add sandbox observability failure tests

2026-04-03 02:04:51 +03:00 · 2026-04-03 02:04:51 +03:00 · b4a2a9ceea
commit b4a2a9ceea
parent 02770bce7d
3 changed files with 268 additions and 6 deletions
--- a/test/test_sandbox_usecase.py
+++ b/test/test_sandbox_usecase.py
@ -281,6 +281,22 @@ class StaleSnapshotRepository(InMemorySandboxSessionRepository):
        return [self._snapshot]


+class FailingSaveRepository(InMemorySandboxSessionRepository):
+    def __init__(self, error: Exception) -> None:
+        super().__init__()
+        self._error = error
+        self._fail_next_save = False
+
+    def fail_next_save(self) -> None:
+        self._fail_next_save = True
+
+    def save(self, session: SandboxSession) -> None:
+        if self._fail_next_save:
+            self._fail_next_save = False
+            raise self._error
+        super().save(session)
+
+
 class FakeRuntime:
    def __init__(self) -> None:
        self.create_calls: list[dict[str, object]] = []
@ -706,6 +722,64 @@ def test_create_sandbox_replace_stop_failure_preserves_separate_identities(
    assert excinfo.value in span.errors


+def test_create_sandbox_replace_save_failure_records_stage_safe_trace_ids(
+    monkeypatch,
+) -> None:
+    now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC)
+    expired_session = SandboxSession(
+        session_id=SESSION_OLD_ID,
+        chat_id=CHAT_ID,
+        container_id='container-old',
+        status=SandboxStatus.RUNNING,
+        created_at=now - timedelta(minutes=10),
+        expires_at=now,
+    )
+    repository = FailingSaveRepository(RuntimeError('save_failed'))
+    repository.save(expired_session)
+    repository.fail_next_save()
+    metrics = RecordingMetrics()
+    tracer = RecordingTracer()
+    runtime = FakeRuntime()
+    usecase = CreateSandbox(
+        repository=repository,
+        locker=FakeLocker(),
+        runtime=runtime,
+        clock=FakeClock(now),
+        logger=FakeLogger(),
+        metrics=metrics,
+        tracer=tracer,
+        ttl=timedelta(minutes=5),
+    )
+    monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID)
+
+    with pytest.raises(RuntimeError, match='save_failed') as excinfo:
+        usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID))
+
+    assert runtime.stop_calls == ['container-old']
+    assert len(runtime.create_calls) == 1
+    assert repository.get_active_by_chat_id(CHAT_ID) is None
+    _assert_increment_metric_present(
+        metrics,
+        'sandbox.create.total',
+        attrs={'result': 'error'},
+    )
+    span = _find_span(
+        tracer,
+        'usecase.create_sandbox',
+        {'chat.id': str(CHAT_ID)},
+        {
+            'sandbox.previous_session.id': str(SESSION_OLD_ID),
+            'sandbox.previous_container.id': 'container-old',
+            'sandbox.new_session.id': str(SESSION_NEW_ID),
+            'sandbox.new_container.id': f'container-{SESSION_NEW_ID}',
+            'sandbox.result': 'error',
+        },
+    )
+    assert 'session.id' not in span.attrs
+    assert 'container.id' not in span.attrs
+    assert excinfo.value in span.errors
+
+
 def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id(
    monkeypatch,
 ) -> None:
@ -982,6 +1056,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None:
    repository.save(cleaned_session)
    runtime = FailingStopRuntime('container-fail')
    logger = FakeLogger()
+    metrics = RecordingMetrics()
+    tracer = RecordingTracer()
    locker = FakeLocker()
    usecase = CleanupExpiredSandboxes(
        repository=repository,
@ -989,8 +1065,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None:
        runtime=runtime,
        clock=FakeClock(now),
        logger=logger,
-        metrics=NoopMetrics(),
-        tracer=NoopTracer(),
+        metrics=metrics,
+        tracer=tracer,
    )

    result = usecase.execute()
@ -1021,3 +1097,48 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None:
            },
        ),
    ]
+    _assert_increment_metric_present(
+        metrics,
+        'sandbox.cleanup.error.total',
+        attrs={'error.type': 'RuntimeError'},
+    )
+    _assert_increment_metric_present(
+        metrics,
+        'sandbox.cleanup.total',
+        attrs={'result': 'cleaned'},
+    )
+    assert _active_count_values(metrics)
+    assert _active_count_values(metrics)[-1] == 1
+    root_span = _find_span(
+        tracer,
+        'usecase.cleanup_expired_sandboxes',
+        span_attrs={
+            'sandbox.expired_count': 2,
+            'sandbox.cleaned_count': 1,
+            'sandbox.error_count': 1,
+            'sandbox.result': 'completed_with_errors',
+        },
+    )
+    assert not root_span.errors
+    failed_span = _find_span(
+        tracer,
+        'usecase.cleanup_expired_sandbox',
+        {
+            'chat.id': str(FAIL_CHAT_ID),
+            'session.id': str(SESSION_FAIL_ID),
+            'container.id': 'container-fail',
+        },
+        {'sandbox.result': 'error'},
+    )
+    assert [str(error) for error in failed_span.errors] == ['stop_failed']
+    cleaned_span = _find_span(
+        tracer,
+        'usecase.cleanup_expired_sandbox',
+        {
+            'chat.id': str(CLEAN_CHAT_ID),
+            'session.id': str(SESSION_CLEAN_ID),
+            'container.id': 'container-clean',
+        },
+        {'sandbox.result': 'cleaned'},
+    )
+    assert not cleaned_span.errors