diff --git a/tasks.md b/tasks.md index 4b9e7e5..e713a66 100644 --- a/tasks.md +++ b/tasks.md @@ -305,7 +305,7 @@ ### M25. Добрать failure-path observability regression tests - Субагент: `test-engineer` -- Статус: pending +- Статус: completed - Зависимости: `M24` - Commit required: yes - Commit message: `add sandbox observability failure tests` diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py index 4db1095..352adad 100644 --- a/test/test_docker_runtime.py +++ b/test/test_docker_runtime.py @@ -43,6 +43,27 @@ class FakeListedContainer(FakeContainer): self.attrs = {'Created': created_at} +class FailingStopContainer(FakeListedContainer): + def __init__( + self, + container_id: str, + *, + labels: dict[str, str], + created_at: str, + error: Exception, + ) -> None: + super().__init__( + container_id, + labels=labels, + created_at=created_at, + ) + self._error = error + + def stop(self) -> None: + self.stop_calls += 1 + raise self._error + + class RunKwargs(TypedDict): detach: bool labels: dict[str, str] @@ -435,23 +456,143 @@ def test_runtime_stop_ignores_missing_container(tmp_path: Path) -> None: config = build_config(tmp_path) containers = FakeContainers() containers.get_result = NotFound('missing') - runtime = build_runtime(config, containers) + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = DockerSandboxRuntime( + config, + FakeDockerClient(containers), + metrics, + tracer, + ) runtime.stop('container-123') assert containers.get_calls == ['container-123'] + duration_call = _find_record_call( + metrics, + 'sandbox.runtime.stop.duration_ms', + attrs={'operation': 'stop', 'result': 'not_found'}, + ) + assert duration_call[1] >= 0 + span = _find_span( + tracer, + 'adapter.docker.stop_sandbox', + {'container.id': 'container-123'}, + {'sandbox.result': 'not_found'}, + ) + assert not span.errors + stop_error_calls = [ + call + for call in metrics.increment_calls + if call[0] == 'sandbox.runtime.error.total' + and call[2] is not None + and call[2].get('operation') == 'stop' + ] + assert stop_error_calls == [] def test_runtime_stop_wraps_docker_errors(tmp_path: Path) -> None: config = build_config(tmp_path) containers = FakeContainers() - containers.get_result = DockerException('boom') - runtime = build_runtime(config, containers) + containers.get_result = FailingStopContainer( + 'container-123', + labels={ + 'session_id': str(SESSION_ID), + 'chat_id': str(CHAT_ID), + 'expires_at': '2026-04-02T12:05:00+00:00', + }, + created_at='2026-04-02T12:00:00Z', + error=DockerException('boom'), + ) + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = DockerSandboxRuntime( + config, + FakeDockerClient(containers), + metrics, + tracer, + ) with pytest.raises(SandboxError) as excinfo: runtime.stop('container-123') assert str(excinfo.value) == 'sandbox_stop_failed' + _find_increment_call( + metrics, + 'sandbox.runtime.error.total', + attrs={'operation': 'stop', 'error.type': 'DockerException'}, + ) + duration_call = _find_record_call( + metrics, + 'sandbox.runtime.stop.duration_ms', + attrs={'operation': 'stop', 'result': 'error'}, + ) + assert duration_call[1] >= 0 + span = _find_span( + tracer, + 'adapter.docker.stop_sandbox', + {'container.id': 'container-123'}, + { + 'session.id': str(SESSION_ID), + 'chat.id': str(CHAT_ID), + 'sandbox.result': 'error', + }, + ) + cause = excinfo.value.__cause__ + assert isinstance(cause, DockerException) + assert cause in span.errors + + +def test_runtime_stop_records_observability_on_success(tmp_path: Path) -> None: + config = build_config(tmp_path) + containers = FakeContainers() + container = FakeListedContainer( + 'container-123', + labels={ + 'session_id': str(SESSION_ID), + 'chat_id': str(CHAT_ID), + 'expires_at': '2026-04-02T12:05:00+00:00', + }, + created_at='2026-04-02T12:00:00Z', + ) + containers.get_result = container + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = DockerSandboxRuntime( + config, + FakeDockerClient(containers), + metrics, + tracer, + ) + + runtime.stop('container-123') + + assert container.stop_calls == 1 + duration_call = _find_record_call( + metrics, + 'sandbox.runtime.stop.duration_ms', + attrs={'operation': 'stop', 'result': 'stopped'}, + ) + assert duration_call[1] >= 0 + span = _find_span( + tracer, + 'adapter.docker.stop_sandbox', + {'container.id': 'container-123'}, + { + 'session.id': str(SESSION_ID), + 'chat.id': str(CHAT_ID), + 'sandbox.result': 'stopped', + }, + ) + assert not span.errors + stop_error_calls = [ + call + for call in metrics.increment_calls + if call[0] == 'sandbox.runtime.error.total' + and call[2] is not None + and call[2].get('operation') == 'stop' + ] + assert stop_error_calls == [] def test_runtime_list_active_sessions_reads_valid_labeled_containers( diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index 92c7937..068204c 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -281,6 +281,22 @@ class StaleSnapshotRepository(InMemorySandboxSessionRepository): return [self._snapshot] +class FailingSaveRepository(InMemorySandboxSessionRepository): + def __init__(self, error: Exception) -> None: + super().__init__() + self._error = error + self._fail_next_save = False + + def fail_next_save(self) -> None: + self._fail_next_save = True + + def save(self, session: SandboxSession) -> None: + if self._fail_next_save: + self._fail_next_save = False + raise self._error + super().save(session) + + class FakeRuntime: def __init__(self) -> None: self.create_calls: list[dict[str, object]] = [] @@ -706,6 +722,64 @@ def test_create_sandbox_replace_stop_failure_preserves_separate_identities( assert excinfo.value in span.errors +def test_create_sandbox_replace_save_failure_records_stage_safe_trace_ids( + monkeypatch, +) -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + expired_session = SandboxSession( + session_id=SESSION_OLD_ID, + chat_id=CHAT_ID, + container_id='container-old', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=10), + expires_at=now, + ) + repository = FailingSaveRepository(RuntimeError('save_failed')) + repository.save(expired_session) + repository.fail_next_save() + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = FakeRuntime() + usecase = CreateSandbox( + repository=repository, + locker=FakeLocker(), + runtime=runtime, + clock=FakeClock(now), + logger=FakeLogger(), + metrics=metrics, + tracer=tracer, + ttl=timedelta(minutes=5), + ) + monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) + + with pytest.raises(RuntimeError, match='save_failed') as excinfo: + usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) + + assert runtime.stop_calls == ['container-old'] + assert len(runtime.create_calls) == 1 + assert repository.get_active_by_chat_id(CHAT_ID) is None + _assert_increment_metric_present( + metrics, + 'sandbox.create.total', + attrs={'result': 'error'}, + ) + span = _find_span( + tracer, + 'usecase.create_sandbox', + {'chat.id': str(CHAT_ID)}, + { + 'sandbox.previous_session.id': str(SESSION_OLD_ID), + 'sandbox.previous_container.id': 'container-old', + 'sandbox.new_session.id': str(SESSION_NEW_ID), + 'sandbox.new_container.id': f'container-{SESSION_NEW_ID}', + 'sandbox.result': 'error', + }, + ) + assert 'session.id' not in span.attrs + assert 'container.id' not in span.attrs + assert excinfo.value in span.errors + + def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( monkeypatch, ) -> None: @@ -982,6 +1056,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: repository.save(cleaned_session) runtime = FailingStopRuntime('container-fail') logger = FakeLogger() + metrics = RecordingMetrics() + tracer = RecordingTracer() locker = FakeLocker() usecase = CleanupExpiredSandboxes( repository=repository, @@ -989,8 +1065,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: runtime=runtime, clock=FakeClock(now), logger=logger, - metrics=NoopMetrics(), - tracer=NoopTracer(), + metrics=metrics, + tracer=tracer, ) result = usecase.execute() @@ -1021,3 +1097,48 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: }, ), ] + _assert_increment_metric_present( + metrics, + 'sandbox.cleanup.error.total', + attrs={'error.type': 'RuntimeError'}, + ) + _assert_increment_metric_present( + metrics, + 'sandbox.cleanup.total', + attrs={'result': 'cleaned'}, + ) + assert _active_count_values(metrics) + assert _active_count_values(metrics)[-1] == 1 + root_span = _find_span( + tracer, + 'usecase.cleanup_expired_sandboxes', + span_attrs={ + 'sandbox.expired_count': 2, + 'sandbox.cleaned_count': 1, + 'sandbox.error_count': 1, + 'sandbox.result': 'completed_with_errors', + }, + ) + assert not root_span.errors + failed_span = _find_span( + tracer, + 'usecase.cleanup_expired_sandbox', + { + 'chat.id': str(FAIL_CHAT_ID), + 'session.id': str(SESSION_FAIL_ID), + 'container.id': 'container-fail', + }, + {'sandbox.result': 'error'}, + ) + assert [str(error) for error in failed_span.errors] == ['stop_failed'] + cleaned_span = _find_span( + tracer, + 'usecase.cleanup_expired_sandbox', + { + 'chat.id': str(CLEAN_CHAT_ID), + 'session.id': str(SESSION_CLEAN_ID), + 'container.id': 'container-clean', + }, + {'sandbox.result': 'cleaned'}, + ) + assert not cleaned_span.errors