fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)
This commit is contained in:
0xbyt4 2026-03-06 01:51:10 +03:00
parent 3a1b35ed92
commit b00c5949fc
3 changed files with 130 additions and 70 deletions

5
cli.py
View file

@ -4242,6 +4242,11 @@ class HermesCLI:
if result and result.get("failed") and not response: if result and result.get("failed") and not response:
error_detail = result.get("error", "Unknown error") error_detail = result.get("error", "Unknown error")
response = f"Error: {error_detail}" response = f"Error: {error_detail}"
# Stop continuous voice mode on persistent errors (e.g. 429 rate limit)
# to avoid an infinite error → record → error loop
if self._voice_continuous:
self._voice_continuous = False
_cprint(f"\n{_DIM}Continuous voice mode stopped due to error.{_RST}")
# Handle interrupt - check if we were interrupted # Handle interrupt - check if we were interrupted
pending_message = None pending_message = None

View file

@ -493,6 +493,10 @@ class AIAgent:
]: ]:
logging.getLogger(quiet_logger).setLevel(logging.ERROR) logging.getLogger(quiet_logger).setLevel(logging.ERROR)
# Internal stream callback (set during streaming TTS).
# Initialized here so _vprint can reference it before run_conversation.
self._stream_callback = None
# Initialize LLM client via centralized provider router. # Initialize LLM client via centralized provider router.
# The router handles auth resolution, base URL, headers, and # The router handles auth resolution, base URL, headers, and
# Codex/Anthropic wrapping for all known providers. # Codex/Anthropic wrapping for all known providers.
@ -812,6 +816,12 @@ class AIAgent:
else: else:
print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)") print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
def _vprint(self, *args, **kwargs):
"""Verbose print — suppressed when streaming TTS is active."""
if getattr(self, "_stream_callback", None) is not None:
return
print(*args, **kwargs)
def _max_tokens_param(self, value: int) -> dict: def _max_tokens_param(self, value: int) -> dict:
"""Return the correct max tokens kwarg for the current provider. """Return the correct max tokens kwarg for the current provider.
@ -1340,7 +1350,7 @@ class AIAgent:
encoding="utf-8", encoding="utf-8",
) )
print(f"{self.log_prefix}🧾 Request debug dump written to: {dump_file}") self._vprint(f"{self.log_prefix}🧾 Request debug dump written to: {dump_file}")
if os.getenv("HERMES_DUMP_REQUEST_STDOUT", "").strip().lower() in {"1", "true", "yes", "on"}: if os.getenv("HERMES_DUMP_REQUEST_STDOUT", "").strip().lower() in {"1", "true", "yes", "on"}:
print(json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str)) print(json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str))
@ -1482,7 +1492,7 @@ class AIAgent:
# Replay the items into the store (replace mode) # Replay the items into the store (replace mode)
self._todo_store.write(last_todo_response, merge=False) self._todo_store.write(last_todo_response, merge=False)
if not self.quiet_mode: if not self.quiet_mode:
print(f"{self.log_prefix}📋 Restored {len(last_todo_response)} todo item(s) from history") self._vprint(f"{self.log_prefix}📋 Restored {len(last_todo_response)} todo item(s) from history")
_set_interrupt(False) _set_interrupt(False)
@property @property
@ -3578,7 +3588,7 @@ class AIAgent:
if self._interrupt_requested: if self._interrupt_requested:
remaining_calls = assistant_message.tool_calls[i-1:] remaining_calls = assistant_message.tool_calls[i-1:]
if remaining_calls: if remaining_calls:
print(f"{self.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)") self._vprint(f"{self.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)")
for skipped_tc in remaining_calls: for skipped_tc in remaining_calls:
skipped_name = skipped_tc.function.name skipped_name = skipped_tc.function.name
skip_msg = { skip_msg = {
@ -3640,7 +3650,7 @@ class AIAgent:
) )
tool_duration = time.time() - tool_start_time tool_duration = time.time() - tool_start_time
if self.quiet_mode: if self.quiet_mode:
print(f" {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}") self._vprint(f" {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}")
elif function_name == "session_search": elif function_name == "session_search":
if not self._session_db: if not self._session_db:
function_result = json.dumps({"success": False, "error": "Session database not available."}) function_result = json.dumps({"success": False, "error": "Session database not available."})
@ -3655,7 +3665,7 @@ class AIAgent:
) )
tool_duration = time.time() - tool_start_time tool_duration = time.time() - tool_start_time
if self.quiet_mode: if self.quiet_mode:
print(f" {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}") self._vprint(f" {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}")
elif function_name == "memory": elif function_name == "memory":
target = function_args.get("target", "memory") target = function_args.get("target", "memory")
from tools.memory_tool import memory_tool as _memory_tool from tools.memory_tool import memory_tool as _memory_tool
@ -3671,7 +3681,7 @@ class AIAgent:
self._honcho_save_user_observation(function_args.get("content", "")) self._honcho_save_user_observation(function_args.get("content", ""))
tool_duration = time.time() - tool_start_time tool_duration = time.time() - tool_start_time
if self.quiet_mode: if self.quiet_mode:
print(f" {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}") self._vprint(f" {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}")
elif function_name == "clarify": elif function_name == "clarify":
from tools.clarify_tool import clarify_tool as _clarify_tool from tools.clarify_tool import clarify_tool as _clarify_tool
function_result = _clarify_tool( function_result = _clarify_tool(
@ -3681,7 +3691,7 @@ class AIAgent:
) )
tool_duration = time.time() - tool_start_time tool_duration = time.time() - tool_start_time
if self.quiet_mode: if self.quiet_mode:
print(f" {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}") self._vprint(f" {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}")
elif function_name == "delegate_task": elif function_name == "delegate_task":
from tools.delegate_tool import delegate_task as _delegate_task from tools.delegate_tool import delegate_task as _delegate_task
tasks_arg = function_args.get("tasks") tasks_arg = function_args.get("tasks")
@ -3714,8 +3724,8 @@ class AIAgent:
if spinner: if spinner:
spinner.stop(cute_msg) spinner.stop(cute_msg)
elif self.quiet_mode: elif self.quiet_mode:
print(f" {cute_msg}") self._vprint(f" {cute_msg}")
elif self.quiet_mode: elif self.quiet_mode and self._stream_callback is None:
face = random.choice(KawaiiSpinner.KAWAII_WAITING) face = random.choice(KawaiiSpinner.KAWAII_WAITING)
tool_emoji_map = { tool_emoji_map = {
'web_search': '🔍', 'web_extract': '📄', 'web_crawl': '🕸️', 'web_search': '🔍', 'web_extract': '📄', 'web_crawl': '🕸️',
@ -3802,7 +3812,7 @@ class AIAgent:
if self._interrupt_requested and i < len(assistant_message.tool_calls): if self._interrupt_requested and i < len(assistant_message.tool_calls):
remaining = len(assistant_message.tool_calls) - i remaining = len(assistant_message.tool_calls) - i
print(f"{self.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)") self._vprint(f"{self.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)")
for skipped_tc in assistant_message.tool_calls[i:]: for skipped_tc in assistant_message.tool_calls[i:]:
skipped_name = skipped_tc.function.name skipped_name = skipped_tc.function.name
skip_msg = { skip_msg = {
@ -4344,11 +4354,11 @@ class AIAgent:
thinking_spinner = None thinking_spinner = None
if not self.quiet_mode: if not self.quiet_mode:
print(f"\n{self.log_prefix}🔄 Making API call #{api_call_count}/{self.max_iterations}...") self._vprint(f"\n{self.log_prefix}🔄 Making API call #{api_call_count}/{self.max_iterations}...")
print(f"{self.log_prefix} 📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)") self._vprint(f"{self.log_prefix} 📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)")
print(f"{self.log_prefix} 🔧 Available tools: {len(self.tools) if self.tools else 0}") self._vprint(f"{self.log_prefix} 🔧 Available tools: {len(self.tools) if self.tools else 0}")
else: elif self._stream_callback is None:
# Animated thinking spinner in quiet mode # Animated thinking spinner in quiet mode (skip during streaming TTS)
face = random.choice(KawaiiSpinner.KAWAII_THINKING) face = random.choice(KawaiiSpinner.KAWAII_THINKING)
verb = random.choice(KawaiiSpinner.THINKING_VERBS) verb = random.choice(KawaiiSpinner.THINKING_VERBS)
if self.thinking_callback: if self.thinking_callback:
@ -4401,7 +4411,7 @@ class AIAgent:
self.thinking_callback("") self.thinking_callback("")
if not self.quiet_mode: if not self.quiet_mode:
print(f"{self.log_prefix}⏱️ API call completed in {api_duration:.2f}s") self._vprint(f"{self.log_prefix}⏱️ API call completed in {api_duration:.2f}s")
if self.verbose_logging: if self.verbose_logging:
# Log response with provider info if available # Log response with provider info if available
@ -4478,17 +4488,17 @@ class AIAgent:
if self.verbose_logging: if self.verbose_logging:
logging.debug(f"Response attributes for invalid response: {resp_attrs}") logging.debug(f"Response attributes for invalid response: {resp_attrs}")
print(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}") self._vprint(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}")
print(f"{self.log_prefix} 🏢 Provider: {provider_name}") self._vprint(f"{self.log_prefix} 🏢 Provider: {provider_name}")
print(f"{self.log_prefix} 📝 Provider message: {error_msg[:200]}") self._vprint(f"{self.log_prefix} 📝 Provider message: {error_msg[:200]}")
print(f"{self.log_prefix} ⏱️ Response time: {api_duration:.2f}s (fast response often indicates rate limiting)") self._vprint(f"{self.log_prefix} ⏱️ Response time: {api_duration:.2f}s (fast response often indicates rate limiting)")
if retry_count >= max_retries: if retry_count >= max_retries:
# Try fallback before giving up # Try fallback before giving up
if self._try_activate_fallback(): if self._try_activate_fallback():
retry_count = 0 retry_count = 0
continue continue
print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.") self._vprint(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.", force=True)
logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.") logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.")
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
return { return {
@ -4501,14 +4511,14 @@ class AIAgent:
# Longer backoff for rate limiting (likely cause of None choices) # Longer backoff for rate limiting (likely cause of None choices)
wait_time = min(5 * (2 ** (retry_count - 1)), 120) # 5s, 10s, 20s, 40s, 80s, 120s wait_time = min(5 * (2 ** (retry_count - 1)), 120) # 5s, 10s, 20s, 40s, 80s, 120s
print(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...") self._vprint(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...")
logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}") logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}")
# Sleep in small increments to stay responsive to interrupts # Sleep in small increments to stay responsive to interrupts
sleep_end = time.time() + wait_time sleep_end = time.time() + wait_time
while time.time() < sleep_end: while time.time() < sleep_end:
if self._interrupt_requested: if self._interrupt_requested:
print(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.") self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.")
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
self.clear_interrupt() self.clear_interrupt()
return { return {
@ -4541,7 +4551,7 @@ class AIAgent:
finish_reason = response.choices[0].finish_reason finish_reason = response.choices[0].finish_reason
if finish_reason == "length": if finish_reason == "length":
print(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens") self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens")
if self.api_mode == "chat_completions": if self.api_mode == "chat_completions":
assistant_message = response.choices[0].message assistant_message = response.choices[0].message
@ -4553,7 +4563,7 @@ class AIAgent:
truncated_response_prefix += assistant_message.content truncated_response_prefix += assistant_message.content
if length_continue_retries < 3: if length_continue_retries < 3:
print( self._vprint(
f"{self.log_prefix}↻ Requesting continuation " f"{self.log_prefix}↻ Requesting continuation "
f"({length_continue_retries}/3)..." f"({length_continue_retries}/3)..."
) )
@ -4585,7 +4595,7 @@ class AIAgent:
# If we have prior messages, roll back to last complete state # If we have prior messages, roll back to last complete state
if len(messages) > 1: if len(messages) > 1:
print(f"{self.log_prefix} ⏪ Rolling back to last complete assistant turn") self._vprint(f"{self.log_prefix} ⏪ Rolling back to last complete assistant turn")
rolled_back_messages = self._get_messages_up_to_last_assistant(messages) rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
self._cleanup_task_resources(effective_task_id) self._cleanup_task_resources(effective_task_id)
@ -4601,7 +4611,7 @@ class AIAgent:
} }
else: else:
# First message was truncated - mark as failed # First message was truncated - mark as failed
print(f"{self.log_prefix}❌ First response truncated - cannot recover") self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover")
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
return { return {
"final_response": None, "final_response": None,
@ -4661,7 +4671,7 @@ class AIAgent:
prompt = usage_dict["prompt_tokens"] prompt = usage_dict["prompt_tokens"]
hit_pct = (cached / prompt * 100) if prompt > 0 else 0 hit_pct = (cached / prompt * 100) if prompt > 0 else 0
if not self.quiet_mode: if not self.quiet_mode:
print(f"{self.log_prefix} 💾 Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)") self._vprint(f"{self.log_prefix} 💾 Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)")
break # Success, exit retry loop break # Success, exit retry loop
@ -4672,7 +4682,7 @@ class AIAgent:
if self.thinking_callback: if self.thinking_callback:
self.thinking_callback("") self.thinking_callback("")
api_elapsed = time.time() - api_start_time api_elapsed = time.time() - api_start_time
print(f"{self.log_prefix}⚡ Interrupted during API call.") self._vprint(f"{self.log_prefix}⚡ Interrupted during API call.", force=True)
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
interrupted = True interrupted = True
final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)." final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)."
@ -4695,7 +4705,7 @@ class AIAgent:
): ):
codex_auth_retry_attempted = True codex_auth_retry_attempted = True
if self._try_refresh_codex_client_credentials(force=True): if self._try_refresh_codex_client_credentials(force=True):
print(f"{self.log_prefix}🔐 Codex auth refreshed after 401. Retrying request...") self._vprint(f"{self.log_prefix}🔐 Codex auth refreshed after 401. Retrying request...")
continue continue
if ( if (
self.api_mode == "chat_completions" self.api_mode == "chat_completions"
@ -4743,14 +4753,14 @@ class AIAgent:
error_type = type(api_error).__name__ error_type = type(api_error).__name__
error_msg = str(api_error).lower() error_msg = str(api_error).lower()
print(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}") self._vprint(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}")
print(f"{self.log_prefix} ⏱️ Time elapsed before failure: {elapsed_time:.2f}s") self._vprint(f"{self.log_prefix} ⏱️ Time elapsed before failure: {elapsed_time:.2f}s")
print(f"{self.log_prefix} 📝 Error: {str(api_error)[:200]}") self._vprint(f"{self.log_prefix} 📝 Error: {str(api_error)[:200]}")
print(f"{self.log_prefix} 📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools") self._vprint(f"{self.log_prefix} 📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools")
# Check for interrupt before deciding to retry # Check for interrupt before deciding to retry
if self._interrupt_requested: if self._interrupt_requested:
print(f"{self.log_prefix}⚡ Interrupt detected during error handling, aborting retries.") self._vprint(f"{self.log_prefix}⚡ Interrupt detected during error handling, aborting retries.")
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
self.clear_interrupt() self.clear_interrupt()
return { return {
@ -4775,7 +4785,7 @@ class AIAgent:
if is_payload_too_large: if is_payload_too_large:
compression_attempts += 1 compression_attempts += 1
if compression_attempts > max_compression_attempts: if compression_attempts > max_compression_attempts:
print(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.") self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True)
logging.error(f"{self.log_prefix}413 compression failed after {max_compression_attempts} attempts.") logging.error(f"{self.log_prefix}413 compression failed after {max_compression_attempts} attempts.")
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
return { return {
@ -4785,7 +4795,7 @@ class AIAgent:
"error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.", "error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.",
"partial": True "partial": True
} }
print(f"{self.log_prefix}⚠️ Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...") self._vprint(f"{self.log_prefix}⚠️ Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...")
original_len = len(messages) original_len = len(messages)
messages, active_system_prompt = self._compress_context( messages, active_system_prompt = self._compress_context(
@ -4794,12 +4804,12 @@ class AIAgent:
) )
if len(messages) < original_len: if len(messages) < original_len:
print(f"{self.log_prefix} 🗜️ Compressed {original_len}{len(messages)} messages, retrying...") self._vprint(f"{self.log_prefix} 🗜️ Compressed {original_len}{len(messages)} messages, retrying...")
time.sleep(2) # Brief pause between compression retries time.sleep(2) # Brief pause between compression retries
restart_with_compressed_messages = True restart_with_compressed_messages = True
break break
else: else:
print(f"{self.log_prefix}❌ Payload too large and cannot compress further.") self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.")
logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.") logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.")
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
return { return {
@ -4830,7 +4840,7 @@ class AIAgent:
parsed_limit = parse_context_limit_from_error(error_msg) parsed_limit = parse_context_limit_from_error(error_msg)
if parsed_limit and parsed_limit < old_ctx: if parsed_limit and parsed_limit < old_ctx:
new_ctx = parsed_limit new_ctx = parsed_limit
print(f"{self.log_prefix}⚠️ Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})") self._vprint(f"{self.log_prefix}⚠️ Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True)
else: else:
# Step down to the next probe tier # Step down to the next probe tier
new_ctx = get_next_probe_tier(old_ctx) new_ctx = get_next_probe_tier(old_ctx)
@ -4839,13 +4849,13 @@ class AIAgent:
compressor.context_length = new_ctx compressor.context_length = new_ctx
compressor.threshold_tokens = int(new_ctx * compressor.threshold_percent) compressor.threshold_tokens = int(new_ctx * compressor.threshold_percent)
compressor._context_probed = True compressor._context_probed = True
print(f"{self.log_prefix}⚠️ Context length exceeded — stepping down: {old_ctx:,}{new_ctx:,} tokens") self._vprint(f"{self.log_prefix}⚠️ Context length exceeded — stepping down: {old_ctx:,}{new_ctx:,} tokens", force=True)
else: else:
print(f"{self.log_prefix}⚠️ Context length exceeded at minimum tier — attempting compression...") self._vprint(f"{self.log_prefix}⚠️ Context length exceeded at minimum tier — attempting compression...", force=True)
compression_attempts += 1 compression_attempts += 1
if compression_attempts > max_compression_attempts: if compression_attempts > max_compression_attempts:
print(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.") self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.") logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
return { return {
@ -4855,7 +4865,7 @@ class AIAgent:
"error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.",
"partial": True "partial": True
} }
print(f"{self.log_prefix} 🗜️ Context compression attempt {compression_attempts}/{max_compression_attempts}...") self._vprint(f"{self.log_prefix} 🗜️ Context compression attempt {compression_attempts}/{max_compression_attempts}...")
original_len = len(messages) original_len = len(messages)
messages, active_system_prompt = self._compress_context( messages, active_system_prompt = self._compress_context(
@ -4865,14 +4875,14 @@ class AIAgent:
if len(messages) < original_len or new_ctx and new_ctx < old_ctx: if len(messages) < original_len or new_ctx and new_ctx < old_ctx:
if len(messages) < original_len: if len(messages) < original_len:
print(f"{self.log_prefix} 🗜️ Compressed {original_len}{len(messages)} messages, retrying...") self._vprint(f"{self.log_prefix} 🗜️ Compressed {original_len}{len(messages)} messages, retrying...")
time.sleep(2) # Brief pause between compression retries time.sleep(2) # Brief pause between compression retries
restart_with_compressed_messages = True restart_with_compressed_messages = True
break break
else: else:
# Can't compress further and already at minimum tier # Can't compress further and already at minimum tier
print(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.") self._vprint(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.", force=True)
print(f"{self.log_prefix} 💡 The conversation has accumulated too much content.") self._vprint(f"{self.log_prefix} 💡 The conversation has accumulated too much content.", force=True)
logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.") logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.")
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
return { return {
@ -4908,8 +4918,8 @@ class AIAgent:
self._dump_api_request_debug( self._dump_api_request_debug(
api_kwargs, reason="non_retryable_client_error", error=api_error, api_kwargs, reason="non_retryable_client_error", error=api_error,
) )
print(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.") self._vprint(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.")
print(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.") self._vprint(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.")
logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}") logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}")
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
return { return {
@ -4926,7 +4936,7 @@ class AIAgent:
if self._try_activate_fallback(): if self._try_activate_fallback():
retry_count = 0 retry_count = 0
continue continue
print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.") self._vprint(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.", force=True)
logging.error(f"{self.log_prefix}API call failed after {max_retries} retries. Last error: {api_error}") logging.error(f"{self.log_prefix}API call failed after {max_retries} retries. Last error: {api_error}")
logging.error(f"{self.log_prefix}Request details - Messages: {len(api_messages)}, Approx tokens: {approx_tokens:,}") logging.error(f"{self.log_prefix}Request details - Messages: {len(api_messages)}, Approx tokens: {approx_tokens:,}")
raise api_error raise api_error
@ -4934,15 +4944,15 @@ class AIAgent:
wait_time = min(2 ** retry_count, 60) # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s, 60s wait_time = min(2 ** retry_count, 60) # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s, 60s
logging.warning(f"API retry {retry_count}/{max_retries} after error: {api_error}") logging.warning(f"API retry {retry_count}/{max_retries} after error: {api_error}")
if retry_count >= max_retries: if retry_count >= max_retries:
print(f"{self.log_prefix}⚠️ API call failed after {retry_count} attempts: {str(api_error)[:100]}") self._vprint(f"{self.log_prefix}⚠️ API call failed after {retry_count} attempts: {str(api_error)[:100]}")
print(f"{self.log_prefix}⏳ Final retry in {wait_time}s...") self._vprint(f"{self.log_prefix}⏳ Final retry in {wait_time}s...")
# Sleep in small increments so we can respond to interrupts quickly # Sleep in small increments so we can respond to interrupts quickly
# instead of blocking the entire wait_time in one sleep() call # instead of blocking the entire wait_time in one sleep() call
sleep_end = time.time() + wait_time sleep_end = time.time() + wait_time
while time.time() < sleep_end: while time.time() < sleep_end:
if self._interrupt_requested: if self._interrupt_requested:
print(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.") self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.")
self._persist_session(messages, conversation_history) self._persist_session(messages, conversation_history)
self.clear_interrupt() self.clear_interrupt()
return { return {
@ -5006,7 +5016,7 @@ class AIAgent:
# Handle assistant response # Handle assistant response
if assistant_message.content and not self.quiet_mode: if assistant_message.content and not self.quiet_mode:
print(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}") self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")
# Notify progress callback of model's thinking (used by subagent # Notify progress callback of model's thinking (used by subagent
# delegation to relay the child's reasoning to the parent display). # delegation to relay the child's reasoning to the parent display).
@ -5033,15 +5043,15 @@ class AIAgent:
self._incomplete_scratchpad_retries = 0 self._incomplete_scratchpad_retries = 0
self._incomplete_scratchpad_retries += 1 self._incomplete_scratchpad_retries += 1
print(f"{self.log_prefix}⚠️ Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)") self._vprint(f"{self.log_prefix}⚠️ Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
if self._incomplete_scratchpad_retries <= 2: if self._incomplete_scratchpad_retries <= 2:
print(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...") self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...")
# Don't add the broken message, just retry # Don't add the broken message, just retry
continue continue
else: else:
# Max retries - discard this turn and save as partial # Max retries - discard this turn and save as partial
print(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.") self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.")
self._incomplete_scratchpad_retries = 0 self._incomplete_scratchpad_retries = 0
rolled_back_messages = self._get_messages_up_to_last_assistant(messages) rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
@ -5084,7 +5094,7 @@ class AIAgent:
if self._codex_incomplete_retries < 3: if self._codex_incomplete_retries < 3:
if not self.quiet_mode: if not self.quiet_mode:
print(f"{self.log_prefix}↻ Codex response incomplete; continuing turn ({self._codex_incomplete_retries}/3)") self._vprint(f"{self.log_prefix}↻ Codex response incomplete; continuing turn ({self._codex_incomplete_retries}/3)")
self._session_messages = messages self._session_messages = messages
self._save_session_log(messages) self._save_session_log(messages)
continue continue
@ -5105,7 +5115,7 @@ class AIAgent:
# Check for tool calls # Check for tool calls
if assistant_message.tool_calls: if assistant_message.tool_calls:
if not self.quiet_mode: if not self.quiet_mode:
print(f"{self.log_prefix}🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...") self._vprint(f"{self.log_prefix}🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...")
if self.verbose_logging: if self.verbose_logging:
for tc in assistant_message.tool_calls: for tc in assistant_message.tool_calls:
@ -5124,11 +5134,30 @@ class AIAgent:
if tc.function.name not in self.valid_tool_names if tc.function.name not in self.valid_tool_names
] ]
if invalid_tool_calls: if invalid_tool_calls:
# Track retries for invalid tool calls
if not hasattr(self, '_invalid_tool_retries'):
self._invalid_tool_retries = 0
self._invalid_tool_retries += 1
# Return helpful error to model — model can self-correct next turn # Return helpful error to model — model can self-correct next turn
available = ", ".join(sorted(self.valid_tool_names)) available = ", ".join(sorted(self.valid_tool_names))
invalid_name = invalid_tool_calls[0] invalid_name = invalid_tool_calls[0]
invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name
print(f"{self.log_prefix}⚠️ Unknown tool '{invalid_preview}' — sending error to model for self-correction") self._vprint(f"{self.log_prefix}⚠️ Unknown tool '{invalid_preview}' — sending error to model for self-correction ({self._invalid_tool_retries}/3)")
if self._invalid_tool_retries >= 3:
self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.")
self._invalid_tool_retries = 0
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": f"Model generated invalid tool call: {invalid_preview}"
}
assistant_msg = self._build_assistant_message(assistant_message, finish_reason) assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
messages.append(assistant_msg) messages.append(assistant_msg)
for tc in assistant_message.tool_calls: for tc in assistant_message.tool_calls:
@ -5165,15 +5194,15 @@ class AIAgent:
self._invalid_json_retries += 1 self._invalid_json_retries += 1
tool_name, error_msg = invalid_json_args[0] tool_name, error_msg = invalid_json_args[0]
print(f"{self.log_prefix}⚠️ Invalid JSON in tool call arguments for '{tool_name}': {error_msg}") self._vprint(f"{self.log_prefix}⚠️ Invalid JSON in tool call arguments for '{tool_name}': {error_msg}")
if self._invalid_json_retries < 3: if self._invalid_json_retries < 3:
print(f"{self.log_prefix}🔄 Retrying API call ({self._invalid_json_retries}/3)...") self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._invalid_json_retries}/3)...")
# Don't add anything to messages, just retry the API call # Don't add anything to messages, just retry the API call
continue continue
else: else:
# Instead of returning partial, inject a helpful message and let model recover # Instead of returning partial, inject a helpful message and let model recover
print(f"{self.log_prefix}⚠️ Injecting recovery message for invalid JSON...") self._vprint(f"{self.log_prefix}⚠️ Injecting recovery message for invalid JSON...")
self._invalid_json_retries = 0 # Reset for next attempt self._invalid_json_retries = 0 # Reset for next attempt
# Add a user message explaining the issue # Add a user message explaining the issue
@ -5203,7 +5232,7 @@ class AIAgent:
if self.quiet_mode: if self.quiet_mode:
clean = self._strip_think_blocks(turn_content).strip() clean = self._strip_think_blocks(turn_content).strip()
if clean: if clean:
print(f" ┊ 💬 {clean}") self._vprint(f" ┊ 💬 {clean}")
messages.append(assistant_msg) messages.append(assistant_msg)
@ -5279,19 +5308,19 @@ class AIAgent:
self._empty_content_retries += 1 self._empty_content_retries += 1
reasoning_text = self._extract_reasoning(assistant_message) reasoning_text = self._extract_reasoning(assistant_message)
print(f"{self.log_prefix}⚠️ Response only contains think block with no content after it") self._vprint(f"{self.log_prefix}⚠️ Response only contains think block with no content after it")
if reasoning_text: if reasoning_text:
reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text
print(f"{self.log_prefix} Reasoning: {reasoning_preview}") self._vprint(f"{self.log_prefix} Reasoning: {reasoning_preview}")
else: else:
content_preview = final_response[:80] + "..." if len(final_response) > 80 else final_response content_preview = final_response[:80] + "..." if len(final_response) > 80 else final_response
print(f"{self.log_prefix} Content: '{content_preview}'") self._vprint(f"{self.log_prefix} Content: '{content_preview}'")
if self._empty_content_retries < 3: if self._empty_content_retries < 3:
print(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...") self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...")
continue continue
else: else:
print(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.") self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.")
self._empty_content_retries = 0 self._empty_content_retries = 0
# If a prior tool_calls turn had real content, salvage it: # If a prior tool_calls turn had real content, salvage it:

View file

@ -12,6 +12,7 @@ Dependencies (optional):
import logging import logging
import os import os
import platform import platform
import re
import shutil import shutil
import subprocess import subprocess
import tempfile import tempfile
@ -350,12 +351,37 @@ WHISPER_HALLUCINATIONS = {
"you", "you",
"the end.", "the end.",
"the end", "the end",
# Non-English hallucinations (common on silence)
"продолжение следует",
"продолжение следует...",
"sous-titres",
"sous-titres réalisés par la communauté d'amara.org",
"sottotitoli creati dalla comunità amara.org",
"untertitel von stephanie geiges",
"amara.org",
"www.mooji.org",
"ご視聴ありがとうございました",
} }
# Regex patterns for repetitive hallucinations (e.g. "Thank you. Thank you. Thank you.")
_HALLUCINATION_REPEAT_RE = re.compile(
r'^(?:thank you|thanks|bye|you|ok|okay|the end|\.|\s|,|!)+$',
flags=re.IGNORECASE,
)
def is_whisper_hallucination(transcript: str) -> bool: def is_whisper_hallucination(transcript: str) -> bool:
"""Check if a transcript is a known Whisper hallucination on silence.""" """Check if a transcript is a known Whisper hallucination on silence."""
return transcript.strip().lower() in WHISPER_HALLUCINATIONS cleaned = transcript.strip().lower()
if not cleaned:
return True
# Exact match against known phrases
if cleaned.rstrip('.!') in WHISPER_HALLUCINATIONS or cleaned in WHISPER_HALLUCINATIONS:
return True
# Repetitive patterns (e.g. "Thank you. Thank you. Thank you. you")
if _HALLUCINATION_REPEAT_RE.match(cleaned):
return True
return False
# ============================================================================ # ============================================================================