From 0698ddb49618646c6a576fe7d8e15d8503604c5a Mon Sep 17 00:00:00 2001
From: Mibayy <mibayy@github.com>
Date: Sun, 22 Mar 2026 11:20:27 +0000
Subject: [PATCH] fix(compression): remove hardcoded gemini-3-flash-preview as
 default summary model

Closes #2453

The DEFAULT_CONFIG was hardcoding google/gemini-3-flash-preview as the
summary_model for context compression. This caused unexpected OpenRouter
charges for users who configured a different provider/model, because the
compression task would silently fall back to gemini via OpenRouter even
when the user's main model was on a different provider.

Fix: change summary_model default to empty string. When empty,
call_llm() resolves the model through the standard auto-detection chain
(auxiliary.compression config -> env vars -> main provider), which
correctly uses the user's configured provider and model.

Users who want a dedicated cheap model for compression can still
explicitly set compression.summary_model in their config.yaml.
---
 cli.py               | 2 +-
 hermes_cli/config.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cli.py b/cli.py
index ac9231b0..931610bf 100644
--- a/cli.py
+++ b/cli.py
@@ -180,7 +180,7 @@ def load_cli_config() -> Dict[str, Any]:
         "compression": {
             "enabled": True,      # Auto-compress when approaching context limit
             "threshold": 0.50,    # Compress at 50% of model's context limit
-            "summary_model": "google/gemini-3-flash-preview",  # Fast/cheap model for summaries
+            "summary_model": "",  # Model for summaries (empty = use main model)
         },
         "smart_model_routing": {
             "enabled": False,
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index b2fd27c1..b0e14f2a 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -159,7 +159,7 @@ DEFAULT_CONFIG = {
     "compression": {
         "enabled": True,
         "threshold": 0.50,
-        "summary_model": "google/gemini-3-flash-preview",
+        "summary_model": "",  # empty = use main configured model
         "summary_provider": "auto",
         "summary_base_url": None,
     },
@@ -1659,7 +1659,8 @@ def show_config():
     print(f"  Enabled:      {'yes' if enabled else 'no'}")
     if enabled:
         print(f"  Threshold:    {compression.get('threshold', 0.50) * 100:.0f}%")
-        print(f"  Model:        {compression.get('summary_model', 'google/gemini-3-flash-preview')}")
+        _sm = compression.get('summary_model', '') or '(main model)'
+        print(f"  Model:        {_sm}")
         comp_provider = compression.get('summary_provider', 'auto')
         if comp_provider != 'auto':
             print(f"  Provider:     {comp_provider}")