fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)

* fix: detect context length for custom model endpoints via fuzzy matching + config override

Custom model endpoints (non-OpenRouter, non-known-provider) were silently
falling back to 2M tokens when the model name didn't exactly match what the
endpoint's /v1/models reported. This happened because:

1. Endpoint metadata lookup used exact match only — model name mismatches
   (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss
2. Single-model servers (common for local inference) required exact name
   match even though only one model was loaded
3. No user escape hatch to manually set context length

Changes:
- Add fuzzy matching for endpoint model metadata: single-model servers
  use the only available model regardless of name; multi-model servers
  try substring matching in both directions
- Add model.context_length config override (highest priority) so users
  can explicitly set their model's context length in config.yaml
- Log an informative message when falling back to 2M probe, telling
  users about the config override option
- Thread config_context_length through ContextCompressor and AIAgent init

Tests: 6 new tests covering fuzzy match, single-model fallback, config
override (including zero/None edge cases).

* fix: auto-detect local model name and context length for local servers

Cherry-picked from PR #2043 by sudoingX.

- Auto-detect model name from local server's /v1/models when only one
  model is loaded (no manual model name config needed)
- Add n_ctx_train and n_ctx to context length detection keys for llama.cpp
- Query llama.cpp /props endpoint for actual allocated context (not just
  training context from GGUF metadata)
- Strip .gguf suffix from display in banner and status bar
- _auto_detect_local_model() in runtime_provider.py for CLI init

Co-authored-by: sudo <sudoingx@users.noreply.github.com>

* fix: revert accidental summary_target_tokens change + add docs for context_length config

- Revert summary_target_tokens from 2500 back to 500 (accidental change
  during patching)
- Add 'Context Length Detection' section to Custom & Self-Hosted docs
  explaining model.context_length config override

---------

Co-authored-by: Test <test@test.com>
Co-authored-by: sudo <sudoingx@users.noreply.github.com>
This commit is contained in:
Teknium 2026-03-19 06:01:16 -07:00 committed by GitHub
parent 7b6d14e62a
commit d76fa7fc37
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 208 additions and 5 deletions

View file

@ -46,6 +46,7 @@ class ContextCompressor:
summary_model_override: str = None, summary_model_override: str = None,
base_url: str = "", base_url: str = "",
api_key: str = "", api_key: str = "",
config_context_length: int | None = None,
): ):
self.model = model self.model = model
self.base_url = base_url self.base_url = base_url
@ -56,7 +57,10 @@ class ContextCompressor:
self.summary_target_tokens = summary_target_tokens self.summary_target_tokens = summary_target_tokens
self.quiet_mode = quiet_mode self.quiet_mode = quiet_mode
self.context_length = get_model_context_length(model, base_url=base_url, api_key=api_key) self.context_length = get_model_context_length(
model, base_url=base_url, api_key=api_key,
config_context_length=config_context_length,
)
self.threshold_tokens = int(self.context_length * threshold_percent) self.threshold_tokens = int(self.context_length * threshold_percent)
self.compression_count = 0 self.compression_count = 0
self._context_probed = False # True after a step-down from context error self._context_probed = False # True after a step-down from context error

View file

@ -136,6 +136,8 @@ _CONTEXT_LENGTH_KEYS = (
"max_input_tokens", "max_input_tokens",
"max_sequence_length", "max_sequence_length",
"max_seq_len", "max_seq_len",
"n_ctx_train",
"n_ctx",
) )
_MAX_COMPLETION_KEYS = ( _MAX_COMPLETION_KEYS = (
@ -342,6 +344,25 @@ def fetch_endpoint_model_metadata(
entry["pricing"] = pricing entry["pricing"] = pricing
_add_model_aliases(cache, model_id, entry) _add_model_aliases(cache, model_id, entry)
# If this is a llama.cpp server, query /props for actual allocated context
is_llamacpp = any(
m.get("owned_by") == "llamacpp"
for m in payload.get("data", []) if isinstance(m, dict)
)
if is_llamacpp:
try:
props_url = candidate.rstrip("/").replace("/v1", "") + "/props"
props_resp = requests.get(props_url, headers=headers, timeout=5)
if props_resp.ok:
props = props_resp.json()
gen_settings = props.get("default_generation_settings", {})
n_ctx = gen_settings.get("n_ctx")
model_alias = props.get("model_alias", "")
if n_ctx and model_alias and model_alias in cache:
cache[model_alias]["context_length"] = n_ctx
except Exception:
pass
_endpoint_model_metadata_cache[normalized] = cache _endpoint_model_metadata_cache[normalized] = cache
_endpoint_model_metadata_cache_time[normalized] = time.time() _endpoint_model_metadata_cache_time[normalized] = time.time()
return cache return cache
@ -439,16 +460,26 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
return None return None
def get_model_context_length(model: str, base_url: str = "", api_key: str = "") -> int: def get_model_context_length(
model: str,
base_url: str = "",
api_key: str = "",
config_context_length: int | None = None,
) -> int:
"""Get the context length for a model. """Get the context length for a model.
Resolution order: Resolution order:
0. Explicit config override (model.context_length in config.yaml)
1. Persistent cache (previously discovered via probing) 1. Persistent cache (previously discovered via probing)
2. Active endpoint metadata (/models for explicit custom endpoints) 2. Active endpoint metadata (/models for explicit custom endpoints)
3. OpenRouter API metadata 3. OpenRouter API metadata
4. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match for hosted routes only) 4. Hardcoded DEFAULT_CONTEXT_LENGTHS (fuzzy match for hosted routes only)
5. First probe tier (2M) will be narrowed on first context error 5. First probe tier (2M) will be narrowed on first context error
""" """
# 0. Explicit config override — user knows best
if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
return config_context_length
# 1. Check persistent cache (model+provider) # 1. Check persistent cache (model+provider)
if base_url: if base_url:
cached = get_cached_context_length(model, base_url) cached = get_cached_context_length(model, base_url)
@ -458,13 +489,30 @@ def get_model_context_length(model: str, base_url: str = "", api_key: str = "")
# 2. Active endpoint metadata for explicit custom routes # 2. Active endpoint metadata for explicit custom routes
if _is_custom_endpoint(base_url): if _is_custom_endpoint(base_url):
endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key) endpoint_metadata = fetch_endpoint_model_metadata(base_url, api_key=api_key)
if model in endpoint_metadata: matched = endpoint_metadata.get(model)
context_length = endpoint_metadata[model].get("context_length") if not matched:
# Single-model servers: if only one model is loaded, use it
if len(endpoint_metadata) == 1:
matched = next(iter(endpoint_metadata.values()))
else:
# Fuzzy match: substring in either direction
for key, entry in endpoint_metadata.items():
if model in key or key in model:
matched = entry
break
if matched:
context_length = matched.get("context_length")
if isinstance(context_length, int): if isinstance(context_length, int):
return context_length return context_length
if not _is_known_provider_base_url(base_url): if not _is_known_provider_base_url(base_url):
# Explicit third-party endpoints should not borrow fuzzy global # Explicit third-party endpoints should not borrow fuzzy global
# defaults from unrelated providers with similarly named models. # defaults from unrelated providers with similarly named models.
logger.info(
"Could not detect context length for model %r at %s"
"defaulting to %s tokens (probe-down). Set model.context_length "
"in config.yaml to override.",
model, base_url, f"{CONTEXT_PROBE_TIERS[0]:,}",
)
return CONTEXT_PROBE_TIERS[0] return CONTEXT_PROBE_TIERS[0]
# 3. OpenRouter API metadata # 3. OpenRouter API metadata

10
cli.py
View file

@ -1046,6 +1046,14 @@ class HermesCLI:
_config_model = _model_config.get("default", "") if isinstance(_model_config, dict) else (_model_config or "") _config_model = _model_config.get("default", "") if isinstance(_model_config, dict) else (_model_config or "")
_FALLBACK_MODEL = "anthropic/claude-opus-4.6" _FALLBACK_MODEL = "anthropic/claude-opus-4.6"
self.model = model or _config_model or _FALLBACK_MODEL self.model = model or _config_model or _FALLBACK_MODEL
# Auto-detect model from local server if still on fallback
if self.model == _FALLBACK_MODEL:
_base_url = _model_config.get("base_url", "") if isinstance(_model_config, dict) else ""
if "localhost" in _base_url or "127.0.0.1" in _base_url:
from hermes_cli.runtime_provider import _auto_detect_local_model
_detected = _auto_detect_local_model(_base_url)
if _detected:
self.model = _detected
# Track whether model was explicitly chosen by the user or fell back # Track whether model was explicitly chosen by the user or fell back
# to the global default. Provider-specific normalisation may override # to the global default. Provider-specific normalisation may override
# the default silently but should warn when overriding an explicit choice. # the default silently but should warn when overriding an explicit choice.
@ -1251,6 +1259,8 @@ class HermesCLI:
def _get_status_bar_snapshot(self) -> Dict[str, Any]: def _get_status_bar_snapshot(self) -> Dict[str, Any]:
model_name = self.model or "unknown" model_name = self.model or "unknown"
model_short = model_name.split("/")[-1] if "/" in model_name else model_name model_short = model_name.split("/")[-1] if "/" in model_name else model_name
if model_short.endswith(".gguf"):
model_short = model_short[:-5]
if len(model_short) > 26: if len(model_short) > 26:
model_short = f"{model_short[:23]}..." model_short = f"{model_short[:23]}..."

View file

@ -289,6 +289,8 @@ def build_welcome_banner(console: Console, model: str, cwd: str,
_hero = HERMES_CADUCEUS _hero = HERMES_CADUCEUS
left_lines = ["", _hero, ""] left_lines = ["", _hero, ""]
model_short = model.split("/")[-1] if "/" in model else model model_short = model.split("/")[-1] if "/" in model else model
if model_short.endswith(".gguf"):
model_short = model_short[:-5]
if len(model_short) > 28: if len(model_short) > 28:
model_short = model_short[:25] + "..." model_short = model_short[:25] + "..."
ctx_str = f" [dim {dim}]·[/] [dim {dim}]{_format_context_length(context_length)} context[/]" if context_length else "" ctx_str = f" [dim {dim}]·[/] [dim {dim}]{_format_context_length(context_length)} context[/]" if context_length else ""

View file

@ -24,11 +24,41 @@ def _normalize_custom_provider_name(value: str) -> str:
return value.strip().lower().replace(" ", "-") return value.strip().lower().replace(" ", "-")
def _auto_detect_local_model(base_url: str) -> str:
"""Query a local server for its model name when only one model is loaded."""
if not base_url:
return ""
try:
import requests
url = base_url.rstrip("/")
if not url.endswith("/v1"):
url += "/v1"
resp = requests.get(url + "/models", timeout=5)
if resp.ok:
models = resp.json().get("data", [])
if len(models) == 1:
model_id = models[0].get("id", "")
if model_id:
return model_id
except Exception:
pass
return ""
def _get_model_config() -> Dict[str, Any]: def _get_model_config() -> Dict[str, Any]:
config = load_config() config = load_config()
model_cfg = config.get("model") model_cfg = config.get("model")
if isinstance(model_cfg, dict): if isinstance(model_cfg, dict):
return dict(model_cfg) cfg = dict(model_cfg)
default = cfg.get("default", "").strip()
base_url = cfg.get("base_url", "").strip()
is_local = "localhost" in base_url or "127.0.0.1" in base_url
is_fallback = not default or default == "anthropic/claude-opus-4.6"
if is_local and is_fallback and base_url:
detected = _auto_detect_local_model(base_url)
if detected:
cfg["default"] = detected
return cfg
if isinstance(model_cfg, str) and model_cfg.strip(): if isinstance(model_cfg, str) and model_cfg.strip():
return {"default": model_cfg.strip()} return {"default": model_cfg.strip()}
return {} return {}

View file

@ -969,6 +969,18 @@ class AIAgent:
compression_threshold = float(_compression_cfg.get("threshold", 0.50)) compression_threshold = float(_compression_cfg.get("threshold", 0.50))
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes") compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
compression_summary_model = _compression_cfg.get("summary_model") or None compression_summary_model = _compression_cfg.get("summary_model") or None
# Read explicit context_length override from model config
_model_cfg = _agent_cfg.get("model", {})
if isinstance(_model_cfg, dict):
_config_context_length = _model_cfg.get("context_length")
else:
_config_context_length = None
if _config_context_length is not None:
try:
_config_context_length = int(_config_context_length)
except (TypeError, ValueError):
_config_context_length = None
self.context_compressor = ContextCompressor( self.context_compressor = ContextCompressor(
model=self.model, model=self.model,
@ -980,6 +992,7 @@ class AIAgent:
quiet_mode=self.quiet_mode, quiet_mode=self.quiet_mode,
base_url=self.base_url, base_url=self.base_url,
api_key=getattr(self, "api_key", ""), api_key=getattr(self, "api_key", ""),
config_context_length=_config_context_length,
) )
self.compression_enabled = compression_enabled self.compression_enabled = compression_enabled
self._user_turn_count = 0 self._user_turn_count = 0

View file

@ -218,6 +218,79 @@ class TestGetModelContextLength:
assert result == CONTEXT_PROBE_TIERS[0] assert result == CONTEXT_PROBE_TIERS[0]
@patch("agent.model_metadata.fetch_model_metadata")
@patch("agent.model_metadata.fetch_endpoint_model_metadata")
def test_custom_endpoint_single_model_fallback(self, mock_endpoint_fetch, mock_fetch):
"""Single-model servers: use the only model even if name doesn't match."""
mock_fetch.return_value = {}
mock_endpoint_fetch.return_value = {
"Qwen3.5-9B-Q4_K_M.gguf": {"context_length": 131072}
}
result = get_model_context_length(
"qwen3.5:9b",
base_url="http://myserver.example.com:8080/v1",
api_key="test-key",
)
assert result == 131072
@patch("agent.model_metadata.fetch_model_metadata")
@patch("agent.model_metadata.fetch_endpoint_model_metadata")
def test_custom_endpoint_fuzzy_substring_match(self, mock_endpoint_fetch, mock_fetch):
"""Fuzzy match: configured model name is substring of endpoint model."""
mock_fetch.return_value = {}
mock_endpoint_fetch.return_value = {
"org/llama-3.3-70b-instruct-fp8": {"context_length": 131072},
"org/qwen-2.5-72b": {"context_length": 32768},
}
result = get_model_context_length(
"llama-3.3-70b-instruct",
base_url="http://myserver.example.com:8080/v1",
api_key="test-key",
)
assert result == 131072
@patch("agent.model_metadata.fetch_model_metadata")
def test_config_context_length_overrides_all(self, mock_fetch):
"""Explicit config_context_length takes priority over everything."""
mock_fetch.return_value = {
"test/model": {"context_length": 200000}
}
result = get_model_context_length(
"test/model",
config_context_length=65536,
)
assert result == 65536
@patch("agent.model_metadata.fetch_model_metadata")
def test_config_context_length_zero_is_ignored(self, mock_fetch):
"""config_context_length=0 should be treated as unset."""
mock_fetch.return_value = {}
result = get_model_context_length(
"anthropic/claude-sonnet-4",
config_context_length=0,
)
assert result == 200000
@patch("agent.model_metadata.fetch_model_metadata")
def test_config_context_length_none_is_ignored(self, mock_fetch):
"""config_context_length=None should be treated as unset."""
mock_fetch.return_value = {}
result = get_model_context_length(
"anthropic/claude-sonnet-4",
config_context_length=None,
)
assert result == 200000
# ========================================================================= # =========================================================================
# fetch_model_metadata — caching, TTL, slugs, failures # fetch_model_metadata — caching, TTL, slugs, failures

View file

@ -414,6 +414,29 @@ LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct-Turbo
--- ---
### Context Length Detection
Hermes automatically detects your model's context length by querying the endpoint's `/v1/models` response. For most setups this works out of the box. If detection fails (the model name doesn't match, the endpoint doesn't expose `/v1/models`, etc.), Hermes falls back to a high default and probes downward on context-length errors.
To set the context length explicitly, add `context_length` to your model config:
```yaml
model:
default: "qwen3.5:9b"
base_url: "http://localhost:8080/v1"
context_length: 131072 # tokens
```
This takes highest priority — it overrides auto-detection, cached values, and hardcoded defaults.
:::tip When to set this manually
- Your model shows "2M context" in the status bar (detection failed)
- You want to limit context below the model's maximum (e.g., 8k on a 128k model to save VRAM)
- You're running behind a proxy that doesn't expose `/v1/models`
:::
---
### Choosing the Right Setup ### Choosing the Right Setup
| Use Case | Recommended | | Use Case | Recommended |