fix: preserve Ollama model:tag colons in context length detection (#2149)
The colon-split logic in get_model_context_length() and _query_local_context_length() assumed any colon meant provider:model format (e.g. "local:my-model"). But Ollama uses model:tag format (e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just "27b" — which matches nothing, causing a fallback to the 2M token probe tier. Now only recognised provider prefixes (local, openrouter, anthropic, etc.) are stripped. Ollama model:tag names pass through intact. Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
This commit is contained in:
parent
3a9a1bbb84
commit
471ea81a7d
2 changed files with 77 additions and 7 deletions
|
|
@ -19,6 +19,34 @@ from hermes_constants import OPENROUTER_MODELS_URL
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Provider names that can appear as a "provider:" prefix before a model ID.
|
||||||
|
# Only these are stripped — Ollama-style "model:tag" colons (e.g. "qwen3.5:27b")
|
||||||
|
# are preserved so the full model name reaches cache lookups and server queries.
|
||||||
|
_PROVIDER_PREFIXES: frozenset[str] = frozenset({
|
||||||
|
"openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
|
||||||
|
"zai", "kimi-coding", "minimax", "minimax-cn", "anthropic", "deepseek",
|
||||||
|
"opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
|
||||||
|
"custom", "local",
|
||||||
|
# Common aliases
|
||||||
|
"glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
|
||||||
|
"github-models", "kimi", "moonshot", "claude", "deep-seek",
|
||||||
|
"opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_provider_prefix(model: str) -> str:
|
||||||
|
"""Strip a recognised provider prefix from a model string.
|
||||||
|
|
||||||
|
``"local:my-model"`` → ``"my-model"``
|
||||||
|
``"qwen3.5:27b"`` → ``"qwen3.5:27b"`` (unchanged — not a provider prefix)
|
||||||
|
"""
|
||||||
|
if ":" not in model or model.startswith("http"):
|
||||||
|
return model
|
||||||
|
prefix = model.split(":", 1)[0].strip().lower()
|
||||||
|
if prefix in _PROVIDER_PREFIXES:
|
||||||
|
return model.split(":", 1)[1]
|
||||||
|
return model
|
||||||
|
|
||||||
_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
|
_model_metadata_cache: Dict[str, Dict[str, Any]] = {}
|
||||||
_model_metadata_cache_time: float = 0
|
_model_metadata_cache_time: float = 0
|
||||||
_MODEL_CACHE_TTL = 3600
|
_MODEL_CACHE_TTL = 3600
|
||||||
|
|
@ -579,10 +607,9 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
|
||||||
"""Query a local server for the model's context length."""
|
"""Query a local server for the model's context length."""
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
# Strip provider prefix (e.g., "local:model-name" → "model-name").
|
# Strip recognised provider prefix (e.g., "local:model-name" → "model-name").
|
||||||
# LM Studio and Ollama don't use provider prefixes in their model IDs.
|
# Ollama "model:tag" colons (e.g. "qwen3.5:27b") are intentionally preserved.
|
||||||
if ":" in model and not model.startswith("http"):
|
model = _strip_provider_prefix(model)
|
||||||
model = model.split(":", 1)[1]
|
|
||||||
|
|
||||||
# Strip /v1 suffix to get the server root
|
# Strip /v1 suffix to get the server root
|
||||||
server_url = base_url.rstrip("/")
|
server_url = base_url.rstrip("/")
|
||||||
|
|
@ -689,9 +716,8 @@ def get_model_context_length(
|
||||||
|
|
||||||
# Normalise provider-prefixed model names (e.g. "local:model-name" →
|
# Normalise provider-prefixed model names (e.g. "local:model-name" →
|
||||||
# "model-name") so cache lookups and server queries use the bare ID that
|
# "model-name") so cache lookups and server queries use the bare ID that
|
||||||
# local servers actually know about.
|
# local servers actually know about. Ollama "model:tag" colons are preserved.
|
||||||
if ":" in model and not model.startswith("http"):
|
model = _strip_provider_prefix(model)
|
||||||
model = model.split(":", 1)[1]
|
|
||||||
|
|
||||||
# 1. Check persistent cache (model+provider)
|
# 1. Check persistent cache (model+provider)
|
||||||
if base_url:
|
if base_url:
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,7 @@ from unittest.mock import patch, MagicMock
|
||||||
from agent.model_metadata import (
|
from agent.model_metadata import (
|
||||||
CONTEXT_PROBE_TIERS,
|
CONTEXT_PROBE_TIERS,
|
||||||
DEFAULT_CONTEXT_LENGTHS,
|
DEFAULT_CONTEXT_LENGTHS,
|
||||||
|
_strip_provider_prefix,
|
||||||
estimate_tokens_rough,
|
estimate_tokens_rough,
|
||||||
estimate_messages_tokens_rough,
|
estimate_messages_tokens_rough,
|
||||||
get_model_context_length,
|
get_model_context_length,
|
||||||
|
|
@ -292,6 +293,49 @@ class TestGetModelContextLength:
|
||||||
assert result == 200000
|
assert result == 200000
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# _strip_provider_prefix — Ollama model:tag vs provider:model
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class TestStripProviderPrefix:
|
||||||
|
def test_known_provider_prefix_is_stripped(self):
|
||||||
|
assert _strip_provider_prefix("local:my-model") == "my-model"
|
||||||
|
assert _strip_provider_prefix("openrouter:anthropic/claude-sonnet-4") == "anthropic/claude-sonnet-4"
|
||||||
|
assert _strip_provider_prefix("anthropic:claude-sonnet-4") == "claude-sonnet-4"
|
||||||
|
|
||||||
|
def test_ollama_model_tag_preserved(self):
|
||||||
|
"""Ollama model:tag format must NOT be stripped."""
|
||||||
|
assert _strip_provider_prefix("qwen3.5:27b") == "qwen3.5:27b"
|
||||||
|
assert _strip_provider_prefix("llama3.3:70b") == "llama3.3:70b"
|
||||||
|
assert _strip_provider_prefix("gemma2:9b") == "gemma2:9b"
|
||||||
|
assert _strip_provider_prefix("codellama:13b-instruct-q4_0") == "codellama:13b-instruct-q4_0"
|
||||||
|
|
||||||
|
def test_http_urls_preserved(self):
|
||||||
|
assert _strip_provider_prefix("http://example.com") == "http://example.com"
|
||||||
|
assert _strip_provider_prefix("https://example.com") == "https://example.com"
|
||||||
|
|
||||||
|
def test_no_colon_returns_unchanged(self):
|
||||||
|
assert _strip_provider_prefix("gpt-4o") == "gpt-4o"
|
||||||
|
assert _strip_provider_prefix("anthropic/claude-sonnet-4") == "anthropic/claude-sonnet-4"
|
||||||
|
|
||||||
|
@patch("agent.model_metadata.fetch_model_metadata")
|
||||||
|
def test_ollama_model_tag_not_mangled_in_context_lookup(self, mock_fetch):
|
||||||
|
"""Ensure 'qwen3.5:27b' is NOT reduced to '27b' during context length lookup.
|
||||||
|
|
||||||
|
We mock a custom endpoint that knows 'qwen3.5:27b' — the full name
|
||||||
|
must reach the endpoint metadata lookup intact.
|
||||||
|
"""
|
||||||
|
mock_fetch.return_value = {}
|
||||||
|
with patch("agent.model_metadata.fetch_endpoint_model_metadata") as mock_ep, \
|
||||||
|
patch("agent.model_metadata._is_custom_endpoint", return_value=True):
|
||||||
|
mock_ep.return_value = {"qwen3.5:27b": {"context_length": 32768}}
|
||||||
|
result = get_model_context_length(
|
||||||
|
"qwen3.5:27b",
|
||||||
|
base_url="http://localhost:11434/v1",
|
||||||
|
)
|
||||||
|
assert result == 32768
|
||||||
|
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# fetch_model_metadata — caching, TTL, slugs, failures
|
# fetch_model_metadata — caching, TTL, slugs, failures
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue