fix: 6 bugs in model metadata, reasoning detection, and delegate tool

Cherry-picked from PR #2169 by @0xbyt4.

1. _strip_provider_prefix: skip Ollama model:tag names (qwen:0.5b)
2. Fuzzy match: remove reverse direction that made claude-sonnet-4
   resolve to 1M instead of 200K
3. _has_content_after_think_block: reuse _strip_think_blocks() to
   handle all tag variants (thinking, reasoning, REASONING_SCRATCHPAD)
4. models.dev lookup: elif→if so nous provider also queries models.dev
5. Disk cache fallback: use 5-min TTL instead of full hour so network
   is retried soon
6. Delegate build: wrap child construction in try/finally so
   _last_resolved_tool_names is always restored on exception
This commit is contained in:
Test 2026-03-20 08:52:37 -07:00
parent 2ea4dd30c6
commit 55ce601502
4 changed files with 50 additions and 30 deletions

View file

@ -34,17 +34,29 @@ _PROVIDER_PREFIXES: frozenset[str] = frozenset({
}) })
_OLLAMA_TAG_PATTERN = re.compile(
r"^(\d+\.?\d*b|latest|stable|q\d|fp?\d|instruct|chat|coder|vision|text)",
re.IGNORECASE,
)
def _strip_provider_prefix(model: str) -> str: def _strip_provider_prefix(model: str) -> str:
"""Strip a recognised provider prefix from a model string. """Strip a recognised provider prefix from a model string.
``"local:my-model"`` ``"my-model"`` ``"local:my-model"`` ``"my-model"``
``"qwen3.5:27b"`` ``"qwen3.5:27b"`` (unchanged not a provider prefix) ``"qwen3.5:27b"`` ``"qwen3.5:27b"`` (unchanged not a provider prefix)
``"qwen:0.5b"`` ``"qwen:0.5b"`` (unchanged Ollama model:tag)
``"deepseek:latest"`` ``"deepseek:latest"``(unchanged Ollama model:tag)
""" """
if ":" not in model or model.startswith("http"): if ":" not in model or model.startswith("http"):
return model return model
prefix = model.split(":", 1)[0].strip().lower() prefix, suffix = model.split(":", 1)
if prefix in _PROVIDER_PREFIXES: prefix_lower = prefix.strip().lower()
return model.split(":", 1)[1] if prefix_lower in _PROVIDER_PREFIXES:
# Don't strip if suffix looks like an Ollama tag (e.g. "7b", "latest", "q4_0")
if _OLLAMA_TAG_PATTERN.match(suffix.strip()):
return model
return suffix
return model return model
_model_metadata_cache: Dict[str, Dict[str, Any]] = {} _model_metadata_cache: Dict[str, Dict[str, Any]] = {}
@ -800,7 +812,7 @@ def get_model_context_length(
ctx = _resolve_nous_context_length(model) ctx = _resolve_nous_context_length(model)
if ctx: if ctx:
return ctx return ctx
elif provider: if provider:
from agent.models_dev import lookup_models_dev_context from agent.models_dev import lookup_models_dev_context
ctx = lookup_models_dev_context(provider, model) ctx = lookup_models_dev_context(provider, model)
if ctx: if ctx:
@ -812,10 +824,13 @@ def get_model_context_length(
return metadata[model].get("context_length", 128000) return metadata[model].get("context_length", 128000)
# 8. Hardcoded defaults (fuzzy match — longest key first for specificity) # 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
# Only check `default_model in model` (is the key a substring of the input).
# The reverse (`model in default_model`) causes shorter names like
# "claude-sonnet-4" to incorrectly match "claude-sonnet-4-6" and return 1M.
for default_model, length in sorted( for default_model, length in sorted(
DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True DEFAULT_CONTEXT_LENGTHS.items(), key=lambda x: len(x[0]), reverse=True
): ):
if default_model in model or model in default_model: if default_model in model:
return length return length
# 9. Query local server as last resort # 9. Query local server as last resort

View file

@ -107,11 +107,12 @@ def fetch_models_dev(force_refresh: bool = False) -> Dict[str, Any]:
except Exception as e: except Exception as e:
logger.debug("Failed to fetch models.dev: %s", e) logger.debug("Failed to fetch models.dev: %s", e)
# Fall back to disk cache # Fall back to disk cache — use a short TTL (5 min) so we retry
# the network fetch soon instead of serving stale data for a full hour.
if not _models_dev_cache: if not _models_dev_cache:
_models_dev_cache = _load_disk_cache() _models_dev_cache = _load_disk_cache()
if _models_dev_cache: if _models_dev_cache:
_models_dev_cache_time = time.time() _models_dev_cache_time = time.time() - _MODELS_DEV_CACHE_TTL + 300
logger.debug("Loaded models.dev from disk cache (%d providers)", len(_models_dev_cache)) logger.debug("Loaded models.dev from disk cache (%d providers)", len(_models_dev_cache))
return _models_dev_cache return _models_dev_cache

View file

@ -1142,10 +1142,11 @@ class AIAgent:
def _has_content_after_think_block(self, content: str) -> bool: def _has_content_after_think_block(self, content: str) -> bool:
""" """
Check if content has actual text after any <think></think> blocks. Check if content has actual text after any reasoning/thinking blocks.
This detects cases where the model only outputs reasoning but no actual This detects cases where the model only outputs reasoning but no actual
response, which indicates an incomplete generation that should be retried. response, which indicates an incomplete generation that should be retried.
Must stay in sync with _strip_think_blocks() tag variants.
Args: Args:
content: The assistant message content to check content: The assistant message content to check
@ -1156,8 +1157,8 @@ class AIAgent:
if not content: if not content:
return False return False
# Remove all <think>...</think> blocks (including nested ones, non-greedy) # Remove all reasoning tag variants (must match _strip_think_blocks)
cleaned = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL) cleaned = self._strip_think_blocks(content)
# Check if there's any non-whitespace content remaining # Check if there's any non-whitespace content remaining
return bool(cleaned.strip()) return bool(cleaned.strip())

View file

@ -470,22 +470,25 @@ def delegate_task(
_parent_tool_names = list(_model_tools._last_resolved_tool_names) _parent_tool_names = list(_model_tools._last_resolved_tool_names)
# Build all child agents on the main thread (thread-safe construction) # Build all child agents on the main thread (thread-safe construction)
# Wrapped in try/finally so the global is always restored even if a
# child build raises (otherwise _last_resolved_tool_names stays corrupted).
children = [] children = []
for i, t in enumerate(task_list): try:
child = _build_child_agent( for i, t in enumerate(task_list):
task_index=i, goal=t["goal"], context=t.get("context"), child = _build_child_agent(
toolsets=t.get("toolsets") or toolsets, model=creds["model"], task_index=i, goal=t["goal"], context=t.get("context"),
max_iterations=effective_max_iter, parent_agent=parent_agent, toolsets=t.get("toolsets") or toolsets, model=creds["model"],
override_provider=creds["provider"], override_base_url=creds["base_url"], max_iterations=effective_max_iter, parent_agent=parent_agent,
override_api_key=creds["api_key"], override_provider=creds["provider"], override_base_url=creds["base_url"],
override_api_mode=creds["api_mode"], override_api_key=creds["api_key"],
) override_api_mode=creds["api_mode"],
# Override with correct parent tool names (before child construction mutated global) )
child._delegate_saved_tool_names = _parent_tool_names # Override with correct parent tool names (before child construction mutated global)
children.append((i, t, child)) child._delegate_saved_tool_names = _parent_tool_names
children.append((i, t, child))
# Authoritative restore: reset global to parent's tool names after all children built finally:
_model_tools._last_resolved_tool_names = _parent_tool_names # Authoritative restore: reset global to parent's tool names after all children built
_model_tools._last_resolved_tool_names = _parent_tool_names
if n_tasks == 1: if n_tasks == 1:
# Single task -- run directly (no thread pool overhead) # Single task -- run directly (no thread pool overhead)