feat: call_llm/async_call_llm + config slots + migrate all consumers

Add centralized call_llm() and async_call_llm() functions that own the
full LLM request lifecycle:
  1. Resolve provider + model from task config or explicit args
  2. Get or create a cached client for that provider
  3. Format request args (max_tokens handling, provider extra_body)
  4. Make the API call with max_tokens/max_completion_tokens retry
  5. Return the response

Config: expanded auxiliary section with provider:model slots for all
tasks (compression, vision, web_extract, session_search, skills_hub,
mcp, flush_memories). Config version bumped to 7.

Migrated all auxiliary consumers:
- context_compressor.py: uses call_llm(task='compression')
- vision_tools.py: uses async_call_llm(task='vision')
- web_tools.py: uses async_call_llm(task='web_extract')
- session_search_tool.py: uses async_call_llm(task='session_search')
- browser_tool.py: uses call_llm(task='vision'/'web_extract')
- mcp_tool.py: uses call_llm(task='mcp')
- skills_guard.py: uses call_llm(provider='openrouter')
- run_agent.py flush_memories: uses call_llm(task='flush_memories')

Tests updated for context_compressor and MCP tool. Some test mocks
still need updating (15 remaining failures from mock pattern changes,
2 pre-existing).
This commit is contained in:
teknium1 2026-03-11 20:52:19 -07:00
parent 013cc4d2fc
commit 0aa31cd3cb
13 changed files with 552 additions and 375 deletions

View file

@ -456,17 +456,13 @@ class SamplingHandler:
# Resolve model
model = self._resolve_model(getattr(params, "modelPreferences", None))
# Get auxiliary LLM client
from agent.auxiliary_client import get_text_auxiliary_client
client, default_model = get_text_auxiliary_client()
if client is None:
self.metrics["errors"] += 1
return self._error("No LLM provider available for sampling")
# Get auxiliary LLM client via centralized router
from agent.auxiliary_client import call_llm
resolved_model = model or default_model
# Model whitelist check (we need to resolve model before calling)
resolved_model = model or self.model_override or ""
# Model whitelist check
if self.allowed_models and resolved_model not in self.allowed_models:
if self.allowed_models and resolved_model and resolved_model not in self.allowed_models:
logger.warning(
"MCP server '%s' requested model '%s' not in allowed_models",
self.server_name, resolved_model,
@ -484,20 +480,15 @@ class SamplingHandler:
# Build LLM call kwargs
max_tokens = min(params.maxTokens, self.max_tokens_cap)
call_kwargs: dict = {
"model": resolved_model,
"messages": messages,
"max_tokens": max_tokens,
}
call_temperature = None
if hasattr(params, "temperature") and params.temperature is not None:
call_kwargs["temperature"] = params.temperature
if stop := getattr(params, "stopSequences", None):
call_kwargs["stop"] = stop
call_temperature = params.temperature
# Forward server-provided tools
call_tools = None
server_tools = getattr(params, "tools", None)
if server_tools:
call_kwargs["tools"] = [
call_tools = [
{
"type": "function",
"function": {
@ -508,9 +499,6 @@ class SamplingHandler:
}
for t in server_tools
]
if tool_choice := getattr(params, "toolChoice", None):
mode = getattr(tool_choice, "mode", "auto")
call_kwargs["tool_choice"] = {"auto": "auto", "required": "required", "none": "none"}.get(mode, "auto")
logger.log(
self.audit_level,
@ -520,7 +508,15 @@ class SamplingHandler:
# Offload sync LLM call to thread (non-blocking)
def _sync_call():
return client.chat.completions.create(**call_kwargs)
return call_llm(
task="mcp",
model=resolved_model or None,
messages=messages,
temperature=call_temperature,
max_tokens=max_tokens,
tools=call_tools,
timeout=self.timeout,
)
try:
response = await asyncio.wait_for(