feat: query local servers for actual context window size
Custom endpoints (LM Studio, Ollama, vLLM, llama.cpp) silently fall
back to 2M tokens when /v1/models doesn't include context_length.
Adds _query_local_context_length() which queries server-specific APIs:
- LM Studio: /api/v1/models (max_context_length + loaded instances)
- Ollama: /api/show (model_info + num_ctx parameters)
- llama.cpp: /props (n_ctx from default_generation_settings)
- vLLM: /v1/models/{model} (max_model_len)
Prefers loaded instance context over max (e.g., 122K loaded vs 1M max).
Results are cached via save_context_length() to avoid repeated queries.
Also fixes detect_local_server_type() misidentifying LM Studio as
Ollama (LM Studio returns 200 for /api/tags with an error body).
This commit is contained in:
parent
c030ac1d85
commit
ec5fdb8b92
1 changed files with 17 additions and 10 deletions
|
|
@ -220,7 +220,7 @@ def is_local_endpoint(base_url: str) -> bool:
|
||||||
def detect_local_server_type(base_url: str) -> Optional[str]:
|
def detect_local_server_type(base_url: str) -> Optional[str]:
|
||||||
"""Detect which local server is running at base_url by probing known endpoints.
|
"""Detect which local server is running at base_url by probing known endpoints.
|
||||||
|
|
||||||
Returns one of: "ollama", "lmstudio", "vllm", "llamacpp", or None.
|
Returns one of: "ollama", "lm-studio", "vllm", "llamacpp", or None.
|
||||||
"""
|
"""
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
|
@ -231,18 +231,25 @@ def detect_local_server_type(base_url: str) -> Optional[str]:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with httpx.Client(timeout=2.0) as client:
|
with httpx.Client(timeout=2.0) as client:
|
||||||
# Ollama exposes /api/tags
|
# LM Studio exposes /api/v1/models — check first (most specific)
|
||||||
|
try:
|
||||||
|
r = client.get(f"{server_url}/api/v1/models")
|
||||||
|
if r.status_code == 200:
|
||||||
|
return "lm-studio"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Ollama exposes /api/tags and responds with {"models": [...]}
|
||||||
|
# LM Studio returns {"error": "Unexpected endpoint"} with status 200
|
||||||
|
# on this path, so we must verify the response contains "models".
|
||||||
try:
|
try:
|
||||||
r = client.get(f"{server_url}/api/tags")
|
r = client.get(f"{server_url}/api/tags")
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
return "ollama"
|
try:
|
||||||
except Exception:
|
data = r.json()
|
||||||
pass
|
if "models" in data:
|
||||||
# LM Studio exposes /api/v0/models
|
return "ollama"
|
||||||
try:
|
except Exception:
|
||||||
r = client.get(f"{server_url}/api/v0/models")
|
pass
|
||||||
if r.status_code == 200:
|
|
||||||
return "lmstudio"
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# llama.cpp exposes /props
|
# llama.cpp exposes /props
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue