fix: improve clawhub skill search matching
This commit is contained in:
parent
df9020dfa3
commit
8ccd14a0d4
2 changed files with 177 additions and 31 deletions
|
|
@ -3,7 +3,7 @@
|
||||||
import unittest
|
import unittest
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from tools.skills_hub import ClawHubSource
|
from tools.skills_hub import ClawHubSource, SkillMeta
|
||||||
|
|
||||||
|
|
||||||
class _MockResponse:
|
class _MockResponse:
|
||||||
|
|
@ -22,8 +22,11 @@ class TestClawHubSource(unittest.TestCase):
|
||||||
|
|
||||||
@patch("tools.skills_hub._write_index_cache")
|
@patch("tools.skills_hub._write_index_cache")
|
||||||
@patch("tools.skills_hub._read_index_cache", return_value=None)
|
@patch("tools.skills_hub._read_index_cache", return_value=None)
|
||||||
|
@patch.object(ClawHubSource, "_load_catalog_index", return_value=[])
|
||||||
@patch("tools.skills_hub.httpx.get")
|
@patch("tools.skills_hub.httpx.get")
|
||||||
def test_search_uses_new_endpoint_and_parses_items(self, mock_get, _mock_read_cache, _mock_write_cache):
|
def test_search_uses_listing_endpoint_as_fallback(
|
||||||
|
self, mock_get, _mock_load_catalog, _mock_read_cache, _mock_write_cache
|
||||||
|
):
|
||||||
def side_effect(url, *args, **kwargs):
|
def side_effect(url, *args, **kwargs):
|
||||||
if url.endswith("/skills"):
|
if url.endswith("/skills"):
|
||||||
return _MockResponse(
|
return _MockResponse(
|
||||||
|
|
@ -52,16 +55,21 @@ class TestClawHubSource(unittest.TestCase):
|
||||||
self.assertEqual(results[0].name, "CalDAV Calendar")
|
self.assertEqual(results[0].name, "CalDAV Calendar")
|
||||||
self.assertEqual(results[0].description, "Calendar integration")
|
self.assertEqual(results[0].description, "Calendar integration")
|
||||||
|
|
||||||
first_call = mock_get.call_args_list[0]
|
self.assertGreaterEqual(mock_get.call_count, 2)
|
||||||
args, kwargs = first_call
|
args, kwargs = mock_get.call_args_list[0]
|
||||||
self.assertTrue(args[0].endswith("/skills"))
|
self.assertTrue(args[0].endswith("/skills"))
|
||||||
self.assertEqual(kwargs["params"], {"search": "caldav", "limit": 5})
|
self.assertEqual(kwargs["params"], {"search": "caldav", "limit": 5})
|
||||||
|
|
||||||
@patch("tools.skills_hub._write_index_cache")
|
@patch("tools.skills_hub._write_index_cache")
|
||||||
@patch("tools.skills_hub._read_index_cache", return_value=None)
|
@patch("tools.skills_hub._read_index_cache", return_value=None)
|
||||||
|
@patch.object(
|
||||||
|
ClawHubSource,
|
||||||
|
"_load_catalog_index",
|
||||||
|
return_value=[],
|
||||||
|
)
|
||||||
@patch("tools.skills_hub.httpx.get")
|
@patch("tools.skills_hub.httpx.get")
|
||||||
def test_search_falls_back_to_exact_slug_when_search_results_are_irrelevant(
|
def test_search_falls_back_to_exact_slug_when_search_results_are_irrelevant(
|
||||||
self, mock_get, _mock_read_cache, _mock_write_cache
|
self, mock_get, _mock_load_catalog, _mock_read_cache, _mock_write_cache
|
||||||
):
|
):
|
||||||
def side_effect(url, *args, **kwargs):
|
def side_effect(url, *args, **kwargs):
|
||||||
if url.endswith("/skills"):
|
if url.endswith("/skills"):
|
||||||
|
|
@ -102,23 +110,7 @@ class TestClawHubSource(unittest.TestCase):
|
||||||
self.assertIn("continuous improvement", results[0].description)
|
self.assertIn("continuous improvement", results[0].description)
|
||||||
|
|
||||||
@patch("tools.skills_hub.httpx.get")
|
@patch("tools.skills_hub.httpx.get")
|
||||||
@patch(
|
def test_search_repairs_poisoned_cache_with_exact_slug_lookup(self, mock_get):
|
||||||
"tools.skills_hub._read_index_cache",
|
|
||||||
return_value=[
|
|
||||||
{
|
|
||||||
"name": "Apple Music DJ",
|
|
||||||
"description": "Unrelated cached result",
|
|
||||||
"source": "clawhub",
|
|
||||||
"identifier": "apple-music-dj",
|
|
||||||
"trust_level": "community",
|
|
||||||
"repo": None,
|
|
||||||
"path": None,
|
|
||||||
"tags": [],
|
|
||||||
"extra": {},
|
|
||||||
}
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_search_repairs_poisoned_cache_with_exact_slug_lookup(self, _mock_read_cache, mock_get):
|
|
||||||
mock_get.return_value = _MockResponse(
|
mock_get.return_value = _MockResponse(
|
||||||
status_code=200,
|
status_code=200,
|
||||||
json_data={
|
json_data={
|
||||||
|
|
@ -132,13 +124,43 @@ class TestClawHubSource(unittest.TestCase):
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
results = self.src.search("self-improving-agent", limit=5)
|
poisoned = [
|
||||||
|
SkillMeta(
|
||||||
|
name="Apple Music DJ",
|
||||||
|
description="Unrelated cached result",
|
||||||
|
source="clawhub",
|
||||||
|
identifier="apple-music-dj",
|
||||||
|
trust_level="community",
|
||||||
|
tags=[],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
results = self.src._finalize_search_results("self-improving-agent", poisoned, 5)
|
||||||
|
|
||||||
self.assertEqual(len(results), 1)
|
self.assertEqual(len(results), 1)
|
||||||
self.assertEqual(results[0].identifier, "self-improving-agent")
|
self.assertEqual(results[0].identifier, "self-improving-agent")
|
||||||
mock_get.assert_called_once()
|
mock_get.assert_called_once()
|
||||||
self.assertTrue(mock_get.call_args.args[0].endswith("/skills/self-improving-agent"))
|
self.assertTrue(mock_get.call_args.args[0].endswith("/skills/self-improving-agent"))
|
||||||
|
|
||||||
|
@patch.object(
|
||||||
|
ClawHubSource,
|
||||||
|
"_exact_slug_meta",
|
||||||
|
return_value=SkillMeta(
|
||||||
|
name="self-improving-agent",
|
||||||
|
description="Captures learnings and errors for continuous improvement.",
|
||||||
|
source="clawhub",
|
||||||
|
identifier="self-improving-agent",
|
||||||
|
trust_level="community",
|
||||||
|
tags=["automation"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def test_search_matches_space_separated_query_to_hyphenated_slug(
|
||||||
|
self, _mock_exact_slug
|
||||||
|
):
|
||||||
|
results = self.src.search("self improving", limit=5)
|
||||||
|
|
||||||
|
self.assertEqual(len(results), 1)
|
||||||
|
self.assertEqual(results[0].identifier, "self-improving-agent")
|
||||||
|
|
||||||
@patch("tools.skills_hub.httpx.get")
|
@patch("tools.skills_hub.httpx.get")
|
||||||
def test_inspect_maps_display_name_and_summary(self, mock_get):
|
def test_inspect_maps_display_name_and_summary(self, mock_get):
|
||||||
mock_get.return_value = _MockResponse(
|
mock_get.return_value = _MockResponse(
|
||||||
|
|
|
||||||
|
|
@ -1190,12 +1190,29 @@ class ClawHubSource(SkillSource):
|
||||||
identifier = (meta.identifier or "").lower()
|
identifier = (meta.identifier or "").lower()
|
||||||
name = (meta.name or "").lower()
|
name = (meta.name or "").lower()
|
||||||
description = (meta.description or "").lower()
|
description = (meta.description or "").lower()
|
||||||
|
normalized_identifier = " ".join(cls._query_terms(identifier))
|
||||||
|
normalized_name = " ".join(cls._query_terms(name))
|
||||||
|
query_terms = cls._query_terms(query_norm)
|
||||||
|
identifier_terms = cls._query_terms(identifier)
|
||||||
|
name_terms = cls._query_terms(name)
|
||||||
score = 0
|
score = 0
|
||||||
|
|
||||||
if query_norm == identifier:
|
if query_norm == identifier:
|
||||||
score += 100
|
score += 140
|
||||||
if query_norm == name:
|
if query_norm == name:
|
||||||
|
score += 130
|
||||||
|
if normalized_identifier == query_norm:
|
||||||
|
score += 125
|
||||||
|
if normalized_name == query_norm:
|
||||||
|
score += 120
|
||||||
|
if normalized_identifier.startswith(query_norm):
|
||||||
score += 95
|
score += 95
|
||||||
|
if normalized_name.startswith(query_norm):
|
||||||
|
score += 90
|
||||||
|
if query_terms and identifier_terms[: len(query_terms)] == query_terms:
|
||||||
|
score += 70
|
||||||
|
if query_terms and name_terms[: len(query_terms)] == query_terms:
|
||||||
|
score += 65
|
||||||
if query_norm in identifier:
|
if query_norm in identifier:
|
||||||
score += 40
|
score += 40
|
||||||
if query_norm in name:
|
if query_norm in name:
|
||||||
|
|
@ -1203,10 +1220,10 @@ class ClawHubSource(SkillSource):
|
||||||
if query_norm in description:
|
if query_norm in description:
|
||||||
score += 10
|
score += 10
|
||||||
|
|
||||||
for term in cls._query_terms(query_norm):
|
for term in query_terms:
|
||||||
if term in identifier:
|
if term in identifier_terms:
|
||||||
score += 15
|
score += 15
|
||||||
if term in name:
|
if term in name_terms:
|
||||||
score += 12
|
score += 12
|
||||||
if term in description:
|
if term in description:
|
||||||
score += 3
|
score += 3
|
||||||
|
|
@ -1227,9 +1244,36 @@ class ClawHubSource(SkillSource):
|
||||||
|
|
||||||
def _exact_slug_meta(self, query: str) -> Optional[SkillMeta]:
|
def _exact_slug_meta(self, query: str) -> Optional[SkillMeta]:
|
||||||
slug = query.strip().split("/")[-1]
|
slug = query.strip().split("/")[-1]
|
||||||
if not slug or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", slug):
|
query_terms = self._query_terms(query)
|
||||||
return None
|
candidates: List[str] = []
|
||||||
return self.inspect(slug)
|
|
||||||
|
if slug and re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", slug):
|
||||||
|
candidates.append(slug)
|
||||||
|
|
||||||
|
if query_terms:
|
||||||
|
base_slug = "-".join(query_terms)
|
||||||
|
if len(query_terms) >= 2:
|
||||||
|
candidates.extend([
|
||||||
|
f"{base_slug}-agent",
|
||||||
|
f"{base_slug}-skill",
|
||||||
|
f"{base_slug}-tool",
|
||||||
|
f"{base_slug}-assistant",
|
||||||
|
f"{base_slug}-playbook",
|
||||||
|
base_slug,
|
||||||
|
])
|
||||||
|
else:
|
||||||
|
candidates.append(base_slug)
|
||||||
|
|
||||||
|
seen: set[str] = set()
|
||||||
|
for candidate in candidates:
|
||||||
|
if candidate in seen:
|
||||||
|
continue
|
||||||
|
seen.add(candidate)
|
||||||
|
meta = self.inspect(candidate)
|
||||||
|
if meta:
|
||||||
|
return meta
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def _finalize_search_results(self, query: str, results: List[SkillMeta], limit: int) -> List[SkillMeta]:
|
def _finalize_search_results(self, query: str, results: List[SkillMeta], limit: int) -> List[SkillMeta]:
|
||||||
query_norm = query.strip()
|
query_norm = query.strip()
|
||||||
|
|
@ -1260,7 +1304,21 @@ class ClawHubSource(SkillSource):
|
||||||
return self._dedupe_results(results)[:limit]
|
return self._dedupe_results(results)[:limit]
|
||||||
|
|
||||||
def search(self, query: str, limit: int = 10) -> List[SkillMeta]:
|
def search(self, query: str, limit: int = 10) -> List[SkillMeta]:
|
||||||
cache_key = f"clawhub_search_{hashlib.md5(query.encode()).hexdigest()}"
|
query = query.strip()
|
||||||
|
|
||||||
|
if query:
|
||||||
|
query_terms = self._query_terms(query)
|
||||||
|
if len(query_terms) >= 2:
|
||||||
|
direct = self._exact_slug_meta(query)
|
||||||
|
if direct:
|
||||||
|
return [direct]
|
||||||
|
|
||||||
|
results = self._search_catalog(query, limit=limit)
|
||||||
|
if results:
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Empty query or catalog fallback failure: use the lightweight listing API.
|
||||||
|
cache_key = f"clawhub_search_listing_v1_{hashlib.md5(query.encode()).hexdigest()}_{limit}"
|
||||||
cached = _read_index_cache(cache_key)
|
cached = _read_index_cache(cache_key)
|
||||||
if cached is not None:
|
if cached is not None:
|
||||||
return self._finalize_search_results(
|
return self._finalize_search_results(
|
||||||
|
|
@ -1365,6 +1423,72 @@ class ClawHubSource(SkillSource):
|
||||||
tags=tags,
|
tags=tags,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _search_catalog(self, query: str, limit: int = 10) -> List[SkillMeta]:
|
||||||
|
cache_key = f"clawhub_search_catalog_v1_{hashlib.md5(f'{query}|{limit}'.encode()).hexdigest()}"
|
||||||
|
cached = _read_index_cache(cache_key)
|
||||||
|
if cached is not None:
|
||||||
|
return [SkillMeta(**s) for s in cached][:limit]
|
||||||
|
|
||||||
|
catalog = self._load_catalog_index()
|
||||||
|
if not catalog:
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = self._finalize_search_results(query, catalog, limit)
|
||||||
|
_write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results])
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _load_catalog_index(self) -> List[SkillMeta]:
|
||||||
|
cache_key = "clawhub_catalog_v1"
|
||||||
|
cached = _read_index_cache(cache_key)
|
||||||
|
if cached is not None:
|
||||||
|
return [SkillMeta(**s) for s in cached]
|
||||||
|
|
||||||
|
cursor: Optional[str] = None
|
||||||
|
results: List[SkillMeta] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
max_pages = 50
|
||||||
|
|
||||||
|
for _ in range(max_pages):
|
||||||
|
params: Dict[str, Any] = {"limit": 200}
|
||||||
|
if cursor:
|
||||||
|
params["cursor"] = cursor
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = httpx.get(f"{self.BASE_URL}/skills", params=params, timeout=30)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
break
|
||||||
|
data = resp.json()
|
||||||
|
except (httpx.HTTPError, json.JSONDecodeError):
|
||||||
|
break
|
||||||
|
|
||||||
|
items = data.get("items", []) if isinstance(data, dict) else []
|
||||||
|
if not isinstance(items, list) or not items:
|
||||||
|
break
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
slug = item.get("slug")
|
||||||
|
if not isinstance(slug, str) or not slug or slug in seen:
|
||||||
|
continue
|
||||||
|
seen.add(slug)
|
||||||
|
display_name = item.get("displayName") or item.get("name") or slug
|
||||||
|
summary = item.get("summary") or item.get("description") or ""
|
||||||
|
tags = self._normalize_tags(item.get("tags", []))
|
||||||
|
results.append(SkillMeta(
|
||||||
|
name=display_name,
|
||||||
|
description=summary,
|
||||||
|
source="clawhub",
|
||||||
|
identifier=slug,
|
||||||
|
trust_level="community",
|
||||||
|
tags=tags,
|
||||||
|
))
|
||||||
|
|
||||||
|
cursor = data.get("nextCursor") if isinstance(data, dict) else None
|
||||||
|
if not isinstance(cursor, str) or not cursor:
|
||||||
|
break
|
||||||
|
|
||||||
|
_write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results])
|
||||||
|
return results
|
||||||
|
|
||||||
def _get_json(self, url: str, timeout: int = 20) -> Optional[Any]:
|
def _get_json(self, url: str, timeout: int = 20) -> Optional[Any]:
|
||||||
try:
|
try:
|
||||||
resp = httpx.get(url, timeout=timeout)
|
resp = httpx.get(url, timeout=timeout)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue