update to firecrawl
This commit is contained in:
parent
96cff78335
commit
4ece87efb0
3 changed files with 145 additions and 43 deletions
|
|
@ -24,7 +24,7 @@ import asyncio
|
||||||
from typing import Dict, Any, List
|
from typing import Dict, Any, List
|
||||||
|
|
||||||
# Import toolsets
|
# Import toolsets
|
||||||
from web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_tavily_api_key
|
from web_tools import web_search_tool, web_extract_tool, web_crawl_tool, check_firecrawl_api_key
|
||||||
from terminal_tool import terminal_tool, check_hecate_requirements, TERMINAL_TOOL_DESCRIPTION
|
from terminal_tool import terminal_tool, check_hecate_requirements, TERMINAL_TOOL_DESCRIPTION
|
||||||
from vision_tools import vision_analyze_tool, check_vision_requirements
|
from vision_tools import vision_analyze_tool, check_vision_requirements
|
||||||
from mixture_of_agents_tool import mixture_of_agents_tool, check_moa_requirements
|
from mixture_of_agents_tool import mixture_of_agents_tool, check_moa_requirements
|
||||||
|
|
@ -272,7 +272,7 @@ def get_all_tool_names() -> List[str]:
|
||||||
tool_names = []
|
tool_names = []
|
||||||
|
|
||||||
# Web tools
|
# Web tools
|
||||||
if check_tavily_api_key():
|
if check_firecrawl_api_key():
|
||||||
tool_names.extend(["web_search", "web_extract", "web_crawl"])
|
tool_names.extend(["web_search", "web_extract", "web_crawl"])
|
||||||
|
|
||||||
# Terminal tools
|
# Terminal tools
|
||||||
|
|
@ -395,7 +395,7 @@ def get_tool_definitions(
|
||||||
|
|
||||||
# Collect all available tools from each toolset
|
# Collect all available tools from each toolset
|
||||||
toolset_tools = {
|
toolset_tools = {
|
||||||
"web_tools": get_web_tool_definitions() if check_tavily_api_key() else [],
|
"web_tools": get_web_tool_definitions() if check_firecrawl_api_key() else [],
|
||||||
"terminal_tools": get_terminal_tool_definitions() if check_hecate_requirements() else [],
|
"terminal_tools": get_terminal_tool_definitions() if check_hecate_requirements() else [],
|
||||||
"vision_tools": get_vision_tool_definitions() if check_vision_requirements() else [],
|
"vision_tools": get_vision_tool_definitions() if check_vision_requirements() else [],
|
||||||
"moa_tools": get_moa_tool_definitions() if check_moa_requirements() else [],
|
"moa_tools": get_moa_tool_definitions() if check_moa_requirements() else [],
|
||||||
|
|
@ -687,10 +687,10 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
toolsets = {
|
toolsets = {
|
||||||
"web_tools": {
|
"web_tools": {
|
||||||
"available": check_tavily_api_key(),
|
"available": check_firecrawl_api_key(),
|
||||||
"tools": ["web_search_tool", "web_extract_tool", "web_crawl_tool"],
|
"tools": ["web_search_tool", "web_extract_tool", "web_crawl_tool"],
|
||||||
"description": "Web search, content extraction, and website crawling tools",
|
"description": "Web search, content extraction, and website crawling tools",
|
||||||
"requirements": ["TAVILY_API_KEY environment variable"]
|
"requirements": ["FIRECRAWL_API_KEY environment variable"]
|
||||||
},
|
},
|
||||||
"terminal_tools": {
|
"terminal_tools": {
|
||||||
"available": check_hecate_requirements(),
|
"available": check_hecate_requirements(),
|
||||||
|
|
@ -714,7 +714,7 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
|
||||||
"available": check_image_generation_requirements(),
|
"available": check_image_generation_requirements(),
|
||||||
"tools": ["image_generate_tool"],
|
"tools": ["image_generate_tool"],
|
||||||
"description": "Generate high-quality images from text prompts using FAL.ai's FLUX.1 Krea model with automatic 2x upscaling for enhanced quality",
|
"description": "Generate high-quality images from text prompts using FAL.ai's FLUX.1 Krea model with automatic 2x upscaling for enhanced quality",
|
||||||
"requirements": ["FAL_API_KEY environment variable", "fal-client package"]
|
"requirements": ["FAL_KEY environment variable", "fal-client package"]
|
||||||
}
|
}
|
||||||
# Future toolsets can be added here
|
# Future toolsets can be added here
|
||||||
}
|
}
|
||||||
|
|
@ -729,7 +729,7 @@ def check_toolset_requirements() -> Dict[str, bool]:
|
||||||
Dict: Status of each toolset's requirements
|
Dict: Status of each toolset's requirements
|
||||||
"""
|
"""
|
||||||
return {
|
return {
|
||||||
"web_tools": check_tavily_api_key(),
|
"web_tools": check_firecrawl_api_key(),
|
||||||
"terminal_tools": check_hecate_requirements(),
|
"terminal_tools": check_hecate_requirements(),
|
||||||
"vision_tools": check_vision_requirements(),
|
"vision_tools": check_vision_requirements(),
|
||||||
"moa_tools": check_moa_requirements(),
|
"moa_tools": check_moa_requirements(),
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,3 @@
|
||||||
tavily-python
|
firecrawl-py
|
||||||
openai
|
openai
|
||||||
fal-client
|
fal-client
|
||||||
172
web_tools.py
172
web_tools.py
|
|
@ -3,8 +3,8 @@
|
||||||
Standalone Web Tools Module
|
Standalone Web Tools Module
|
||||||
|
|
||||||
This module provides generic web tools that work with multiple backend providers.
|
This module provides generic web tools that work with multiple backend providers.
|
||||||
Currently uses Tavily as the backend, but the interface makes it easy to swap
|
Currently uses Firecrawl as the backend, and the interface makes it easy to swap
|
||||||
to other providers like Firecrawl without changing the function signatures.
|
providers without changing the function signatures.
|
||||||
|
|
||||||
Available tools:
|
Available tools:
|
||||||
- web_search_tool: Search the web for information
|
- web_search_tool: Search the web for information
|
||||||
|
|
@ -12,8 +12,7 @@ Available tools:
|
||||||
- web_crawl_tool: Crawl websites with specific instructions
|
- web_crawl_tool: Crawl websites with specific instructions
|
||||||
|
|
||||||
Backend compatibility:
|
Backend compatibility:
|
||||||
- Tavily: https://docs.tavily.com/
|
- Firecrawl: https://docs.firecrawl.dev/introduction
|
||||||
- Firecrawl: https://docs.firecrawl.dev/features/search
|
|
||||||
|
|
||||||
LLM Processing:
|
LLM Processing:
|
||||||
- Uses Nous Research API with Gemini 2.5 Flash for intelligent content extraction
|
- Uses Nous Research API with Gemini 2.5 Flash for intelligent content extraction
|
||||||
|
|
@ -49,11 +48,11 @@ import uuid
|
||||||
import datetime
|
import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
from tavily import TavilyClient
|
from firecrawl import FirecrawlApp, ScrapeOptions
|
||||||
from openai import AsyncOpenAI
|
from openai import AsyncOpenAI
|
||||||
|
|
||||||
# Initialize Tavily client once at module level
|
# Initialize Firecrawl client once at module level
|
||||||
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
firecrawl_app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
|
||||||
|
|
||||||
# Initialize Nous Research API client for LLM processing (async)
|
# Initialize Nous Research API client for LLM processing (async)
|
||||||
nous_client = AsyncOpenAI(
|
nous_client = AsyncOpenAI(
|
||||||
|
|
@ -250,7 +249,7 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
||||||
Search the web for information using available search API backend.
|
Search the web for information using available search API backend.
|
||||||
|
|
||||||
This function provides a generic interface for web search that can work
|
This function provides a generic interface for web search that can work
|
||||||
with multiple backends. Currently uses Tavily but can be easily swapped.
|
with multiple backends. Currently uses Firecrawl.
|
||||||
|
|
||||||
Note: Search results are already concise snippets, so no LLM processing is applied.
|
Note: Search results are already concise snippets, so no LLM processing is applied.
|
||||||
|
|
||||||
|
|
@ -290,18 +289,36 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
||||||
try:
|
try:
|
||||||
print(f"🔍 Searching the web for: '{query}' (limit: {limit})")
|
print(f"🔍 Searching the web for: '{query}' (limit: {limit})")
|
||||||
|
|
||||||
# Use Tavily's search functionality
|
# Use Firecrawl's search functionality
|
||||||
response = tavily_client.search(query=query, max_results=limit, search_depth="advanced")
|
# Firecrawl Search: search the web and get full content from results
|
||||||
|
# Docs: https://docs.firecrawl.dev/introduction
|
||||||
|
# Note: Firecrawl SDK supports search via app.search(query, limit=...)
|
||||||
|
response = firecrawl_app.search(query=query, limit=limit)
|
||||||
|
|
||||||
results_count = len(response.get('results', []))
|
# Determine results count and trim to minimal structure: { success, data: [{markdown}] }
|
||||||
|
results_list = []
|
||||||
|
success_flag = True
|
||||||
|
if isinstance(response, dict):
|
||||||
|
success_flag = bool(response.get("success", True))
|
||||||
|
if "data" in response and isinstance(response["data"], list):
|
||||||
|
results_list = response["data"]
|
||||||
|
elif "results" in response and isinstance(response["results"], list):
|
||||||
|
results_list = response["results"]
|
||||||
|
results_count = len(results_list)
|
||||||
print(f"✅ Found {results_count} results")
|
print(f"✅ Found {results_count} results")
|
||||||
|
|
||||||
# Capture debug information
|
# Capture debug information
|
||||||
debug_call_data["results_count"] = results_count
|
debug_call_data["results_count"] = results_count
|
||||||
debug_call_data["original_response_size"] = len(json.dumps(response))
|
debug_call_data["original_response_size"] = len(json.dumps(response))
|
||||||
|
|
||||||
result_json = json.dumps(response, indent=2)
|
# Build minimal response
|
||||||
# Clean base64 images from search results
|
minimal_data = []
|
||||||
|
for item in results_list:
|
||||||
|
if isinstance(item, dict) and ("markdown" in item):
|
||||||
|
minimal_data.append({"markdown": item.get("markdown", "")})
|
||||||
|
minimal_response = {"success": success_flag, "data": minimal_data}
|
||||||
|
|
||||||
|
result_json = json.dumps(minimal_response, indent=2)
|
||||||
cleaned_result = clean_base64_images(result_json)
|
cleaned_result = clean_base64_images(result_json)
|
||||||
|
|
||||||
debug_call_data["final_response_size"] = len(cleaned_result)
|
debug_call_data["final_response_size"] = len(cleaned_result)
|
||||||
|
|
@ -335,7 +352,7 @@ async def web_extract_tool(
|
||||||
Extract content from specific web pages using available extraction API backend.
|
Extract content from specific web pages using available extraction API backend.
|
||||||
|
|
||||||
This function provides a generic interface for web content extraction that
|
This function provides a generic interface for web content extraction that
|
||||||
can work with multiple backends. Currently uses Tavily but can be easily swapped.
|
can work with multiple backends. Currently uses Firecrawl.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls (List[str]): List of URLs to extract content from
|
urls (List[str]): List of URLs to extract content from
|
||||||
|
|
@ -371,8 +388,49 @@ async def web_extract_tool(
|
||||||
try:
|
try:
|
||||||
print(f"📄 Extracting content from {len(urls)} URL(s)")
|
print(f"📄 Extracting content from {len(urls)} URL(s)")
|
||||||
|
|
||||||
# Use Tavily's extract functionality
|
# Use Firecrawl's scrape functionality per URL and normalize to a common shape
|
||||||
response = tavily_client.extract(urls=urls, format=format)
|
results: List[Dict[str, Any]] = []
|
||||||
|
for url in urls:
|
||||||
|
try:
|
||||||
|
# Determine requested formats for Firecrawl
|
||||||
|
formats: List[str] = []
|
||||||
|
if format == "markdown":
|
||||||
|
formats = ["markdown"]
|
||||||
|
elif format == "html":
|
||||||
|
formats = ["html"]
|
||||||
|
else:
|
||||||
|
# Default: request markdown for LLM-readiness and include html as backup
|
||||||
|
formats = ["markdown", "html"]
|
||||||
|
|
||||||
|
scrape_result = firecrawl_app.scrape_url(url, formats=formats)
|
||||||
|
|
||||||
|
# Firecrawl returns {success, data: {markdown?, html?, metadata}}
|
||||||
|
data = scrape_result.get("data", {}) if isinstance(scrape_result, dict) else {}
|
||||||
|
metadata = data.get("metadata", {})
|
||||||
|
title = metadata.get("title", "")
|
||||||
|
content_markdown = data.get("markdown")
|
||||||
|
content_html = data.get("html")
|
||||||
|
|
||||||
|
# Choose content based on requested format
|
||||||
|
chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or ""
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"url": metadata.get("sourceURL", url),
|
||||||
|
"title": title,
|
||||||
|
"content": chosen_content,
|
||||||
|
"raw_content": chosen_content,
|
||||||
|
"metadata": metadata
|
||||||
|
})
|
||||||
|
except Exception as scrape_err:
|
||||||
|
results.append({
|
||||||
|
"url": url,
|
||||||
|
"title": "",
|
||||||
|
"content": "",
|
||||||
|
"raw_content": "",
|
||||||
|
"error": str(scrape_err)
|
||||||
|
})
|
||||||
|
|
||||||
|
response = {"results": results}
|
||||||
|
|
||||||
pages_extracted = len(response.get('results', []))
|
pages_extracted = len(response.get('results', []))
|
||||||
print(f"✅ Extracted content from {pages_extracted} pages")
|
print(f"✅ Extracted content from {pages_extracted} pages")
|
||||||
|
|
@ -440,7 +498,18 @@ async def web_extract_tool(
|
||||||
content_length = len(result.get('raw_content', ''))
|
content_length = len(result.get('raw_content', ''))
|
||||||
print(f" 📝 {url} ({content_length} characters)")
|
print(f" 📝 {url} ({content_length} characters)")
|
||||||
|
|
||||||
result_json = json.dumps(response, indent=2)
|
# Trim output to minimal fields per entry: title, content, error
|
||||||
|
trimmed_results = [
|
||||||
|
{
|
||||||
|
"title": r.get("title", ""),
|
||||||
|
"content": r.get("content", ""),
|
||||||
|
"error": r.get("error")
|
||||||
|
}
|
||||||
|
for r in response.get("results", [])
|
||||||
|
]
|
||||||
|
trimmed_response = {"results": trimmed_results}
|
||||||
|
|
||||||
|
result_json = json.dumps(trimmed_response, indent=2)
|
||||||
# Clean base64 images from extracted content
|
# Clean base64 images from extracted content
|
||||||
cleaned_result = clean_base64_images(result_json)
|
cleaned_result = clean_base64_images(result_json)
|
||||||
|
|
||||||
|
|
@ -476,7 +545,7 @@ async def web_crawl_tool(
|
||||||
Crawl a website with specific instructions using available crawling API backend.
|
Crawl a website with specific instructions using available crawling API backend.
|
||||||
|
|
||||||
This function provides a generic interface for web crawling that can work
|
This function provides a generic interface for web crawling that can work
|
||||||
with multiple backends. Currently uses Tavily but can be easily swapped.
|
with multiple backends. Currently uses Firecrawl.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): The base URL to crawl (can include or exclude https://)
|
url (str): The base URL to crawl (can include or exclude https://)
|
||||||
|
|
@ -516,13 +585,35 @@ async def web_crawl_tool(
|
||||||
instructions_text = f" with instructions: '{instructions}'" if instructions else ""
|
instructions_text = f" with instructions: '{instructions}'" if instructions else ""
|
||||||
print(f"🕷️ Crawling {url}{instructions_text}")
|
print(f"🕷️ Crawling {url}{instructions_text}")
|
||||||
|
|
||||||
# Use Tavily's crawl functionality
|
# Use Firecrawl's crawl functionality and normalize to a common shape
|
||||||
response = tavily_client.crawl(
|
# Firecrawl SDK returns the crawl results directly for synchronous crawl
|
||||||
url=url,
|
scrape_options = ScrapeOptions(formats=["markdown", "html"])
|
||||||
limit=20, # Reasonable limit for most use cases
|
crawl_result = firecrawl_app.crawl_url(
|
||||||
instructions=instructions or "Get all available content",
|
url,
|
||||||
extract_depth=depth
|
limit=20,
|
||||||
|
scrape_options=scrape_options,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
pages: List[Dict[str, Any]] = []
|
||||||
|
if isinstance(crawl_result, dict):
|
||||||
|
# Firecrawl returns {success, data: [ {markdown?, html?, metadata} ]}
|
||||||
|
data_list = crawl_result.get("data", [])
|
||||||
|
for item in data_list:
|
||||||
|
metadata = item.get("metadata", {}) if isinstance(item, dict) else {}
|
||||||
|
page_url = metadata.get("sourceURL", "Unknown URL")
|
||||||
|
title = metadata.get("title", "")
|
||||||
|
content_markdown = item.get("markdown") if isinstance(item, dict) else None
|
||||||
|
content_html = item.get("html") if isinstance(item, dict) else None
|
||||||
|
content = content_markdown or content_html or ""
|
||||||
|
pages.append({
|
||||||
|
"url": page_url,
|
||||||
|
"title": title,
|
||||||
|
"content": content,
|
||||||
|
"raw_content": content,
|
||||||
|
"metadata": metadata
|
||||||
|
})
|
||||||
|
|
||||||
|
response = {"results": pages}
|
||||||
|
|
||||||
pages_crawled = len(response.get('results', []))
|
pages_crawled = len(response.get('results', []))
|
||||||
print(f"✅ Crawled {pages_crawled} pages")
|
print(f"✅ Crawled {pages_crawled} pages")
|
||||||
|
|
@ -590,7 +681,18 @@ async def web_crawl_tool(
|
||||||
content_length = len(result.get('content', ''))
|
content_length = len(result.get('content', ''))
|
||||||
print(f" 🌐 {page_url} ({content_length} characters)")
|
print(f" 🌐 {page_url} ({content_length} characters)")
|
||||||
|
|
||||||
result_json = json.dumps(response, indent=2)
|
# Trim output to minimal fields per entry: title, content, error
|
||||||
|
trimmed_results = [
|
||||||
|
{
|
||||||
|
"title": r.get("title", ""),
|
||||||
|
"content": r.get("content", ""),
|
||||||
|
"error": r.get("error")
|
||||||
|
}
|
||||||
|
for r in response.get("results", [])
|
||||||
|
]
|
||||||
|
trimmed_response = {"results": trimmed_results}
|
||||||
|
|
||||||
|
result_json = json.dumps(trimmed_response, indent=2)
|
||||||
# Clean base64 images from crawled content
|
# Clean base64 images from crawled content
|
||||||
cleaned_result = clean_base64_images(result_json)
|
cleaned_result = clean_base64_images(result_json)
|
||||||
|
|
||||||
|
|
@ -615,14 +717,14 @@ async def web_crawl_tool(
|
||||||
|
|
||||||
|
|
||||||
# Convenience function to check if API key is available
|
# Convenience function to check if API key is available
|
||||||
def check_tavily_api_key() -> bool:
|
def check_firecrawl_api_key() -> bool:
|
||||||
"""
|
"""
|
||||||
Check if the Tavily API key is available in environment variables.
|
Check if the Firecrawl API key is available in environment variables.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if API key is set, False otherwise
|
bool: True if API key is set, False otherwise
|
||||||
"""
|
"""
|
||||||
return bool(os.getenv("TAVILY_API_KEY"))
|
return bool(os.getenv("FIRECRAWL_API_KEY"))
|
||||||
|
|
||||||
|
|
||||||
def check_nous_api_key() -> bool:
|
def check_nous_api_key() -> bool:
|
||||||
|
|
@ -670,15 +772,15 @@ if __name__ == "__main__":
|
||||||
print("=" * 40)
|
print("=" * 40)
|
||||||
|
|
||||||
# Check if API keys are available
|
# Check if API keys are available
|
||||||
tavily_available = check_tavily_api_key()
|
firecrawl_available = check_firecrawl_api_key()
|
||||||
nous_available = check_nous_api_key()
|
nous_available = check_nous_api_key()
|
||||||
|
|
||||||
if not tavily_available:
|
if not firecrawl_available:
|
||||||
print("❌ TAVILY_API_KEY environment variable not set")
|
print("❌ FIRECRAWL_API_KEY environment variable not set")
|
||||||
print("Please set your API key: export TAVILY_API_KEY='your-key-here'")
|
print("Please set your API key: export FIRECRAWL_API_KEY='your-key-here'")
|
||||||
print("Get API key at: https://tavily.com/")
|
print("Get API key at: https://firecrawl.dev/")
|
||||||
else:
|
else:
|
||||||
print("✅ Tavily API key found")
|
print("✅ Firecrawl API key found")
|
||||||
|
|
||||||
if not nous_available:
|
if not nous_available:
|
||||||
print("❌ NOUS_API_KEY environment variable not set")
|
print("❌ NOUS_API_KEY environment variable not set")
|
||||||
|
|
@ -688,7 +790,7 @@ if __name__ == "__main__":
|
||||||
else:
|
else:
|
||||||
print("✅ Nous Research API key found")
|
print("✅ Nous Research API key found")
|
||||||
|
|
||||||
if not tavily_available:
|
if not firecrawl_available:
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
print("🛠️ Web tools ready for use!")
|
print("🛠️ Web tools ready for use!")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue