initital commit
This commit is contained in:
commit
21d80ca683
8 changed files with 865 additions and 0 deletions
265
web_tools.py
Normal file
265
web_tools.py
Normal file
|
|
@ -0,0 +1,265 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Standalone Web Tools Module
|
||||
|
||||
This module provides generic web tools that work with multiple backend providers.
|
||||
Currently uses Tavily as the backend, but the interface makes it easy to swap
|
||||
to other providers like Firecrawl without changing the function signatures.
|
||||
|
||||
Available tools:
|
||||
- web_search_tool: Search the web for information
|
||||
- web_extract_tool: Extract content from specific web pages
|
||||
- web_crawl_tool: Crawl websites with specific instructions
|
||||
|
||||
Backend compatibility:
|
||||
- Tavily: https://docs.tavily.com/
|
||||
- Firecrawl: https://docs.firecrawl.dev/features/search
|
||||
|
||||
Usage:
|
||||
from web_tools import web_search_tool, web_extract_tool, web_crawl_tool
|
||||
|
||||
# Search the web
|
||||
results = web_search_tool("Python machine learning libraries", limit=3)
|
||||
|
||||
# Extract content from URLs
|
||||
content = web_extract_tool(["https://example.com"], format="markdown")
|
||||
|
||||
# Crawl a website
|
||||
crawl_data = web_crawl_tool("example.com", "Find contact information")
|
||||
"""
|
||||
|
||||
#TODO: Search Capabilities over the scraped pages
|
||||
#TODO: Store the pages in something
|
||||
#TODO: Tool to see what pages are available/saved to search over
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import List
|
||||
from tavily import TavilyClient
|
||||
|
||||
# Initialize Tavily client once at module level
|
||||
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
||||
|
||||
|
||||
def clean_base64_images(text: str) -> str:
|
||||
"""
|
||||
Remove base64 encoded images from text to reduce token count and clutter.
|
||||
|
||||
This function finds and removes base64 encoded images in various formats:
|
||||
- (data:image/png;base64,...)
|
||||
- (data:image/jpeg;base64,...)
|
||||
- (data:image/svg+xml;base64,...)
|
||||
- data:image/[type];base64,... (without parentheses)
|
||||
|
||||
Args:
|
||||
text: The text content to clean
|
||||
|
||||
Returns:
|
||||
Cleaned text with base64 images replaced with placeholders
|
||||
"""
|
||||
# Pattern to match base64 encoded images wrapped in parentheses
|
||||
# Matches: (data:image/[type];base64,[base64-string])
|
||||
base64_with_parens_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)'
|
||||
|
||||
# Pattern to match base64 encoded images without parentheses
|
||||
# Matches: data:image/[type];base64,[base64-string]
|
||||
base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
|
||||
|
||||
# Replace parentheses-wrapped images first
|
||||
cleaned_text = re.sub(base64_with_parens_pattern, '[BASE64_IMAGE_REMOVED]', text)
|
||||
|
||||
# Then replace any remaining non-parentheses images
|
||||
cleaned_text = re.sub(base64_pattern, '[BASE64_IMAGE_REMOVED]', cleaned_text)
|
||||
|
||||
return cleaned_text
|
||||
|
||||
|
||||
def web_search_tool(query: str, limit: int = 5) -> str:
|
||||
"""
|
||||
Search the web for information using available search API backend.
|
||||
|
||||
This function provides a generic interface for web search that can work
|
||||
with multiple backends. Currently uses Tavily but can be easily swapped.
|
||||
|
||||
Args:
|
||||
query (str): The search query to look up
|
||||
limit (int): Maximum number of results to return (default: 5)
|
||||
|
||||
Returns:
|
||||
str: JSON string containing search results with the following structure:
|
||||
{
|
||||
"query": str,
|
||||
"results": [
|
||||
{
|
||||
"title": str,
|
||||
"url": str,
|
||||
"content": str,
|
||||
"score": float
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Raises:
|
||||
Exception: If search fails or API key is not set
|
||||
"""
|
||||
try:
|
||||
print(f"🔍 Searching the web for: '{query}' (limit: {limit})")
|
||||
|
||||
# Use Tavily's search functionality
|
||||
response = tavily_client.search(query=query, max_results=limit, search_depth="advanced")
|
||||
|
||||
print(f"✅ Found {len(response.get('results', []))} results")
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Clean base64 images from search results
|
||||
return clean_base64_images(result_json)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error searching web: {str(e)}"
|
||||
print(f"❌ {error_msg}")
|
||||
return json.dumps({"error": error_msg})
|
||||
|
||||
|
||||
def web_extract_tool(urls: List[str], format: str = None) -> str:
|
||||
"""
|
||||
Extract content from specific web pages using available extraction API backend.
|
||||
|
||||
This function provides a generic interface for web content extraction that
|
||||
can work with multiple backends. Currently uses Tavily but can be easily swapped.
|
||||
|
||||
Args:
|
||||
urls (List[str]): List of URLs to extract content from
|
||||
format (str): Desired output format ("markdown" or "html", optional)
|
||||
|
||||
Returns:
|
||||
str: JSON string containing extracted content with the following structure:
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"url": str,
|
||||
"title": str,
|
||||
"raw_content": str,
|
||||
"content": str
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Raises:
|
||||
Exception: If extraction fails or API key is not set
|
||||
"""
|
||||
try:
|
||||
print(f"📄 Extracting content from {len(urls)} URL(s)")
|
||||
|
||||
# Use Tavily's extract functionality
|
||||
response = tavily_client.extract(urls=urls, format=format)
|
||||
|
||||
print(f"✅ Extracted content from {len(response.get('results', []))} pages")
|
||||
|
||||
# Print summary of extracted pages for debugging
|
||||
for result in response.get('results', []):
|
||||
url = result.get('url', 'Unknown URL')
|
||||
content_length = len(result.get('raw_content', ''))
|
||||
print(f" 📝 {url} ({content_length} characters)")
|
||||
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Clean base64 images from extracted content
|
||||
return clean_base64_images(result_json)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error extracting content: {str(e)}"
|
||||
print(f"❌ {error_msg}")
|
||||
return json.dumps({"error": error_msg})
|
||||
|
||||
|
||||
def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") -> str:
|
||||
"""
|
||||
Crawl a website with specific instructions using available crawling API backend.
|
||||
|
||||
This function provides a generic interface for web crawling that can work
|
||||
with multiple backends. Currently uses Tavily but can be easily swapped.
|
||||
|
||||
Args:
|
||||
url (str): The base URL to crawl (can include or exclude https://)
|
||||
instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional)
|
||||
depth (str): Depth of extraction ("basic" or "advanced", default: "basic")
|
||||
|
||||
Returns:
|
||||
str: JSON string containing crawled content with the following structure:
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"url": str,
|
||||
"title": str,
|
||||
"content": str
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Raises:
|
||||
Exception: If crawling fails or API key is not set
|
||||
"""
|
||||
try:
|
||||
instructions_text = f" with instructions: '{instructions}'" if instructions else ""
|
||||
print(f"🕷️ Crawling {url}{instructions_text}")
|
||||
|
||||
# Use Tavily's crawl functionality
|
||||
response = tavily_client.crawl(
|
||||
url=url,
|
||||
limit=20, # Reasonable limit for most use cases
|
||||
instructions=instructions or "Get all available content",
|
||||
extract_depth=depth
|
||||
)
|
||||
|
||||
print(f"✅ Crawled {len(response.get('results', []))} pages")
|
||||
|
||||
# Print summary of crawled pages for debugging
|
||||
for result in response.get('results', []):
|
||||
page_url = result.get('url', 'Unknown URL')
|
||||
content_length = len(result.get('content', ''))
|
||||
print(f" 🌐 {page_url} ({content_length} characters)")
|
||||
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Clean base64 images from crawled content
|
||||
return clean_base64_images(result_json)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error crawling website: {str(e)}"
|
||||
print(f"❌ {error_msg}")
|
||||
return json.dumps({"error": error_msg})
|
||||
|
||||
|
||||
# Convenience function to check if API key is available
|
||||
def check_tavily_api_key() -> bool:
|
||||
"""
|
||||
Check if the Tavily API key is available in environment variables.
|
||||
|
||||
Returns:
|
||||
bool: True if API key is set, False otherwise
|
||||
"""
|
||||
return bool(os.getenv("TAVILY_API_KEY"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Simple test/demo when run directly
|
||||
"""
|
||||
print("🌐 Standalone Web Tools Module")
|
||||
print("=" * 40)
|
||||
|
||||
# Check if API key is available
|
||||
if not check_tavily_api_key():
|
||||
print("❌ TAVILY_API_KEY environment variable not set")
|
||||
print("Please set your API key: export TAVILY_API_KEY='your-key-here'")
|
||||
print("Get API key at: https://tavily.com/")
|
||||
exit(1)
|
||||
|
||||
print("✅ Tavily API key found")
|
||||
print("🛠️ Web tools ready for use!")
|
||||
print("\nExample usage:")
|
||||
print(" from web_tools import web_search_tool, web_extract_tool, web_crawl_tool")
|
||||
print(" results = web_search_tool('Python tutorials')")
|
||||
print(" content = web_extract_tool(['https://example.com'])")
|
||||
print(" crawl_data = web_crawl_tool('example.com', 'Find documentation')")
|
||||
Loading…
Add table
Add a link
Reference in a new issue