initital commit

2025-07-22 18:32:44 -07:00 · 2025-07-22 18:32:44 -07:00 · 21d80ca683
commit 21d80ca683
8 changed files with 865 additions and 0 deletions
--- a/web_tools.py
+++ b/web_tools.py
@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+"""
+Standalone Web Tools Module
+
+This module provides generic web tools that work with multiple backend providers.
+Currently uses Tavily as the backend, but the interface makes it easy to swap
+to other providers like Firecrawl without changing the function signatures.
+
+Available tools:
+- web_search_tool: Search the web for information
+- web_extract_tool: Extract content from specific web pages
+- web_crawl_tool: Crawl websites with specific instructions
+
+Backend compatibility:
+- Tavily: https://docs.tavily.com/
+- Firecrawl: https://docs.firecrawl.dev/features/search
+
+Usage:
+    from web_tools import web_search_tool, web_extract_tool, web_crawl_tool
+    
+    # Search the web
+    results = web_search_tool("Python machine learning libraries", limit=3)
+    
+    # Extract content from URLs  
+    content = web_extract_tool(["https://example.com"], format="markdown")
+    
+    # Crawl a website
+    crawl_data = web_crawl_tool("example.com", "Find contact information")
+"""
+
+#TODO: Search Capabilities over the scraped pages
+#TODO: Store the pages in something
+#TODO: Tool to see what pages are available/saved to search over
+
+import json
+import os
+import re
+from typing import List
+from tavily import TavilyClient
+
+# Initialize Tavily client once at module level
+tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
+
+
+def clean_base64_images(text: str) -> str:
+    """
+    Remove base64 encoded images from text to reduce token count and clutter.
+    
+    This function finds and removes base64 encoded images in various formats:
+    - (data:image/png;base64,...)
+    - (data:image/jpeg;base64,...)
+    - (data:image/svg+xml;base64,...)
+    - data:image/[type];base64,... (without parentheses)
+    
+    Args:
+        text: The text content to clean
+        
+    Returns:
+        Cleaned text with base64 images replaced with placeholders
+    """
+    # Pattern to match base64 encoded images wrapped in parentheses
+    # Matches: (data:image/[type];base64,[base64-string])
+    base64_with_parens_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)'
+    
+    # Pattern to match base64 encoded images without parentheses
+    # Matches: data:image/[type];base64,[base64-string]
+    base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
+    
+    # Replace parentheses-wrapped images first
+    cleaned_text = re.sub(base64_with_parens_pattern, '[BASE64_IMAGE_REMOVED]', text)
+    
+    # Then replace any remaining non-parentheses images
+    cleaned_text = re.sub(base64_pattern, '[BASE64_IMAGE_REMOVED]', cleaned_text)
+    
+    return cleaned_text
+
+
+def web_search_tool(query: str, limit: int = 5) -> str:
+    """
+    Search the web for information using available search API backend.
+    
+    This function provides a generic interface for web search that can work
+    with multiple backends. Currently uses Tavily but can be easily swapped.
+    
+    Args:
+        query (str): The search query to look up
+        limit (int): Maximum number of results to return (default: 5)
+    
+    Returns:
+        str: JSON string containing search results with the following structure:
+             {
+                 "query": str,
+                 "results": [
+                     {
+                         "title": str,
+                         "url": str,
+                         "content": str,
+                         "score": float
+                     },
+                     ...
+                 ]
+             }
+    
+    Raises:
+        Exception: If search fails or API key is not set
+    """
+    try:
+        print(f"🔍 Searching the web for: '{query}' (limit: {limit})")
+        
+        # Use Tavily's search functionality
+        response = tavily_client.search(query=query, max_results=limit, search_depth="advanced")
+        
+        print(f"✅ Found {len(response.get('results', []))} results")
+        result_json = json.dumps(response, indent=2)
+        # Clean base64 images from search results
+        return clean_base64_images(result_json)
+        
+    except Exception as e:
+        error_msg = f"Error searching web: {str(e)}"
+        print(f"❌ {error_msg}")
+        return json.dumps({"error": error_msg})
+
+
+def web_extract_tool(urls: List[str], format: str = None) -> str:
+    """
+    Extract content from specific web pages using available extraction API backend.
+    
+    This function provides a generic interface for web content extraction that
+    can work with multiple backends. Currently uses Tavily but can be easily swapped.
+    
+    Args:
+        urls (List[str]): List of URLs to extract content from
+        format (str): Desired output format ("markdown" or "html", optional)
+    
+    Returns:
+        str: JSON string containing extracted content with the following structure:
+             {
+                 "results": [
+                     {
+                         "url": str,
+                         "title": str,
+                         "raw_content": str,
+                         "content": str
+                     },
+                     ...
+                 ]
+             }
+    
+    Raises:
+        Exception: If extraction fails or API key is not set
+    """
+    try:
+        print(f"📄 Extracting content from {len(urls)} URL(s)")
+        
+        # Use Tavily's extract functionality
+        response = tavily_client.extract(urls=urls, format=format)
+        
+        print(f"✅ Extracted content from {len(response.get('results', []))} pages")
+        
+        # Print summary of extracted pages for debugging
+        for result in response.get('results', []):
+            url = result.get('url', 'Unknown URL')
+            content_length = len(result.get('raw_content', ''))
+            print(f"  📝 {url} ({content_length} characters)")
+        
+        result_json = json.dumps(response, indent=2)
+        # Clean base64 images from extracted content
+        return clean_base64_images(result_json)
+            
+    except Exception as e:
+        error_msg = f"Error extracting content: {str(e)}"
+        print(f"❌ {error_msg}")
+        return json.dumps({"error": error_msg})
+
+
+def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") -> str:
+    """
+    Crawl a website with specific instructions using available crawling API backend.
+    
+    This function provides a generic interface for web crawling that can work
+    with multiple backends. Currently uses Tavily but can be easily swapped.
+    
+    Args:
+        url (str): The base URL to crawl (can include or exclude https://)
+        instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional)
+        depth (str): Depth of extraction ("basic" or "advanced", default: "basic")
+    
+    Returns:
+        str: JSON string containing crawled content with the following structure:
+             {
+                 "results": [
+                     {
+                         "url": str,
+                         "title": str,
+                         "content": str
+                     },
+                     ...
+                 ]
+             }
+    
+    Raises:
+        Exception: If crawling fails or API key is not set
+    """
+    try:
+        instructions_text = f" with instructions: '{instructions}'" if instructions else ""
+        print(f"🕷️ Crawling {url}{instructions_text}")
+        
+        # Use Tavily's crawl functionality
+        response = tavily_client.crawl(
+            url=url,
+            limit=20,  # Reasonable limit for most use cases
+            instructions=instructions or "Get all available content",
+            extract_depth=depth
+        )
+        
+        print(f"✅ Crawled {len(response.get('results', []))} pages")
+        
+        # Print summary of crawled pages for debugging
+        for result in response.get('results', []):
+            page_url = result.get('url', 'Unknown URL')
+            content_length = len(result.get('content', ''))
+            print(f"  🌐 {page_url} ({content_length} characters)")
+        
+        result_json = json.dumps(response, indent=2)
+        # Clean base64 images from crawled content
+        return clean_base64_images(result_json)
+        
+    except Exception as e:
+        error_msg = f"Error crawling website: {str(e)}"
+        print(f"❌ {error_msg}")
+        return json.dumps({"error": error_msg})
+
+
+# Convenience function to check if API key is available
+def check_tavily_api_key() -> bool:
+    """
+    Check if the Tavily API key is available in environment variables.
+    
+    Returns:
+        bool: True if API key is set, False otherwise
+    """
+    return bool(os.getenv("TAVILY_API_KEY"))
+
+
+if __name__ == "__main__":
+    """
+    Simple test/demo when run directly
+    """
+    print("🌐 Standalone Web Tools Module")
+    print("=" * 40)
+    
+    # Check if API key is available
+    if not check_tavily_api_key():
+        print("❌ TAVILY_API_KEY environment variable not set")
+        print("Please set your API key: export TAVILY_API_KEY='your-key-here'")
+        print("Get API key at: https://tavily.com/")
+        exit(1)
+    
+    print("✅ Tavily API key found")
+    print("🛠️  Web tools ready for use!")
+    print("\nExample usage:")
+    print("  from web_tools import web_search_tool, web_extract_tool, web_crawl_tool")
+    print("  results = web_search_tool('Python tutorials')")
+    print("  content = web_extract_tool(['https://example.com'])")
+    print("  crawl_data = web_crawl_tool('example.com', 'Find documentation')")