implement first pass of scrape/crawl content compression

2025-07-31 10:11:27 -07:00 · 2025-07-31 10:11:27 -07:00 · bf4223f381
commit bf4223f381
parent 1dacd941f6
4 changed files with 272 additions and 45 deletions
--- a/web_tools.py
+++ b/web_tools.py
@ -15,6 +15,10 @@ Backend compatibility:
 - Tavily: https://docs.tavily.com/
 - Firecrawl: https://docs.firecrawl.dev/features/search

+LLM Processing:
+- Uses Nous Research API with Gemini 2.5 Flash for intelligent content extraction
+- Extracts key excerpts and creates markdown summaries to reduce token usage
+
 Usage:
    from web_tools import web_search_tool, web_extract_tool, web_crawl_tool
    
@ -35,12 +39,110 @@ Usage:
 import json
 import os
 import re
-from typing import List
+import asyncio
+from typing import List, Dict, Any, Optional
 from tavily import TavilyClient
+from openai import AsyncOpenAI

 # Initialize Tavily client once at module level
 tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))

+# Initialize Nous Research API client for LLM processing (async)
+nous_client = AsyncOpenAI(
+    api_key=os.getenv("NOUS_API_KEY"),
+    base_url="https://inference-api.nousresearch.com/v1"
+)
+
+# Configuration for LLM processing
+DEFAULT_SUMMARIZER_MODEL = "gemini-2.5-flash"
+DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
+
+
+async def process_content_with_llm(
+    content: str, 
+    url: str = "", 
+    title: str = "",
+    model: str = DEFAULT_SUMMARIZER_MODEL,
+    min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
+) -> Optional[str]:
+    """
+    Process web content using LLM to create intelligent summaries with key excerpts.
+    
+    This function uses Gemini 2.5 Flash (or specified model) via Nous Research API 
+    to intelligently extract key information and create markdown summaries,
+    significantly reducing token usage while preserving all important information.
+    
+    Args:
+        content (str): The raw content to process
+        url (str): The source URL (for context, optional)
+        title (str): The page title (for context, optional)
+        model (str): The model to use for processing (default: gemini-2.5-flash)
+        min_length (int): Minimum content length to trigger processing (default: 5000)
+        
+    Returns:
+        Optional[str]: Processed markdown content, or None if content too short or processing fails
+    """
+    try:
+        # Skip processing if content is too short
+        if len(content) < min_length:
+            print(f"📏 Content too short ({len(content)} < {min_length} chars), skipping LLM processing")
+            return None
+        
+        print(f"🧠 Processing content with LLM ({len(content)} characters)")
+        
+        # Create context information
+        context_info = []
+        if title:
+            context_info.append(f"Title: {title}")
+        if url:
+            context_info.append(f"Source: {url}")
+        
+        context_str = "\n".join(context_info) + "\n\n" if context_info else ""
+        
+        # Simplified prompt for better quality markdown output
+        system_prompt = """You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk.
+
+Create a well-structured markdown summary that includes:
+1. Key excerpts (quotes, code snippets, important facts) in their original format
+2. Comprehensive summary of all other important information
+3. Proper markdown formatting with headers, bullets, and emphasis
+
+Your goal is to preserve ALL important information while reducing length. Never lose key facts, figures, insights, or actionable information. Make it scannable and well-organized."""
+
+        user_prompt = f"""Please process this web content and create a comprehensive markdown summary:
+
+{context_str}CONTENT TO PROCESS:
+{content}
+
+Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights."""
+
+        # Call the LLM asynchronously
+        response = await nous_client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            temperature=0.1,  # Low temperature for consistent extraction
+            max_tokens=4000   # Generous limit for comprehensive processing
+        )
+        
+        # Get the markdown response directly
+        processed_content = response.choices[0].message.content.strip()
+        
+        # Calculate compression metrics for logging
+        original_length = len(content)
+        processed_length = len(processed_content)
+        compression_ratio = processed_length / original_length if original_length > 0 else 1.0
+        
+        print(f"✅ Content processed: {original_length} → {processed_length} chars ({compression_ratio:.1%})")
+        
+        return processed_content
+        
+    except Exception as e:
+        print(f"❌ Error processing content with LLM: {str(e)}")
+        return None
+

 def clean_base64_images(text: str) -> str:
    """
@ -82,6 +184,8 @@ def web_search_tool(query: str, limit: int = 5) -> str:
    This function provides a generic interface for web search that can work
    with multiple backends. Currently uses Tavily but can be easily swapped.
    
+    Note: Search results are already concise snippets, so no LLM processing is applied.
+    
    Args:
        query (str): The search query to look up
        limit (int): Maximum number of results to return (default: 5)
@ -111,6 +215,7 @@ def web_search_tool(query: str, limit: int = 5) -> str:
        response = tavily_client.search(query=query, max_results=limit, search_depth="advanced")
        
        print(f"✅ Found {len(response.get('results', []))} results")
+        
        result_json = json.dumps(response, indent=2)
        # Clean base64 images from search results
        return clean_base64_images(result_json)
@ -121,7 +226,13 @@ def web_search_tool(query: str, limit: int = 5) -> str:
        return json.dumps({"error": error_msg})


-def web_extract_tool(urls: List[str], format: str = None) -> str:
+async def web_extract_tool(
+    urls: List[str], 
+    format: str = None, 
+    use_llm_processing: bool = True,
+    model: str = DEFAULT_SUMMARIZER_MODEL,
+    min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
+) -> str:
    """
    Extract content from specific web pages using available extraction API backend.
    
@ -131,20 +242,13 @@ def web_extract_tool(urls: List[str], format: str = None) -> str:
    Args:
        urls (List[str]): List of URLs to extract content from
        format (str): Desired output format ("markdown" or "html", optional)
+        use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
+        model (str): The model to use for LLM processing (default: gemini-2.5-flash)
+        min_length (int): Minimum content length to trigger LLM processing (default: 5000)
    
    Returns:
-        str: JSON string containing extracted content with the following structure:
-             {
-                 "results": [
-                     {
-                         "url": str,
-                         "title": str,
-                         "raw_content": str,
-                         "content": str
-                     },
-                     ...
-                 ]
-             }
+        str: JSON string containing extracted content. If LLM processing is enabled and successful,
+             the 'content' field will contain the processed markdown summary instead of raw content.
    
    Raises:
        Exception: If extraction fails or API key is not set
@ -157,11 +261,40 @@ def web_extract_tool(urls: List[str], format: str = None) -> str:
        
        print(f"✅ Extracted content from {len(response.get('results', []))} pages")
        
-        # Print summary of extracted pages for debugging
-        for result in response.get('results', []):
-            url = result.get('url', 'Unknown URL')
-            content_length = len(result.get('raw_content', ''))
-            print(f"  📝 {url} ({content_length} characters)")
+        # Process each result with LLM if enabled
+        if use_llm_processing and os.getenv("NOUS_API_KEY"):
+            print("🧠 Processing extracted content with LLM...")
+            
+            for result in response.get('results', []):
+                url = result.get('url', 'Unknown URL')
+                title = result.get('title', '')
+                raw_content = result.get('raw_content', '') or result.get('content', '')
+                
+                if raw_content:
+                    # Process content with LLM
+                    processed = await process_content_with_llm(
+                        raw_content, url, title, model, min_length
+                    )
+                    
+                    if processed:
+                        # Replace content with processed version
+                        result['content'] = processed
+                        # Keep raw content in separate field for reference
+                        result['raw_content'] = raw_content
+                        print(f"  📝 {url} (processed)")
+                    else:
+                        print(f"  📝 {url} (no processing - content too short)")
+                else:
+                    print(f"  ⚠️  {url} (no content to process)")
+        else:
+            if use_llm_processing and not os.getenv("NOUS_API_KEY"):
+                print("⚠️  LLM processing requested but NOUS_API_KEY not set, returning raw content")
+            
+            # Print summary of extracted pages for debugging (original behavior)
+            for result in response.get('results', []):
+                url = result.get('url', 'Unknown URL')
+                content_length = len(result.get('raw_content', ''))
+                print(f"  📝 {url} ({content_length} characters)")
        
        result_json = json.dumps(response, indent=2)
        # Clean base64 images from extracted content
@ -173,7 +306,14 @@ def web_extract_tool(urls: List[str], format: str = None) -> str:
        return json.dumps({"error": error_msg})


-def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") -> str:
+async def web_crawl_tool(
+    url: str, 
+    instructions: str = None, 
+    depth: str = "basic", 
+    use_llm_processing: bool = True,
+    model: str = DEFAULT_SUMMARIZER_MODEL,
+    min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
+) -> str:
    """
    Crawl a website with specific instructions using available crawling API backend.
    
@ -184,19 +324,14 @@ def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") ->
        url (str): The base URL to crawl (can include or exclude https://)
        instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional)
        depth (str): Depth of extraction ("basic" or "advanced", default: "basic")
+        use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
+        model (str): The model to use for LLM processing (default: gemini-2.5-flash)
+        min_length (int): Minimum content length to trigger LLM processing (default: 5000)
    
    Returns:
-        str: JSON string containing crawled content with the following structure:
-             {
-                 "results": [
-                     {
-                         "url": str,
-                         "title": str,
-                         "content": str
-                     },
-                     ...
-                 ]
-             }
+        str: JSON string containing crawled content. If LLM processing is enabled and successful,
+             the 'content' field will contain the processed markdown summary instead of raw content.
+             Each page is processed individually.
    
    Raises:
        Exception: If crawling fails or API key is not set
@ -215,11 +350,40 @@ def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") ->
        
        print(f"✅ Crawled {len(response.get('results', []))} pages")
        
-        # Print summary of crawled pages for debugging
-        for result in response.get('results', []):
-            page_url = result.get('url', 'Unknown URL')
-            content_length = len(result.get('content', ''))
-            print(f"  🌐 {page_url} ({content_length} characters)")
+        # Process each result with LLM if enabled
+        if use_llm_processing and os.getenv("NOUS_API_KEY"):
+            print("🧠 Processing crawled content with LLM...")
+            
+            for result in response.get('results', []):
+                page_url = result.get('url', 'Unknown URL')
+                title = result.get('title', '')
+                content = result.get('content', '')
+                
+                if content:
+                    # Process content with LLM
+                    processed = await process_content_with_llm(
+                        content, page_url, title, model, min_length
+                    )
+                    
+                    if processed:
+                        # Keep original content in raw_content field
+                        result['raw_content'] = content
+                        # Replace content with processed version
+                        result['content'] = processed
+                        print(f"  🌐 {page_url} (processed)")
+                    else:
+                        print(f"  🌐 {page_url} (no processing - content too short)")
+                else:
+                    print(f"  ⚠️  {page_url} (no content to process)")
+        else:
+            if use_llm_processing and not os.getenv("NOUS_API_KEY"):
+                print("⚠️  LLM processing requested but NOUS_API_KEY not set, returning raw content")
+            
+            # Print summary of crawled pages for debugging (original behavior)
+            for result in response.get('results', []):
+                page_url = result.get('url', 'Unknown URL')
+                content_length = len(result.get('content', ''))
+                print(f"  🌐 {page_url} ({content_length} characters)")
        
        result_json = json.dumps(response, indent=2)
        # Clean base64 images from crawled content
@ -242,6 +406,16 @@ def check_tavily_api_key() -> bool:
    return bool(os.getenv("TAVILY_API_KEY"))


+def check_nous_api_key() -> bool:
+    """
+    Check if the Nous Research API key is available in environment variables.
+    
+    Returns:
+        bool: True if API key is set, False otherwise
+    """
+    return bool(os.getenv("NOUS_API_KEY"))
+
+
 if __name__ == "__main__":
    """
    Simple test/demo when run directly
@ -249,17 +423,61 @@ if __name__ == "__main__":
    print("🌐 Standalone Web Tools Module")
    print("=" * 40)
    
-    # Check if API key is available
-    if not check_tavily_api_key():
+    # Check if API keys are available
+    tavily_available = check_tavily_api_key()
+    nous_available = check_nous_api_key()
+    
+    if not tavily_available:
        print("❌ TAVILY_API_KEY environment variable not set")
        print("Please set your API key: export TAVILY_API_KEY='your-key-here'")
        print("Get API key at: https://tavily.com/")
+    else:
+        print("✅ Tavily API key found")
+    
+    if not nous_available:
+        print("❌ NOUS_API_KEY environment variable not set")
+        print("Please set your API key: export NOUS_API_KEY='your-key-here'")  
+        print("Get API key at: https://inference-api.nousresearch.com/")
+        print("⚠️  Without Nous API key, LLM content processing will be disabled")
+    else:
+        print("✅ Nous Research API key found")
+    
+    if not tavily_available:
        exit(1)
    
-    print("✅ Tavily API key found")
    print("🛠️  Web tools ready for use!")
-    print("\nExample usage:")
+    
+    if nous_available:
+        print("🧠 LLM content processing available with Gemini 2.5 Flash")
+        print(f"   Default min length for processing: {DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION} chars")
+    
+    print("\nBasic usage:")
    print("  from web_tools import web_search_tool, web_extract_tool, web_crawl_tool")
+    print("  import asyncio")
+    print("")
+    print("  # Search (synchronous)")
    print("  results = web_search_tool('Python tutorials')")
-    print("  content = web_extract_tool(['https://example.com'])")
-    print("  crawl_data = web_crawl_tool('example.com', 'Find documentation')")
+    print("")
+    print("  # Extract and crawl (asynchronous)")
+    print("  async def main():")
+    print("      content = await web_extract_tool(['https://example.com'])")
+    print("      crawl_data = await web_crawl_tool('example.com', 'Find docs')")
+    print("  asyncio.run(main())")
+    
+    if nous_available:
+        print("\nLLM-enhanced usage:")
+        print("  # Content automatically processed for pages >5000 chars (default)")
+        print("  content = await web_extract_tool(['https://python.org/about/'])")
+        print("")
+        print("  # Customize processing parameters")
+        print("  crawl_data = await web_crawl_tool(")
+        print("      'docs.python.org',")
+        print("      'Find key concepts',")
+        print("      model='gemini-2.5-flash',")
+        print("      min_length=3000")
+        print("  )")
+        print("")
+        print("  # Disable LLM processing")
+        print("  raw_content = await web_extract_tool(['https://example.com'], use_llm_processing=False)")
+    
+    print(f"\n📝 Run 'python test_web_tools_llm.py' to test LLM processing capabilities")