implement first pass of scrape/crawl content compression
This commit is contained in:
parent
1dacd941f6
commit
bf4223f381
4 changed files with 272 additions and 45 deletions
302
web_tools.py
302
web_tools.py
|
|
@ -15,6 +15,10 @@ Backend compatibility:
|
|||
- Tavily: https://docs.tavily.com/
|
||||
- Firecrawl: https://docs.firecrawl.dev/features/search
|
||||
|
||||
LLM Processing:
|
||||
- Uses Nous Research API with Gemini 2.5 Flash for intelligent content extraction
|
||||
- Extracts key excerpts and creates markdown summaries to reduce token usage
|
||||
|
||||
Usage:
|
||||
from web_tools import web_search_tool, web_extract_tool, web_crawl_tool
|
||||
|
||||
|
|
@ -35,12 +39,110 @@ Usage:
|
|||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import List
|
||||
import asyncio
|
||||
from typing import List, Dict, Any, Optional
|
||||
from tavily import TavilyClient
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
# Initialize Tavily client once at module level
|
||||
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
||||
|
||||
# Initialize Nous Research API client for LLM processing (async)
|
||||
nous_client = AsyncOpenAI(
|
||||
api_key=os.getenv("NOUS_API_KEY"),
|
||||
base_url="https://inference-api.nousresearch.com/v1"
|
||||
)
|
||||
|
||||
# Configuration for LLM processing
|
||||
DEFAULT_SUMMARIZER_MODEL = "gemini-2.5-flash"
|
||||
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
|
||||
|
||||
|
||||
async def process_content_with_llm(
|
||||
content: str,
|
||||
url: str = "",
|
||||
title: str = "",
|
||||
model: str = DEFAULT_SUMMARIZER_MODEL,
|
||||
min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Process web content using LLM to create intelligent summaries with key excerpts.
|
||||
|
||||
This function uses Gemini 2.5 Flash (or specified model) via Nous Research API
|
||||
to intelligently extract key information and create markdown summaries,
|
||||
significantly reducing token usage while preserving all important information.
|
||||
|
||||
Args:
|
||||
content (str): The raw content to process
|
||||
url (str): The source URL (for context, optional)
|
||||
title (str): The page title (for context, optional)
|
||||
model (str): The model to use for processing (default: gemini-2.5-flash)
|
||||
min_length (int): Minimum content length to trigger processing (default: 5000)
|
||||
|
||||
Returns:
|
||||
Optional[str]: Processed markdown content, or None if content too short or processing fails
|
||||
"""
|
||||
try:
|
||||
# Skip processing if content is too short
|
||||
if len(content) < min_length:
|
||||
print(f"📏 Content too short ({len(content)} < {min_length} chars), skipping LLM processing")
|
||||
return None
|
||||
|
||||
print(f"🧠 Processing content with LLM ({len(content)} characters)")
|
||||
|
||||
# Create context information
|
||||
context_info = []
|
||||
if title:
|
||||
context_info.append(f"Title: {title}")
|
||||
if url:
|
||||
context_info.append(f"Source: {url}")
|
||||
|
||||
context_str = "\n".join(context_info) + "\n\n" if context_info else ""
|
||||
|
||||
# Simplified prompt for better quality markdown output
|
||||
system_prompt = """You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk.
|
||||
|
||||
Create a well-structured markdown summary that includes:
|
||||
1. Key excerpts (quotes, code snippets, important facts) in their original format
|
||||
2. Comprehensive summary of all other important information
|
||||
3. Proper markdown formatting with headers, bullets, and emphasis
|
||||
|
||||
Your goal is to preserve ALL important information while reducing length. Never lose key facts, figures, insights, or actionable information. Make it scannable and well-organized."""
|
||||
|
||||
user_prompt = f"""Please process this web content and create a comprehensive markdown summary:
|
||||
|
||||
{context_str}CONTENT TO PROCESS:
|
||||
{content}
|
||||
|
||||
Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights."""
|
||||
|
||||
# Call the LLM asynchronously
|
||||
response = await nous_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt}
|
||||
],
|
||||
temperature=0.1, # Low temperature for consistent extraction
|
||||
max_tokens=4000 # Generous limit for comprehensive processing
|
||||
)
|
||||
|
||||
# Get the markdown response directly
|
||||
processed_content = response.choices[0].message.content.strip()
|
||||
|
||||
# Calculate compression metrics for logging
|
||||
original_length = len(content)
|
||||
processed_length = len(processed_content)
|
||||
compression_ratio = processed_length / original_length if original_length > 0 else 1.0
|
||||
|
||||
print(f"✅ Content processed: {original_length} → {processed_length} chars ({compression_ratio:.1%})")
|
||||
|
||||
return processed_content
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error processing content with LLM: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
def clean_base64_images(text: str) -> str:
|
||||
"""
|
||||
|
|
@ -82,6 +184,8 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
|||
This function provides a generic interface for web search that can work
|
||||
with multiple backends. Currently uses Tavily but can be easily swapped.
|
||||
|
||||
Note: Search results are already concise snippets, so no LLM processing is applied.
|
||||
|
||||
Args:
|
||||
query (str): The search query to look up
|
||||
limit (int): Maximum number of results to return (default: 5)
|
||||
|
|
@ -111,6 +215,7 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
|||
response = tavily_client.search(query=query, max_results=limit, search_depth="advanced")
|
||||
|
||||
print(f"✅ Found {len(response.get('results', []))} results")
|
||||
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Clean base64 images from search results
|
||||
return clean_base64_images(result_json)
|
||||
|
|
@ -121,7 +226,13 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
|||
return json.dumps({"error": error_msg})
|
||||
|
||||
|
||||
def web_extract_tool(urls: List[str], format: str = None) -> str:
|
||||
async def web_extract_tool(
|
||||
urls: List[str],
|
||||
format: str = None,
|
||||
use_llm_processing: bool = True,
|
||||
model: str = DEFAULT_SUMMARIZER_MODEL,
|
||||
min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
|
||||
) -> str:
|
||||
"""
|
||||
Extract content from specific web pages using available extraction API backend.
|
||||
|
||||
|
|
@ -131,20 +242,13 @@ def web_extract_tool(urls: List[str], format: str = None) -> str:
|
|||
Args:
|
||||
urls (List[str]): List of URLs to extract content from
|
||||
format (str): Desired output format ("markdown" or "html", optional)
|
||||
use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
|
||||
model (str): The model to use for LLM processing (default: gemini-2.5-flash)
|
||||
min_length (int): Minimum content length to trigger LLM processing (default: 5000)
|
||||
|
||||
Returns:
|
||||
str: JSON string containing extracted content with the following structure:
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"url": str,
|
||||
"title": str,
|
||||
"raw_content": str,
|
||||
"content": str
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
str: JSON string containing extracted content. If LLM processing is enabled and successful,
|
||||
the 'content' field will contain the processed markdown summary instead of raw content.
|
||||
|
||||
Raises:
|
||||
Exception: If extraction fails or API key is not set
|
||||
|
|
@ -157,11 +261,40 @@ def web_extract_tool(urls: List[str], format: str = None) -> str:
|
|||
|
||||
print(f"✅ Extracted content from {len(response.get('results', []))} pages")
|
||||
|
||||
# Print summary of extracted pages for debugging
|
||||
for result in response.get('results', []):
|
||||
url = result.get('url', 'Unknown URL')
|
||||
content_length = len(result.get('raw_content', ''))
|
||||
print(f" 📝 {url} ({content_length} characters)")
|
||||
# Process each result with LLM if enabled
|
||||
if use_llm_processing and os.getenv("NOUS_API_KEY"):
|
||||
print("🧠 Processing extracted content with LLM...")
|
||||
|
||||
for result in response.get('results', []):
|
||||
url = result.get('url', 'Unknown URL')
|
||||
title = result.get('title', '')
|
||||
raw_content = result.get('raw_content', '') or result.get('content', '')
|
||||
|
||||
if raw_content:
|
||||
# Process content with LLM
|
||||
processed = await process_content_with_llm(
|
||||
raw_content, url, title, model, min_length
|
||||
)
|
||||
|
||||
if processed:
|
||||
# Replace content with processed version
|
||||
result['content'] = processed
|
||||
# Keep raw content in separate field for reference
|
||||
result['raw_content'] = raw_content
|
||||
print(f" 📝 {url} (processed)")
|
||||
else:
|
||||
print(f" 📝 {url} (no processing - content too short)")
|
||||
else:
|
||||
print(f" ⚠️ {url} (no content to process)")
|
||||
else:
|
||||
if use_llm_processing and not os.getenv("NOUS_API_KEY"):
|
||||
print("⚠️ LLM processing requested but NOUS_API_KEY not set, returning raw content")
|
||||
|
||||
# Print summary of extracted pages for debugging (original behavior)
|
||||
for result in response.get('results', []):
|
||||
url = result.get('url', 'Unknown URL')
|
||||
content_length = len(result.get('raw_content', ''))
|
||||
print(f" 📝 {url} ({content_length} characters)")
|
||||
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Clean base64 images from extracted content
|
||||
|
|
@ -173,7 +306,14 @@ def web_extract_tool(urls: List[str], format: str = None) -> str:
|
|||
return json.dumps({"error": error_msg})
|
||||
|
||||
|
||||
def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") -> str:
|
||||
async def web_crawl_tool(
|
||||
url: str,
|
||||
instructions: str = None,
|
||||
depth: str = "basic",
|
||||
use_llm_processing: bool = True,
|
||||
model: str = DEFAULT_SUMMARIZER_MODEL,
|
||||
min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
|
||||
) -> str:
|
||||
"""
|
||||
Crawl a website with specific instructions using available crawling API backend.
|
||||
|
||||
|
|
@ -184,19 +324,14 @@ def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") ->
|
|||
url (str): The base URL to crawl (can include or exclude https://)
|
||||
instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional)
|
||||
depth (str): Depth of extraction ("basic" or "advanced", default: "basic")
|
||||
use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
|
||||
model (str): The model to use for LLM processing (default: gemini-2.5-flash)
|
||||
min_length (int): Minimum content length to trigger LLM processing (default: 5000)
|
||||
|
||||
Returns:
|
||||
str: JSON string containing crawled content with the following structure:
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"url": str,
|
||||
"title": str,
|
||||
"content": str
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
str: JSON string containing crawled content. If LLM processing is enabled and successful,
|
||||
the 'content' field will contain the processed markdown summary instead of raw content.
|
||||
Each page is processed individually.
|
||||
|
||||
Raises:
|
||||
Exception: If crawling fails or API key is not set
|
||||
|
|
@ -215,11 +350,40 @@ def web_crawl_tool(url: str, instructions: str = None, depth: str = "basic") ->
|
|||
|
||||
print(f"✅ Crawled {len(response.get('results', []))} pages")
|
||||
|
||||
# Print summary of crawled pages for debugging
|
||||
for result in response.get('results', []):
|
||||
page_url = result.get('url', 'Unknown URL')
|
||||
content_length = len(result.get('content', ''))
|
||||
print(f" 🌐 {page_url} ({content_length} characters)")
|
||||
# Process each result with LLM if enabled
|
||||
if use_llm_processing and os.getenv("NOUS_API_KEY"):
|
||||
print("🧠 Processing crawled content with LLM...")
|
||||
|
||||
for result in response.get('results', []):
|
||||
page_url = result.get('url', 'Unknown URL')
|
||||
title = result.get('title', '')
|
||||
content = result.get('content', '')
|
||||
|
||||
if content:
|
||||
# Process content with LLM
|
||||
processed = await process_content_with_llm(
|
||||
content, page_url, title, model, min_length
|
||||
)
|
||||
|
||||
if processed:
|
||||
# Keep original content in raw_content field
|
||||
result['raw_content'] = content
|
||||
# Replace content with processed version
|
||||
result['content'] = processed
|
||||
print(f" 🌐 {page_url} (processed)")
|
||||
else:
|
||||
print(f" 🌐 {page_url} (no processing - content too short)")
|
||||
else:
|
||||
print(f" ⚠️ {page_url} (no content to process)")
|
||||
else:
|
||||
if use_llm_processing and not os.getenv("NOUS_API_KEY"):
|
||||
print("⚠️ LLM processing requested but NOUS_API_KEY not set, returning raw content")
|
||||
|
||||
# Print summary of crawled pages for debugging (original behavior)
|
||||
for result in response.get('results', []):
|
||||
page_url = result.get('url', 'Unknown URL')
|
||||
content_length = len(result.get('content', ''))
|
||||
print(f" 🌐 {page_url} ({content_length} characters)")
|
||||
|
||||
result_json = json.dumps(response, indent=2)
|
||||
# Clean base64 images from crawled content
|
||||
|
|
@ -242,6 +406,16 @@ def check_tavily_api_key() -> bool:
|
|||
return bool(os.getenv("TAVILY_API_KEY"))
|
||||
|
||||
|
||||
def check_nous_api_key() -> bool:
|
||||
"""
|
||||
Check if the Nous Research API key is available in environment variables.
|
||||
|
||||
Returns:
|
||||
bool: True if API key is set, False otherwise
|
||||
"""
|
||||
return bool(os.getenv("NOUS_API_KEY"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Simple test/demo when run directly
|
||||
|
|
@ -249,17 +423,61 @@ if __name__ == "__main__":
|
|||
print("🌐 Standalone Web Tools Module")
|
||||
print("=" * 40)
|
||||
|
||||
# Check if API key is available
|
||||
if not check_tavily_api_key():
|
||||
# Check if API keys are available
|
||||
tavily_available = check_tavily_api_key()
|
||||
nous_available = check_nous_api_key()
|
||||
|
||||
if not tavily_available:
|
||||
print("❌ TAVILY_API_KEY environment variable not set")
|
||||
print("Please set your API key: export TAVILY_API_KEY='your-key-here'")
|
||||
print("Get API key at: https://tavily.com/")
|
||||
else:
|
||||
print("✅ Tavily API key found")
|
||||
|
||||
if not nous_available:
|
||||
print("❌ NOUS_API_KEY environment variable not set")
|
||||
print("Please set your API key: export NOUS_API_KEY='your-key-here'")
|
||||
print("Get API key at: https://inference-api.nousresearch.com/")
|
||||
print("⚠️ Without Nous API key, LLM content processing will be disabled")
|
||||
else:
|
||||
print("✅ Nous Research API key found")
|
||||
|
||||
if not tavily_available:
|
||||
exit(1)
|
||||
|
||||
print("✅ Tavily API key found")
|
||||
print("🛠️ Web tools ready for use!")
|
||||
print("\nExample usage:")
|
||||
|
||||
if nous_available:
|
||||
print("🧠 LLM content processing available with Gemini 2.5 Flash")
|
||||
print(f" Default min length for processing: {DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION} chars")
|
||||
|
||||
print("\nBasic usage:")
|
||||
print(" from web_tools import web_search_tool, web_extract_tool, web_crawl_tool")
|
||||
print(" import asyncio")
|
||||
print("")
|
||||
print(" # Search (synchronous)")
|
||||
print(" results = web_search_tool('Python tutorials')")
|
||||
print(" content = web_extract_tool(['https://example.com'])")
|
||||
print(" crawl_data = web_crawl_tool('example.com', 'Find documentation')")
|
||||
print("")
|
||||
print(" # Extract and crawl (asynchronous)")
|
||||
print(" async def main():")
|
||||
print(" content = await web_extract_tool(['https://example.com'])")
|
||||
print(" crawl_data = await web_crawl_tool('example.com', 'Find docs')")
|
||||
print(" asyncio.run(main())")
|
||||
|
||||
if nous_available:
|
||||
print("\nLLM-enhanced usage:")
|
||||
print(" # Content automatically processed for pages >5000 chars (default)")
|
||||
print(" content = await web_extract_tool(['https://python.org/about/'])")
|
||||
print("")
|
||||
print(" # Customize processing parameters")
|
||||
print(" crawl_data = await web_crawl_tool(")
|
||||
print(" 'docs.python.org',")
|
||||
print(" 'Find key concepts',")
|
||||
print(" model='gemini-2.5-flash',")
|
||||
print(" min_length=3000")
|
||||
print(" )")
|
||||
print("")
|
||||
print(" # Disable LLM processing")
|
||||
print(" raw_content = await web_extract_tool(['https://example.com'], use_llm_processing=False)")
|
||||
|
||||
print(f"\n📝 Run 'python test_web_tools_llm.py' to test LLM processing capabilities")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue