mirror of https://github.com/kortix-ai/suna.git
383 lines
17 KiB
Python
383 lines
17 KiB
Python
from tavily import AsyncTavilyClient
|
|
import httpx
|
|
from dotenv import load_dotenv
|
|
from agentpress.tool import Tool, ToolResult, openapi_schema, usage_example
|
|
from utils.config import config
|
|
from sandbox.tool_base import SandboxToolsBase
|
|
from agentpress.thread_manager import ThreadManager
|
|
import json
|
|
import os
|
|
import datetime
|
|
import asyncio
|
|
import logging
|
|
|
|
# TODO: add subpages, etc... in filters as sometimes its necessary
|
|
|
|
class SandboxWebSearchTool(SandboxToolsBase):
|
|
"""Tool for performing web searches and web scraping using Tavily API."""
|
|
|
|
def __init__(self, project_id: str, thread_manager: ThreadManager):
|
|
super().__init__(project_id, thread_manager)
|
|
# Load environment variables
|
|
load_dotenv()
|
|
# Use API keys from config
|
|
self.tavily_api_key = config.TAVILY_API_KEY
|
|
|
|
if not self.tavily_api_key:
|
|
raise ValueError("TAVILY_API_KEY not found in configuration")
|
|
|
|
# Tavily asynchronous search client
|
|
self.tavily_client = AsyncTavilyClient(api_key=self.tavily_api_key)
|
|
|
|
@openapi_schema({
|
|
"type": "function",
|
|
"function": {
|
|
"name": "web_search",
|
|
"description": "Search the web for up-to-date information on a specific topic using the Tavily API. This tool allows you to gather real-time information from the internet to answer user queries, research topics, validate facts, and find recent developments. Results include titles, URLs, and publication dates. Use this tool for discovering relevant web pages before potentially crawling them for complete content.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {
|
|
"type": "string",
|
|
"description": "The search query to find relevant web pages. Be specific and include key terms to improve search accuracy. For best results, use natural language questions or keyword combinations that precisely describe what you're looking for."
|
|
},
|
|
"num_results": {
|
|
"type": "integer",
|
|
"description": "The number of search results to return. Increase for more comprehensive research or decrease for focused, high-relevance results.",
|
|
"default": 20
|
|
}
|
|
},
|
|
"required": ["query"]
|
|
}
|
|
}
|
|
})
|
|
@usage_example('''
|
|
<function_calls>
|
|
<invoke name="web_search">
|
|
<parameter name="query">what is Kortix AI and what are they building?</parameter>
|
|
<parameter name="num_results">20</parameter>
|
|
</invoke>
|
|
</function_calls>
|
|
|
|
<!-- Another search example -->
|
|
<function_calls>
|
|
<invoke name="web_search">
|
|
<parameter name="query">latest AI research on transformer models</parameter>
|
|
<parameter name="num_results">20</parameter>
|
|
</invoke>
|
|
</function_calls>
|
|
''')
|
|
async def web_search(
|
|
self,
|
|
query: str,
|
|
num_results: int = 20
|
|
) -> ToolResult:
|
|
"""
|
|
Search the web using the Tavily API to find relevant and up-to-date information.
|
|
"""
|
|
try:
|
|
# Ensure we have a valid query
|
|
if not query or not isinstance(query, str):
|
|
return self.fail_response("A valid search query is required.")
|
|
|
|
# Normalize num_results
|
|
if num_results is None:
|
|
num_results = 20
|
|
elif isinstance(num_results, int):
|
|
num_results = max(1, min(num_results, 50))
|
|
elif isinstance(num_results, str):
|
|
try:
|
|
num_results = max(1, min(int(num_results), 50))
|
|
except ValueError:
|
|
num_results = 20
|
|
else:
|
|
num_results = 20
|
|
|
|
# Execute the search with Tavily
|
|
logging.info(f"Executing web search for query: '{query}' with {num_results} results")
|
|
search_response = await self.tavily_client.search(
|
|
query=query,
|
|
max_results=num_results,
|
|
include_images=True,
|
|
include_answer="advanced",
|
|
search_depth="advanced",
|
|
)
|
|
|
|
# Check if we have actual results or an answer
|
|
results = search_response.get('results', [])
|
|
answer = search_response.get('answer', '')
|
|
|
|
# Return the complete Tavily response
|
|
# This includes the query, answer, results, images and more
|
|
logging.info(f"Retrieved search results for query: '{query}' with answer and {len(results)} results")
|
|
|
|
# Consider search successful if we have either results OR an answer
|
|
if len(results) > 0 or (answer and answer.strip()):
|
|
return ToolResult(
|
|
success=True,
|
|
output=json.dumps(search_response, ensure_ascii=False)
|
|
)
|
|
else:
|
|
# No results or answer found
|
|
logging.warning(f"No search results or answer found for query: '{query}'")
|
|
return ToolResult(
|
|
success=False,
|
|
output=json.dumps(search_response, ensure_ascii=False)
|
|
)
|
|
|
|
except Exception as e:
|
|
error_message = str(e)
|
|
logging.error(f"Error performing web search for '{query}': {error_message}")
|
|
simplified_message = f"Error performing web search: {error_message[:200]}"
|
|
if len(error_message) > 200:
|
|
simplified_message += "..."
|
|
return self.fail_response(simplified_message)
|
|
|
|
@openapi_schema({
|
|
"type": "function",
|
|
"function": {
|
|
"name": "scrape_webpage",
|
|
"description": "Extract full text content from multiple webpages in a single operation. IMPORTANT: You should ALWAYS collect multiple relevant URLs from web-search results and scrape them all in a single call for efficiency. This tool saves time by processing multiple pages simultaneously rather than one at a time. The extracted text includes the main content of each page without HTML markup.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"urls": {
|
|
"type": "string",
|
|
"description": "Multiple URLs to scrape, separated by commas. You should ALWAYS include several URLs when possible for efficiency. Example: 'https://example.com/page1,https://example.com/page2,https://example.com/page3'"
|
|
}
|
|
},
|
|
"required": ["urls"]
|
|
}
|
|
}
|
|
})
|
|
@usage_example('''
|
|
<function_calls>
|
|
<invoke name="scrape_webpage">
|
|
<parameter name="urls">https://www.kortix.ai/,https://github.com/kortix-ai/suna</parameter>
|
|
</invoke>
|
|
</function_calls>
|
|
''')
|
|
async def scrape_webpage(
|
|
self,
|
|
urls: str
|
|
) -> ToolResult:
|
|
"""
|
|
Retrieve the complete text content of multiple webpages in a single efficient operation.
|
|
|
|
ALWAYS collect multiple relevant URLs from search results and scrape them all at once
|
|
rather than making separate calls for each URL. This is much more efficient.
|
|
|
|
Parameters:
|
|
- urls: Multiple URLs to scrape, separated by commas
|
|
"""
|
|
try:
|
|
logging.info(f"Starting to scrape webpages: {urls}")
|
|
|
|
# Ensure sandbox is initialized
|
|
await self._ensure_sandbox()
|
|
|
|
# Parse the URLs parameter
|
|
if not urls:
|
|
logging.warning("Scrape attempt with empty URLs")
|
|
return self.fail_response("Valid URLs are required.")
|
|
|
|
# Split the URLs string into a list
|
|
url_list = [url.strip() for url in urls.split(',') if url.strip()]
|
|
|
|
if not url_list:
|
|
logging.warning("No valid URLs found in the input")
|
|
return self.fail_response("No valid URLs provided.")
|
|
|
|
if len(url_list) == 1:
|
|
logging.warning("Only a single URL provided - for efficiency you should scrape multiple URLs at once")
|
|
|
|
logging.info(f"Processing {len(url_list)} URLs: {url_list}")
|
|
|
|
# Process each URL concurrently and collect results
|
|
# tasks = [self._scrape_single_url(url) for url in url_list]
|
|
# results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
results = await self._scrape_urls(url_list)
|
|
|
|
# Summarize results
|
|
successful = sum(1 for r in results if r.get("success", False))
|
|
failed = len(results) - successful
|
|
|
|
# Create success/failure message
|
|
if successful == len(results):
|
|
message = f"Successfully scraped all {len(results)} URLs. Results saved to:"
|
|
for r in results:
|
|
if r.get("file_path"):
|
|
message += f"\n- {r.get('file_path')}"
|
|
elif successful > 0:
|
|
message = f"Scraped {successful} URLs successfully and {failed} failed. Results saved to:"
|
|
for r in results:
|
|
if r.get("success", False) and r.get("file_path"):
|
|
message += f"\n- {r.get('file_path')}"
|
|
message += "\n\nFailed URLs:"
|
|
for r in results:
|
|
if not r.get("success", False):
|
|
message += f"\n- {r.get('url')}: {r.get('error', 'Unknown error')}"
|
|
else:
|
|
error_details = "; ".join([f"{r.get('url')}: {r.get('error', 'Unknown error')}" for r in results])
|
|
return self.fail_response(f"Failed to scrape all {len(results)} URLs. Errors: {error_details}")
|
|
|
|
return ToolResult(
|
|
success=True,
|
|
output=message
|
|
)
|
|
|
|
except Exception as e:
|
|
error_message = str(e)
|
|
logging.error(f"Error in scrape_webpage: {error_message}")
|
|
return self.fail_response(f"Error processing scrape request: {error_message[:200]}")
|
|
|
|
async def _scrape_urls(self, url_list: list[str]) -> dict:
|
|
"""
|
|
Helper function to scrape multiple URLs and return the result information.
|
|
"""
|
|
logging.info(f"Scraping URLs: {url_list}")
|
|
|
|
# ---------- Tavily scrape endpoint ----------
|
|
max_retries = 3
|
|
retry_count = 0
|
|
remaining_urls = url_list.copy()
|
|
all_results = []
|
|
|
|
while retry_count < max_retries and remaining_urls:
|
|
try:
|
|
logging.info(f"Sending request to Tavily Extract API (attempt {retry_count + 1}/{max_retries}) for {len(remaining_urls)} URLs")
|
|
response = await self.tavily_client.extract(
|
|
urls=remaining_urls,
|
|
include_images=True
|
|
)
|
|
logging.info(f"Successfully received response from Tavily for {remaining_urls}")
|
|
successful_urls = []
|
|
if response and "results" in response and response["results"]:
|
|
logging.info(f"Received {len(response['results'])} results from Tavily")
|
|
|
|
for result_data in response["results"]:
|
|
try:
|
|
processed_result = await self._process_single_result(result_data)
|
|
all_results.append(processed_result)
|
|
successful_urls.append(result_data["url"])
|
|
except Exception as e:
|
|
logging.error(f"Error processing result for {result_data.get('url', 'unknown')}: {str(e)}")
|
|
all_results.append({
|
|
"url": result_data.get('url', 'unknown'),
|
|
"success": False,
|
|
"error": f"Error processing result: {str(e)}"
|
|
})
|
|
|
|
failed_results = response.get("failed_results", [])
|
|
if failed_results:
|
|
logging.warning(f"API reported {len(failed_results)} failed results")
|
|
for failed_result in failed_results:
|
|
url = failed_result.get("url")
|
|
error = failed_result.get("error", "Failed to extract content")
|
|
all_results.append({
|
|
"url": url,
|
|
"success": False,
|
|
"error": error
|
|
})
|
|
logging.warning(f"Failed: {url} - {error}")
|
|
remaining_urls = [] # All URLs processed, exit retry loop
|
|
break
|
|
except Exception as e:
|
|
retry_count += 1
|
|
logging.warning(f"Request failed (attempt {retry_count}/{max_retries}): {str(e)}")
|
|
if retry_count >= max_retries:
|
|
# add all remaining urls as failed
|
|
for url in remaining_urls:
|
|
all_results.append({
|
|
"url": url,
|
|
"success": False,
|
|
"error": f"Request failed after {max_retries} attempts: {str(e)}"
|
|
})
|
|
break
|
|
# Exponential backoff
|
|
logging.info(f"Waiting {2 ** retry_count}s before retry")
|
|
await asyncio.sleep(2 ** retry_count)
|
|
|
|
# All URLs should be handled by the API response
|
|
logging.info(f"Batch scraping completed. Total results: {len(all_results)} (successful: {sum(1 for r in all_results if r.get('success'))}, failed: {sum(1 for r in all_results if not r.get('success'))})")
|
|
return all_results
|
|
|
|
|
|
async def _process_single_result(self, data: dict) -> dict:
|
|
"""
|
|
Process a single result from the Tavily API response and save to file.
|
|
"""
|
|
url = data.get("url", "unknown")
|
|
|
|
try:
|
|
# Extract content
|
|
markdown_content = data.get("raw_content", "")
|
|
logging.info(f"Extracted content from {url}: content length={len(markdown_content)}")
|
|
|
|
formatted_result = {
|
|
"url": url,
|
|
"text": markdown_content
|
|
}
|
|
|
|
# Create a simple filename from the URL domain and date
|
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
# Extract domain from URL for the filename
|
|
from urllib.parse import urlparse
|
|
parsed_url = urlparse(url)
|
|
domain = parsed_url.netloc.replace("www.", "")
|
|
|
|
# Clean up domain for filename
|
|
domain = "".join([c if c.isalnum() else "_" for c in domain])
|
|
safe_filename = f"{timestamp}_{domain}.json"
|
|
|
|
logging.info(f"Generated filename: {safe_filename}")
|
|
|
|
# Save results to a file in the /workspace/scrape directory
|
|
scrape_dir = f"{self.workspace_path}/scrape"
|
|
await self.sandbox.fs.create_folder(scrape_dir, "755")
|
|
|
|
results_file_path = f"{scrape_dir}/{safe_filename}"
|
|
json_content = json.dumps(formatted_result, ensure_ascii=False, indent=2)
|
|
logging.info(f"Saving content to file: {results_file_path}, size: {len(json_content)} bytes")
|
|
|
|
await self.sandbox.fs.upload_file(
|
|
json_content.encode(),
|
|
results_file_path,
|
|
)
|
|
|
|
return {
|
|
"url": url,
|
|
"success": True,
|
|
"file_path": results_file_path,
|
|
"content_length": len(markdown_content)
|
|
}
|
|
|
|
except Exception as e:
|
|
error_message = str(e)
|
|
logging.error(f"Error scraping URL '{url}': {error_message}")
|
|
|
|
# Create an error result
|
|
return {
|
|
"url": url,
|
|
"success": False,
|
|
"error": error_message
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
async def test_web_search():
|
|
"""Test function for the web search tool"""
|
|
# This test function is not compatible with the sandbox version
|
|
print("Test function needs to be updated for sandbox version")
|
|
|
|
async def test_scrape_webpage():
|
|
"""Test function for the webpage scrape tool"""
|
|
# This test function is not compatible with the sandbox version
|
|
print("Test function needs to be updated for sandbox version")
|
|
|
|
async def run_tests():
|
|
"""Run all test functions"""
|
|
await test_web_search()
|
|
await test_scrape_webpage()
|
|
|
|
asyncio.run(run_tests()) |