mirror of https://github.com/kortix-ai/suna.git
replace firecrawl with tavily extract
This commit is contained in:
parent
cd3ebb5e78
commit
34e7811f12
|
@ -14,7 +14,7 @@ import logging
|
|||
# TODO: add subpages, etc... in filters as sometimes its necessary
|
||||
|
||||
class SandboxWebSearchTool(SandboxToolsBase):
|
||||
"""Tool for performing web searches using Tavily API and web scraping using Firecrawl."""
|
||||
"""Tool for performing web searches and web scraping using Tavily API."""
|
||||
|
||||
def __init__(self, project_id: str, thread_manager: ThreadManager):
|
||||
super().__init__(project_id, thread_manager)
|
||||
|
@ -22,13 +22,9 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
load_dotenv()
|
||||
# Use API keys from config
|
||||
self.tavily_api_key = config.TAVILY_API_KEY
|
||||
self.firecrawl_api_key = config.FIRECRAWL_API_KEY
|
||||
self.firecrawl_url = config.FIRECRAWL_URL
|
||||
|
||||
if not self.tavily_api_key:
|
||||
raise ValueError("TAVILY_API_KEY not found in configuration")
|
||||
if not self.firecrawl_api_key:
|
||||
raise ValueError("FIRECRAWL_API_KEY not found in configuration")
|
||||
|
||||
# Tavily asynchronous search client
|
||||
self.tavily_client = AsyncTavilyClient(api_key=self.tavily_api_key)
|
||||
|
@ -263,65 +259,46 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
logging.info(f"Scraping single URL: {url}")
|
||||
|
||||
try:
|
||||
# ---------- Firecrawl scrape endpoint ----------
|
||||
logging.info(f"Sending request to Firecrawl for URL: {url}")
|
||||
async with httpx.AsyncClient() as client:
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.firecrawl_api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"url": url,
|
||||
"formats": ["markdown"]
|
||||
}
|
||||
|
||||
# Use longer timeout and retry logic for more reliability
|
||||
# ---------- Tavily scrape endpoint ----------
|
||||
logging.info(f"Sending request to Tavily for URL: {url}")
|
||||
max_retries = 3
|
||||
timeout_seconds = 30
|
||||
retry_count = 0
|
||||
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
logging.info(f"Sending request to Firecrawl (attempt {retry_count + 1}/{max_retries})")
|
||||
response = await client.post(
|
||||
f"{self.firecrawl_url}/v1/scrape",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=timeout_seconds,
|
||||
logging.info(f"Sending request to Tavily Extract API (attempt {retry_count + 1}/{max_retries})")
|
||||
response = await self.tavily_client.extract(
|
||||
urls=[url],
|
||||
include_images=True
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
logging.info(f"Successfully received response from Firecrawl for {url}")
|
||||
logging.info(f"Successfully received response from Tavily for {url}")
|
||||
break
|
||||
except (httpx.ReadTimeout, httpx.ConnectTimeout, httpx.ReadError) as timeout_err:
|
||||
except Exception as e:
|
||||
retry_count += 1
|
||||
logging.warning(f"Request timed out (attempt {retry_count}/{max_retries}): {str(timeout_err)}")
|
||||
logging.warning(f"Request timed out (attempt {retry_count}/{max_retries}): {str(e)}")
|
||||
if retry_count >= max_retries:
|
||||
raise Exception(f"Request timed out after {max_retries} attempts with {timeout_seconds}s timeout")
|
||||
raise Exception(f"Request failed after {max_retries} attempts: {str(e)}")
|
||||
# Exponential backoff
|
||||
logging.info(f"Waiting {2 ** retry_count}s before retry")
|
||||
await asyncio.sleep(2 ** retry_count)
|
||||
except Exception as e:
|
||||
# Don't retry on non-timeout errors
|
||||
logging.error(f"Error during scraping: {str(e)}")
|
||||
raise e
|
||||
|
||||
if not response or "results" not in response or not response["results"]:
|
||||
raise Exception(f"No results returned from Tavily Extract API for URL: {url}")
|
||||
|
||||
# Get the first result for this URL
|
||||
data = next((r for r in response["results"] if r["url"] == url), None)
|
||||
|
||||
if not data:
|
||||
raise Exception(f"URL {url} not found in Tavily Extract API response")
|
||||
# Format the response
|
||||
title = data.get("data", {}).get("metadata", {}).get("title", "")
|
||||
markdown_content = data.get("data", {}).get("markdown", "")
|
||||
logging.info(f"Extracted content from {url}: title='{title}', content length={len(markdown_content)}")
|
||||
markdown_content = data.get("raw_content", "")
|
||||
logging.info(f"Extracted content from {url}: content length={len(markdown_content)}")
|
||||
|
||||
formatted_result = {
|
||||
"title": title,
|
||||
"url": url,
|
||||
"text": markdown_content
|
||||
}
|
||||
|
||||
# Add metadata if available
|
||||
if "metadata" in data.get("data", {}):
|
||||
formatted_result["metadata"] = data["data"]["metadata"]
|
||||
logging.info(f"Added metadata: {data['data']['metadata'].keys()}")
|
||||
|
||||
# Create a simple filename from the URL domain and date
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
|
@ -352,7 +329,6 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
|||
return {
|
||||
"url": url,
|
||||
"success": True,
|
||||
"title": title,
|
||||
"file_path": results_file_path,
|
||||
"content_length": len(markdown_content)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue