mirror of https://github.com/kortix-ai/suna.git
replace firecrawl with tavily extract
This commit is contained in:
parent
cd3ebb5e78
commit
34e7811f12
|
@ -14,7 +14,7 @@ import logging
|
||||||
# TODO: add subpages, etc... in filters as sometimes its necessary
|
# TODO: add subpages, etc... in filters as sometimes its necessary
|
||||||
|
|
||||||
class SandboxWebSearchTool(SandboxToolsBase):
|
class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
"""Tool for performing web searches using Tavily API and web scraping using Firecrawl."""
|
"""Tool for performing web searches and web scraping using Tavily API."""
|
||||||
|
|
||||||
def __init__(self, project_id: str, thread_manager: ThreadManager):
|
def __init__(self, project_id: str, thread_manager: ThreadManager):
|
||||||
super().__init__(project_id, thread_manager)
|
super().__init__(project_id, thread_manager)
|
||||||
|
@ -22,13 +22,9 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
# Use API keys from config
|
# Use API keys from config
|
||||||
self.tavily_api_key = config.TAVILY_API_KEY
|
self.tavily_api_key = config.TAVILY_API_KEY
|
||||||
self.firecrawl_api_key = config.FIRECRAWL_API_KEY
|
|
||||||
self.firecrawl_url = config.FIRECRAWL_URL
|
|
||||||
|
|
||||||
if not self.tavily_api_key:
|
if not self.tavily_api_key:
|
||||||
raise ValueError("TAVILY_API_KEY not found in configuration")
|
raise ValueError("TAVILY_API_KEY not found in configuration")
|
||||||
if not self.firecrawl_api_key:
|
|
||||||
raise ValueError("FIRECRAWL_API_KEY not found in configuration")
|
|
||||||
|
|
||||||
# Tavily asynchronous search client
|
# Tavily asynchronous search client
|
||||||
self.tavily_client = AsyncTavilyClient(api_key=self.tavily_api_key)
|
self.tavily_client = AsyncTavilyClient(api_key=self.tavily_api_key)
|
||||||
|
@ -263,65 +259,46 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
logging.info(f"Scraping single URL: {url}")
|
logging.info(f"Scraping single URL: {url}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# ---------- Firecrawl scrape endpoint ----------
|
# ---------- Tavily scrape endpoint ----------
|
||||||
logging.info(f"Sending request to Firecrawl for URL: {url}")
|
logging.info(f"Sending request to Tavily for URL: {url}")
|
||||||
async with httpx.AsyncClient() as client:
|
|
||||||
headers = {
|
|
||||||
"Authorization": f"Bearer {self.firecrawl_api_key}",
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
}
|
|
||||||
payload = {
|
|
||||||
"url": url,
|
|
||||||
"formats": ["markdown"]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Use longer timeout and retry logic for more reliability
|
|
||||||
max_retries = 3
|
max_retries = 3
|
||||||
timeout_seconds = 30
|
|
||||||
retry_count = 0
|
retry_count = 0
|
||||||
|
|
||||||
while retry_count < max_retries:
|
while retry_count < max_retries:
|
||||||
try:
|
try:
|
||||||
logging.info(f"Sending request to Firecrawl (attempt {retry_count + 1}/{max_retries})")
|
logging.info(f"Sending request to Tavily Extract API (attempt {retry_count + 1}/{max_retries})")
|
||||||
response = await client.post(
|
response = await self.tavily_client.extract(
|
||||||
f"{self.firecrawl_url}/v1/scrape",
|
urls=[url],
|
||||||
json=payload,
|
include_images=True
|
||||||
headers=headers,
|
|
||||||
timeout=timeout_seconds,
|
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
logging.info(f"Successfully received response from Tavily for {url}")
|
||||||
data = response.json()
|
|
||||||
logging.info(f"Successfully received response from Firecrawl for {url}")
|
|
||||||
break
|
break
|
||||||
except (httpx.ReadTimeout, httpx.ConnectTimeout, httpx.ReadError) as timeout_err:
|
except Exception as e:
|
||||||
retry_count += 1
|
retry_count += 1
|
||||||
logging.warning(f"Request timed out (attempt {retry_count}/{max_retries}): {str(timeout_err)}")
|
logging.warning(f"Request timed out (attempt {retry_count}/{max_retries}): {str(e)}")
|
||||||
if retry_count >= max_retries:
|
if retry_count >= max_retries:
|
||||||
raise Exception(f"Request timed out after {max_retries} attempts with {timeout_seconds}s timeout")
|
raise Exception(f"Request failed after {max_retries} attempts: {str(e)}")
|
||||||
# Exponential backoff
|
# Exponential backoff
|
||||||
logging.info(f"Waiting {2 ** retry_count}s before retry")
|
logging.info(f"Waiting {2 ** retry_count}s before retry")
|
||||||
await asyncio.sleep(2 ** retry_count)
|
await asyncio.sleep(2 ** retry_count)
|
||||||
except Exception as e:
|
|
||||||
# Don't retry on non-timeout errors
|
|
||||||
logging.error(f"Error during scraping: {str(e)}")
|
|
||||||
raise e
|
|
||||||
|
|
||||||
|
if not response or "results" not in response or not response["results"]:
|
||||||
|
raise Exception(f"No results returned from Tavily Extract API for URL: {url}")
|
||||||
|
|
||||||
|
# Get the first result for this URL
|
||||||
|
data = next((r for r in response["results"] if r["url"] == url), None)
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
raise Exception(f"URL {url} not found in Tavily Extract API response")
|
||||||
# Format the response
|
# Format the response
|
||||||
title = data.get("data", {}).get("metadata", {}).get("title", "")
|
markdown_content = data.get("raw_content", "")
|
||||||
markdown_content = data.get("data", {}).get("markdown", "")
|
logging.info(f"Extracted content from {url}: content length={len(markdown_content)}")
|
||||||
logging.info(f"Extracted content from {url}: title='{title}', content length={len(markdown_content)}")
|
|
||||||
|
|
||||||
formatted_result = {
|
formatted_result = {
|
||||||
"title": title,
|
|
||||||
"url": url,
|
"url": url,
|
||||||
"text": markdown_content
|
"text": markdown_content
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add metadata if available
|
|
||||||
if "metadata" in data.get("data", {}):
|
|
||||||
formatted_result["metadata"] = data["data"]["metadata"]
|
|
||||||
logging.info(f"Added metadata: {data['data']['metadata'].keys()}")
|
|
||||||
|
|
||||||
# Create a simple filename from the URL domain and date
|
# Create a simple filename from the URL domain and date
|
||||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
|
||||||
|
@ -352,7 +329,6 @@ class SandboxWebSearchTool(SandboxToolsBase):
|
||||||
return {
|
return {
|
||||||
"url": url,
|
"url": url,
|
||||||
"success": True,
|
"success": True,
|
||||||
"title": title,
|
|
||||||
"file_path": results_file_path,
|
"file_path": results_file_path,
|
||||||
"content_length": len(markdown_content)
|
"content_length": len(markdown_content)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue