feat: pass multiple urls at once

This commit is contained in:
Krishav Raj Singh 2025-08-22 02:06:49 +05:30
parent 34e7811f12
commit f093b859ab
1 changed files with 77 additions and 57 deletions

View File

@ -194,24 +194,9 @@ class SandboxWebSearchTool(SandboxToolsBase):
logging.info(f"Processing {len(url_list)} URLs: {url_list}")
# Process each URL concurrently and collect results
tasks = [self._scrape_single_url(url) for url in url_list]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Process results, handling exceptions
processed_results = []
for i, result in enumerate(results):
if isinstance(result, Exception):
logging.error(f"Error processing URL {url_list[i]}: {str(result)}")
processed_results.append({
"url": url_list[i],
"success": False,
"error": str(result)
})
else:
processed_results.append(result)
results = processed_results
# tasks = [self._scrape_single_url(url) for url in url_list]
# results = await asyncio.gather(*tasks, return_exceptions=True)
results = await self._scrape_urls(url_list)
# Summarize results
successful = sum(1 for r in results if r.get("success", False))
@ -246,51 +231,86 @@ class SandboxWebSearchTool(SandboxToolsBase):
logging.error(f"Error in scrape_webpage: {error_message}")
return self.fail_response(f"Error processing scrape request: {error_message[:200]}")
async def _scrape_single_url(self, url: str) -> dict:
async def _scrape_urls(self, url_list: list[str]) -> dict:
"""
Helper function to scrape a single URL and return the result information.
Helper function to scrape multiple URLs and return the result information.
"""
logging.info(f"Scraping URLs: {url_list}")
# # Add protocol if missing
# if not (url.startswith('http://') or url.startswith('https://')):
# url = 'https://' + url
# logging.info(f"Added https:// protocol to URL: {url}")
logging.info(f"Scraping single URL: {url}")
try:
# ---------- Tavily scrape endpoint ----------
logging.info(f"Sending request to Tavily for URL: {url}")
max_retries = 3
retry_count = 0
remaining_urls = url_list.copy()
all_results = []
while retry_count < max_retries:
while retry_count < max_retries and remaining_urls:
try:
logging.info(f"Sending request to Tavily Extract API (attempt {retry_count + 1}/{max_retries})")
logging.info(f"Sending request to Tavily Extract API (attempt {retry_count + 1}/{max_retries}) for {len(remaining_urls)} URLs")
response = await self.tavily_client.extract(
urls=[url],
urls=remaining_urls,
include_images=True
)
logging.info(f"Successfully received response from Tavily for {url}")
logging.info(f"Successfully received response from Tavily for {remaining_urls}")
successful_urls = []
if response and "results" in response and response["results"]:
logging.info(f"Received {len(response['results'])} results from Tavily")
for result_data in response["results"]:
try:
processed_result = await self._process_single_result(result_data)
all_results.append(processed_result)
successful_urls.append(result_data["url"])
except Exception as e:
logging.error(f"Error processing result for {result_data.get('url', 'unknown')}: {str(e)}")
all_results.append({
"url": result_data.get('url', 'unknown'),
"success": False,
"error": f"Error processing result: {str(e)}"
})
failed_results = response.get("failed_results", [])
if failed_results:
logging.warning(f"API reported {len(failed_results)} failed results")
for failed_result in failed_results:
url = failed_result.get("url")
error = failed_result.get("error", "Failed to extract content")
all_results.append({
"url": url,
"success": False,
"error": error
})
logging.warning(f"Failed: {url} - {error}")
remaining_urls = [] # All URLs processed, exit retry loop
break
except Exception as e:
retry_count += 1
logging.warning(f"Request timed out (attempt {retry_count}/{max_retries}): {str(e)}")
logging.warning(f"Request failed (attempt {retry_count}/{max_retries}): {str(e)}")
if retry_count >= max_retries:
raise Exception(f"Request failed after {max_retries} attempts: {str(e)}")
# add all remaining urls as failed
for url in remaining_urls:
all_results.append({
"url": url,
"success": False,
"error": f"Request failed after {max_retries} attempts: {str(e)}"
})
break
# Exponential backoff
logging.info(f"Waiting {2 ** retry_count}s before retry")
await asyncio.sleep(2 ** retry_count)
if not response or "results" not in response or not response["results"]:
raise Exception(f"No results returned from Tavily Extract API for URL: {url}")
# All URLs should be handled by the API response
logging.info(f"Batch scraping completed. Total results: {len(all_results)} (successful: {sum(1 for r in all_results if r.get('success'))}, failed: {sum(1 for r in all_results if not r.get('success'))})")
return all_results
# Get the first result for this URL
data = next((r for r in response["results"] if r["url"] == url), None)
if not data:
raise Exception(f"URL {url} not found in Tavily Extract API response")
# Format the response
async def _process_single_result(self, data: dict) -> dict:
"""
Process a single result from the Tavily API response and save to file.
"""
url = data.get("url", "unknown")
try:
# Extract content
markdown_content = data.get("raw_content", "")
logging.info(f"Extracted content from {url}: content length={len(markdown_content)}")