mirror of https://github.com/kortix-ai/suna.git
446 lines
22 KiB
Python
446 lines
22 KiB
Python
from agentpress.tool import ToolResult, openapi_schema, usage_example
|
|
from agentpress.thread_manager import ThreadManager
|
|
from sandbox.tool_base import SandboxToolsBase
|
|
from utils.logger import logger
|
|
from utils.s3_upload_utils import upload_base64_image
|
|
import asyncio
|
|
import json
|
|
import base64
|
|
import io
|
|
import traceback
|
|
from PIL import Image
|
|
from utils.config import config
|
|
|
|
class BrowserTool(SandboxToolsBase):
|
|
"""
|
|
Browser Tool for browser automation using local Stagehand API.
|
|
|
|
This tool provides browser automation capabilities using a local Stagehand API server,
|
|
replacing the sandbox browser tool functionality.
|
|
|
|
Only 4 core functions that can handle everything:
|
|
- browser_navigate_to: Navigate to URLs
|
|
- browser_act: Perform any action (click, type, scroll, dropdowns etc.)
|
|
- browser_extract_content: Extract content from pages
|
|
- browser_screenshot: Take screenshots
|
|
"""
|
|
|
|
|
|
def __init__(self, project_id: str, thread_id: str, thread_manager: ThreadManager):
|
|
super().__init__(project_id, thread_manager)
|
|
self.thread_id = thread_id
|
|
|
|
def _validate_base64_image(self, base64_string: str, max_size_mb: int = 10) -> tuple[bool, str]:
|
|
"""
|
|
Comprehensive validation of base64 image data.
|
|
|
|
Args:
|
|
base64_string (str): The base64 encoded image data
|
|
max_size_mb (int): Maximum allowed image size in megabytes
|
|
|
|
Returns:
|
|
tuple[bool, str]: (is_valid, error_message)
|
|
"""
|
|
try:
|
|
# Check if data exists and has reasonable length
|
|
if not base64_string or len(base64_string) < 10:
|
|
return False, "Base64 string is empty or too short"
|
|
|
|
# Remove data URL prefix if present (data:image/jpeg;base64,...)
|
|
if base64_string.startswith('data:'):
|
|
try:
|
|
base64_string = base64_string.split(',', 1)[1]
|
|
except (IndexError, ValueError):
|
|
return False, "Invalid data URL format"
|
|
|
|
# Check if string contains only valid base64 characters
|
|
# Base64 alphabet: A-Z, a-z, 0-9, +, /, = (padding)
|
|
import re
|
|
if not re.match(r'^[A-Za-z0-9+/]*={0,2}$', base64_string):
|
|
return False, "Invalid base64 characters detected"
|
|
|
|
# Check if base64 string length is valid (must be multiple of 4)
|
|
if len(base64_string) % 4 != 0:
|
|
return False, "Invalid base64 string length"
|
|
|
|
# Attempt to decode base64
|
|
try:
|
|
image_data = base64.b64decode(base64_string, validate=True)
|
|
except Exception as e:
|
|
return False, f"Base64 decoding failed: {str(e)}"
|
|
|
|
# Check decoded data size
|
|
if len(image_data) == 0:
|
|
return False, "Decoded image data is empty"
|
|
|
|
# Check if decoded data size exceeds limit
|
|
max_size_bytes = max_size_mb * 1024 * 1024
|
|
if len(image_data) > max_size_bytes:
|
|
return False, f"Image size ({len(image_data)} bytes) exceeds limit ({max_size_bytes} bytes)"
|
|
|
|
# Validate that decoded data is actually a valid image using PIL
|
|
try:
|
|
image_stream = io.BytesIO(image_data)
|
|
with Image.open(image_stream) as img:
|
|
# Verify the image by attempting to load it
|
|
img.verify()
|
|
|
|
# Check if image format is supported
|
|
supported_formats = {'JPEG', 'PNG', 'GIF', 'BMP', 'WEBP', 'TIFF'}
|
|
if img.format not in supported_formats:
|
|
return False, f"Unsupported image format: {img.format}"
|
|
|
|
return True, "Image validation successful"
|
|
|
|
except Exception as e:
|
|
return False, f"Image validation failed: {str(e)}"
|
|
|
|
except Exception as e:
|
|
return False, f"Image validation error: {str(e)}"
|
|
|
|
async def _debug_sandbox_services(self) -> str:
|
|
"""Debug method to check what services are running in the sandbox"""
|
|
try:
|
|
await self._ensure_sandbox()
|
|
|
|
# Check what processes are running
|
|
ps_cmd = "ps aux | grep -E '(python|uvicorn|stagehand|node)' | grep -v grep"
|
|
response = await self.sandbox.process.exec(ps_cmd, timeout=10)
|
|
|
|
processes = response.result if response.exit_code == 0 else "Failed to get process list"
|
|
|
|
# Check what ports are listening
|
|
netstat_cmd = "netstat -tlnp 2>/dev/null | grep -E ':(8003|8004)' || ss -tlnp 2>/dev/null | grep -E ':(8003|8004)' || echo 'No netstat/ss available'"
|
|
response2 = await self.sandbox.process.exec(netstat_cmd, timeout=10)
|
|
|
|
ports = response2.result if response2.exit_code == 0 else "Failed to get port list"
|
|
|
|
debug_info = f"""
|
|
=== Sandbox Services Debug Info ===
|
|
Running processes:
|
|
{processes}
|
|
|
|
Listening ports:
|
|
{ports}
|
|
|
|
=== End Debug Info ===
|
|
"""
|
|
return debug_info
|
|
|
|
except Exception as e:
|
|
return f"Error getting debug info: {e}"
|
|
|
|
async def _check_stagehand_api_health(self) -> bool:
|
|
"""Check if the Stagehand API server is running and accessible"""
|
|
try:
|
|
await self._ensure_sandbox()
|
|
|
|
|
|
# Simple health check curl command
|
|
curl_cmd = "curl -s -X GET 'http://localhost:8004/api' -H 'Content-Type: application/json'"
|
|
|
|
logger.debug(f"Checking Stagehand API health with: {curl_cmd}")
|
|
|
|
response = await self.sandbox.process.exec(curl_cmd, timeout=10)
|
|
if response.exit_code == 0:
|
|
try:
|
|
result = json.loads(response.result)
|
|
if result.get("status") == "healthy":
|
|
logger.debug("✅ Stagehand API server is running and healthy")
|
|
return True
|
|
else:
|
|
# If the browser api is not healthy, we need to restart the browser api
|
|
model_api_key = config.GEMINI_API_KEY
|
|
|
|
response = await self.sandbox.process.exec(f"curl -X POST 'http://localhost:8004/api/init' -H 'Content-Type: application/json' -d '{{\"api_key\": \"{model_api_key}\"}}'", timeout=90)
|
|
if response.exit_code == 0:
|
|
logger.debug("Stagehand API server restarted successfully")
|
|
return True
|
|
else:
|
|
logger.warning(f"Stagehand API server restart failed: {response.result}")
|
|
return False
|
|
except json.JSONDecodeError:
|
|
logger.warning(f"Stagehand API server responded but with invalid JSON: {response.result}")
|
|
return False
|
|
else:
|
|
logger.warning(f"Stagehand API server health check failed with exit code {response.exit_code}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking Stagehand API health: {e}")
|
|
return False
|
|
|
|
async def _execute_stagehand_api(self, endpoint: str, params: dict = None, method: str = "POST") -> ToolResult:
|
|
"""Execute a Stagehand action through the sandbox API"""
|
|
try:
|
|
# Ensure sandbox is initialized
|
|
await self._ensure_sandbox()
|
|
|
|
# Check if Stagehand API server is running
|
|
stagehand_healthy = await self._check_stagehand_api_health()
|
|
|
|
if not stagehand_healthy:
|
|
error_msg = "Stagehand API server is not running. Please ensure the Stagehand API server is running. Error: {response}"
|
|
|
|
# Add debug information
|
|
debug_info = await self._debug_sandbox_services()
|
|
error_msg += f"\n\nDebug information:\n{debug_info}"
|
|
|
|
logger.error(error_msg)
|
|
return self.fail_response(error_msg)
|
|
|
|
|
|
# Build the curl command to call the local Stagehand API
|
|
url = f"http://localhost:8004/api/{endpoint}" # Fixed localhost as curl runs inside container
|
|
|
|
if method == "GET" and params:
|
|
query_params = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
url = f"{url}?{query_params}"
|
|
curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'"
|
|
else:
|
|
curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'"
|
|
if params:
|
|
json_data = json.dumps(params)
|
|
curl_cmd += f" -d '{json_data}'"
|
|
|
|
logger.debug(f"\033[95mExecuting curl command:\033[0m\n{curl_cmd}")
|
|
|
|
response = await self.sandbox.process.exec(curl_cmd, timeout=30) # Execute curl inside sandbox
|
|
|
|
if response.exit_code == 0:
|
|
try:
|
|
result = json.loads(response.result)
|
|
logger.debug(f"Stagehand API result: {result}")
|
|
|
|
logger.debug("Stagehand API request completed successfully")
|
|
|
|
if "screenshot_base64" in result:
|
|
try:
|
|
screenshot_data = result["screenshot_base64"]
|
|
is_valid, validation_message = self._validate_base64_image(screenshot_data)
|
|
|
|
if is_valid:
|
|
logger.debug(f"Screenshot validation passed: {validation_message}")
|
|
image_url = await upload_base64_image(screenshot_data)
|
|
result["image_url"] = image_url
|
|
logger.debug(f"Uploaded screenshot to {image_url}")
|
|
else:
|
|
logger.warning(f"Screenshot validation failed: {validation_message}")
|
|
result["image_validation_error"] = validation_message
|
|
|
|
del result["screenshot_base64"]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to process screenshot: {e}")
|
|
result["image_upload_error"] = str(e)
|
|
|
|
result["input"] = params
|
|
added_message = await self.thread_manager.add_message(
|
|
thread_id=self.thread_id,
|
|
type="browser_state",
|
|
content=result,
|
|
is_llm_message=False
|
|
)
|
|
|
|
# Prepare clean response for agent (filter out internal metadata)
|
|
# Only include data that's useful for the agent's decision making
|
|
clean_result = {
|
|
"success": result.get("success", True),
|
|
"message": result.get("message", "Stagehand action completed successfully")
|
|
}
|
|
|
|
# Include only data that actually comes from browserApi.ts
|
|
if result.get("url"):
|
|
clean_result["url"] = result["url"]
|
|
if result.get("title"):
|
|
clean_result["title"] = result["title"]
|
|
if result.get("action"):
|
|
clean_result["action"] = result["action"]
|
|
if result.get("image_url"): # This is screenshot_base64 converted to image_url
|
|
clean_result["image_url"] = result["image_url"]
|
|
|
|
# Include any error context that's useful for the agent
|
|
if result.get("image_validation_error"):
|
|
clean_result["screenshot_issue"] = f"Screenshot processing issue: {result['image_validation_error']}"
|
|
if result.get("image_upload_error"):
|
|
clean_result["screenshot_issue"] = f"Screenshot upload issue: {result['image_upload_error']}"
|
|
clean_result["message_id"] = added_message.get("message_id")
|
|
|
|
if clean_result.get("success"):
|
|
return self.success_response(clean_result)
|
|
else:
|
|
# Handle error responses with helpful context
|
|
error_msg = result.get("error", result.get("message", "Unknown error"))
|
|
clean_result["message"] = error_msg
|
|
return self.fail_response(clean_result)
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse response JSON: {response.result} {e}")
|
|
return self.fail_response(f"Failed to parse response JSON: {response.result} {e}")
|
|
else:
|
|
# Check if it's a connection error (exit code 7)
|
|
if response.exit_code == 7:
|
|
error_msg = f"Stagehand API server is not available on port 8004. Please ensure the Stagehand API server is running. Error: {response}"
|
|
logger.error(error_msg)
|
|
return self.fail_response(error_msg)
|
|
else:
|
|
logger.error(f"Stagehand API request failed: {response}")
|
|
return self.fail_response(f"Stagehand API request failed: {response}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error executing Stagehand action: {e}")
|
|
logger.debug(traceback.format_exc())
|
|
return self.fail_response(f"Error executing Stagehand action: {e}")
|
|
|
|
# Core Functions Only
|
|
|
|
@openapi_schema({
|
|
"type": "function",
|
|
"function": {
|
|
"name": "browser_navigate_to",
|
|
"description": "Navigate to a specific url",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {
|
|
"type": "string",
|
|
"description": "The url to navigate to"
|
|
}
|
|
},
|
|
"required": ["url"]
|
|
}
|
|
}
|
|
})
|
|
@usage_example('''
|
|
<function_calls>
|
|
<invoke name="browser_navigate_to">
|
|
<parameter name="url">https://example.com</parameter>
|
|
</invoke>
|
|
</function_calls>
|
|
''')
|
|
async def browser_navigate_to(self, url: str) -> ToolResult:
|
|
"""Navigate to a URL using Stagehand."""
|
|
logger.debug(f"Browser navigating to: {url}")
|
|
return await self._execute_stagehand_api("navigate", {"url": url})
|
|
|
|
@openapi_schema({
|
|
"type": "function",
|
|
"function": {
|
|
"name": "browser_act",
|
|
"description": "Perform any browser action using natural language description. CRITICAL: This tool automatically provides a screenshot with every action. For data entry actions (filling forms, entering text, selecting options), you MUST review the provided screenshot to verify that displayed values exactly match what was intended. Report mismatches immediately. CRITICAL FILE UPLOAD RULE: ANY action that involves clicking, interacting with, or locating upload buttons, file inputs, resume upload sections, or any element that might trigger a choose file dialog MUST include the filePath parameter with filePath. This includes actions like 'click upload button', 'locate resume section', 'find file input' etc. Always err on the side of caution - if there's any possibility the action might lead to a file dialog, include filePath. This prevents accidental file dialog triggers without proper file handling.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"action": {
|
|
"type": "string",
|
|
"description": "The action to perform. Examples: 'click the login button', 'fill in the email field with %email%', 'scroll down to see more content', 'select option 2 from the dropdown', 'press Enter', 'go back', 'wait 5 seconds', 'click at coordinates 100,200', 'select United States from the country dropdown'"
|
|
},
|
|
"variables": {
|
|
"type": "object",
|
|
"description": "Variables to use in the action. Variables in the action string are referenced using %variable_name%. These variables are NOT shared with LLM providers for security.",
|
|
"additionalProperties": {"type": "string"},
|
|
"default": {}
|
|
},
|
|
"iframes": {
|
|
"type": "boolean",
|
|
"description": "Whether to include iframe content in the action. Set to true if the target element is inside an iframe.",
|
|
"default": True
|
|
},
|
|
"filePath": {
|
|
"type": "string",
|
|
"description": "CRITICAL: REQUIRED for ANY action that might involve file uploads. This includes: clicking upload buttons, locating resume sections, finding file inputs, scrolling to upload areas, or any action that could potentially trigger a file dialog. Always include this parameter when dealing with upload-related elements to prevent accidental file dialog triggers. The tool will automatically handle the file upload after the action is performed.",
|
|
}
|
|
},
|
|
"required": ["action"]
|
|
}
|
|
}
|
|
})
|
|
@usage_example('''
|
|
<function_calls>
|
|
<invoke name="browser_act">
|
|
<parameter name="action">fill in the login form with %username% and %password%</parameter>
|
|
<parameter name="variables">{"username": "john.doe", "password": "secret123"}</parameter>
|
|
<parameter name="iframes">true</parameter>
|
|
</invoke>
|
|
</function_calls>
|
|
|
|
<function_calls>
|
|
<invoke name="browser_act">
|
|
<parameter name="action">click on upload resume button</parameter>
|
|
<parameter name="filePath">/workspace/downloads/document.pdf</parameter>
|
|
</invoke>
|
|
</function_calls>
|
|
''')
|
|
async def browser_act(self, action: str, variables: dict = None, iframes: bool = False, filePath: dict = None) -> ToolResult:
|
|
"""Perform any browser action using Stagehand."""
|
|
logger.debug(f"Browser acting: {action} (variables={'***' if variables else None}, iframes={iframes}), filePath={filePath}")
|
|
params = {"action": action, "iframes": iframes, "variables": variables}
|
|
if filePath:
|
|
params["filePath"] = filePath
|
|
return await self._execute_stagehand_api("act", params)
|
|
|
|
@openapi_schema({
|
|
"type": "function",
|
|
"function": {
|
|
"name": "browser_extract_content",
|
|
"description": "Extract structured content from the current page using Stagehand",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"instruction": {
|
|
"type": "string",
|
|
"description": "What content to extract (e.g., 'extract all product prices', 'get the main heading', 'extract apartment listings with address and price')"
|
|
},
|
|
"iframes": {
|
|
"type": "boolean",
|
|
"description": "Whether to include iframe content in the extraction. Set to true if the target content is inside an iframe.",
|
|
"default": True
|
|
}
|
|
},
|
|
"required": ["instruction"]
|
|
}
|
|
}
|
|
})
|
|
@usage_example('''
|
|
<function_calls>
|
|
<invoke name="browser_extract_content">
|
|
<parameter name="instruction">extract all product names and prices from the main product list</parameter>
|
|
<parameter name="iframes">true</parameter>
|
|
</invoke>
|
|
</function_calls>
|
|
''')
|
|
async def browser_extract_content(self, instruction: str, iframes: bool = False) -> ToolResult:
|
|
"""Extract structured content from the current page using Stagehand."""
|
|
logger.debug(f"Browser extracting: {instruction} (iframes={iframes})")
|
|
params = {"instruction": instruction, "iframes": iframes}
|
|
return await self._execute_stagehand_api("extract", params)
|
|
|
|
@openapi_schema({
|
|
"type": "function",
|
|
"function": {
|
|
"name": "browser_screenshot",
|
|
"description": "Take a screenshot of the current page",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"name": {
|
|
"type": "string",
|
|
"description": "Name for the screenshot",
|
|
"default": "screenshot"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
})
|
|
@usage_example('''
|
|
<function_calls>
|
|
<invoke name="browser_screenshot">
|
|
<parameter name="name">page_screenshot</parameter>
|
|
</invoke>
|
|
</function_calls>
|
|
''')
|
|
async def browser_screenshot(self, name: str = "screenshot") -> ToolResult:
|
|
"""Take a screenshot using Stagehand."""
|
|
logger.debug(f"Browser taking screenshot: {name}")
|
|
return await self._execute_stagehand_api("screenshot", {"name": name})
|