diff --git a/backend/core/prompts/prompt.py b/backend/core/prompts/prompt.py
index 84ea9232..f5964732 100644
--- a/backend/core/prompts/prompt.py
+++ b/backend/core/prompts/prompt.py
@@ -148,18 +148,35 @@ You have the abilixwty to execute operations using both Python and CLI tools:
- Finding recent news, articles, and information beyond training data
- Scraping webpage content for detailed information extraction when needed
-### 2.3.5 BROWSER TOOLS AND CAPABILITIES
-- BROWSER OPERATIONS:
- * Navigate to URLs and manage history
- * Fill forms and submit data
- * Click elements and interact with pages
- * Extract text and HTML content
- * Wait for elements to load
- * Scroll pages and handle infinite scroll
- * YOU CAN DO ANYTHING ON THE BROWSER - including clicking on elements, filling forms, submitting data, etc.
- * The browser is in a sandboxed environment, so nothing to worry about.
+### 2.3.5 BROWSER AUTOMATION CAPABILITIES
+- **CORE BROWSER FUNCTIONS:**
+ * `browser_navigate_to(url)` - Navigate to any URL
+ * `browser_act(action, variables, iframes, filePath)` - Perform ANY browser action using natural language
+ - Examples: "click the login button", "fill in email with user@example.com", "scroll down", "select option from dropdown"
+ - Supports variables for secure data entry (not shared with LLM providers)
+ - Handles iframes when needed
+ - CRITICAL: Include filePath parameter for ANY action involving file uploads to prevent accidental file dialog triggers
+ * `browser_extract_content(instruction, iframes)` - Extract structured content from pages
+ - Example: "extract all product prices", "get apartment listings with address and price"
+ * `browser_screenshot(name)` - Take screenshots of the current page
-- CRITICAL BROWSER VALIDATION WORKFLOW:
+- **WHAT YOU CAN DO:**
+ * Navigate to any URL and browse websites
+ * Click buttons, links, and any interactive elements
+ * Fill out forms with text, numbers, emails, etc.
+ * Select options from dropdowns and menus
+ * Scroll pages (up, down, to specific elements)
+ * Handle dynamic content and JavaScript-heavy sites
+ * Extract structured data from pages
+ * Take screenshots at any point
+ * Press keyboard keys (Enter, Escape, Tab, etc.)
+ * Handle iframes and embedded content
+ * Upload files (use filePath parameter in browser_act)
+ * Navigate browser history (go back, forward)
+ * Wait for content to load
+ * The browser is in a sandboxed environment, so nothing to worry about
+
+- **CRITICAL BROWSER VALIDATION WORKFLOW:**
* Every browser action automatically provides a screenshot - ALWAYS review it carefully
* When entering values (phone numbers, emails, text), explicitly verify the screenshot shows the exact values you intended
* Only report success when visual confirmation shows the exact intended values are present
@@ -699,7 +716,12 @@ IMPORTANT: Use the `cat` command to view contents of small files (100 kb or less
- Only if you need specific details not found in search results:
* Use scrape-webpage on specific URLs from web-search results
- Only if scrape-webpage fails or if the page requires interaction:
- * Use direct browser tools (browser_navigate_to, browser_go_back, browser_wait, browser_click_element, browser_input_text, browser_send_keys, browser_switch_tab, browser_close_tab, browser_scroll_down, browser_scroll_up, browser_scroll_to_text, browser_get_dropdown_options, browser_select_dropdown_option, browser_drag_drop, browser_click_coordinates etc.)
+ * Use browser automation tools:
+ - `browser_navigate_to(url)` - Navigate to the page
+ - `browser_act(action)` - Perform any action using natural language
+ Examples: "click the login button", "fill in email", "scroll down", "select option from dropdown", "press Enter", "go back"
+ - `browser_extract_content(instruction)` - Extract structured content
+ - `browser_screenshot(name)` - Take screenshots
* This is needed for:
- Dynamic content loading
- JavaScript-heavy sites
@@ -731,22 +753,21 @@ IMPORTANT: Use the `cat` command to view contents of small files (100 kb or less
- Only basic facts or information are needed
- Only a high-level overview is needed
4. Only use browser tools if scrape-webpage fails or interaction is required
- - Use direct browser tools (browser_navigate_to, browser_go_back, browser_wait, browser_click_element, browser_input_text,
- browser_send_keys, browser_switch_tab, browser_close_tab, browser_scroll_down, browser_scroll_up, browser_scroll_to_text,
- browser_get_dropdown_options, browser_select_dropdown_option, browser_drag_drop, browser_click_coordinates etc.)
+ - Use browser automation tools:
+ * `browser_navigate_to(url)` - Navigate to pages
+ * `browser_act(action, variables, iframes, filePath)` - Perform any action with natural language
+ Examples: "click login", "fill form field with email@example.com", "scroll to bottom", "select dropdown option", "press Enter", "go back", "wait 3 seconds"
+ * `browser_extract_content(instruction, iframes)` - Extract structured content
+ * `browser_screenshot(name)` - Capture screenshots
- This is needed for:
* Dynamic content loading
* JavaScript-heavy sites
* Pages requiring login
* Interactive elements
* Infinite scroll pages
+ * Form submissions and data entry
DO NOT use browser tools directly unless interaction is required.
5. Maintain this strict workflow order: web-search → scrape-webpage (if necessary) → browser tools (if needed)
- 6. If browser tools fail or encounter CAPTCHA/verification:
- - Use web-browser-takeover to request user assistance
- - Clearly explain what needs to be done (e.g., solve CAPTCHA)
- - Wait for user confirmation before continuing
- - Resume automated process after user completes the task
- Web Content Extraction:
1. Verify URL validity before scraping
diff --git a/backend/core/run.py b/backend/core/run.py
index bdfd356e..cea14287 100644
--- a/backend/core/run.py
+++ b/backend/core/run.py
@@ -654,7 +654,7 @@ class AgentRunner:
'sb_shell_tool', 'sb_files_tool', 'sb_deploy_tool', 'sb_expose_tool',
'web_search_tool', 'image_search_tool', 'sb_vision_tool', 'sb_presentation_tool', 'sb_image_edit_tool',
'sb_sheets_tool', 'sb_kb_tool', 'sb_design_tool', 'sb_presentation_outline_tool', 'sb_upload_file_tool',
- 'sb_docs_tool', 'sb_browser_tool', 'sb_templates_tool', 'computer_use_tool', 'sb_web_dev_tool',
+ 'sb_docs_tool', 'sb_templates_tool', 'computer_use_tool', 'sb_web_dev_tool',
'data_providers_tool', 'browser_tool', 'people_search_tool', 'company_search_tool',
'agent_config_tool', 'mcp_search_tool', 'credential_profile_tool', 'trigger_tool',
'agent_creation_tool'
@@ -809,8 +809,6 @@ class AgentRunner:
last_tool_call = 'ask'
elif '' in assistant_text:
last_tool_call = 'complete'
- elif '' in assistant_text:
- last_tool_call = 'web-browser-takeover'
except (json.JSONDecodeError, Exception):
pass
@@ -834,7 +832,7 @@ class AgentRunner:
generation.end(status_message="error_detected", level="ERROR")
break
- if agent_should_terminate or last_tool_call in ['ask', 'complete', 'web-browser-takeover', 'present_presentation']:
+ if agent_should_terminate or last_tool_call in ['ask', 'complete', 'present_presentation']:
if generation:
generation.end(status_message="agent_stopped")
continue_execution = False
diff --git a/backend/core/suna_config.py b/backend/core/suna_config.py
index 310f3013..f9f6d852 100644
--- a/backend/core/suna_config.py
+++ b/backend/core/suna_config.py
@@ -37,9 +37,8 @@ SUNA_CONFIG = {
"people_search_tool": True,
"company_search_tool": True,
- # Browser automation (both variants)
+ # Browser automation
"browser_tool": True,
- "sb_browser_tool": True,
# Web development tools
"sb_web_dev_tool": False,
diff --git a/backend/core/tools/message_tool.py b/backend/core/tools/message_tool.py
index d46356b1..7fdc6ac7 100644
--- a/backend/core/tools/message_tool.py
+++ b/backend/core/tools/message_tool.py
@@ -4,9 +4,6 @@ from core.utils.logger import logger
class MessageTool(Tool):
"""Tool for user communication and interaction.
-
- This tool provides methods for asking questions, with support for
- attachments and user takeover suggestions.
"""
def __init__(self):
diff --git a/backend/core/tools/sb_browser_tool.py b/backend/core/tools/sb_browser_tool.py
deleted file mode 100644
index 58fb3c11..00000000
--- a/backend/core/tools/sb_browser_tool.py
+++ /dev/null
@@ -1,848 +0,0 @@
-import traceback
-import json
-import base64
-import io
-from PIL import Image
-
-from core.agentpress.tool import ToolResult, openapi_schema
-from core.agentpress.thread_manager import ThreadManager
-from core.sandbox.tool_base import SandboxToolsBase
-from core.utils.logger import logger
-from core.utils.s3_upload_utils import upload_base64_image
-
-
-class SandboxBrowserTool(SandboxToolsBase):
- """Tool for executing tasks in a Daytona sandbox with browser-use capabilities."""
-
- def __init__(self, project_id: str, thread_id: str, thread_manager: ThreadManager):
- super().__init__(project_id, thread_manager)
- self.thread_id = thread_id
-
- def _validate_base64_image(self, base64_string: str, max_size_mb: int = 10) -> tuple[bool, str]:
- """
- Comprehensive validation of base64 image data.
-
- Args:
- base64_string (str): The base64 encoded image data
- max_size_mb (int): Maximum allowed image size in megabytes
-
- Returns:
- tuple[bool, str]: (is_valid, error_message)
- """
- try:
- # Check if data exists and has reasonable length
- if not base64_string or len(base64_string) < 10:
- return False, "Base64 string is empty or too short"
-
- # Remove data URL prefix if present (data:image/jpeg;base64,...)
- if base64_string.startswith('data:'):
- try:
- base64_string = base64_string.split(',', 1)[1]
- except (IndexError, ValueError):
- return False, "Invalid data URL format"
-
- # Check if string contains only valid base64 characters
- # Base64 alphabet: A-Z, a-z, 0-9, +, /, = (padding)
- import re
- if not re.match(r'^[A-Za-z0-9+/]*={0,2}$', base64_string):
- return False, "Invalid base64 characters detected"
-
- # Check if base64 string length is valid (must be multiple of 4)
- if len(base64_string) % 4 != 0:
- return False, "Invalid base64 string length"
-
- # Attempt to decode base64
- try:
- image_data = base64.b64decode(base64_string, validate=True)
- except Exception as e:
- return False, f"Base64 decoding failed: {str(e)}"
-
- # Check decoded data size
- if len(image_data) == 0:
- return False, "Decoded image data is empty"
-
- # Check if decoded data size exceeds limit
- max_size_bytes = max_size_mb * 1024 * 1024
- if len(image_data) > max_size_bytes:
- return False, f"Image size ({len(image_data)} bytes) exceeds limit ({max_size_bytes} bytes)"
-
- # Validate that decoded data is actually a valid image using PIL
- try:
- image_stream = io.BytesIO(image_data)
- with Image.open(image_stream) as img:
- # Verify the image by attempting to load it
- img.verify()
-
- # Check if image format is supported
- supported_formats = {'JPEG', 'PNG', 'GIF', 'BMP', 'WEBP', 'TIFF'}
- if img.format not in supported_formats:
- return False, f"Unsupported image format: {img.format}"
-
- # Re-open for dimension checks (verify() closes the image)
- image_stream.seek(0)
- with Image.open(image_stream) as img_check:
- width, height = img_check.size
-
- # Check reasonable dimension limits
- max_dimension = 8192 # 8K resolution limit
- if width > max_dimension or height > max_dimension:
- return False, f"Image dimensions ({width}x{height}) exceed limit ({max_dimension}x{max_dimension})"
-
- # Check minimum dimensions
- if width < 1 or height < 1:
- return False, f"Invalid image dimensions: {width}x{height}"
-
- logger.debug(f"Valid image detected: {img.format}, {width}x{height}, {len(image_data)} bytes")
-
- except Exception as e:
- return False, f"Invalid image data: {str(e)}"
-
- return True, "Valid image"
-
- except Exception as e:
- logger.error(f"Unexpected error during base64 image validation: {e}")
- return False, f"Validation error: {str(e)}"
-
- async def _execute_browser_action(self, endpoint: str, params: dict = None, method: str = "POST") -> ToolResult:
- """Execute a browser automation action through the API
-
- Args:
- endpoint (str): The API endpoint to call
- params (dict, optional): Parameters to send. Defaults to None.
- method (str, optional): HTTP method to use. Defaults to "POST".
-
- Returns:
- ToolResult: Result of the execution
- """
- try:
- # Ensure sandbox is initialized
- await self._ensure_sandbox()
-
- # Build the curl command
- url = f"http://localhost:8003/api/automation/{endpoint}"
-
- if method == "GET" and params:
- query_params = "&".join([f"{k}={v}" for k, v in params.items()])
- url = f"{url}?{query_params}"
- curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'"
- else:
- curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'"
- if params:
- json_data = json.dumps(params)
- curl_cmd += f" -d '{json_data}'"
-
- logger.debug("\033[95mExecuting curl command:\033[0m")
- logger.debug(f"{curl_cmd}")
-
- response = await self.sandbox.process.exec(curl_cmd, timeout=30)
-
- if response.exit_code == 0:
- try:
- result = json.loads(response.result)
-
- if not "content" in result:
- result["content"] = ""
-
- if not "role" in result:
- result["role"] = "assistant"
-
- logger.debug("Browser automation request completed successfully")
-
- if "screenshot_base64" in result:
- try:
- # Comprehensive validation of the base64 image data
- screenshot_data = result["screenshot_base64"]
- is_valid, validation_message = self._validate_base64_image(screenshot_data)
-
- if is_valid:
- logger.debug(f"Screenshot validation passed: {validation_message}")
- image_url = await upload_base64_image(screenshot_data, "browser-screenshots")
- result["image_url"] = image_url
- logger.debug(f"Uploaded screenshot to {image_url}")
- else:
- logger.warning(f"Screenshot validation failed: {validation_message}")
- result["image_validation_error"] = validation_message
-
- # Remove base64 data from result to keep it clean
- del result["screenshot_base64"]
-
- except Exception as e:
- logger.error(f"Failed to process screenshot: {e}")
- result["image_upload_error"] = str(e)
-
- added_message = await self.thread_manager.add_message(
- thread_id=self.thread_id,
- type="browser_state",
- content=result,
- is_llm_message=False
- )
-
- success_response = {}
-
- if result.get("success"):
- success_response["success"] = result["success"]
- success_response["message"] = result.get("message", "Browser action completed successfully")
- else:
- success_response["success"] = False
- success_response["message"] = result.get("message", "Browser action failed")
-
- if added_message and 'message_id' in added_message:
- success_response['message_id'] = added_message['message_id']
- if result.get("url"):
- success_response["url"] = result["url"]
- if result.get("title"):
- success_response["title"] = result["title"]
- if result.get("element_count"):
- success_response["elements_found"] = result["element_count"]
- if result.get("pixels_below"):
- success_response["scrollable_content"] = result["pixels_below"] > 0
- if result.get("ocr_text"):
- success_response["ocr_text"] = result["ocr_text"]
- if result.get("image_url"):
- success_response["image_url"] = result["image_url"]
-
- if success_response.get("success"):
- return self.success_response(success_response)
- else:
- return self.fail_response(success_response)
-
- except json.JSONDecodeError as e:
- logger.error(f"Failed to parse response JSON: {response.result} {e}")
- return self.fail_response(f"Failed to parse response JSON: {response.result} {e}")
- else:
- logger.error(f"Browser automation request failed 2: {response}")
- return self.fail_response(f"Browser automation request failed 2: {response}")
-
- except Exception as e:
- logger.error(f"Error executing browser action: {e}")
- logger.debug(traceback.format_exc())
- return self.fail_response(f"Error executing browser action: {e}")
-
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_navigate_to",
- "description": "Navigate to a specific url",
- "parameters": {
- "type": "object",
- "properties": {
- "url": {
- "type": "string",
- "description": "The url to navigate to"
- }
- },
- "required": ["url"]
- }
- }
- })
- async def browser_navigate_to(self, url: str) -> ToolResult:
- """Navigate to a specific url
-
- Args:
- url (str): The url to navigate to
-
- Returns:
- dict: Result of the execution
- """
- return await self._execute_browser_action("navigate_to", {"url": url})
-
- # @openapi_schema({
- # "type": "function",
- # "function": {
- # "name": "browser_search_google",
- # "description": "Search Google with the provided query",
- # "parameters": {
- # "type": "object",
- # "properties": {
- # "query": {
- # "type": "string",
- # "description": "The search query to use"
- # }
- # },
- # "required": ["query"]
- # }
- # }
- # })
- # @xml_schema(
- # tag_name="browser-search-google",
- # mappings=[
- # {"param_name": "query", "node_type": "content", "path": "."}
- # ],
- # example='''
- #
- # artificial intelligence news
- #
- # '''
- # )
- # async def browser_search_google(self, query: str) -> ToolResult:
- # """Search Google with the provided query
-
- # Args:
- # query (str): The search query to use
-
- # Returns:
- # dict: Result of the execution
- # """
- # logger.debug(f"\033[95mSearching Google for: {query}\033[0m")
- # return await self._execute_browser_action("search_google", {"query": query})
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_go_back",
- "description": "Navigate back in browser history",
- "parameters": {
- "type": "object",
- "properties": {}
- }
- }
- })
- async def browser_go_back(self) -> ToolResult:
- """Navigate back in browser history
-
- Returns:
- dict: Result of the execution
- """
- logger.debug(f"\033[95mNavigating back in browser history\033[0m")
- return await self._execute_browser_action("go_back", {})
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_wait",
- "description": "Wait for the specified number of seconds",
- "parameters": {
- "type": "object",
- "properties": {
- "seconds": {
- "type": "integer",
- "description": "Number of seconds to wait (default: 3)"
- }
- }
- }
- }
- })
- async def browser_wait(self, seconds: int = 3) -> ToolResult:
- """Wait for the specified number of seconds
-
- Args:
- seconds (int, optional): Number of seconds to wait. Defaults to 3.
-
- Returns:
- dict: Result of the execution
- """
- logger.debug(f"\033[95mWaiting for {seconds} seconds\033[0m")
- return await self._execute_browser_action("wait", {"seconds": seconds})
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_click_element",
- "description": "Click on an element by index",
- "parameters": {
- "type": "object",
- "properties": {
- "index": {
- "type": "integer",
- "description": "The index of the element to click"
- }
- },
- "required": ["index"]
- }
- }
- })
- async def browser_click_element(self, index: int) -> ToolResult:
- """Click on an element by index
-
- Args:
- index (int): The index of the element to click
-
- Returns:
- dict: Result of the execution
- """
- logger.debug(f"\033[95mClicking element with index: {index}\033[0m")
- return await self._execute_browser_action("click_element", {"index": index})
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_input_text",
- "description": "Input text into an element",
- "parameters": {
- "type": "object",
- "properties": {
- "index": {
- "type": "integer",
- "description": "The index of the element to input text into"
- },
- "text": {
- "type": "string",
- "description": "The text to input"
- }
- },
- "required": ["index", "text"]
- }
- }
- })
- async def browser_input_text(self, index: int, text: str) -> ToolResult:
- """Input text into an element
-
- Args:
- index (int): The index of the element to input text into
- text (str): The text to input
-
- Returns:
- dict: Result of the execution
- """
- logger.debug(f"\033[95mInputting text into element {index}: {text}\033[0m")
- return await self._execute_browser_action("input_text", {"index": index, "text": text})
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_send_keys",
- "description": "Send keyboard keys such as Enter, Escape, or keyboard shortcuts",
- "parameters": {
- "type": "object",
- "properties": {
- "keys": {
- "type": "string",
- "description": "The keys to send (e.g., 'Enter', 'Escape', 'Control+a')"
- }
- },
- "required": ["keys"]
- }
- }
- })
- async def browser_send_keys(self, keys: str) -> ToolResult:
- """Send keyboard keys
-
- Args:
- keys (str): The keys to send (e.g., 'Enter', 'Escape', 'Control+a')
-
- Returns:
- dict: Result of the execution
- """
- logger.debug(f"\033[95mSending keys: {keys}\033[0m")
- return await self._execute_browser_action("send_keys", {"keys": keys})
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_switch_tab",
- "description": "Switch to a different browser tab",
- "parameters": {
- "type": "object",
- "properties": {
- "page_id": {
- "type": "integer",
- "description": "The ID of the tab to switch to"
- }
- },
- "required": ["page_id"]
- }
- }
- })
- async def browser_switch_tab(self, page_id: int) -> ToolResult:
- """Switch to a different browser tab
-
- Args:
- page_id (int): The ID of the tab to switch to
-
- Returns:
- dict: Result of the execution
- """
- logger.debug(f"\033[95mSwitching to tab: {page_id}\033[0m")
- return await self._execute_browser_action("switch_tab", {"page_id": page_id})
-
- # @openapi_schema({
- # "type": "function",
- # "function": {
- # "name": "browser_open_tab",
- # "description": "Open a new browser tab with the specified URL",
- # "parameters": {
- # "type": "object",
- # "properties": {
- # "url": {
- # "type": "string",
- # "description": "The URL to open in the new tab"
- # }
- # },
- # "required": ["url"]
- # }
- # }
- # })
- # @xml_schema(
- # tag_name="browser-open-tab",
- # mappings=[
- # {"param_name": "url", "node_type": "content", "path": "."}
- # ],
- # example='''
- #
- # https://example.com
- #
- # '''
- # )
- # async def browser_open_tab(self, url: str) -> ToolResult:
- # """Open a new browser tab with the specified URL
-
- # Args:
- # url (str): The URL to open in the new tab
-
- # Returns:
- # dict: Result of the execution
- # """
- # logger.debug(f"\033[95mOpening new tab with URL: {url}\033[0m")
- # return await self._execute_browser_action("open_tab", {"url": url})
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_close_tab",
- "description": "Close a browser tab",
- "parameters": {
- "type": "object",
- "properties": {
- "page_id": {
- "type": "integer",
- "description": "The ID of the tab to close"
- }
- },
- "required": ["page_id"]
- }
- }
- })
- async def browser_close_tab(self, page_id: int) -> ToolResult:
- """Close a browser tab
-
- Args:
- page_id (int): The ID of the tab to close
-
- Returns:
- dict: Result of the execution
- """
- logger.debug(f"\033[95mClosing tab: {page_id}\033[0m")
- return await self._execute_browser_action("close_tab", {"page_id": page_id})
-
- # @openapi_schema({
- # "type": "function",
- # "function": {
- # "name": "browser_extract_content",
- # "description": "Extract content from the current page based on the provided goal",
- # "parameters": {
- # "type": "object",
- # "properties": {
- # "goal": {
- # "type": "string",
- # "description": "The extraction goal (e.g., 'extract all links', 'find product information')"
- # }
- # },
- # "required": ["goal"]
- # }
- # }
- # })
- # @xml_schema(
- # tag_name="browser-extract-content",
- # mappings=[
- # {"param_name": "goal", "node_type": "content", "path": "."}
- # ],
- # example='''
- #
- # Extract all links on the page
- #
- # '''
- # )
- # async def browser_extract_content(self, goal: str) -> ToolResult:
- # """Extract content from the current page based on the provided goal
-
- # Args:
- # goal (str): The extraction goal
-
- # Returns:
- # dict: Result of the execution
- # """
- # logger.debug(f"\033[95mExtracting content with goal: {goal}\033[0m")
- # result = await self._execute_browser_action("extract_content", {"goal": goal})
-
- # # Format content for better readability
- # if result.get("success"):
- # logger.debug(f"\033[92mContent extraction successful\033[0m")
- # content = result.data.get("content", "")
- # url = result.data.get("url", "")
- # title = result.data.get("title", "")
-
- # if content:
- # content_preview = content[:200] + "..." if len(content) > 200 else content
- # logger.debug(f"\033[95mExtracted content from {title} ({url}):\033[0m")
- # logger.debug(f"\033[96m{content_preview}\033[0m")
- # logger.debug(f"\033[95mTotal content length: {len(content)} characters\033[0m")
- # else:
- # logger.debug(f"\033[93mNo content extracted from {url}\033[0m")
- # else:
- # logger.debug(f"\033[91mFailed to extract content: {result.data.get('error', 'Unknown error')}\033[0m")
-
- # return result
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_scroll_down",
- "description": "Scroll down the page",
- "parameters": {
- "type": "object",
- "properties": {
- "amount": {
- "type": "integer",
- "description": "Pixel amount to scroll (if not specified, scrolls one page)"
- }
- }
- }
- }
- })
- async def browser_scroll_down(self, amount: int = None) -> ToolResult:
- """Scroll down the page
-
- Args:
- amount (int, optional): Pixel amount to scroll. If None, scrolls one page.
-
- Returns:
- dict: Result of the execution
- """
- params = {}
- if amount is not None:
- params["amount"] = amount
- logger.debug(f"\033[95mScrolling down by {amount} pixels\033[0m")
- else:
- logger.debug(f"\033[95mScrolling down one page\033[0m")
-
- return await self._execute_browser_action("scroll_down", params)
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_scroll_up",
- "description": "Scroll up the page",
- "parameters": {
- "type": "object",
- "properties": {
- "amount": {
- "type": "integer",
- "description": "Pixel amount to scroll (if not specified, scrolls one page)"
- }
- }
- }
- }
- })
- async def browser_scroll_up(self, amount: int = None) -> ToolResult:
- """Scroll up the page
-
- Args:
- amount (int, optional): Pixel amount to scroll. If None, scrolls one page.
-
- Returns:
- dict: Result of the execution
- """
- params = {}
- if amount is not None:
- params["amount"] = amount
- logger.debug(f"\033[95mScrolling up by {amount} pixels\033[0m")
- else:
- logger.debug(f"\033[95mScrolling up one page\033[0m")
-
- return await self._execute_browser_action("scroll_up", params)
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_scroll_to_text",
- "description": "Scroll to specific text on the page",
- "parameters": {
- "type": "object",
- "properties": {
- "text": {
- "type": "string",
- "description": "The text to scroll to"
- }
- },
- "required": ["text"]
- }
- }
- })
- async def browser_scroll_to_text(self, text: str) -> ToolResult:
- """Scroll to specific text on the page
-
- Args:
- text (str): The text to scroll to
-
- Returns:
- dict: Result of the execution
- """
- logger.debug(f"\033[95mScrolling to text: {text}\033[0m")
- return await self._execute_browser_action("scroll_to_text", {"text": text})
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_get_dropdown_options",
- "description": "Get all options from a dropdown element",
- "parameters": {
- "type": "object",
- "properties": {
- "index": {
- "type": "integer",
- "description": "The index of the dropdown element"
- }
- },
- "required": ["index"]
- }
- }
- })
- async def browser_get_dropdown_options(self, index: int) -> ToolResult:
- """Get all options from a dropdown element
-
- Args:
- index (int): The index of the dropdown element
-
- Returns:
- dict: Result of the execution with the dropdown options
- """
- logger.debug(f"\033[95mGetting options from dropdown with index: {index}\033[0m")
- return await self._execute_browser_action("get_dropdown_options", {"index": index})
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_select_dropdown_option",
- "description": "Select an option from a dropdown by text",
- "parameters": {
- "type": "object",
- "properties": {
- "index": {
- "type": "integer",
- "description": "The index of the dropdown element"
- },
- "text": {
- "type": "string",
- "description": "The text of the option to select"
- }
- },
- "required": ["index", "text"]
- }
- }
- })
- async def browser_select_dropdown_option(self, index: int, text: str) -> ToolResult:
- """Select an option from a dropdown by text
-
- Args:
- index (int): The index of the dropdown element
- text (str): The text of the option to select
-
- Returns:
- dict: Result of the execution
- """
- logger.debug(f"\033[95mSelecting option '{text}' from dropdown with index: {index}\033[0m")
- return await self._execute_browser_action("select_dropdown_option", {"index": index, "text": text})
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_drag_drop",
- "description": "Perform drag and drop operation between elements or coordinates",
- "parameters": {
- "type": "object",
- "properties": {
- "element_source": {
- "type": "string",
- "description": "The source element selector"
- },
- "element_target": {
- "type": "string",
- "description": "The target element selector"
- },
- "coord_source_x": {
- "type": "integer",
- "description": "The source X coordinate"
- },
- "coord_source_y": {
- "type": "integer",
- "description": "The source Y coordinate"
- },
- "coord_target_x": {
- "type": "integer",
- "description": "The target X coordinate"
- },
- "coord_target_y": {
- "type": "integer",
- "description": "The target Y coordinate"
- }
- }
- }
- }
- })
- async def browser_drag_drop(self, element_source: str = None, element_target: str = None,
- coord_source_x: int = None, coord_source_y: int = None,
- coord_target_x: int = None, coord_target_y: int = None) -> ToolResult:
- """Perform drag and drop operation between elements or coordinates
-
- Args:
- element_source (str, optional): The source element selector
- element_target (str, optional): The target element selector
- coord_source_x (int, optional): The source X coordinate
- coord_source_y (int, optional): The source Y coordinate
- coord_target_x (int, optional): The target X coordinate
- coord_target_y (int, optional): The target Y coordinate
-
- Returns:
- dict: Result of the execution
- """
- params = {}
-
- if element_source and element_target:
- params["element_source"] = element_source
- params["element_target"] = element_target
- logger.debug(f"\033[95mDragging from element '{element_source}' to '{element_target}'\033[0m")
- elif all(coord is not None for coord in [coord_source_x, coord_source_y, coord_target_x, coord_target_y]):
- params["coord_source_x"] = coord_source_x
- params["coord_source_y"] = coord_source_y
- params["coord_target_x"] = coord_target_x
- params["coord_target_y"] = coord_target_y
- logger.debug(f"\033[95mDragging from coordinates ({coord_source_x}, {coord_source_y}) to ({coord_target_x}, {coord_target_y})\033[0m")
- else:
- return self.fail_response("Must provide either element selectors or coordinates for drag and drop")
-
- return await self._execute_browser_action("drag_drop", params)
-
- @openapi_schema({
- "type": "function",
- "function": {
- "name": "browser_click_coordinates",
- "description": "Click at specific X,Y coordinates on the page",
- "parameters": {
- "type": "object",
- "properties": {
- "x": {
- "type": "integer",
- "description": "The X coordinate to click"
- },
- "y": {
- "type": "integer",
- "description": "The Y coordinate to click"
- }
- },
- "required": ["x", "y"]
- }
- }
- })
- async def browser_click_coordinates(self, x: int, y: int) -> ToolResult:
- """Click at specific X,Y coordinates on the page
-
- Args:
- x (int): The X coordinate to click
- y (int): The Y coordinate to click
-
- Returns:
- dict: Result of the execution
- """
- logger.debug(f"\033[95mClicking at coordinates: ({x}, {y})\033[0m")
- return await self._execute_browser_action("click_coordinates", {"x": x, "y": y})
\ No newline at end of file
diff --git a/backend/core/utils/tool_groups.py b/backend/core/utils/tool_groups.py
index 1224781b..335d3726 100644
--- a/backend/core/utils/tool_groups.py
+++ b/backend/core/utils/tool_groups.py
@@ -745,105 +745,6 @@ TOOL_GROUPS: Dict[str, ToolGroup] = {
]
),
- "sb_browser_tool": ToolGroup(
- name="sb_browser_tool",
- display_name="Browser Automation (Advanced)",
- description="Advanced browser automation with full UI interaction capabilities",
- tool_class="SandboxBrowserTool",
- methods=[
- ToolMethod(
- name="browser_navigate_to",
- display_name="Navigate To URL",
- description="Navigate browser to specific URL",
- enabled=True
- ),
- ToolMethod(
- name="browser_go_back",
- display_name="Go Back",
- description="Navigate back in browser history",
- enabled=True
- ),
- ToolMethod(
- name="browser_wait",
- display_name="Wait",
- description="Wait for specified number of seconds",
- enabled=True
- ),
- ToolMethod(
- name="browser_click_element",
- display_name="Click Element",
- description="Click on an element by index",
- enabled=True
- ),
- ToolMethod(
- name="browser_input_text",
- display_name="Input Text",
- description="Input text into an element",
- enabled=True
- ),
- ToolMethod(
- name="browser_send_keys",
- display_name="Send Keys",
- description="Send keyboard keys (Enter, Escape, shortcuts)",
- enabled=True
- ),
- ToolMethod(
- name="browser_switch_tab",
- display_name="Switch Tab",
- description="Switch to a different browser tab",
- enabled=True
- ),
- ToolMethod(
- name="browser_close_tab",
- display_name="Close Tab",
- description="Close a browser tab",
- enabled=True
- ),
- ToolMethod(
- name="browser_scroll_down",
- display_name="Scroll Down",
- description="Scroll down the page",
- enabled=True
- ),
- ToolMethod(
- name="browser_scroll_up",
- display_name="Scroll Up",
- description="Scroll up the page",
- enabled=True
- ),
- ToolMethod(
- name="browser_scroll_to_text",
- display_name="Scroll To Text",
- description="Scroll to specific text on the page",
- enabled=True
- ),
- ToolMethod(
- name="browser_get_dropdown_options",
- display_name="Get Dropdown Options",
- description="Get all options from a dropdown element",
- enabled=True
- ),
- ToolMethod(
- name="browser_select_dropdown_option",
- display_name="Select Dropdown Option",
- description="Select an option from a dropdown by text",
- enabled=True
- ),
- ToolMethod(
- name="browser_drag_drop",
- display_name="Drag and Drop",
- description="Perform drag and drop operation",
- enabled=True
- ),
- ToolMethod(
- name="browser_click_coordinates",
- display_name="Click Coordinates",
- description="Click at specific X,Y coordinates",
- enabled=True
- ),
- ]
- ),
-
"people_search_tool": ToolGroup(
name="people_search_tool",
display_name="People Search",
@@ -1017,12 +918,6 @@ TOOL_GROUPS: Dict[str, ToolGroup] = {
enabled=True,
is_core=True
),
- ToolMethod(
- name="web_browser_takeover",
- display_name="Request Browser Takeover",
- description="Request user takeover of browser interaction",
- enabled=True
- ),
ToolMethod(
name="complete",
display_name="Complete Task",