preview

2025-04-15 15:34:26 +01:00 · 2025-04-15 15:34:26 +01:00 · c4d30e270b
parent ad78a0d4f3
commit c4d30e270b
10 changed files with 1725 additions and 560 deletions
--- a/backend/agent/run.py
+++ b/backend/agent/run.py
@ -58,7 +58,8 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread
        await client.table('projects').update({
            'sandbox': {
                'id': sandbox_id,
-                'pass': sandbox_pass
+                'pass': sandbox_pass,
+                'vnc_preview': sandbox.get_preview_link(6080)
            }
        }).eq('project_id', project_id).execute()
    
@ -114,6 +115,12 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread
                print(f"Last message was from assistant, stopping execution")
                continue_execution = False
                break
+        # Get the latest message from messages table that its tpye is browser_state
+        latest_browser_state = await client.table('messages').select('*').eq('thread_id', thread_id).eq('type', 'browser_state').order('created_at', desc=True).limit(1).execute()
+        if latest_browser_state.data and len(latest_browser_state.data) > 0:
+            temporary_message = latest_browser_state.data[0].get('content', '')
+        else:
+            temporary_message = None

        response = await thread_manager.run_thread(
            thread_id=thread_id,
@ -124,6 +131,7 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread
            llm_max_tokens=64000,
            tool_choice="auto",
            max_xml_tool_calls=1,
+            # temporary_message=
            processor_config=ProcessorConfig(
                xml_tool_calling=True,
                native_tool_calling=False,
--- a/backend/agent/tools/sb_browser_tool.py
+++ b/backend/agent/tools/sb_browser_tool.py
@ -30,9 +30,9 @@ class SandboxBrowserTool(SandboxToolsBase):
            if method == "GET" and params:
                query_params = "&".join([f"{k}={v}" for k, v in params.items()])
                url = f"{url}?{query_params}"
-                curl_cmd = f"curl -X {method} '{url}' -H 'Content-Type: application/json'"
+                curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'"
            else:
-                curl_cmd = f"curl -X {method} '{url}' -H 'Content-Type: application/json'"
+                curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'"
                if params:
                    json_data = json.dumps(params)
                    curl_cmd += f" -d '{json_data}'"
@ -46,7 +46,43 @@ class SandboxBrowserTool(SandboxToolsBase):
                try:
                    result = json.loads(response.result)
                    logger.info("Browser automation request completed successfully")
-                    return self.success_response(result)
+
+                    # Create a cleaned version of the result based on BrowserActionResult schema
+                    cleaned_result = {
+                        "success": result.get("success", False),
+                        "message": result.get("message", ""),
+                        "error": result.get("error", ""),
+                        "url": result.get("url"),
+                        "title": result.get("title"),
+                        "elements": result.get("elements"),
+                        "pixels_above": result.get("pixels_above", 0),
+                        "pixels_below": result.get("pixels_below", 0),
+                        "content": result.get("content"),
+                        "element_count": result.get("element_count", 0),
+                        "interactive_elements": result.get("interactive_elements"),
+                        "viewport_width": result.get("viewport_width"),
+                        "viewport_height": result.get("viewport_height")
+                    }
+
+                    # Print screenshot info to console but don't return it
+                    if "screenshot_base64" in result:
+                        has_screenshot = bool(result.get("screenshot_base64"))
+                        print(f"\033[95mScreenshot captured: {has_screenshot}\033[0m")
+
+                    # Print viewport info if available
+                    if cleaned_result["viewport_width"] and cleaned_result["viewport_height"]:
+                        print(f"\033[95mViewport size: {cleaned_result['viewport_width']}x{cleaned_result['viewport_height']}\033[0m")
+
+                    # Print interactive elements count
+                    if cleaned_result["element_count"] > 0:
+                        print(f"\033[95mFound {cleaned_result['element_count']} interactive elements\033[0m")
+
+                    print("************************************************")
+                    print(cleaned_result)
+                    print("************************************************")
+
+                    return self.success_response(cleaned_result)
+
                except json.JSONDecodeError:
                    logger.error(f"Failed to parse response JSON: {response.result}")
                    return self.fail_response(f"Failed to parse response JSON: {response.result}")
@ -99,45 +135,45 @@ class SandboxBrowserTool(SandboxToolsBase):
        print(f"\033[95mNavigating to: {url}\033[0m")
        return await self._execute_browser_action("navigate_to", {"url": url})

-    @openapi_schema({
-        "type": "function",
-        "function": {
-            "name": "browser_search_google",
-            "description": "Search Google with the provided query",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "query": {
-                        "type": "string",
-                        "description": "The search query to use"
-                    }
-                },
-                "required": ["query"]
-            }
-        }
-    })
-    @xml_schema(
-        tag_name="browser-search-google",
-        mappings=[
-            {"param_name": "query", "node_type": "content", "path": "."}
-        ],
-        example='''
-        <browser-search-google>
-        artificial intelligence news
-        </browser-search-google>
-        '''
-    )
-    async def browser_search_google(self, query: str) -> ToolResult:
-        """Search Google with the provided query
+    # @openapi_schema({
+    #     "type": "function",
+    #     "function": {
+    #         "name": "browser_search_google",
+    #         "description": "Search Google with the provided query",
+    #         "parameters": {
+    #             "type": "object",
+    #             "properties": {
+    #                 "query": {
+    #                     "type": "string",
+    #                     "description": "The search query to use"
+    #                 }
+    #             },
+    #             "required": ["query"]
+    #         }
+    #     }
+    # })
+    # @xml_schema(
+    #     tag_name="browser-search-google",
+    #     mappings=[
+    #         {"param_name": "query", "node_type": "content", "path": "."}
+    #     ],
+    #     example='''
+    #     <browser-search-google>
+    #     artificial intelligence news
+    #     </browser-search-google>
+    #     '''
+    # )
+    # async def browser_search_google(self, query: str) -> ToolResult:
+    #     """Search Google with the provided query
        
-        Args:
-            query (str): The search query to use
+    #     Args:
+    #         query (str): The search query to use
            
-        Returns:
-            dict: Result of the execution
-        """
-        print(f"\033[95mSearching Google for: {query}\033[0m")
-        return await self._execute_browser_action("search_google", {"query": query})
+    #     Returns:
+    #         dict: Result of the execution
+    #     """
+    #     print(f"\033[95mSearching Google for: {query}\033[0m")
+    #     return await self._execute_browser_action("search_google", {"query": query})

    @openapi_schema({
        "type": "function",
@ -269,7 +305,7 @@ class SandboxBrowserTool(SandboxToolsBase):
    @xml_schema(
        tag_name="browser-input-text",
        mappings=[
-            {"param_name": "index", "node_type": "attribute", "path": "@index"},
+            {"param_name": "index", "node_type": "attribute", "path": "."},
            {"param_name": "text", "node_type": "content", "path": "."}
        ],
        example='''
@ -371,45 +407,45 @@ class SandboxBrowserTool(SandboxToolsBase):
        print(f"\033[95mSwitching to tab: {page_id}\033[0m")
        return await self._execute_browser_action("switch_tab", {"page_id": page_id})

-    @openapi_schema({
-        "type": "function",
-        "function": {
-            "name": "browser_open_tab",
-            "description": "Open a new browser tab with the specified URL",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "url": {
-                        "type": "string",
-                        "description": "The URL to open in the new tab"
-                    }
-                },
-                "required": ["url"]
-            }
-        }
-    })
-    @xml_schema(
-        tag_name="browser-open-tab",
-        mappings=[
-            {"param_name": "url", "node_type": "content", "path": "."}
-        ],
-        example='''
-        <browser-open-tab>
-        https://example.com
-        </browser-open-tab>
-        '''
-    )
-    async def browser_open_tab(self, url: str) -> ToolResult:
-        """Open a new browser tab with the specified URL
+    # @openapi_schema({
+    #     "type": "function",
+    #     "function": {
+    #         "name": "browser_open_tab",
+    #         "description": "Open a new browser tab with the specified URL",
+    #         "parameters": {
+    #             "type": "object",
+    #             "properties": {
+    #                 "url": {
+    #                     "type": "string",
+    #                     "description": "The URL to open in the new tab"
+    #                 }
+    #             },
+    #             "required": ["url"]
+    #         }
+    #     }
+    # })
+    # @xml_schema(
+    #     tag_name="browser-open-tab",
+    #     mappings=[
+    #         {"param_name": "url", "node_type": "content", "path": "."}
+    #     ],
+    #     example='''
+    #     <browser-open-tab>
+    #     https://example.com
+    #     </browser-open-tab>
+    #     '''
+    # )
+    # async def browser_open_tab(self, url: str) -> ToolResult:
+    #     """Open a new browser tab with the specified URL
        
-        Args:
-            url (str): The URL to open in the new tab
+    #     Args:
+    #         url (str): The URL to open in the new tab
            
-        Returns:
-            dict: Result of the execution
-        """
-        print(f"\033[95mOpening new tab with URL: {url}\033[0m")
-        return await self._execute_browser_action("open_tab", {"url": url})
+    #     Returns:
+    #         dict: Result of the execution
+    #     """
+    #     print(f"\033[95mOpening new tab with URL: {url}\033[0m")
+    #     return await self._execute_browser_action("open_tab", {"url": url})

    @openapi_schema({
        "type": "function",
@ -451,72 +487,64 @@ class SandboxBrowserTool(SandboxToolsBase):
        print(f"\033[95mClosing tab: {page_id}\033[0m")
        return await self._execute_browser_action("close_tab", {"page_id": page_id})

-    @openapi_schema({
-        "type": "function",
-        "function": {
-            "name": "browser_extract_content",
-            "description": "Extract content from the current page based on the provided goal",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "goal": {
-                        "type": "string",
-                        "description": "The extraction goal (e.g., 'extract all links', 'find product information')"
-                    }
-                },
-                "required": ["goal"]
-            }
-        }
-    })
-    @xml_schema(
-        tag_name="browser-extract-content",
-        mappings=[
-            {"param_name": "goal", "node_type": "content", "path": "."}
-        ],
-        example='''
-        <browser-extract-content>
-        Extract all links on the page
-        </browser-extract-content>
-        '''
-    )
-    async def browser_extract_content(self, goal: str) -> ToolResult:
-        """Extract content from the current page based on the provided goal
+    # @openapi_schema({
+    #     "type": "function",
+    #     "function": {
+    #         "name": "browser_extract_content",
+    #         "description": "Extract content from the current page based on the provided goal",
+    #         "parameters": {
+    #             "type": "object",
+    #             "properties": {
+    #                 "goal": {
+    #                     "type": "string",
+    #                     "description": "The extraction goal (e.g., 'extract all links', 'find product information')"
+    #                 }
+    #             },
+    #             "required": ["goal"]
+    #         }
+    #     }
+    # })
+    # @xml_schema(
+    #     tag_name="browser-extract-content",
+    #     mappings=[
+    #         {"param_name": "goal", "node_type": "content", "path": "."}
+    #     ],
+    #     example='''
+    #     <browser-extract-content>
+    #     Extract all links on the page
+    #     </browser-extract-content>
+    #     '''
+    # )
+    # async def browser_extract_content(self, goal: str) -> ToolResult:
+    #     """Extract content from the current page based on the provided goal
        
-        Args:
-            goal (str): The extraction goal
+    #     Args:
+    #         goal (str): The extraction goal
            
-        Returns:
-            dict: Result of the execution
-        """
-        print(f"\033[95mExtracting content with goal: {goal}\033[0m")
-        return await self._execute_browser_action("extract_content", {"goal": goal})
-
-    @openapi_schema({
-        "type": "function",
-        "function": {
-            "name": "browser_save_pdf",
-            "description": "Save the current page as a PDF file",
-            "parameters": {
-                "type": "object",
-                "properties": {}
-            }
-        }
-    })
-    @xml_schema(
-        tag_name="browser-save-pdf",
-        mappings=[],
-        example='''
-        <browser-save-pdf></browser-save-pdf>
-        '''
-    )
-    async def browser_save_pdf(self) -> ToolResult:
-        """Save the current page as a PDF file
+    #     Returns:
+    #         dict: Result of the execution
+    #     """
+    #     print(f"\033[95mExtracting content with goal: {goal}\033[0m")
+    #     result = await self._execute_browser_action("extract_content", {"goal": goal})
        
-        Returns:
-            dict: Result of the execution
-        """
-        print(f"\033[95mSaving current page as PDF\033[0m")
-        return await self._execute_browser_action("save_pdf")
+    #     # Format content for better readability
+    #     if result.get("success"):
+    #         print(f"\033[92mContent extraction successful\033[0m")
+    #         content = result.data.get("content", "")
+    #         url = result.data.get("url", "")
+    #         title = result.data.get("title", "")
+            
+    #         if content:
+    #             content_preview = content[:200] + "..." if len(content) > 200 else content
+    #             print(f"\033[95mExtracted content from {title} ({url}):\033[0m")
+    #             print(f"\033[96m{content_preview}\033[0m")
+    #             print(f"\033[95mTotal content length: {len(content)} characters\033[0m")
+    #         else:
+    #             print(f"\033[93mNo content extracted from {url}\033[0m")
+    #     else:
+    #         print(f"\033[91mFailed to extract content: {result.data.get('error', 'Unknown error')}\033[0m")
+        
+    #     return result

    @openapi_schema({
        "type": "function",
@ -712,7 +740,7 @@ class SandboxBrowserTool(SandboxToolsBase):
    @xml_schema(
        tag_name="browser-select-dropdown-option",
        mappings=[
-            {"param_name": "index", "node_type": "attribute", "path": "@index"},
+            {"param_name": "index", "node_type": "attribute", "path": "."},
            {"param_name": "text", "node_type": "content", "path": "."}
        ],
        example='''
@ -773,12 +801,12 @@ class SandboxBrowserTool(SandboxToolsBase):
    @xml_schema(
        tag_name="browser-drag-drop",
        mappings=[
-            {"param_name": "element_source", "node_type": "attribute", "path": "@element_source"},
-            {"param_name": "element_target", "node_type": "attribute", "path": "@element_target"},
-            {"param_name": "coord_source_x", "node_type": "attribute", "path": "@coord_source_x"},
-            {"param_name": "coord_source_y", "node_type": "attribute", "path": "@coord_source_y"},
-            {"param_name": "coord_target_x", "node_type": "attribute", "path": "@coord_target_x"},
-            {"param_name": "coord_target_y", "node_type": "attribute", "path": "@coord_target_y"}
+            {"param_name": "element_source", "node_type": "attribute", "path": "."},
+            {"param_name": "element_target", "node_type": "attribute", "path": "."},
+            {"param_name": "coord_source_x", "node_type": "attribute", "path": "."},
+            {"param_name": "coord_source_y", "node_type": "attribute", "path": "."},
+            {"param_name": "coord_target_x", "node_type": "attribute", "path": "."},
+            {"param_name": "coord_target_y", "node_type": "attribute", "path": "."}
        ],
        example='''
        <browser-drag-drop element_source="#draggable" element_target="#droppable"></browser-drag-drop>
--- a/backend/sandbox/docker/browser_api.py
+++ b/backend/sandbox/docker/browser_api.py
--- a/backend/sandbox/docker/browser_automation_service.py
+++ b/backend/sandbox/docker/browser_automation_service.py
@ -1,272 +0,0 @@
-import asyncio
-from typing import List, Dict, Any, Optional, Union
-from fastapi import APIRouter
-from pydantic import BaseModel
-from enum import Enum
-from playwright.async_api import async_playwright, Browser, Page, Mouse, Keyboard
-import base64
-
-class MouseButton(str, Enum):
-    left = "left"
-    middle = "middle"
-    right = "right"
-
-class Position(BaseModel):
-    x: Optional[int] = None
-    y: Optional[int] = None
-
-class MouseAction(BaseModel):
-    x: Optional[int] = None
-    y: Optional[int] = None
-    clicks: Optional[int] = 1
-    button: MouseButton = MouseButton.left
-    delay: Optional[float] = 0.0
-
-class KeyboardAction(BaseModel):
-    key: str
-
-class KeyboardPress(BaseModel):
-    keys: Union[str, List[str]]
-    delay: Optional[float] = 0.0
-
-class WriteAction(BaseModel):
-    message: str
-    delay: Optional[float] = 0.0
-
-class HotkeyAction(BaseModel):
-    keys: List[str]
-    delay: Optional[float] = 0.0
-
-class BrowserAutomation:
-    def __init__(self):
-        self.router = APIRouter()
-        self.browser: Optional[Browser] = None
-        self.page: Optional[Page] = None
-        self.mouse: Optional[Mouse] = None
-        self.keyboard: Optional[Keyboard] = None
-        
-        # Register routes
-        self.router.on_startup.append(self.startup)
-        self.router.on_shutdown.append(self.shutdown)
-        
-        self.router.get("/automation/mouse/position")(self.get_mouse_position)
-        self.router.post("/automation/mouse/move")(self.move_mouse)
-        self.router.post("/automation/mouse/click")(self.click_mouse)
-        self.router.post("/automation/mouse/down")(self.mouse_down)
-        self.router.post("/automation/mouse/up")(self.mouse_up)
-        self.router.post("/automation/keyboard/press")(self.press_key)
-        self.router.post("/automation/keyboard/write")(self.write_text)
-        self.router.post("/automation/keyboard/hotkey")(self.press_hotkey)
-        self.router.post("/automation/navigate_to")(self.navigate_to)
-        self.router.post("/automation/screenshot")(self.take_screenshot)
-
-    async def startup(self):
-        """Initialize the browser instance on startup"""
-        playwright = await async_playwright().start()
-        # Connect to the persistent browser running on port 9222
-        self.browser = await playwright.chromium.connect_over_cdp("http://localhost:9222")
-        # self.browser = await playwright.chromium.launch(headless=False)
-        self.page = await self.browser.new_page()
-        # await self.page.goto('about:blank')
-        self.mouse = self.page.mouse
-        self.keyboard = self.page.keyboard
-
-    async def shutdown(self):
-        """Clean up browser instance on shutdown"""
-        if self.browser:
-            await self.browser.close()
-
-    async def get_mouse_position(self):
-        """Get current mouse position"""
-        try:
-            # Playwright doesn't provide direct mouse position
-            # We'll return the last known position from our tracking
-            return {"x": 0, "y": 0}  # Default position
-        except Exception as e:
-            return {"error": str(e), "x": 0, "y": 0}
-
-    async def move_mouse(self, action: Position):
-        """Move mouse to specified position"""
-        try:
-            await self.mouse.move(action.x, action.y)
-            return {"success": True}
-        except Exception as e:
-            return {"success": False, "error": str(e)}
-
-    async def click_mouse(self, action: MouseAction):
-        """Click at the specified position"""
-        try:
-            await self.mouse.click(
-                action.x, 
-                action.y, 
-                button=action.button,
-                click_count=action.clicks,
-                delay=action.delay * 1000 if action.delay else None
-            )
-            return {"success": True}
-        except Exception as e:
-            return {"success": False, "error": str(e)}
-
-    async def mouse_down(self, action: MouseAction):
-        """Press mouse button down"""
-        try:
-            await self.mouse.down(button=action.button)
-            return {"success": True}
-        except Exception as e:
-            return {"success": False, "error": str(e)}
-
-    async def mouse_up(self, action: MouseAction):
-        """Release mouse button"""
-        try:
-            await self.mouse.up(button=action.button)
-            return {"success": True}
-        except Exception as e:
-            return {"success": False, "error": str(e)}
-
-    async def press_key(self, action: KeyboardPress):
-        """Press specified key(s)"""
-        try:
-            if isinstance(action.keys, list):
-                for key in action.keys:
-                    await self.keyboard.press(key)
-                    if action.delay:
-                        await asyncio.sleep(action.delay)
-            else:
-                await self.keyboard.press(action.keys)
-            return {"success": True}
-        except Exception as e:
-            return {"success": False, "error": str(e)}
-
-    async def write_text(self, action: WriteAction):
-        """Type specified text"""
-        try:
-            await self.keyboard.type(action.message, delay=action.delay * 1000 if action.delay else undefined)
-            return {"success": True}
-        except Exception as e:
-            return {"success": False, "error": str(e)}
-
-    async def press_hotkey(self, action: HotkeyAction):
-        """Press multiple keys simultaneously"""
-        try:
-            # Press all keys in sequence
-            for key in action.keys:
-                await self.keyboard.down(key)
-            
-            # Release all keys in reverse order
-            for key in reversed(action.keys):
-                await self.keyboard.up(key)
-                
-            if action.delay:
-                await asyncio.sleep(action.delay)
-                
-            return {"success": True}
-        except Exception as e:
-            return {"success": False, "error": str(e)}
-
-    async def navigate_to(self, url: str):
-        """Navigate to a specified URL"""
-        try:
-            await self.page.goto(url)
-            return {"success": True}
-        except Exception as e:
-            return {"success": False, "error": str(e)}
-    
-    async def take_screenshot(self) -> Dict[str, str]:
-        """Take a screenshot of the current page"""
-        try:
-            screenshot_bytes = await self.page.screenshot()
-            return {"image": base64.b64encode(screenshot_bytes).decode()}
-        except Exception as e:
-            return {"error": str(e)}
-
-# Create a singleton instance
-automation_service = BrowserAutomation()
-
-
-async def run_demo():
-    """Run a demonstration of browser automation capabilities"""
-    print("Starting browser automation demo...")
-    
-    # Initialize the automation service
-    service = BrowserAutomation()
-    await service.startup()
-    
-    try:
-        # 1. Navigate to a test website
-        await service.page.goto('https://playwright.dev')
-        print("✓ Navigated to playwright.dev")
-        await asyncio.sleep(2)
-        
-        # 2. Take a screenshot
-        result = await service.take_screenshot()
-        if 'image' in result:
-            print("✓ Took initial screenshot")
-        
-        # 3. Move mouse to center and click
-        center_pos = MouseAction(
-            x=500,
-            y=300,
-            clicks=1
-        )
-        await service.move_mouse(Position(x=center_pos.x, y=center_pos.y))
-        print("✓ Moved mouse to center")
-        await asyncio.sleep(1)
-        
-        await service.click_mouse(center_pos)
-        print("✓ Clicked at center")
-        await asyncio.sleep(1)
-        
-        # 4. Type some text into search box
-        # First, click the search button
-        await service.page.click('button[type="button"]:has-text("Search")')
-        print("✓ Clicked search button")
-        await asyncio.sleep(1)
-        
-        # Type search term
-        write_action = WriteAction(
-            message="browser automation",
-            delay=0.1
-        )
-        await service.write_text(write_action)
-        print("✓ Typed search text")
-        await asyncio.sleep(2)
-        
-        # 5. Press Enter
-        enter_action = KeyboardPress(
-            keys="Enter"
-        )
-        await service.press_key(enter_action)
-        print("✓ Pressed Enter")
-        await asyncio.sleep(2)
-        
-        # 6. Demonstrate hotkeys (e.g., Ctrl+A to select all)
-        hotkey_action = HotkeyAction(
-            keys=["Control", "a"]
-        )
-        await service.press_hotkey(hotkey_action)
-        print("✓ Pressed Ctrl+A")
-        await asyncio.sleep(1)
-        
-        # 7. Take another screenshot after interactions
-        result = await service.take_screenshot()
-        if 'image' in result:
-            print("✓ Took final screenshot")
-        
-        print("\nDemo completed successfully! 🎉")
-        
-    except Exception as e:
-        print(f"Error during demo: {str(e)}", file=sys.stderr)
-        raise
-    finally:
-        # Clean up
-        await service.shutdown()
-        print("Browser closed.")
-
-def main():
-    """Main entry point"""
-    print("Browser Automation Demo")
-    print("======================")
-    asyncio.run(run_demo())
-
-if __name__ == "__main__":
-    main() 
--- a/backend/sandbox/docker/docker-compose.yml
+++ b/backend/sandbox/docker/docker-compose.yml
@ -6,7 +6,7 @@ services:
      dockerfile: ${DOCKERFILE:-Dockerfile}
      args:
        TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
-    image: adamcohenhillel/kortix-suna:0.0.10
+    image: adamcohenhillel/kortix-suna:0.0.13
    ports:
      - "6080:6080"  # noVNC web interface
      - "5901:5901"  # VNC port
--- a/backend/sandbox/sandbox.py
+++ b/backend/sandbox/sandbox.py
@ -78,7 +78,7 @@ def create_sandbox(password: str):
        logger.debug("OPENAI_API_KEY configured for sandbox")
    
    sandbox = daytona.create(CreateSandboxParams(
-        image="adamcohenhillel/kortix-suna:0.0.10",
+        image="adamcohenhillel/kortix-suna:0.0.13",
        public=True,
        env_vars={
            "CHROME_PERSISTENT_SESSION": "true",
--- a/frontend/src/app/dashboard/agents/[threadId]/page.tsx
+++ b/frontend/src/app/dashboard/agents/[threadId]/page.tsx
@ -282,6 +282,12 @@ export default function AgentPage({ params }: AgentPageProps) {
          part.isToolCall = !isUserMessage;
          part.status = part.isClosing ? 'completed' : 'running';
          
+          // Check if this is a browser-related tool and add VNC preview
+          if (part.tagName.includes('browser') && agent?.sandbox?.vnc_preview) {
+            console.log(`[TOOLS] Adding VNC preview from sandbox to browser tool ${part.tagName}`);
+            part.vncPreview = agent.sandbox.vnc_preview + "/vnc_lite.html?password=" + agent.sandbox.pass;
+          }
+          
          // Use ID for deduplication
          if (!seenTagIds.has(part.id)) {
            seenTagIds.add(part.id);
@ -307,6 +313,12 @@ export default function AgentPage({ params }: AgentPageProps) {
        tag.isToolCall = !isUserMessage;
        tag.status = tag.isClosing ? 'completed' : 'running';
        
+        // Check if this is a browser-related tool and add VNC preview
+        if (tag.tagName.includes('browser') && agent?.sandbox?.vnc_preview) {
+          console.log(`[TOOLS] Adding VNC preview from sandbox to browser tool ${tag.tagName}`);
+          tag.vncPreview = agent.sandbox.vnc_preview + "/vnc_lite.html?password=" + agent.sandbox.pass;
+        }
+        
        // Use ID for deduplication
        if (!seenTagIds.has(tag.id)) {
          seenTagIds.add(tag.id);
@ -381,7 +393,7 @@ export default function AgentPage({ params }: AgentPageProps) {
    
    // Update tool calls in the shared context
    setToolCalls(pairedTags);
-  }, [messages, streamContent, setToolCalls]);
+  }, [messages, streamContent, setToolCalls, agent]);
  
  // Scroll to bottom of messages
  const scrollToBottom = useCallback(() => {
--- a/frontend/src/components/chat/tool-components.tsx
+++ b/frontend/src/components/chat/tool-components.tsx
@ -4,7 +4,7 @@ import React from 'react';
 import { ParsedTag, ToolComponentProps } from '@/lib/types/tool-calls';
 import { 
  File, FileText, Terminal, FolderPlus, Folder, Code, Search as SearchIcon, 
-  Bell, Replace, Plus, Minus
+  Bell, Replace, Plus, Minus, Globe
 } from 'lucide-react';
 import { cn } from '@/lib/utils';
 import { diffLines } from 'diff';
@ -458,6 +458,69 @@ export const SearchCodeTool: React.FC<ToolComponentProps> = ({ tag, mode }) => {
  );
 };

+/**
+ * Browser Navigate Tool Component
+ */
+export const BrowserNavigateTool: React.FC<ToolComponentProps> = ({ tag, mode }) => {
+  const url = tag.content || '';
+  const isRunning = tag.status === 'running';
+  
+  if (mode === 'compact') {
+    return (
+      <CompactToolDisplay
+        icon={<Globe className="h-4 w-4 mr-2" />}
+        name={isRunning ? "Navigating to" : "Navigated to"}
+        input={url}
+        isRunning={isRunning}
+      />
+    );
+  }
+
+  return (
+    <div className="border rounded-lg overflow-hidden border-subtle dark:border-white/10">
+      <div className="flex items-center px-2 py-1 text-xs font-medium border-b border-subtle dark:border-white/10 bg-background-secondary dark:bg-background-secondary text-foreground">
+        <Globe className="h-4 w-4 mr-2" />
+        <div className="flex-1">{isRunning ? `Navigating to` : `Navigated to`}: {url}</div>
+        {isRunning && (
+          <div className="flex items-center gap-2">
+            <span className="text-amber-500">Running</span>
+            <div className="h-2 w-2 rounded-full bg-amber-500 animate-pulse"></div>
+          </div>
+        )}
+      </div>
+      <div className="p-3 bg-card-bg dark:bg-background-secondary text-foreground">
+        <div className="space-y-2">
+          <div className="flex items-center gap-1 text-xs text-muted-foreground mb-1">
+            <Globe className="h-3 w-3" />
+            <span className="font-mono">{url}</span>
+          </div>
+          
+          {/* Display VNC preview if available */}
+          {tag.vncPreview && (
+            <div className="mt-2 border border-subtle dark:border-white/10 rounded-md overflow-hidden">
+              <div className="text-xs bg-black text-white p-1">VNC Preview</div>
+              <div className="relative w-full h-[300px] overflow-hidden">
+                <iframe 
+                  src={tag.vncPreview} 
+                  title="Browser preview" 
+                  className="absolute top-0 left-0 border-0"
+                  style={{
+                    width: '200%',
+                    height: '200%',
+                    transform: 'scale(0.5)',
+                    transformOrigin: '0 0'
+                  }}
+                  sandbox="allow-same-origin allow-scripts"
+                />
+              </div>
+            </div>
+          )}
+        </div>
+      </div>
+    </div>
+  );
+};
+
 // Tool component registry
 export const ToolComponentRegistry: Record<string, React.FC<ToolComponentProps>> = {
  'create-file': CreateFileTool,
@ -471,6 +534,19 @@ export const ToolComponentRegistry: Record<string, React.FC<ToolComponentProps>>
  'ask': NotifyTool,  // Handle ask similar to notify for now
  'complete': NotifyTool, // Handle complete similar to notify for now
  'full-file-rewrite': FullFileRewriteTool,
+  'browser-navigate-to': BrowserNavigateTool,
+  'browser-click-element': BrowserNavigateTool,
+  'browser-input-text': BrowserNavigateTool,
+  'browser-go-back': BrowserNavigateTool,
+  'browser-wait': BrowserNavigateTool,
+  'browser-scroll-down': BrowserNavigateTool,
+  'browser-scroll-up': BrowserNavigateTool,
+  'browser-scroll-to-text': BrowserNavigateTool,
+  'browser-switch-tab': BrowserNavigateTool,
+  'browser-close-tab': BrowserNavigateTool,
+  'browser-get-dropdown-options': BrowserNavigateTool,
+  'browser-select-dropdown-option': BrowserNavigateTool,
+  'browser-drag-drop': BrowserNavigateTool,
 };

 // Helper function to get the appropriate component for a tag
--- a/frontend/src/lib/api.ts
+++ b/frontend/src/lib/api.ts
@ -80,8 +80,11 @@ export type Project = {
  description: string;
  account_id: string;
  created_at: string;
-  sandbox_id?: string;
-  sandbox_pass?: string;
+  sandbox: {
+    vnc_preview?: string;
+    id?: string;
+    pass?: string;
+  };
 }

 export type Thread = {
@ -214,7 +217,8 @@ export const createProject = async (
    name: data.name,
    description: data.description || '',
    account_id: data.account_id,
-    created_at: data.created_at
+    created_at: data.created_at,
+    sandbox: { id: "", pass: "", vnc_preview: "" }
  };
 };

--- a/frontend/src/lib/types/tool-calls.ts
+++ b/frontend/src/lib/types/tool-calls.ts
@ -13,6 +13,9 @@ export interface ParsedTag {
  isToolCall?: boolean; // Whether this is a tool call (vs a result)
  isPaired?: boolean; // Whether this tag has been paired with its call/result
  status?: 'running' | 'completed' | 'error'; // Status of the tool call
+  
+  // VNC preview for browser-related tools
+  vncPreview?: string; // VNC preview image URL
 }

 // Display mode for tool components
@ -37,7 +40,20 @@ export const SUPPORTED_XML_TAGS = [
  'list-directory',
  'search-code',
  'complete',
-  'full-file-rewrite'
+  'full-file-rewrite',
+  'browser-navigate-to',
+  'browser-click-element',
+  'browser-input-text',
+  'browser-go-back',
+  'browser-wait',
+  'browser-scroll-down',
+  'browser-scroll-up',
+  'browser-scroll-to-text',
+  'browser-switch-tab',
+  'browser-close-tab',
+  'browser-get-dropdown-options',
+  'browser-select-dropdown-option',
+  'browser-drag-drop'
 ];

 // Tool status labels