From e51b1076a79c5b652786f1be089b66f11d433da1 Mon Sep 17 00:00:00 2001
From: Adam Cohen Hillel <adamcohenhillel@gmail.com>
Date: Tue, 15 Apr 2025 17:36:01 +0100
Subject: [PATCH] fuck yeah

---
 backend/agent/prompt.py                       |  3 +-
 backend/agent/run.py                          | 45 ++++++++++---
 backend/agent/tools/sb_browser_tool.py        | 66 +++++++++----------
 .../app/dashboard/agents/[threadId]/page.tsx  |  7 ++
 frontend/src/hooks/use-tools-panel.tsx        | 26 --------
 5 files changed, 77 insertions(+), 70 deletions(-)

diff --git a/backend/agent/prompt.py b/backend/agent/prompt.py
index 08c46314..153515ae 100644
--- a/backend/agent/prompt.py
+++ b/backend/agent/prompt.py
@@ -65,7 +65,8 @@ You have the ability to execute operations using both Python and CLI tools:
   * Extract text and HTML content
   * Wait for elements to load
   * Scroll pages and handle infinite scroll
-
+  * YOU CAN DO ANYTHING ON THE BROWSER - including clicking on elements, filling forms, submitting data, etc.
+  * The browser is in a sandboxed environment, so nothing to worry about.
 
 # 3. TOOLKIT & METHODOLOGY
 
diff --git a/backend/agent/run.py b/backend/agent/run.py
index f89f6f01..4f997278 100644
--- a/backend/agent/run.py
+++ b/backend/agent/run.py
@@ -63,12 +63,12 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread
             }
         }).eq('project_id', project_id).execute()
     
-    # thread_manager.add_tool(SandboxShellTool, sandbox=sandbox)
-    # thread_manager.add_tool(SandboxFilesTool, sandbox=sandbox)
-    thread_manager.add_tool(SandboxBrowserTool, sandbox=sandbox)
-    # thread_manager.add_tool(SandboxDeployTool, sandbox=sandbox)
-    # thread_manager.add_tool(MessageTool)
-    # thread_manager.add_tool(WebSearchTool)
+    thread_manager.add_tool(SandboxShellTool, sandbox=sandbox)
+    thread_manager.add_tool(SandboxFilesTool, sandbox=sandbox)
+    thread_manager.add_tool(SandboxBrowserTool, sandbox=sandbox, thread_id=thread_id, thread_manager=thread_manager)
+    thread_manager.add_tool(SandboxDeployTool, sandbox=sandbox)
+    thread_manager.add_tool(MessageTool)
+    thread_manager.add_tool(WebSearchTool)
 
     xml_examples = ""
     for tag_name, example in thread_manager.tool_registry.get_xml_examples().items():
@@ -116,11 +116,36 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread
                 continue_execution = False
                 break
         # Get the latest message from messages table that its tpye is browser_state
+        
         latest_browser_state = await client.table('messages').select('*').eq('thread_id', thread_id).eq('type', 'browser_state').order('created_at', desc=True).limit(1).execute()
+        temporary_message = None
         if latest_browser_state.data and len(latest_browser_state.data) > 0:
-            temporary_message = latest_browser_state.data[0].get('content', '')
-        else:
-            temporary_message = None
+            try:
+                content = json.loads(latest_browser_state.data[0]["content"])
+                screenshot_base64 = content["screenshot_base64"]
+                # Create a copy of the browser state without screenshot
+                browser_state = content.copy()
+                browser_state.pop('screenshot_base64', None)
+                browser_state.pop('screenshot_url', None) 
+                browser_state.pop('screenshot_url_base64', None)
+                temporary_message = { "role": "user", "content": [] }
+                if browser_state:
+                    temporary_message["content"].append({
+                        "type": "text",
+                        "text": f"The following is the current state of the browser:\n{browser_state}"
+                    })
+                if screenshot_base64:
+                    temporary_message["content"].append({
+                        "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{screenshot_base64}",
+                            }
+                    })
+                else:
+                    print("@@@@@ THIS TIME NO SCREENSHOT!!")
+            except Exception as e:
+                print(f"Error parsing browser state: {e}")
+                # print(latest_browser_state.data[0])
 
         response = await thread_manager.run_thread(
             thread_id=thread_id,
@@ -131,7 +156,7 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread
             llm_max_tokens=64000,
             tool_choice="auto",
             max_xml_tool_calls=1,
-            # temporary_message=
+            temporary_message=temporary_message,
             processor_config=ProcessorConfig(
                 xml_tool_calling=True,
                 native_tool_calling=False,
diff --git a/backend/agent/tools/sb_browser_tool.py b/backend/agent/tools/sb_browser_tool.py
index 55f23864..0e37a5e9 100644
--- a/backend/agent/tools/sb_browser_tool.py
+++ b/backend/agent/tools/sb_browser_tool.py
@@ -2,6 +2,7 @@ import traceback
 import json
 
 from agentpress.tool import ToolResult, openapi_schema, xml_schema
+from agentpress.thread_manager import ThreadManager
 from sandbox.sandbox import SandboxToolsBase, Sandbox
 from utils.logger import logger
 
@@ -9,8 +10,10 @@ from utils.logger import logger
 class SandboxBrowserTool(SandboxToolsBase):
     """Tool for executing tasks in a Daytona sandbox with browser-use capabilities."""
     
-    def __init__(self, sandbox: Sandbox):
+    def __init__(self, sandbox: Sandbox, thread_id: str, thread_manager: ThreadManager):
         super().__init__(sandbox)
+        self.thread_id = thread_id
+        self.thread_manager = thread_manager
 
     async def _execute_browser_action(self, endpoint: str, params: dict = None, method: str = "POST") -> ToolResult:
         """Execute a browser automation action through the API
@@ -45,43 +48,40 @@ class SandboxBrowserTool(SandboxToolsBase):
             if response.exit_code == 0:
                 try:
                     result = json.loads(response.result)
+
+                    if not "content" in result:
+                        result["content"] = ""
+                    
+                    if not "role" in result:
+                        result["role"] = "assistant"
+
                     logger.info("Browser automation request completed successfully")
 
-                    # Create a cleaned version of the result based on BrowserActionResult schema
-                    cleaned_result = {
-                        "success": result.get("success", False),
-                        "message": result.get("message", ""),
-                        "error": result.get("error", ""),
-                        "url": result.get("url"),
-                        "title": result.get("title"),
-                        "elements": result.get("elements"),
-                        "pixels_above": result.get("pixels_above", 0),
-                        "pixels_below": result.get("pixels_below", 0),
-                        "content": result.get("content"),
-                        "element_count": result.get("element_count", 0),
-                        "interactive_elements": result.get("interactive_elements"),
-                        "viewport_width": result.get("viewport_width"),
-                        "viewport_height": result.get("viewport_height")
+                    # Add full result to thread messages for state tracking
+                    await self.thread_manager.add_message(
+                        thread_id=self.thread_id,
+                        type="browser_state",
+                        content=result,
+                        is_llm_message=False
+                    )
+
+                    # Return tool-specific success response
+                    success_response = {
+                        "success": True,
+                        "message": result.get("message", "Browser action completed successfully")
                     }
 
-                    # Print screenshot info to console but don't return it
-                    if "screenshot_base64" in result:
-                        has_screenshot = bool(result.get("screenshot_base64"))
-                        print(f"\033[95mScreenshot captured: {has_screenshot}\033[0m")
+                    # Add relevant browser-specific info
+                    if result.get("url"):
+                        success_response["url"] = result["url"]
+                    if result.get("title"):
+                        success_response["title"] = result["title"]
+                    if result.get("element_count"):
+                        success_response["elements_found"] = result["element_count"]
+                    if result.get("pixels_below"):
+                        success_response["scrollable_content"] = result["pixels_below"] > 0
 
-                    # Print viewport info if available
-                    if cleaned_result["viewport_width"] and cleaned_result["viewport_height"]:
-                        print(f"\033[95mViewport size: {cleaned_result['viewport_width']}x{cleaned_result['viewport_height']}\033[0m")
-
-                    # Print interactive elements count
-                    if cleaned_result["element_count"] > 0:
-                        print(f"\033[95mFound {cleaned_result['element_count']} interactive elements\033[0m")
-
-                    print("************************************************")
-                    print(cleaned_result)
-                    print("************************************************")
-
-                    return self.success_response(cleaned_result)
+                    return self.success_response(success_response)
 
                 except json.JSONDecodeError:
                     logger.error(f"Failed to parse response JSON: {response.result}")
diff --git a/frontend/src/app/dashboard/agents/[threadId]/page.tsx b/frontend/src/app/dashboard/agents/[threadId]/page.tsx
index b3826236..c2fe6273 100644
--- a/frontend/src/app/dashboard/agents/[threadId]/page.tsx
+++ b/frontend/src/app/dashboard/agents/[threadId]/page.tsx
@@ -764,6 +764,10 @@ export default function AgentPage({ params }: AgentPageProps) {
               <>
                 {messages.map((message, index) => {
                   // Skip messages containing "ToolResult("
+                  if (!message || !message?.content || !message?.role) {
+                    return null;
+                  }
+
                   if (message.content.includes("ToolResult(")) {
                     return null;
                   }
@@ -939,6 +943,9 @@ export default function AgentPage({ params }: AgentPageProps) {
           <>
             {messages.map((message, index) => {
               // Skip messages containing "ToolResult("
+              if (!message || !message?.content || !message?.role) {
+                return null;
+              }
               if (message.content.includes("ToolResult(")) {
                 return null;
               }
diff --git a/frontend/src/hooks/use-tools-panel.tsx b/frontend/src/hooks/use-tools-panel.tsx
index 5552c206..6247ff66 100644
--- a/frontend/src/hooks/use-tools-panel.tsx
+++ b/frontend/src/hooks/use-tools-panel.tsx
@@ -175,29 +175,3 @@ export function useToolsPanel() {
     prevTool,
   };
 }
-
-// Helper function to get a friendly title for a tool call
-function getToolTitle(tag: ParsedTag): string {
-  switch (tag.tagName) {
-    case 'create-file':
-      return `Creating file: ${tag.attributes.file_path || ''}`;
-    case 'read-file':
-      return `Reading file: ${tag.attributes.file_path || ''}`;
-    case 'execute-command':
-      return `Executing: ${tag.attributes.command || ''}`;
-    case 'create-directory':
-      return `Creating directory: ${tag.attributes.path || ''}`;
-    case 'list-directory':
-      return `Listing directory: ${tag.attributes.path || ''}`;
-    case 'search-code':
-      return `Searching code: ${tag.attributes.query || ''}`;
-    case 'notify':
-      return `Notification: ${tag.attributes.message || ''}`;
-    case 'str-replace':
-      return `String replace: ${tag.attributes.pattern || ''}`;
-    case 'full-file-rewrite':
-      return `Full file rewrite: ${tag.attributes.file_path || ''}`;
-    default:
-      return `${tag.tagName} operation`;
-  }
-} 
\ No newline at end of file