From e51b1076a79c5b652786f1be089b66f11d433da1 Mon Sep 17 00:00:00 2001 From: Adam Cohen Hillel Date: Tue, 15 Apr 2025 17:36:01 +0100 Subject: [PATCH] fuck yeah --- backend/agent/prompt.py | 3 +- backend/agent/run.py | 45 ++++++++++--- backend/agent/tools/sb_browser_tool.py | 66 +++++++++---------- .../app/dashboard/agents/[threadId]/page.tsx | 7 ++ frontend/src/hooks/use-tools-panel.tsx | 26 -------- 5 files changed, 77 insertions(+), 70 deletions(-) diff --git a/backend/agent/prompt.py b/backend/agent/prompt.py index 08c46314..153515ae 100644 --- a/backend/agent/prompt.py +++ b/backend/agent/prompt.py @@ -65,7 +65,8 @@ You have the ability to execute operations using both Python and CLI tools: * Extract text and HTML content * Wait for elements to load * Scroll pages and handle infinite scroll - + * YOU CAN DO ANYTHING ON THE BROWSER - including clicking on elements, filling forms, submitting data, etc. + * The browser is in a sandboxed environment, so nothing to worry about. # 3. TOOLKIT & METHODOLOGY diff --git a/backend/agent/run.py b/backend/agent/run.py index f89f6f01..4f997278 100644 --- a/backend/agent/run.py +++ b/backend/agent/run.py @@ -63,12 +63,12 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread } }).eq('project_id', project_id).execute() - # thread_manager.add_tool(SandboxShellTool, sandbox=sandbox) - # thread_manager.add_tool(SandboxFilesTool, sandbox=sandbox) - thread_manager.add_tool(SandboxBrowserTool, sandbox=sandbox) - # thread_manager.add_tool(SandboxDeployTool, sandbox=sandbox) - # thread_manager.add_tool(MessageTool) - # thread_manager.add_tool(WebSearchTool) + thread_manager.add_tool(SandboxShellTool, sandbox=sandbox) + thread_manager.add_tool(SandboxFilesTool, sandbox=sandbox) + thread_manager.add_tool(SandboxBrowserTool, sandbox=sandbox, thread_id=thread_id, thread_manager=thread_manager) + thread_manager.add_tool(SandboxDeployTool, sandbox=sandbox) + thread_manager.add_tool(MessageTool) + thread_manager.add_tool(WebSearchTool) xml_examples = "" for tag_name, example in thread_manager.tool_registry.get_xml_examples().items(): @@ -116,11 +116,36 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread continue_execution = False break # Get the latest message from messages table that its tpye is browser_state + latest_browser_state = await client.table('messages').select('*').eq('thread_id', thread_id).eq('type', 'browser_state').order('created_at', desc=True).limit(1).execute() + temporary_message = None if latest_browser_state.data and len(latest_browser_state.data) > 0: - temporary_message = latest_browser_state.data[0].get('content', '') - else: - temporary_message = None + try: + content = json.loads(latest_browser_state.data[0]["content"]) + screenshot_base64 = content["screenshot_base64"] + # Create a copy of the browser state without screenshot + browser_state = content.copy() + browser_state.pop('screenshot_base64', None) + browser_state.pop('screenshot_url', None) + browser_state.pop('screenshot_url_base64', None) + temporary_message = { "role": "user", "content": [] } + if browser_state: + temporary_message["content"].append({ + "type": "text", + "text": f"The following is the current state of the browser:\n{browser_state}" + }) + if screenshot_base64: + temporary_message["content"].append({ + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{screenshot_base64}", + } + }) + else: + print("@@@@@ THIS TIME NO SCREENSHOT!!") + except Exception as e: + print(f"Error parsing browser state: {e}") + # print(latest_browser_state.data[0]) response = await thread_manager.run_thread( thread_id=thread_id, @@ -131,7 +156,7 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread llm_max_tokens=64000, tool_choice="auto", max_xml_tool_calls=1, - # temporary_message= + temporary_message=temporary_message, processor_config=ProcessorConfig( xml_tool_calling=True, native_tool_calling=False, diff --git a/backend/agent/tools/sb_browser_tool.py b/backend/agent/tools/sb_browser_tool.py index 55f23864..0e37a5e9 100644 --- a/backend/agent/tools/sb_browser_tool.py +++ b/backend/agent/tools/sb_browser_tool.py @@ -2,6 +2,7 @@ import traceback import json from agentpress.tool import ToolResult, openapi_schema, xml_schema +from agentpress.thread_manager import ThreadManager from sandbox.sandbox import SandboxToolsBase, Sandbox from utils.logger import logger @@ -9,8 +10,10 @@ from utils.logger import logger class SandboxBrowserTool(SandboxToolsBase): """Tool for executing tasks in a Daytona sandbox with browser-use capabilities.""" - def __init__(self, sandbox: Sandbox): + def __init__(self, sandbox: Sandbox, thread_id: str, thread_manager: ThreadManager): super().__init__(sandbox) + self.thread_id = thread_id + self.thread_manager = thread_manager async def _execute_browser_action(self, endpoint: str, params: dict = None, method: str = "POST") -> ToolResult: """Execute a browser automation action through the API @@ -45,43 +48,40 @@ class SandboxBrowserTool(SandboxToolsBase): if response.exit_code == 0: try: result = json.loads(response.result) + + if not "content" in result: + result["content"] = "" + + if not "role" in result: + result["role"] = "assistant" + logger.info("Browser automation request completed successfully") - # Create a cleaned version of the result based on BrowserActionResult schema - cleaned_result = { - "success": result.get("success", False), - "message": result.get("message", ""), - "error": result.get("error", ""), - "url": result.get("url"), - "title": result.get("title"), - "elements": result.get("elements"), - "pixels_above": result.get("pixels_above", 0), - "pixels_below": result.get("pixels_below", 0), - "content": result.get("content"), - "element_count": result.get("element_count", 0), - "interactive_elements": result.get("interactive_elements"), - "viewport_width": result.get("viewport_width"), - "viewport_height": result.get("viewport_height") + # Add full result to thread messages for state tracking + await self.thread_manager.add_message( + thread_id=self.thread_id, + type="browser_state", + content=result, + is_llm_message=False + ) + + # Return tool-specific success response + success_response = { + "success": True, + "message": result.get("message", "Browser action completed successfully") } - # Print screenshot info to console but don't return it - if "screenshot_base64" in result: - has_screenshot = bool(result.get("screenshot_base64")) - print(f"\033[95mScreenshot captured: {has_screenshot}\033[0m") + # Add relevant browser-specific info + if result.get("url"): + success_response["url"] = result["url"] + if result.get("title"): + success_response["title"] = result["title"] + if result.get("element_count"): + success_response["elements_found"] = result["element_count"] + if result.get("pixels_below"): + success_response["scrollable_content"] = result["pixels_below"] > 0 - # Print viewport info if available - if cleaned_result["viewport_width"] and cleaned_result["viewport_height"]: - print(f"\033[95mViewport size: {cleaned_result['viewport_width']}x{cleaned_result['viewport_height']}\033[0m") - - # Print interactive elements count - if cleaned_result["element_count"] > 0: - print(f"\033[95mFound {cleaned_result['element_count']} interactive elements\033[0m") - - print("************************************************") - print(cleaned_result) - print("************************************************") - - return self.success_response(cleaned_result) + return self.success_response(success_response) except json.JSONDecodeError: logger.error(f"Failed to parse response JSON: {response.result}") diff --git a/frontend/src/app/dashboard/agents/[threadId]/page.tsx b/frontend/src/app/dashboard/agents/[threadId]/page.tsx index b3826236..c2fe6273 100644 --- a/frontend/src/app/dashboard/agents/[threadId]/page.tsx +++ b/frontend/src/app/dashboard/agents/[threadId]/page.tsx @@ -764,6 +764,10 @@ export default function AgentPage({ params }: AgentPageProps) { <> {messages.map((message, index) => { // Skip messages containing "ToolResult(" + if (!message || !message?.content || !message?.role) { + return null; + } + if (message.content.includes("ToolResult(")) { return null; } @@ -939,6 +943,9 @@ export default function AgentPage({ params }: AgentPageProps) { <> {messages.map((message, index) => { // Skip messages containing "ToolResult(" + if (!message || !message?.content || !message?.role) { + return null; + } if (message.content.includes("ToolResult(")) { return null; } diff --git a/frontend/src/hooks/use-tools-panel.tsx b/frontend/src/hooks/use-tools-panel.tsx index 5552c206..6247ff66 100644 --- a/frontend/src/hooks/use-tools-panel.tsx +++ b/frontend/src/hooks/use-tools-panel.tsx @@ -175,29 +175,3 @@ export function useToolsPanel() { prevTool, }; } - -// Helper function to get a friendly title for a tool call -function getToolTitle(tag: ParsedTag): string { - switch (tag.tagName) { - case 'create-file': - return `Creating file: ${tag.attributes.file_path || ''}`; - case 'read-file': - return `Reading file: ${tag.attributes.file_path || ''}`; - case 'execute-command': - return `Executing: ${tag.attributes.command || ''}`; - case 'create-directory': - return `Creating directory: ${tag.attributes.path || ''}`; - case 'list-directory': - return `Listing directory: ${tag.attributes.path || ''}`; - case 'search-code': - return `Searching code: ${tag.attributes.query || ''}`; - case 'notify': - return `Notification: ${tag.attributes.message || ''}`; - case 'str-replace': - return `String replace: ${tag.attributes.pattern || ''}`; - case 'full-file-rewrite': - return `Full file rewrite: ${tag.attributes.file_path || ''}`; - default: - return `${tag.tagName} operation`; - } -} \ No newline at end of file