Merge pull request #36 from kortix-ai/bring-back-browser-use

Bring back using browser with pure playwright
This commit is contained in:
Adam Cohen Hillel 2025-04-15 19:11:29 +01:00 committed by GitHub
commit f0d7392d3b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 2915 additions and 281 deletions

View File

@ -57,6 +57,17 @@ You have the ability to execute operations using both Python and CLI tools:
- Finding recent news, articles, and information beyond training data
- Crawling webpage content for detailed information extraction
### 2.2.5 BROWSER TOOLS AND CAPABILITIES
- BROWSER OPERATIONS:
* Navigate to URLs and manage history
* Fill forms and submit data
* Click elements and interact with pages
* Extract text and HTML content
* Wait for elements to load
* Scroll pages and handle infinite scroll
* YOU CAN DO ANYTHING ON THE BROWSER - including clicking on elements, filling forms, submitting data, etc.
* The browser is in a sandboxed environment, so nothing to worry about.
# 3. TOOLKIT & METHODOLOGY
## 3.1 TOOL SELECTION PRINCIPLES

View File

@ -12,6 +12,7 @@ from agentpress.thread_manager import ThreadManager
from agentpress.response_processor import ProcessorConfig
from agent.tools.sb_shell_tool import SandboxShellTool
from agent.tools.sb_files_tool import SandboxFilesTool
from agent.tools.sb_browser_tool import SandboxBrowserTool
from agent.prompt import get_system_prompt
from sandbox.sandbox import daytona, create_sandbox, get_or_start_sandbox
from utils.billing import check_billing_status, get_account_id_from_thread
@ -52,22 +53,28 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread
else:
sandbox_pass = str(uuid4())
sandbox = create_sandbox(sandbox_pass)
print(f"\033[91m{sandbox.get_preview_link(6080)}/vnc_lite.html?password={sandbox_pass}\033[0m")
sandbox_id = sandbox.id
await client.table('projects').update({
'sandbox': {
'id': sandbox_id,
'pass': sandbox_pass
'pass': sandbox_pass,
'vnc_preview': sandbox.get_preview_link(6080)
}
}).eq('project_id', project_id).execute()
# thread_manager.add_tool(SandboxBrowseTool, sandbox=sandbox)
thread_manager.add_tool(SandboxShellTool, sandbox=sandbox)
thread_manager.add_tool(SandboxFilesTool, sandbox=sandbox)
thread_manager.add_tool(SandboxBrowserTool, sandbox=sandbox, thread_id=thread_id, thread_manager=thread_manager)
thread_manager.add_tool(SandboxDeployTool, sandbox=sandbox)
thread_manager.add_tool(MessageTool)
thread_manager.add_tool(WebSearchTool)
thread_manager.add_tool(SandboxDeployTool, sandbox=sandbox)
system_message = { "role": "system", "content": get_system_prompt() }
xml_examples = ""
for tag_name, example in thread_manager.tool_registry.get_xml_examples().items():
xml_examples += f"{example}\n"
system_message = { "role": "system", "content": get_system_prompt() + "\n\n" + f"<tool_examples>\n{xml_examples}\n</tool_examples>" }
model_name = "anthropic/claude-3-7-sonnet-latest"
# model_name = "groq/llama-3.3-70b-versatile"
@ -108,6 +115,37 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread
print(f"Last message was from assistant, stopping execution")
continue_execution = False
break
# Get the latest message from messages table that its tpye is browser_state
latest_browser_state = await client.table('messages').select('*').eq('thread_id', thread_id).eq('type', 'browser_state').order('created_at', desc=True).limit(1).execute()
temporary_message = None
if latest_browser_state.data and len(latest_browser_state.data) > 0:
try:
content = json.loads(latest_browser_state.data[0]["content"])
screenshot_base64 = content["screenshot_base64"]
# Create a copy of the browser state without screenshot
browser_state = content.copy()
browser_state.pop('screenshot_base64', None)
browser_state.pop('screenshot_url', None)
browser_state.pop('screenshot_url_base64', None)
temporary_message = { "role": "user", "content": [] }
if browser_state:
temporary_message["content"].append({
"type": "text",
"text": f"The following is the current state of the browser:\n{browser_state}"
})
if screenshot_base64:
temporary_message["content"].append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{screenshot_base64}",
}
})
else:
print("@@@@@ THIS TIME NO SCREENSHOT!!")
except Exception as e:
print(f"Error parsing browser state: {e}")
# print(latest_browser_state.data[0])
response = await thread_manager.run_thread(
thread_id=thread_id,
@ -115,9 +153,10 @@ async def run_agent(thread_id: str, project_id: str, stream: bool = True, thread
stream=stream,
llm_model=model_name,
llm_temperature=0,
llm_max_tokens=64000,
llm_max_tokens=128000,
tool_choice="auto",
max_xml_tool_calls=1,
temporary_message=temporary_message,
processor_config=ProcessorConfig(
xml_tool_calling=True,
native_tool_calling=False,

View File

@ -0,0 +1,846 @@
import traceback
import json
from agentpress.tool import ToolResult, openapi_schema, xml_schema
from agentpress.thread_manager import ThreadManager
from sandbox.sandbox import SandboxToolsBase, Sandbox
from utils.logger import logger
class SandboxBrowserTool(SandboxToolsBase):
"""Tool for executing tasks in a Daytona sandbox with browser-use capabilities."""
def __init__(self, sandbox: Sandbox, thread_id: str, thread_manager: ThreadManager):
super().__init__(sandbox)
self.thread_id = thread_id
self.thread_manager = thread_manager
async def _execute_browser_action(self, endpoint: str, params: dict = None, method: str = "POST") -> ToolResult:
"""Execute a browser automation action through the API
Args:
endpoint (str): The API endpoint to call
params (dict, optional): Parameters to send. Defaults to None.
method (str, optional): HTTP method to use. Defaults to "POST".
Returns:
ToolResult: Result of the execution
"""
try:
# Build the curl command
url = f"http://localhost:8002/api/automation/{endpoint}"
if method == "GET" and params:
query_params = "&".join([f"{k}={v}" for k, v in params.items()])
url = f"{url}?{query_params}"
curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'"
else:
curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'"
if params:
json_data = json.dumps(params)
curl_cmd += f" -d '{json_data}'"
print(f"\033[95mExecuting curl command:\033[0m")
print(f"{curl_cmd}")
response = self.sandbox.process.exec(curl_cmd, timeout=30)
if response.exit_code == 0:
try:
result = json.loads(response.result)
if not "content" in result:
result["content"] = ""
if not "role" in result:
result["role"] = "assistant"
logger.info("Browser automation request completed successfully")
# Add full result to thread messages for state tracking
await self.thread_manager.add_message(
thread_id=self.thread_id,
type="browser_state",
content=result,
is_llm_message=False
)
# Return tool-specific success response
success_response = {
"success": True,
"message": result.get("message", "Browser action completed successfully")
}
# Add relevant browser-specific info
if result.get("url"):
success_response["url"] = result["url"]
if result.get("title"):
success_response["title"] = result["title"]
if result.get("element_count"):
success_response["elements_found"] = result["element_count"]
if result.get("pixels_below"):
success_response["scrollable_content"] = result["pixels_below"] > 0
return self.success_response(success_response)
except json.JSONDecodeError:
logger.error(f"Failed to parse response JSON: {response.result}")
return self.fail_response(f"Failed to parse response JSON: {response.result}")
else:
logger.error(f"Browser automation request failed: {response.result}")
return self.fail_response(f"Browser automation request failed: {response.result}")
except Exception as e:
logger.error(f"Error executing browser action: {e}")
print(traceback.format_exc())
return self.fail_response(f"Error executing browser action: {e}")
@openapi_schema({
"type": "function",
"function": {
"name": "browser_navigate_to",
"description": "Navigate to a specific url",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The url to navigate to"
}
},
"required": ["url"]
}
}
})
@xml_schema(
tag_name="browser-navigate-to",
mappings=[
{"param_name": "url", "node_type": "content", "path": "."}
],
example='''
<browser-navigate-to>
https://example.com
</browser-navigate-to>
'''
)
async def browser_navigate_to(self, url: str) -> ToolResult:
"""Navigate to a specific url
Args:
url (str): The url to navigate to
Returns:
dict: Result of the execution
"""
print(f"\033[95mNavigating to: {url}\033[0m")
return await self._execute_browser_action("navigate_to", {"url": url})
# @openapi_schema({
# "type": "function",
# "function": {
# "name": "browser_search_google",
# "description": "Search Google with the provided query",
# "parameters": {
# "type": "object",
# "properties": {
# "query": {
# "type": "string",
# "description": "The search query to use"
# }
# },
# "required": ["query"]
# }
# }
# })
# @xml_schema(
# tag_name="browser-search-google",
# mappings=[
# {"param_name": "query", "node_type": "content", "path": "."}
# ],
# example='''
# <browser-search-google>
# artificial intelligence news
# </browser-search-google>
# '''
# )
# async def browser_search_google(self, query: str) -> ToolResult:
# """Search Google with the provided query
# Args:
# query (str): The search query to use
# Returns:
# dict: Result of the execution
# """
# print(f"\033[95mSearching Google for: {query}\033[0m")
# return await self._execute_browser_action("search_google", {"query": query})
@openapi_schema({
"type": "function",
"function": {
"name": "browser_go_back",
"description": "Navigate back in browser history",
"parameters": {
"type": "object",
"properties": {}
}
}
})
@xml_schema(
tag_name="browser-go-back",
mappings=[],
example='''
<browser-go-back></browser-go-back>
'''
)
async def browser_go_back(self) -> ToolResult:
"""Navigate back in browser history
Returns:
dict: Result of the execution
"""
print(f"\033[95mNavigating back in browser history\033[0m")
return await self._execute_browser_action("go_back", {})
@openapi_schema({
"type": "function",
"function": {
"name": "browser_wait",
"description": "Wait for the specified number of seconds",
"parameters": {
"type": "object",
"properties": {
"seconds": {
"type": "integer",
"description": "Number of seconds to wait (default: 3)"
}
}
}
}
})
@xml_schema(
tag_name="browser-wait",
mappings=[
{"param_name": "seconds", "node_type": "content", "path": "."}
],
example='''
<browser-wait>
5
</browser-wait>
'''
)
async def browser_wait(self, seconds: int = 3) -> ToolResult:
"""Wait for the specified number of seconds
Args:
seconds (int, optional): Number of seconds to wait. Defaults to 3.
Returns:
dict: Result of the execution
"""
print(f"\033[95mWaiting for {seconds} seconds\033[0m")
return await self._execute_browser_action("wait", {"seconds": seconds})
@openapi_schema({
"type": "function",
"function": {
"name": "browser_click_element",
"description": "Click on an element by index",
"parameters": {
"type": "object",
"properties": {
"index": {
"type": "integer",
"description": "The index of the element to click"
}
},
"required": ["index"]
}
}
})
@xml_schema(
tag_name="browser-click-element",
mappings=[
{"param_name": "index", "node_type": "content", "path": "."}
],
example='''
<browser-click-element>
2
</browser-click-element>
'''
)
async def browser_click_element(self, index: int) -> ToolResult:
"""Click on an element by index
Args:
index (int): The index of the element to click
Returns:
dict: Result of the execution
"""
print(f"\033[95mClicking element with index: {index}\033[0m")
return await self._execute_browser_action("click_element", {"index": index})
@openapi_schema({
"type": "function",
"function": {
"name": "browser_input_text",
"description": "Input text into an element",
"parameters": {
"type": "object",
"properties": {
"index": {
"type": "integer",
"description": "The index of the element to input text into"
},
"text": {
"type": "string",
"description": "The text to input"
}
},
"required": ["index", "text"]
}
}
})
@xml_schema(
tag_name="browser-input-text",
mappings=[
{"param_name": "index", "node_type": "attribute", "path": "."},
{"param_name": "text", "node_type": "content", "path": "."}
],
example='''
<browser-input-text index="2">
Hello, world!
</browser-input-text>
'''
)
async def browser_input_text(self, index: int, text: str) -> ToolResult:
"""Input text into an element
Args:
index (int): The index of the element to input text into
text (str): The text to input
Returns:
dict: Result of the execution
"""
print(f"\033[95mInputting text into element {index}: {text}\033[0m")
return await self._execute_browser_action("input_text", {"index": index, "text": text})
@openapi_schema({
"type": "function",
"function": {
"name": "browser_send_keys",
"description": "Send keyboard keys such as Enter, Escape, or keyboard shortcuts",
"parameters": {
"type": "object",
"properties": {
"keys": {
"type": "string",
"description": "The keys to send (e.g., 'Enter', 'Escape', 'Control+a')"
}
},
"required": ["keys"]
}
}
})
@xml_schema(
tag_name="browser-send-keys",
mappings=[
{"param_name": "keys", "node_type": "content", "path": "."}
],
example='''
<browser-send-keys>
Enter
</browser-send-keys>
'''
)
async def browser_send_keys(self, keys: str) -> ToolResult:
"""Send keyboard keys
Args:
keys (str): The keys to send (e.g., 'Enter', 'Escape', 'Control+a')
Returns:
dict: Result of the execution
"""
print(f"\033[95mSending keys: {keys}\033[0m")
return await self._execute_browser_action("send_keys", {"keys": keys})
@openapi_schema({
"type": "function",
"function": {
"name": "browser_switch_tab",
"description": "Switch to a different browser tab",
"parameters": {
"type": "object",
"properties": {
"page_id": {
"type": "integer",
"description": "The ID of the tab to switch to"
}
},
"required": ["page_id"]
}
}
})
@xml_schema(
tag_name="browser-switch-tab",
mappings=[
{"param_name": "page_id", "node_type": "content", "path": "."}
],
example='''
<browser-switch-tab>
1
</browser-switch-tab>
'''
)
async def browser_switch_tab(self, page_id: int) -> ToolResult:
"""Switch to a different browser tab
Args:
page_id (int): The ID of the tab to switch to
Returns:
dict: Result of the execution
"""
print(f"\033[95mSwitching to tab: {page_id}\033[0m")
return await self._execute_browser_action("switch_tab", {"page_id": page_id})
# @openapi_schema({
# "type": "function",
# "function": {
# "name": "browser_open_tab",
# "description": "Open a new browser tab with the specified URL",
# "parameters": {
# "type": "object",
# "properties": {
# "url": {
# "type": "string",
# "description": "The URL to open in the new tab"
# }
# },
# "required": ["url"]
# }
# }
# })
# @xml_schema(
# tag_name="browser-open-tab",
# mappings=[
# {"param_name": "url", "node_type": "content", "path": "."}
# ],
# example='''
# <browser-open-tab>
# https://example.com
# </browser-open-tab>
# '''
# )
# async def browser_open_tab(self, url: str) -> ToolResult:
# """Open a new browser tab with the specified URL
# Args:
# url (str): The URL to open in the new tab
# Returns:
# dict: Result of the execution
# """
# print(f"\033[95mOpening new tab with URL: {url}\033[0m")
# return await self._execute_browser_action("open_tab", {"url": url})
@openapi_schema({
"type": "function",
"function": {
"name": "browser_close_tab",
"description": "Close a browser tab",
"parameters": {
"type": "object",
"properties": {
"page_id": {
"type": "integer",
"description": "The ID of the tab to close"
}
},
"required": ["page_id"]
}
}
})
@xml_schema(
tag_name="browser-close-tab",
mappings=[
{"param_name": "page_id", "node_type": "content", "path": "."}
],
example='''
<browser-close-tab>
1
</browser-close-tab>
'''
)
async def browser_close_tab(self, page_id: int) -> ToolResult:
"""Close a browser tab
Args:
page_id (int): The ID of the tab to close
Returns:
dict: Result of the execution
"""
print(f"\033[95mClosing tab: {page_id}\033[0m")
return await self._execute_browser_action("close_tab", {"page_id": page_id})
# @openapi_schema({
# "type": "function",
# "function": {
# "name": "browser_extract_content",
# "description": "Extract content from the current page based on the provided goal",
# "parameters": {
# "type": "object",
# "properties": {
# "goal": {
# "type": "string",
# "description": "The extraction goal (e.g., 'extract all links', 'find product information')"
# }
# },
# "required": ["goal"]
# }
# }
# })
# @xml_schema(
# tag_name="browser-extract-content",
# mappings=[
# {"param_name": "goal", "node_type": "content", "path": "."}
# ],
# example='''
# <browser-extract-content>
# Extract all links on the page
# </browser-extract-content>
# '''
# )
# async def browser_extract_content(self, goal: str) -> ToolResult:
# """Extract content from the current page based on the provided goal
# Args:
# goal (str): The extraction goal
# Returns:
# dict: Result of the execution
# """
# print(f"\033[95mExtracting content with goal: {goal}\033[0m")
# result = await self._execute_browser_action("extract_content", {"goal": goal})
# # Format content for better readability
# if result.get("success"):
# print(f"\033[92mContent extraction successful\033[0m")
# content = result.data.get("content", "")
# url = result.data.get("url", "")
# title = result.data.get("title", "")
# if content:
# content_preview = content[:200] + "..." if len(content) > 200 else content
# print(f"\033[95mExtracted content from {title} ({url}):\033[0m")
# print(f"\033[96m{content_preview}\033[0m")
# print(f"\033[95mTotal content length: {len(content)} characters\033[0m")
# else:
# print(f"\033[93mNo content extracted from {url}\033[0m")
# else:
# print(f"\033[91mFailed to extract content: {result.data.get('error', 'Unknown error')}\033[0m")
# return result
@openapi_schema({
"type": "function",
"function": {
"name": "browser_scroll_down",
"description": "Scroll down the page",
"parameters": {
"type": "object",
"properties": {
"amount": {
"type": "integer",
"description": "Pixel amount to scroll (if not specified, scrolls one page)"
}
}
}
}
})
@xml_schema(
tag_name="browser-scroll-down",
mappings=[
{"param_name": "amount", "node_type": "content", "path": "."}
],
example='''
<browser-scroll-down>
500
</browser-scroll-down>
'''
)
async def browser_scroll_down(self, amount: int = None) -> ToolResult:
"""Scroll down the page
Args:
amount (int, optional): Pixel amount to scroll. If None, scrolls one page.
Returns:
dict: Result of the execution
"""
params = {}
if amount is not None:
params["amount"] = amount
print(f"\033[95mScrolling down by {amount} pixels\033[0m")
else:
print(f"\033[95mScrolling down one page\033[0m")
return await self._execute_browser_action("scroll_down", params)
@openapi_schema({
"type": "function",
"function": {
"name": "browser_scroll_up",
"description": "Scroll up the page",
"parameters": {
"type": "object",
"properties": {
"amount": {
"type": "integer",
"description": "Pixel amount to scroll (if not specified, scrolls one page)"
}
}
}
}
})
@xml_schema(
tag_name="browser-scroll-up",
mappings=[
{"param_name": "amount", "node_type": "content", "path": "."}
],
example='''
<browser-scroll-up>
500
</browser-scroll-up>
'''
)
async def browser_scroll_up(self, amount: int = None) -> ToolResult:
"""Scroll up the page
Args:
amount (int, optional): Pixel amount to scroll. If None, scrolls one page.
Returns:
dict: Result of the execution
"""
params = {}
if amount is not None:
params["amount"] = amount
print(f"\033[95mScrolling up by {amount} pixels\033[0m")
else:
print(f"\033[95mScrolling up one page\033[0m")
return await self._execute_browser_action("scroll_up", params)
@openapi_schema({
"type": "function",
"function": {
"name": "browser_scroll_to_text",
"description": "Scroll to specific text on the page",
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The text to scroll to"
}
},
"required": ["text"]
}
}
})
@xml_schema(
tag_name="browser-scroll-to-text",
mappings=[
{"param_name": "text", "node_type": "content", "path": "."}
],
example='''
<browser-scroll-to-text>
Contact Us
</browser-scroll-to-text>
'''
)
async def browser_scroll_to_text(self, text: str) -> ToolResult:
"""Scroll to specific text on the page
Args:
text (str): The text to scroll to
Returns:
dict: Result of the execution
"""
print(f"\033[95mScrolling to text: {text}\033[0m")
return await self._execute_browser_action("scroll_to_text", {"text": text})
@openapi_schema({
"type": "function",
"function": {
"name": "browser_get_dropdown_options",
"description": "Get all options from a dropdown element",
"parameters": {
"type": "object",
"properties": {
"index": {
"type": "integer",
"description": "The index of the dropdown element"
}
},
"required": ["index"]
}
}
})
@xml_schema(
tag_name="browser-get-dropdown-options",
mappings=[
{"param_name": "index", "node_type": "content", "path": "."}
],
example='''
<browser-get-dropdown-options>
2
</browser-get-dropdown-options>
'''
)
async def browser_get_dropdown_options(self, index: int) -> ToolResult:
"""Get all options from a dropdown element
Args:
index (int): The index of the dropdown element
Returns:
dict: Result of the execution with the dropdown options
"""
print(f"\033[95mGetting options from dropdown with index: {index}\033[0m")
return await self._execute_browser_action("get_dropdown_options", {"index": index})
@openapi_schema({
"type": "function",
"function": {
"name": "browser_select_dropdown_option",
"description": "Select an option from a dropdown by text",
"parameters": {
"type": "object",
"properties": {
"index": {
"type": "integer",
"description": "The index of the dropdown element"
},
"text": {
"type": "string",
"description": "The text of the option to select"
}
},
"required": ["index", "text"]
}
}
})
@xml_schema(
tag_name="browser-select-dropdown-option",
mappings=[
{"param_name": "index", "node_type": "attribute", "path": "."},
{"param_name": "text", "node_type": "content", "path": "."}
],
example='''
<browser-select-dropdown-option index="2">
Option 1
</browser-select-dropdown-option>
'''
)
async def browser_select_dropdown_option(self, index: int, text: str) -> ToolResult:
"""Select an option from a dropdown by text
Args:
index (int): The index of the dropdown element
text (str): The text of the option to select
Returns:
dict: Result of the execution
"""
print(f"\033[95mSelecting option '{text}' from dropdown with index: {index}\033[0m")
return await self._execute_browser_action("select_dropdown_option", {"index": index, "text": text})
@openapi_schema({
"type": "function",
"function": {
"name": "browser_drag_drop",
"description": "Perform drag and drop operation between elements or coordinates",
"parameters": {
"type": "object",
"properties": {
"element_source": {
"type": "string",
"description": "The source element selector"
},
"element_target": {
"type": "string",
"description": "The target element selector"
},
"coord_source_x": {
"type": "integer",
"description": "The source X coordinate"
},
"coord_source_y": {
"type": "integer",
"description": "The source Y coordinate"
},
"coord_target_x": {
"type": "integer",
"description": "The target X coordinate"
},
"coord_target_y": {
"type": "integer",
"description": "The target Y coordinate"
}
}
}
}
})
@xml_schema(
tag_name="browser-drag-drop",
mappings=[
{"param_name": "element_source", "node_type": "attribute", "path": "."},
{"param_name": "element_target", "node_type": "attribute", "path": "."},
{"param_name": "coord_source_x", "node_type": "attribute", "path": "."},
{"param_name": "coord_source_y", "node_type": "attribute", "path": "."},
{"param_name": "coord_target_x", "node_type": "attribute", "path": "."},
{"param_name": "coord_target_y", "node_type": "attribute", "path": "."}
],
example='''
<browser-drag-drop element_source="#draggable" element_target="#droppable"></browser-drag-drop>
'''
)
async def browser_drag_drop(self, element_source: str = None, element_target: str = None,
coord_source_x: int = None, coord_source_y: int = None,
coord_target_x: int = None, coord_target_y: int = None) -> ToolResult:
"""Perform drag and drop operation between elements or coordinates
Args:
element_source (str, optional): The source element selector
element_target (str, optional): The target element selector
coord_source_x (int, optional): The source X coordinate
coord_source_y (int, optional): The source Y coordinate
coord_target_x (int, optional): The target X coordinate
coord_target_y (int, optional): The target Y coordinate
Returns:
dict: Result of the execution
"""
params = {}
if element_source and element_target:
params["element_source"] = element_source
params["element_target"] = element_target
print(f"\033[95mDragging from element '{element_source}' to '{element_target}'\033[0m")
elif all(coord is not None for coord in [coord_source_x, coord_source_y, coord_target_x, coord_target_y]):
params["coord_source_x"] = coord_source_x
params["coord_source_y"] = coord_source_y
params["coord_target_x"] = coord_target_x
params["coord_target_y"] = coord_target_y
print(f"\033[95mDragging from coordinates ({coord_source_x}, {coord_source_y}) to ({coord_target_x}, {coord_target_y})\033[0m")
else:
return self.fail_response("Must provide either element selectors or coordinates for drag and drop")
return await self._execute_browser_action("drag_drop", params)

View File

@ -94,7 +94,9 @@ COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy server script
COPY . /app
COPY server.py /app/server.py
COPY browser_api.py /app/browser_api.py
# Install Playwright and browsers with system dependencies
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
@ -106,9 +108,6 @@ RUN playwright install chromium
# Verify installation
RUN python -c "from playwright.sync_api import sync_playwright; print('Playwright installation verified')"
# Copy the application code
# COPY . .
# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome

View File

@ -1,18 +0,0 @@
from fastapi import FastAPI
from automation_service import automation_service
# Create API app
api_app = FastAPI()
@api_app.get("/api")
async def health_check():
return {"status": "ok", "message": "API server is running"}
# Include automation service router with /api prefix
api_app.include_router(automation_service.router, prefix="/api")
# This is needed for the import string approach with uvicorn
if __name__ == '__main__':
import uvicorn
print("Starting API server")
uvicorn.run("api:api_app", host="0.0.0.0", port=8000)

View File

@ -1,195 +0,0 @@
import pyautogui
import time
import os
import sys
from typing import List, Dict, Any, Optional, Union
import io
import base64
from PIL import Image
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from enum import Enum
# Set environment variable for the display if not already set
if 'DISPLAY' not in os.environ:
os.environ['DISPLAY'] = ':99'
# Try to initialize pyautogui with error handling
try:
pyautogui.FAILSAFE = False
except Exception as e:
print(f"Warning: Could not initialize pyautogui: {e}", file=sys.stderr)
print("This may be due to X11 authentication issues. Continuing anyway.", file=sys.stderr)
## Input Models
class MouseButton(str, Enum):
left = "left"
middle = "middle"
right = "right"
class Position(BaseModel):
x: Optional[int] = None
y: Optional[int] = None
class MouseAction(BaseModel):
x: Optional[int] = None
y: Optional[int] = None
clicks: Optional[int] = 1
interval: Optional[float] = 0.0
button: MouseButton = MouseButton.left
duration: Optional[float] = 0.0
class KeyboardAction(BaseModel):
key: str
class KeyboardPress(BaseModel):
keys: Union[str, List[str]]
presses: Optional[int] = 1
interval: Optional[float] = 0.0
class WriteAction(BaseModel):
message: str
interval: Optional[float] = 0.0
class HotkeyAction(BaseModel):
keys: List[str]
interval: Optional[float] = 0.0
class AutomationService:
def __init__(self):
self.router = APIRouter()
# Set fallback to avoid crashes
pyautogui.FAILSAFE = False
# X error handling
try:
# Test if we can get the screen size
self.screen_width, self.screen_height = pyautogui.size()
print(f"Screen size detected: {self.screen_width}x{self.screen_height}")
self.x11_available = True
except Exception as e:
print(f"Warning: Could not get screen size: {e}", file=sys.stderr)
print("X11 functionality may be limited. Using fallback values.", file=sys.stderr)
self.screen_width = 1920
self.screen_height = 1080
self.x11_available = False
self.router.get("/automation/mouse/position")(self.get_mouse_position)
self.router.post("/automation/mouse/move")(self.move_mouse)
self.router.post("/automation/mouse/click")(self.click_mouse)
self.router.post("/automation/mouse/down")(self.mouse_down)
self.router.post("/automation/mouse/up")(self.mouse_up)
self.router.post("/automation/mouse/drag")(self.drag_mouse)
self.router.post("/automation/mouse/scroll")(self.scroll_mouse)
self.router.post("/automation/keyboard/down")(self.key_down)
self.router.post("/automation/keyboard/up")(self.key_up)
self.router.post("/automation/keyboard/press")(self.press_key)
self.router.post("/automation/keyboard/write")(self.write_text)
self.router.post("/automation/keyboard/hotkey")(self.press_hotkey)
self.router.post("/automation/screenshot")(self.take_screenshot)
async def get_mouse_position(self):
try:
x, y = pyautogui.position()
return {"x": x, "y": y}
except Exception as e:
return {"error": str(e), "x": 0, "y": 0}
async def move_mouse(self, action: Position):
try:
pyautogui.moveTo(x=action.x, y=action.y)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def click_mouse(self, action: MouseAction):
try:
pyautogui.click(x=action.x, y=action.y, clicks=action.clicks,
interval=action.interval, button=action.button,
duration=action.duration)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def mouse_down(self, action: MouseAction):
try:
pyautogui.mouseDown(x=action.x, y=action.y,
button=action.button, duration=action.duration)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def mouse_up(self, action: MouseAction):
try:
pyautogui.mouseUp(x=action.x, y=action.y,
button=action.button, duration=action.duration)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def drag_mouse(self, action: MouseAction):
try:
pyautogui.dragTo(x=action.x, y=action.y,
duration=action.duration, button=action.button)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def scroll_mouse(self, action: MouseAction):
try:
pyautogui.scroll(clicks=action.clicks, x=action.x, y=action.y)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def key_down(self, action: KeyboardAction):
try:
pyautogui.keyDown(action.key)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def key_up(self, action: KeyboardAction):
try:
pyautogui.keyUp(action.key)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def press_key(self, action: KeyboardPress):
try:
pyautogui.press(keys=action.keys, presses=action.presses,
interval=action.interval)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def write_text(self, action: WriteAction):
try:
pyautogui.write(message=action.message, interval=action.interval)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def press_hotkey(self, action: HotkeyAction):
try:
pyautogui.hotkey(*action.keys, interval=action.interval)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def take_screenshot(self) -> Dict[str, str]:
try:
screenshot = pyautogui.screenshot()
img_byte_arr = io.BytesIO()
screenshot.save(img_byte_arr, format='PNG')
img_byte_arr = img_byte_arr.getvalue()
return {"image": base64.b64encode(img_byte_arr).decode()}
except Exception as e:
return {"error": str(e)}
# Create a singleton instance
automation_service = AutomationService()

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@ services:
dockerfile: ${DOCKERFILE:-Dockerfile}
args:
TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
image: kortixmarko/kortix-suna:0.0.5
image: adamcohenhillel/kortix-suna:0.0.13
ports:
- "6080:6080" # noVNC web interface
- "5901:5901" # VNC port

View File

@ -65,21 +65,6 @@ startretries=5
startsecs=3
depends_on=x11vnc
[program:persistent_browser]
environment=START_URL="data:text/html,<html><body><h1>Browser Ready</h1></body></html>"
command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\""
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
priority=350
startretries=5
startsecs=10
stopsignal=TERM
stopwaitsecs=15
depends_on=novnc
[program:http_server]
command=python /app/server.py
directory=/app
@ -94,8 +79,8 @@ startsecs=5
stopsignal=TERM
stopwaitsecs=10
[program:api_server]
command=python /app/api.py
[program:browser_api]
command=python /app/browser_api.py
directory=/app
autorestart=true
stdout_logfile=/dev/stdout

View File

@ -121,11 +121,12 @@ def prepare_params(
logger.debug(f"Added {len(tools)} tools to API parameters")
# # Add Claude-specific headers
# if "claude" in model_name.lower() or "anthropic" in model_name.lower():
# params["extra_headers"] = {
# "anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"
# }
# logger.debug("Added Claude-specific headers")
if "claude" in model_name.lower() or "anthropic" in model_name.lower():
params["extra_headers"] = {
# "anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"
"anthropic-beta": "output-128k-2025-02-19"
}
logger.debug("Added Claude-specific headers")
# Add OpenRouter-specific parameters
if model_name.startswith("openrouter/"):

View File

@ -4,9 +4,9 @@ from services.supabase import DBConnection
# Define subscription tiers and their monthly hour limits
SUBSCRIPTION_TIERS = {
'price_1RDQbOG6l1KZGqIrgrYzMbnL': {'name': 'free', 'hours': 1},
'price_1RC2PYG6l1KZGqIrpbzFB9Lp': {'name': 'base', 'hours': 1},
'price_1RDQWqG6l1KZGqIrChli4Ys4': {'name': 'extra', 'hours': 1}
'price_1RDQbOG6l1KZGqIrgrYzMbnL': {'name': 'free', 'hours': 100},
'price_1RC2PYG6l1KZGqIrpbzFB9Lp': {'name': 'base', 'hours': 100},
'price_1RDQWqG6l1KZGqIrChli4Ys4': {'name': 'extra', 'hours': 100}
}
async def get_account_subscription(client, account_id: str) -> Optional[Dict]:

View File

@ -282,6 +282,12 @@ export default function AgentPage({ params }: AgentPageProps) {
part.isToolCall = !isUserMessage;
part.status = part.isClosing ? 'completed' : 'running';
// Check if this is a browser-related tool and add VNC preview
if (part.tagName.includes('browser') && agent?.sandbox?.vnc_preview) {
console.log(`[TOOLS] Adding VNC preview from sandbox to browser tool ${part.tagName}`);
part.vncPreview = agent.sandbox.vnc_preview + "/vnc_lite.html?password=" + agent.sandbox.pass;
}
// Use ID for deduplication
if (!seenTagIds.has(part.id)) {
seenTagIds.add(part.id);
@ -307,6 +313,12 @@ export default function AgentPage({ params }: AgentPageProps) {
tag.isToolCall = !isUserMessage;
tag.status = tag.isClosing ? 'completed' : 'running';
// Check if this is a browser-related tool and add VNC preview
if (tag.tagName.includes('browser') && agent?.sandbox?.vnc_preview) {
console.log(`[TOOLS] Adding VNC preview from sandbox to browser tool ${tag.tagName}`);
tag.vncPreview = agent.sandbox.vnc_preview + "/vnc_lite.html?password=" + agent.sandbox.pass;
}
// Use ID for deduplication
if (!seenTagIds.has(tag.id)) {
seenTagIds.add(tag.id);
@ -381,7 +393,7 @@ export default function AgentPage({ params }: AgentPageProps) {
// Update tool calls in the shared context
setToolCalls(pairedTags);
}, [messages, streamContent, setToolCalls]);
}, [messages, streamContent, setToolCalls, agent]);
// Scroll to bottom of messages
const scrollToBottom = useCallback(() => {
@ -752,6 +764,10 @@ export default function AgentPage({ params }: AgentPageProps) {
<>
{messages.map((message, index) => {
// Skip messages containing "ToolResult("
if (!message || !message?.content || !message?.role) {
return null;
}
if (message.content.includes("ToolResult(")) {
return null;
}
@ -927,6 +943,9 @@ export default function AgentPage({ params }: AgentPageProps) {
<>
{messages.map((message, index) => {
// Skip messages containing "ToolResult("
if (!message || !message?.content || !message?.role) {
return null;
}
if (message.content.includes("ToolResult(")) {
return null;
}

View File

@ -16,12 +16,12 @@ export const SUBSCRIPTION_PLANS = {
const PLAN_DETAILS = {
[SUBSCRIPTION_PLANS.FREE]: {
name: 'Free',
limit: 1,
limit: 100,
price: 0
},
[SUBSCRIPTION_PLANS.BASIC]: {
name: 'Basic',
limit: 10,
limit: 100,
price: 10
},
[SUBSCRIPTION_PLANS.PRO]: {

View File

@ -4,7 +4,7 @@ import React from 'react';
import { ParsedTag, ToolComponentProps } from '@/lib/types/tool-calls';
import {
File, FileText, Terminal, FolderPlus, Folder, Code, Search as SearchIcon,
Bell, Replace, Plus, Minus
Bell, Replace, Plus, Minus, Globe, Search
} from 'lucide-react';
import { cn } from '@/lib/utils';
import { diffLines } from 'diff';
@ -458,6 +458,128 @@ export const SearchCodeTool: React.FC<ToolComponentProps> = ({ tag, mode }) => {
);
};
/**
* Browser Navigate Tool Component
*/
export const BrowserNavigateTool: React.FC<ToolComponentProps> = ({ tag, mode }) => {
const url = tag.content || '';
const isRunning = tag.status === 'running';
if (mode === 'compact') {
return (
<CompactToolDisplay
icon={<Globe className="h-4 w-4 mr-2" />}
name={isRunning ? "Navigating to" : "Navigated to"}
input={url}
isRunning={isRunning}
/>
);
}
return (
<div className="border rounded-lg overflow-hidden border-subtle dark:border-white/10">
<div className="flex items-center px-2 py-1 text-xs font-medium border-b border-subtle dark:border-white/10 bg-background-secondary dark:bg-background-secondary text-foreground">
<Globe className="h-4 w-4 mr-2" />
<div className="flex-1">{isRunning ? `Navigating to` : `Navigated to`}: {url}</div>
{isRunning && (
<div className="flex items-center gap-2">
<span className="text-amber-500">Running</span>
<div className="h-2 w-2 rounded-full bg-amber-500 animate-pulse"></div>
</div>
)}
</div>
<div className="p-3 bg-card-bg dark:bg-background-secondary text-foreground">
<div className="space-y-2">
<div className="flex items-center gap-1 text-xs text-muted-foreground mb-1">
<Globe className="h-3 w-3" />
<span className="font-mono">{url}</span>
</div>
{/* Display VNC preview if available */}
{tag.vncPreview && (
<div className="mt-2 border border-subtle dark:border-white/10 rounded-md overflow-hidden">
<div className="text-xs bg-black text-white p-1">VNC Preview</div>
<div className="relative w-full h-[300px] overflow-hidden">
<iframe
src={tag.vncPreview}
title="Browser preview"
className="absolute top-0 left-0 border-0"
style={{
width: '200%',
height: '200%',
transform: 'scale(0.5)',
transformOrigin: '0 0'
}}
sandbox="allow-same-origin allow-scripts"
/>
</div>
</div>
)}
</div>
</div>
</div>
);
};
/**
* Web Search Tool Component
*/
export const WebSearchTool: React.FC<ToolComponentProps> = ({ tag, mode }) => {
const query = tag.attributes.query || '';
const isRunning = tag.status === 'running';
if (mode === 'compact') {
return (
<CompactToolDisplay
icon={<Search className="h-4 w-4 mr-2" />}
name={isRunning ? "Web search in progress..." : "Web search complete"}
input={query}
isRunning={isRunning}
/>
);
}
const results = tag.result?.output ? JSON.parse(tag.result.output) : [];
return (
<div className="border rounded-lg overflow-hidden border-subtle dark:border-white/10">
<div className="flex items-center px-2 py-1 text-xs font-medium border-b border-subtle dark:border-white/10 bg-background-secondary dark:bg-background-secondary text-foreground">
<Search className="h-4 w-4 mr-2" />
<div className="flex-1">Web Search: {query}</div>
{isRunning && (
<div className="flex items-center gap-2">
<span className="text-amber-500">Searching</span>
<div className="h-2 w-2 rounded-full bg-amber-500 animate-pulse"></div>
</div>
)}
</div>
<div className="p-3 bg-card-bg dark:bg-background-secondary text-foreground">
{results.length > 0 ? (
<div className="space-y-3">
{results.map((result: any, index: number) => (
<div key={index} className="text-sm">
<a href={result.URL} target="_blank" rel="noopener noreferrer" className="font-medium text-blue-600 hover:underline">
{result.Title}
</a>
<div className="text-xs text-muted-foreground mt-1">
{result.URL}
{result['Published Date'] && (
<span className="ml-2">
({new Date(result['Published Date']).toLocaleDateString()})
</span>
)}
</div>
</div>
))}
</div>
) : (
<div className="text-sm text-muted-foreground">No results found</div>
)}
</div>
</div>
);
};
// Tool component registry
export const ToolComponentRegistry: Record<string, React.FC<ToolComponentProps>> = {
'create-file': CreateFileTool,
@ -471,10 +593,28 @@ export const ToolComponentRegistry: Record<string, React.FC<ToolComponentProps>>
'ask': NotifyTool, // Handle ask similar to notify for now
'complete': NotifyTool, // Handle complete similar to notify for now
'full-file-rewrite': FullFileRewriteTool,
'browser-navigate-to': BrowserNavigateTool,
'browser-click-element': BrowserNavigateTool,
'browser-input-text': BrowserNavigateTool,
'browser-go-back': BrowserNavigateTool,
'browser-wait': BrowserNavigateTool,
'browser-scroll-down': BrowserNavigateTool,
'browser-scroll-up': BrowserNavigateTool,
'browser-scroll-to-text': BrowserNavigateTool,
'browser-switch-tab': BrowserNavigateTool,
'browser-close-tab': BrowserNavigateTool,
'browser-get-dropdown-options': BrowserNavigateTool,
'browser-select-dropdown-option': BrowserNavigateTool,
'browser-drag-drop': BrowserNavigateTool,
'web-search': WebSearchTool,
};
// Helper function to get the appropriate component for a tag
export function getComponentForTag(tag: ParsedTag): React.FC<ToolComponentProps> {
console.log("getComponentForTag", tag);
if (!tag || !tag?.tagName) {
console.warn(`No tag name for tag: ${tag}`);
}
if (!ToolComponentRegistry[tag.tagName]) {
console.warn(`No component registered for tag type: ${tag.tagName}`);
}

View File

@ -175,29 +175,3 @@ export function useToolsPanel() {
prevTool,
};
}
// Helper function to get a friendly title for a tool call
function getToolTitle(tag: ParsedTag): string {
switch (tag.tagName) {
case 'create-file':
return `Creating file: ${tag.attributes.file_path || ''}`;
case 'read-file':
return `Reading file: ${tag.attributes.file_path || ''}`;
case 'execute-command':
return `Executing: ${tag.attributes.command || ''}`;
case 'create-directory':
return `Creating directory: ${tag.attributes.path || ''}`;
case 'list-directory':
return `Listing directory: ${tag.attributes.path || ''}`;
case 'search-code':
return `Searching code: ${tag.attributes.query || ''}`;
case 'notify':
return `Notification: ${tag.attributes.message || ''}`;
case 'str-replace':
return `String replace: ${tag.attributes.pattern || ''}`;
case 'full-file-rewrite':
return `Full file rewrite: ${tag.attributes.file_path || ''}`;
default:
return `${tag.tagName} operation`;
}
}

View File

@ -80,8 +80,11 @@ export type Project = {
description: string;
account_id: string;
created_at: string;
sandbox_id?: string;
sandbox_pass?: string;
sandbox: {
vnc_preview?: string;
id?: string;
pass?: string;
};
}
export type Thread = {
@ -214,7 +217,8 @@ export const createProject = async (
name: data.name,
description: data.description || '',
account_id: data.account_id,
created_at: data.created_at
created_at: data.created_at,
sandbox: { id: "", pass: "", vnc_preview: "" }
};
};

View File

@ -13,6 +13,9 @@ export interface ParsedTag {
isToolCall?: boolean; // Whether this is a tool call (vs a result)
isPaired?: boolean; // Whether this tag has been paired with its call/result
status?: 'running' | 'completed' | 'error'; // Status of the tool call
// VNC preview for browser-related tools
vncPreview?: string; // VNC preview image URL
}
// Display mode for tool components
@ -37,7 +40,21 @@ export const SUPPORTED_XML_TAGS = [
'list-directory',
'search-code',
'complete',
'full-file-rewrite'
'full-file-rewrite',
'browser-navigate-to',
'browser-click-element',
'browser-input-text',
'browser-go-back',
'browser-wait',
'browser-scroll-down',
'browser-scroll-up',
'browser-scroll-to-text',
'browser-switch-tab',
'browser-close-tab',
'browser-get-dropdown-options',
'browser-select-dropdown-option',
'browser-drag-drop',
'web-search'
];
// Tool status labels