mirror of https://github.com/kortix-ai/suna.git
Merge pull request #64 from kortix-ai/browser-click-onxy-and-ocr
Browser click XY and OCR
This commit is contained in:
commit
04ce0e499b
|
@ -126,7 +126,7 @@ You'll need the following components:
|
|||
- Generate an API key from your account settings
|
||||
- Go to [Images](https://app.daytona.io/dashboard/images)
|
||||
- Click "Add Image"
|
||||
- Enter `adamcohenhillel/kortix-suna:0.0.16` as the image name
|
||||
- Enter `adamcohenhillel/kortix-suna:0.0.18` as the image name
|
||||
- Set `exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf` as the Entrypoint
|
||||
|
||||
4. **LLM API Keys**:
|
||||
|
|
|
@ -84,15 +84,18 @@ class SandboxBrowserTool(SandboxToolsBase):
|
|||
success_response["elements_found"] = result["element_count"]
|
||||
if result.get("pixels_below"):
|
||||
success_response["scrollable_content"] = result["pixels_below"] > 0
|
||||
# Add OCR text when available
|
||||
if result.get("ocr_text"):
|
||||
success_response["ocr_text"] = result["ocr_text"]
|
||||
|
||||
return self.success_response(success_response)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"Failed to parse response JSON: {response.result}")
|
||||
return self.fail_response(f"Failed to parse response JSON: {response.result}")
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse response JSON: {response.result} {e}")
|
||||
return self.fail_response(f"Failed to parse response JSON: {response.result} {e}")
|
||||
else:
|
||||
logger.error(f"Browser automation request failed: {response.result}")
|
||||
return self.fail_response(f"Browser automation request failed: {response.result}")
|
||||
logger.error(f"Browser automation request failed 2: {response}")
|
||||
return self.fail_response(f"Browser automation request failed 2: {response}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error executing browser action: {e}")
|
||||
|
@ -847,4 +850,48 @@ class SandboxBrowserTool(SandboxToolsBase):
|
|||
else:
|
||||
return self.fail_response("Must provide either element selectors or coordinates for drag and drop")
|
||||
|
||||
return await self._execute_browser_action("drag_drop", params)
|
||||
return await self._execute_browser_action("drag_drop", params)
|
||||
|
||||
@openapi_schema({
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "browser_click_coordinates",
|
||||
"description": "Click at specific X,Y coordinates on the page",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"x": {
|
||||
"type": "integer",
|
||||
"description": "The X coordinate to click"
|
||||
},
|
||||
"y": {
|
||||
"type": "integer",
|
||||
"description": "The Y coordinate to click"
|
||||
}
|
||||
},
|
||||
"required": ["x", "y"]
|
||||
}
|
||||
}
|
||||
})
|
||||
@xml_schema(
|
||||
tag_name="browser-click-coordinates",
|
||||
mappings=[
|
||||
{"param_name": "x", "node_type": "attribute", "path": "."},
|
||||
{"param_name": "y", "node_type": "attribute", "path": "."}
|
||||
],
|
||||
example='''
|
||||
<browser-click-coordinates x="100" y="200"></browser-click-coordinates>
|
||||
'''
|
||||
)
|
||||
async def browser_click_coordinates(self, x: int, y: int) -> ToolResult:
|
||||
"""Click at specific X,Y coordinates on the page
|
||||
|
||||
Args:
|
||||
x (int): The X coordinate to click
|
||||
y (int): The Y coordinate to click
|
||||
|
||||
Returns:
|
||||
dict: Result of the execution
|
||||
"""
|
||||
print(f"\033[95mClicking at coordinates: ({x}, {y})\033[0m")
|
||||
return await self._execute_browser_action("click_coordinates", {"x": x, "y": y})
|
File diff suppressed because it is too large
Load Diff
|
@ -50,6 +50,7 @@ streamlit = "^1.44.1"
|
|||
nest-asyncio = "^1.6.0"
|
||||
vncdotool = "^1.2.0"
|
||||
tavily-python = "^0.5.4"
|
||||
pytesseract = "^0.3.13"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
agentpress = "agentpress.cli:main"
|
||||
|
|
|
@ -23,4 +23,5 @@ python-ripgrep==0.0.6
|
|||
daytona_sdk>=0.12.0
|
||||
boto3>=1.34.0
|
||||
pydantic
|
||||
tavily-python>=0.5.4
|
||||
tavily-python>=0.5.4
|
||||
pytesseract==0.3.13
|
|
@ -13,6 +13,9 @@ import os
|
|||
import random
|
||||
from functools import cached_property
|
||||
import traceback
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
#######################################################
|
||||
# Action model definitions
|
||||
|
@ -25,6 +28,10 @@ class Position(BaseModel):
|
|||
class ClickElementAction(BaseModel):
|
||||
index: int
|
||||
|
||||
class ClickCoordinatesAction(BaseModel):
|
||||
x: int
|
||||
y: int
|
||||
|
||||
class GoToUrlAction(BaseModel):
|
||||
url: str
|
||||
|
||||
|
@ -257,6 +264,7 @@ class BrowserActionResult(BaseModel):
|
|||
pixels_above: int = 0
|
||||
pixels_below: int = 0
|
||||
content: Optional[str] = None
|
||||
ocr_text: Optional[str] = None # Added field for OCR text
|
||||
|
||||
# Additional metadata
|
||||
element_count: int = 0 # Number of interactive elements found
|
||||
|
@ -294,6 +302,7 @@ class BrowserAutomation:
|
|||
|
||||
# Element interaction
|
||||
self.router.post("/automation/click_element")(self.click_element)
|
||||
self.router.post("/automation/click_coordinates")(self.click_coordinates)
|
||||
self.router.post("/automation/input_text")(self.input_text)
|
||||
self.router.post("/automation/send_keys")(self.send_keys)
|
||||
|
||||
|
@ -626,6 +635,28 @@ class BrowserAutomation:
|
|||
print(f"Error saving screenshot: {e}")
|
||||
return ""
|
||||
|
||||
async def extract_ocr_text_from_screenshot(self, screenshot_base64: str) -> str:
|
||||
"""Extract text from screenshot using OCR"""
|
||||
if not screenshot_base64:
|
||||
return ""
|
||||
|
||||
try:
|
||||
# Decode base64 to image
|
||||
image_bytes = base64.b64decode(screenshot_base64)
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
|
||||
# Extract text using pytesseract
|
||||
ocr_text = pytesseract.image_to_string(image)
|
||||
|
||||
# Clean up the text
|
||||
ocr_text = ocr_text.strip()
|
||||
|
||||
return ocr_text
|
||||
except Exception as e:
|
||||
print(f"Error performing OCR: {e}")
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
async def get_updated_browser_state(self, action_name: str) -> tuple:
|
||||
"""Helper method to get updated browser state after any action
|
||||
Returns a tuple of (dom_state, screenshot, elements, metadata)
|
||||
|
@ -686,6 +717,12 @@ class BrowserAutomation:
|
|||
metadata['viewport_width'] = 0
|
||||
metadata['viewport_height'] = 0
|
||||
|
||||
# Extract OCR text from screenshot if available
|
||||
ocr_text = ""
|
||||
if screenshot:
|
||||
ocr_text = await self.extract_ocr_text_from_screenshot(screenshot)
|
||||
metadata['ocr_text'] = ocr_text
|
||||
|
||||
print(f"Got updated state after {action_name}: {len(dom_state.selector_map)} elements")
|
||||
return dom_state, screenshot, elements, metadata
|
||||
except Exception as e:
|
||||
|
@ -713,6 +750,7 @@ class BrowserAutomation:
|
|||
pixels_above=dom_state.pixels_above if dom_state else 0,
|
||||
pixels_below=dom_state.pixels_below if dom_state else 0,
|
||||
content=content,
|
||||
ocr_text=metadata.get('ocr_text', ""),
|
||||
element_count=metadata.get('element_count', 0),
|
||||
interactive_elements=metadata.get('interactive_elements', []),
|
||||
viewport_width=metadata.get('viewport_width', 0),
|
||||
|
@ -885,6 +923,59 @@ class BrowserAutomation:
|
|||
|
||||
# Element Interaction Actions
|
||||
|
||||
async def click_coordinates(self, action: ClickCoordinatesAction = Body(...)):
|
||||
"""Click at specific x,y coordinates on the page"""
|
||||
try:
|
||||
page = await self.get_current_page()
|
||||
|
||||
# Perform the click at the specified coordinates
|
||||
await page.mouse.click(action.x, action.y)
|
||||
|
||||
# Give time for any navigation or DOM updates to occur
|
||||
await page.wait_for_load_state("networkidle", timeout=5000)
|
||||
|
||||
# Get updated state after action
|
||||
dom_state, screenshot, elements, metadata = await self.get_updated_browser_state(f"click_coordinates({action.x}, {action.y})")
|
||||
|
||||
return self.build_action_result(
|
||||
True,
|
||||
f"Clicked at coordinates ({action.x}, {action.y})",
|
||||
dom_state,
|
||||
screenshot,
|
||||
elements,
|
||||
metadata,
|
||||
error="",
|
||||
content=None
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error in click_coordinates: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
# Try to get state even after error
|
||||
try:
|
||||
dom_state, screenshot, elements, metadata = await self.get_updated_browser_state("click_coordinates_error_recovery")
|
||||
return self.build_action_result(
|
||||
False,
|
||||
str(e),
|
||||
dom_state,
|
||||
screenshot,
|
||||
elements,
|
||||
metadata,
|
||||
error=str(e),
|
||||
content=None
|
||||
)
|
||||
except:
|
||||
return self.build_action_result(
|
||||
False,
|
||||
str(e),
|
||||
None,
|
||||
"",
|
||||
"",
|
||||
{},
|
||||
error=str(e),
|
||||
content=None
|
||||
)
|
||||
|
||||
async def click_element(self, action: ClickElementAction = Body(...)):
|
||||
"""Click on an element by index"""
|
||||
try:
|
||||
|
@ -1730,6 +1821,18 @@ async def test_browser_api():
|
|||
print(f"\nScreenshot captured: {'Yes' if result.screenshot_base64 else 'No'}")
|
||||
print(f"Viewport size: {result.viewport_width}x{result.viewport_height}")
|
||||
|
||||
# Test OCR extraction from screenshot
|
||||
print("\n--- Testing OCR Text Extraction ---")
|
||||
if result.ocr_text:
|
||||
print("OCR text extracted from screenshot:")
|
||||
print("=== OCR TEXT START ===")
|
||||
print(result.ocr_text)
|
||||
print("=== OCR TEXT END ===")
|
||||
print(f"OCR text length: {len(result.ocr_text)} characters")
|
||||
print(result.ocr_text)
|
||||
else:
|
||||
print("No OCR text extracted from screenshot")
|
||||
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test search functionality
|
||||
|
@ -1741,6 +1844,15 @@ async def test_browser_api():
|
|||
else:
|
||||
print(f"Found {result.element_count} elements after search")
|
||||
print(f"Page title: {result.title}")
|
||||
|
||||
# Test OCR extraction from search results
|
||||
if result.ocr_text:
|
||||
print("\nOCR text from search results:")
|
||||
print("=== OCR TEXT START ===")
|
||||
print(result.ocr_text)
|
||||
print("=== OCR TEXT END ===")
|
||||
else:
|
||||
print("\nNo OCR text extracted from search results")
|
||||
|
||||
await asyncio.sleep(2)
|
||||
|
||||
|
@ -1766,6 +1878,15 @@ async def test_browser_api():
|
|||
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test clicking on coordinates
|
||||
print("\n--- Testing Click Coordinates ---")
|
||||
coord_click_result = await automation_service.click_coordinates(ClickCoordinatesAction(x=100, y=100))
|
||||
print(f"Coordinate click status: {'✅ Success' if coord_click_result.success else '❌ Failed'}")
|
||||
print(f"Message: {coord_click_result.message}")
|
||||
print(f"URL after coordinate click: {coord_click_result.url}")
|
||||
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test extracting content
|
||||
print("\n--- Testing Content Extraction ---")
|
||||
content_result = await automation_service.extract_content("test goal")
|
||||
|
|
|
@ -6,7 +6,7 @@ services:
|
|||
dockerfile: ${DOCKERFILE:-Dockerfile}
|
||||
args:
|
||||
TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
|
||||
image: adamcohenhillel/kortix-suna:0.0.16
|
||||
image: adamcohenhillel/kortix-suna:0.0.18
|
||||
ports:
|
||||
- "6080:6080" # noVNC web interface
|
||||
- "5901:5901" # VNC port
|
||||
|
|
|
@ -2,4 +2,5 @@ fastapi==0.115.12
|
|||
uvicorn==0.34.0
|
||||
pyautogui==0.9.54
|
||||
pillow==10.2.0
|
||||
pydantic==2.6.1
|
||||
pydantic==2.6.1
|
||||
pytesseract==0.3.13
|
|
@ -96,7 +96,7 @@ def create_sandbox(password: str):
|
|||
logger.debug("OPENAI_API_KEY configured for sandbox")
|
||||
|
||||
sandbox = daytona.create(CreateSandboxParams(
|
||||
image="adamcohenhillel/kortix-suna:0.0.16",
|
||||
image="adamcohenhillel/kortix-suna:0.0.18",
|
||||
public=True,
|
||||
env_vars={
|
||||
"CHROME_PERSISTENT_SESSION": "true",
|
||||
|
@ -116,7 +116,8 @@ def create_sandbox(password: str):
|
|||
5900, # VNC port
|
||||
5901, # VNC port
|
||||
9222, # Chrome remote debugging port
|
||||
8080 # HTTP website port
|
||||
8080, # HTTP website port
|
||||
8002, # The browser api port
|
||||
]
|
||||
))
|
||||
logger.info(f"Sandbox created with ID: {sandbox.id}")
|
||||
|
|
Loading…
Reference in New Issue