Merge pull request #64 from kortix-ai/browser-click-onxy-and-ocr

Browser click XY and OCR
This commit is contained in:
Marko Kraemer 2025-04-20 21:45:45 -07:00 committed by GitHub
commit 04ce0e499b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 368 additions and 49 deletions

View File

@ -126,7 +126,7 @@ You'll need the following components:
- Generate an API key from your account settings
- Go to [Images](https://app.daytona.io/dashboard/images)
- Click "Add Image"
- Enter `adamcohenhillel/kortix-suna:0.0.16` as the image name
- Enter `adamcohenhillel/kortix-suna:0.0.18` as the image name
- Set `exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf` as the Entrypoint
4. **LLM API Keys**:

View File

@ -84,15 +84,18 @@ class SandboxBrowserTool(SandboxToolsBase):
success_response["elements_found"] = result["element_count"]
if result.get("pixels_below"):
success_response["scrollable_content"] = result["pixels_below"] > 0
# Add OCR text when available
if result.get("ocr_text"):
success_response["ocr_text"] = result["ocr_text"]
return self.success_response(success_response)
except json.JSONDecodeError:
logger.error(f"Failed to parse response JSON: {response.result}")
return self.fail_response(f"Failed to parse response JSON: {response.result}")
except json.JSONDecodeError as e:
logger.error(f"Failed to parse response JSON: {response.result} {e}")
return self.fail_response(f"Failed to parse response JSON: {response.result} {e}")
else:
logger.error(f"Browser automation request failed: {response.result}")
return self.fail_response(f"Browser automation request failed: {response.result}")
logger.error(f"Browser automation request failed 2: {response}")
return self.fail_response(f"Browser automation request failed 2: {response}")
except Exception as e:
logger.error(f"Error executing browser action: {e}")
@ -847,4 +850,48 @@ class SandboxBrowserTool(SandboxToolsBase):
else:
return self.fail_response("Must provide either element selectors or coordinates for drag and drop")
return await self._execute_browser_action("drag_drop", params)
return await self._execute_browser_action("drag_drop", params)
@openapi_schema({
"type": "function",
"function": {
"name": "browser_click_coordinates",
"description": "Click at specific X,Y coordinates on the page",
"parameters": {
"type": "object",
"properties": {
"x": {
"type": "integer",
"description": "The X coordinate to click"
},
"y": {
"type": "integer",
"description": "The Y coordinate to click"
}
},
"required": ["x", "y"]
}
}
})
@xml_schema(
tag_name="browser-click-coordinates",
mappings=[
{"param_name": "x", "node_type": "attribute", "path": "."},
{"param_name": "y", "node_type": "attribute", "path": "."}
],
example='''
<browser-click-coordinates x="100" y="200"></browser-click-coordinates>
'''
)
async def browser_click_coordinates(self, x: int, y: int) -> ToolResult:
"""Click at specific X,Y coordinates on the page
Args:
x (int): The X coordinate to click
y (int): The Y coordinate to click
Returns:
dict: Result of the execution
"""
print(f"\033[95mClicking at coordinates: ({x}, {y})\033[0m")
return await self._execute_browser_action("click_coordinates", {"x": x, "y": y})

221
backend/poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -50,6 +50,7 @@ streamlit = "^1.44.1"
nest-asyncio = "^1.6.0"
vncdotool = "^1.2.0"
tavily-python = "^0.5.4"
pytesseract = "^0.3.13"
[tool.poetry.scripts]
agentpress = "agentpress.cli:main"

View File

@ -23,4 +23,5 @@ python-ripgrep==0.0.6
daytona_sdk>=0.12.0
boto3>=1.34.0
pydantic
tavily-python>=0.5.4
tavily-python>=0.5.4
pytesseract==0.3.13

View File

@ -13,6 +13,9 @@ import os
import random
from functools import cached_property
import traceback
import pytesseract
from PIL import Image
import io
#######################################################
# Action model definitions
@ -25,6 +28,10 @@ class Position(BaseModel):
class ClickElementAction(BaseModel):
index: int
class ClickCoordinatesAction(BaseModel):
x: int
y: int
class GoToUrlAction(BaseModel):
url: str
@ -257,6 +264,7 @@ class BrowserActionResult(BaseModel):
pixels_above: int = 0
pixels_below: int = 0
content: Optional[str] = None
ocr_text: Optional[str] = None # Added field for OCR text
# Additional metadata
element_count: int = 0 # Number of interactive elements found
@ -294,6 +302,7 @@ class BrowserAutomation:
# Element interaction
self.router.post("/automation/click_element")(self.click_element)
self.router.post("/automation/click_coordinates")(self.click_coordinates)
self.router.post("/automation/input_text")(self.input_text)
self.router.post("/automation/send_keys")(self.send_keys)
@ -626,6 +635,28 @@ class BrowserAutomation:
print(f"Error saving screenshot: {e}")
return ""
async def extract_ocr_text_from_screenshot(self, screenshot_base64: str) -> str:
"""Extract text from screenshot using OCR"""
if not screenshot_base64:
return ""
try:
# Decode base64 to image
image_bytes = base64.b64decode(screenshot_base64)
image = Image.open(io.BytesIO(image_bytes))
# Extract text using pytesseract
ocr_text = pytesseract.image_to_string(image)
# Clean up the text
ocr_text = ocr_text.strip()
return ocr_text
except Exception as e:
print(f"Error performing OCR: {e}")
traceback.print_exc()
return ""
async def get_updated_browser_state(self, action_name: str) -> tuple:
"""Helper method to get updated browser state after any action
Returns a tuple of (dom_state, screenshot, elements, metadata)
@ -686,6 +717,12 @@ class BrowserAutomation:
metadata['viewport_width'] = 0
metadata['viewport_height'] = 0
# Extract OCR text from screenshot if available
ocr_text = ""
if screenshot:
ocr_text = await self.extract_ocr_text_from_screenshot(screenshot)
metadata['ocr_text'] = ocr_text
print(f"Got updated state after {action_name}: {len(dom_state.selector_map)} elements")
return dom_state, screenshot, elements, metadata
except Exception as e:
@ -713,6 +750,7 @@ class BrowserAutomation:
pixels_above=dom_state.pixels_above if dom_state else 0,
pixels_below=dom_state.pixels_below if dom_state else 0,
content=content,
ocr_text=metadata.get('ocr_text', ""),
element_count=metadata.get('element_count', 0),
interactive_elements=metadata.get('interactive_elements', []),
viewport_width=metadata.get('viewport_width', 0),
@ -885,6 +923,59 @@ class BrowserAutomation:
# Element Interaction Actions
async def click_coordinates(self, action: ClickCoordinatesAction = Body(...)):
"""Click at specific x,y coordinates on the page"""
try:
page = await self.get_current_page()
# Perform the click at the specified coordinates
await page.mouse.click(action.x, action.y)
# Give time for any navigation or DOM updates to occur
await page.wait_for_load_state("networkidle", timeout=5000)
# Get updated state after action
dom_state, screenshot, elements, metadata = await self.get_updated_browser_state(f"click_coordinates({action.x}, {action.y})")
return self.build_action_result(
True,
f"Clicked at coordinates ({action.x}, {action.y})",
dom_state,
screenshot,
elements,
metadata,
error="",
content=None
)
except Exception as e:
print(f"Error in click_coordinates: {e}")
traceback.print_exc()
# Try to get state even after error
try:
dom_state, screenshot, elements, metadata = await self.get_updated_browser_state("click_coordinates_error_recovery")
return self.build_action_result(
False,
str(e),
dom_state,
screenshot,
elements,
metadata,
error=str(e),
content=None
)
except:
return self.build_action_result(
False,
str(e),
None,
"",
"",
{},
error=str(e),
content=None
)
async def click_element(self, action: ClickElementAction = Body(...)):
"""Click on an element by index"""
try:
@ -1730,6 +1821,18 @@ async def test_browser_api():
print(f"\nScreenshot captured: {'Yes' if result.screenshot_base64 else 'No'}")
print(f"Viewport size: {result.viewport_width}x{result.viewport_height}")
# Test OCR extraction from screenshot
print("\n--- Testing OCR Text Extraction ---")
if result.ocr_text:
print("OCR text extracted from screenshot:")
print("=== OCR TEXT START ===")
print(result.ocr_text)
print("=== OCR TEXT END ===")
print(f"OCR text length: {len(result.ocr_text)} characters")
print(result.ocr_text)
else:
print("No OCR text extracted from screenshot")
await asyncio.sleep(2)
# Test search functionality
@ -1741,6 +1844,15 @@ async def test_browser_api():
else:
print(f"Found {result.element_count} elements after search")
print(f"Page title: {result.title}")
# Test OCR extraction from search results
if result.ocr_text:
print("\nOCR text from search results:")
print("=== OCR TEXT START ===")
print(result.ocr_text)
print("=== OCR TEXT END ===")
else:
print("\nNo OCR text extracted from search results")
await asyncio.sleep(2)
@ -1766,6 +1878,15 @@ async def test_browser_api():
await asyncio.sleep(2)
# Test clicking on coordinates
print("\n--- Testing Click Coordinates ---")
coord_click_result = await automation_service.click_coordinates(ClickCoordinatesAction(x=100, y=100))
print(f"Coordinate click status: {'✅ Success' if coord_click_result.success else '❌ Failed'}")
print(f"Message: {coord_click_result.message}")
print(f"URL after coordinate click: {coord_click_result.url}")
await asyncio.sleep(2)
# Test extracting content
print("\n--- Testing Content Extraction ---")
content_result = await automation_service.extract_content("test goal")

View File

@ -6,7 +6,7 @@ services:
dockerfile: ${DOCKERFILE:-Dockerfile}
args:
TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
image: adamcohenhillel/kortix-suna:0.0.16
image: adamcohenhillel/kortix-suna:0.0.18
ports:
- "6080:6080" # noVNC web interface
- "5901:5901" # VNC port

View File

@ -2,4 +2,5 @@ fastapi==0.115.12
uvicorn==0.34.0
pyautogui==0.9.54
pillow==10.2.0
pydantic==2.6.1
pydantic==2.6.1
pytesseract==0.3.13

View File

@ -96,7 +96,7 @@ def create_sandbox(password: str):
logger.debug("OPENAI_API_KEY configured for sandbox")
sandbox = daytona.create(CreateSandboxParams(
image="adamcohenhillel/kortix-suna:0.0.16",
image="adamcohenhillel/kortix-suna:0.0.18",
public=True,
env_vars={
"CHROME_PERSISTENT_SESSION": "true",
@ -116,7 +116,8 @@ def create_sandbox(password: str):
5900, # VNC port
5901, # VNC port
9222, # Chrome remote debugging port
8080 # HTTP website port
8080, # HTTP website port
8002, # The browser api port
]
))
logger.info(f"Sandbox created with ID: {sandbox.id}")