Merge pull request #64 from kortix-ai/browser-click-onxy-and-ocr

Browser click XY and OCR
2025-04-20 21:45:45 -07:00 · 2025-04-20 21:45:45 -07:00 · 04ce0e499b
parent 48e8c0ffa9 79740ddf0e
commit 04ce0e499b
9 changed files with 368 additions and 49 deletions
--- a/README.md
+++ b/README.md
@ -126,7 +126,7 @@ You'll need the following components:
   - Generate an API key from your account settings
   - Go to [Images](https://app.daytona.io/dashboard/images)
   - Click "Add Image"
-   - Enter `adamcohenhillel/kortix-suna:0.0.16` as the image name
+   - Enter `adamcohenhillel/kortix-suna:0.0.18` as the image name
   - Set `exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf` as the Entrypoint

 4. **LLM API Keys**:
--- a/backend/agent/tools/sb_browser_tool.py
+++ b/backend/agent/tools/sb_browser_tool.py
@ -84,15 +84,18 @@ class SandboxBrowserTool(SandboxToolsBase):
                        success_response["elements_found"] = result["element_count"]
                    if result.get("pixels_below"):
                        success_response["scrollable_content"] = result["pixels_below"] > 0
+                    # Add OCR text when available
+                    if result.get("ocr_text"):
+                        success_response["ocr_text"] = result["ocr_text"]

                    return self.success_response(success_response)

-                except json.JSONDecodeError:
-                    logger.error(f"Failed to parse response JSON: {response.result}")
-                    return self.fail_response(f"Failed to parse response JSON: {response.result}")
+                except json.JSONDecodeError as e:
+                    logger.error(f"Failed to parse response JSON: {response.result} {e}")
+                    return self.fail_response(f"Failed to parse response JSON: {response.result} {e}")
            else:
-                logger.error(f"Browser automation request failed: {response.result}")
-                return self.fail_response(f"Browser automation request failed: {response.result}")
+                logger.error(f"Browser automation request failed 2: {response}")
+                return self.fail_response(f"Browser automation request failed 2: {response}")

        except Exception as e:
            logger.error(f"Error executing browser action: {e}")
@ -847,4 +850,48 @@ class SandboxBrowserTool(SandboxToolsBase):
        else:
            return self.fail_response("Must provide either element selectors or coordinates for drag and drop")
        
-        return await self._execute_browser_action("drag_drop", params)
+        return await self._execute_browser_action("drag_drop", params)
+
+    @openapi_schema({
+        "type": "function",
+        "function": {
+            "name": "browser_click_coordinates",
+            "description": "Click at specific X,Y coordinates on the page",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "x": {
+                        "type": "integer",
+                        "description": "The X coordinate to click"
+                    },
+                    "y": {
+                        "type": "integer",
+                        "description": "The Y coordinate to click"
+                    }
+                },
+                "required": ["x", "y"]
+            }
+        }
+    })
+    @xml_schema(
+        tag_name="browser-click-coordinates",
+        mappings=[
+            {"param_name": "x", "node_type": "attribute", "path": "."},
+            {"param_name": "y", "node_type": "attribute", "path": "."}
+        ],
+        example='''
+        <browser-click-coordinates x="100" y="200"></browser-click-coordinates>
+        '''
+    )
+    async def browser_click_coordinates(self, x: int, y: int) -> ToolResult:
+        """Click at specific X,Y coordinates on the page
+        
+        Args:
+            x (int): The X coordinate to click
+            y (int): The Y coordinate to click
+            
+        Returns:
+            dict: Result of the execution
+        """
+        print(f"\033[95mClicking at coordinates: ({x}, {y})\033[0m")
+        return await self._execute_browser_action("click_coordinates", {"x": x, "y": y})
--- a/backend/poetry.lock
+++ b/backend/poetry.lock
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@ -50,6 +50,7 @@ streamlit = "^1.44.1"
 nest-asyncio = "^1.6.0"
 vncdotool = "^1.2.0"
 tavily-python = "^0.5.4"
+pytesseract = "^0.3.13"

 [tool.poetry.scripts]
 agentpress = "agentpress.cli:main"
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -23,4 +23,5 @@ python-ripgrep==0.0.6
 daytona_sdk>=0.12.0
 boto3>=1.34.0
 pydantic
-tavily-python>=0.5.4
+tavily-python>=0.5.4
+pytesseract==0.3.13
--- a/backend/sandbox/docker/browser_api.py
+++ b/backend/sandbox/docker/browser_api.py
@ -13,6 +13,9 @@ import os
 import random
 from functools import cached_property
 import traceback
+import pytesseract
+from PIL import Image
+import io

 #######################################################
 # Action model definitions
@ -25,6 +28,10 @@ class Position(BaseModel):
 class ClickElementAction(BaseModel):
    index: int

+class ClickCoordinatesAction(BaseModel):
+    x: int
+    y: int
+
 class GoToUrlAction(BaseModel):
    url: str

@ -257,6 +264,7 @@ class BrowserActionResult(BaseModel):
    pixels_above: int = 0
    pixels_below: int = 0
    content: Optional[str] = None
+    ocr_text: Optional[str] = None  # Added field for OCR text
    
    # Additional metadata
    element_count: int = 0  # Number of interactive elements found
@ -294,6 +302,7 @@ class BrowserAutomation:
        
        # Element interaction
        self.router.post("/automation/click_element")(self.click_element)
+        self.router.post("/automation/click_coordinates")(self.click_coordinates)
        self.router.post("/automation/input_text")(self.input_text)
        self.router.post("/automation/send_keys")(self.send_keys)
        
@ -626,6 +635,28 @@ class BrowserAutomation:
            print(f"Error saving screenshot: {e}")
            return ""
    
+    async def extract_ocr_text_from_screenshot(self, screenshot_base64: str) -> str:
+        """Extract text from screenshot using OCR"""
+        if not screenshot_base64:
+            return ""
+            
+        try:
+            # Decode base64 to image
+            image_bytes = base64.b64decode(screenshot_base64)
+            image = Image.open(io.BytesIO(image_bytes))
+            
+            # Extract text using pytesseract
+            ocr_text = pytesseract.image_to_string(image)
+            
+            # Clean up the text
+            ocr_text = ocr_text.strip()
+            
+            return ocr_text
+        except Exception as e:
+            print(f"Error performing OCR: {e}")
+            traceback.print_exc()
+            return ""
+    
    async def get_updated_browser_state(self, action_name: str) -> tuple:
        """Helper method to get updated browser state after any action
        Returns a tuple of (dom_state, screenshot, elements, metadata)
@ -686,6 +717,12 @@ class BrowserAutomation:
                metadata['viewport_width'] = 0
                metadata['viewport_height'] = 0
            
+            # Extract OCR text from screenshot if available
+            ocr_text = ""
+            if screenshot:
+                ocr_text = await self.extract_ocr_text_from_screenshot(screenshot)
+                metadata['ocr_text'] = ocr_text
+            
            print(f"Got updated state after {action_name}: {len(dom_state.selector_map)} elements")
            return dom_state, screenshot, elements, metadata
        except Exception as e:
@ -713,6 +750,7 @@ class BrowserAutomation:
            pixels_above=dom_state.pixels_above if dom_state else 0,
            pixels_below=dom_state.pixels_below if dom_state else 0,
            content=content,
+            ocr_text=metadata.get('ocr_text', ""),
            element_count=metadata.get('element_count', 0),
            interactive_elements=metadata.get('interactive_elements', []),
            viewport_width=metadata.get('viewport_width', 0),
@ -885,6 +923,59 @@ class BrowserAutomation:
    
    # Element Interaction Actions
    
+    async def click_coordinates(self, action: ClickCoordinatesAction = Body(...)):
+        """Click at specific x,y coordinates on the page"""
+        try:
+            page = await self.get_current_page()
+            
+            # Perform the click at the specified coordinates
+            await page.mouse.click(action.x, action.y)
+            
+            # Give time for any navigation or DOM updates to occur
+            await page.wait_for_load_state("networkidle", timeout=5000)
+            
+            # Get updated state after action
+            dom_state, screenshot, elements, metadata = await self.get_updated_browser_state(f"click_coordinates({action.x}, {action.y})")
+            
+            return self.build_action_result(
+                True,
+                f"Clicked at coordinates ({action.x}, {action.y})",
+                dom_state,
+                screenshot,
+                elements,
+                metadata,
+                error="",
+                content=None
+            )
+        except Exception as e:
+            print(f"Error in click_coordinates: {e}")
+            traceback.print_exc()
+            
+            # Try to get state even after error
+            try:
+                dom_state, screenshot, elements, metadata = await self.get_updated_browser_state("click_coordinates_error_recovery")
+                return self.build_action_result(
+                    False,
+                    str(e),
+                    dom_state,
+                    screenshot,
+                    elements,
+                    metadata,
+                    error=str(e),
+                    content=None
+                )
+            except:
+                return self.build_action_result(
+                    False,
+                    str(e),
+                    None,
+                    "",
+                    "",
+                    {},
+                    error=str(e),
+                    content=None
+                )
+    
    async def click_element(self, action: ClickElementAction = Body(...)):
        """Click on an element by index"""
        try:
@ -1730,6 +1821,18 @@ async def test_browser_api():
        print(f"\nScreenshot captured: {'Yes' if result.screenshot_base64 else 'No'}")
        print(f"Viewport size: {result.viewport_width}x{result.viewport_height}")
        
+        # Test OCR extraction from screenshot
+        print("\n--- Testing OCR Text Extraction ---")
+        if result.ocr_text:
+            print("OCR text extracted from screenshot:")
+            print("=== OCR TEXT START ===")
+            print(result.ocr_text)
+            print("=== OCR TEXT END ===")
+            print(f"OCR text length: {len(result.ocr_text)} characters")
+            print(result.ocr_text)
+        else:
+            print("No OCR text extracted from screenshot")
+        
        await asyncio.sleep(2)
        
        # Test search functionality
@ -1741,6 +1844,15 @@ async def test_browser_api():
        else:
            print(f"Found {result.element_count} elements after search")
            print(f"Page title: {result.title}")
+            
+            # Test OCR extraction from search results
+            if result.ocr_text:
+                print("\nOCR text from search results:")
+                print("=== OCR TEXT START ===")
+                print(result.ocr_text)
+                print("=== OCR TEXT END ===")
+            else:
+                print("\nNo OCR text extracted from search results")
        
        await asyncio.sleep(2)

@ -1766,6 +1878,15 @@ async def test_browser_api():
        
        await asyncio.sleep(2)

+        # Test clicking on coordinates
+        print("\n--- Testing Click Coordinates ---")
+        coord_click_result = await automation_service.click_coordinates(ClickCoordinatesAction(x=100, y=100))
+        print(f"Coordinate click status: {'✅ Success' if coord_click_result.success else '❌ Failed'}")
+        print(f"Message: {coord_click_result.message}")
+        print(f"URL after coordinate click: {coord_click_result.url}")
+        
+        await asyncio.sleep(2)
+
        # Test extracting content
        print("\n--- Testing Content Extraction ---")
        content_result = await automation_service.extract_content("test goal")
--- a/backend/sandbox/docker/docker-compose.yml
+++ b/backend/sandbox/docker/docker-compose.yml
@ -6,7 +6,7 @@ services:
      dockerfile: ${DOCKERFILE:-Dockerfile}
      args:
        TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
-    image: adamcohenhillel/kortix-suna:0.0.16
+    image: adamcohenhillel/kortix-suna:0.0.18
    ports:
      - "6080:6080"  # noVNC web interface
      - "5901:5901"  # VNC port
--- a/backend/sandbox/docker/requirements.txt
+++ b/backend/sandbox/docker/requirements.txt
@ -2,4 +2,5 @@ fastapi==0.115.12
 uvicorn==0.34.0
 pyautogui==0.9.54
 pillow==10.2.0
-pydantic==2.6.1
+pydantic==2.6.1
+pytesseract==0.3.13
--- a/backend/sandbox/sandbox.py
+++ b/backend/sandbox/sandbox.py
@ -96,7 +96,7 @@ def create_sandbox(password: str):
        logger.debug("OPENAI_API_KEY configured for sandbox")
    
    sandbox = daytona.create(CreateSandboxParams(
-        image="adamcohenhillel/kortix-suna:0.0.16",
+        image="adamcohenhillel/kortix-suna:0.0.18",
        public=True,
        env_vars={
            "CHROME_PERSISTENT_SESSION": "true",
@ -116,7 +116,8 @@ def create_sandbox(password: str):
            5900,  # VNC port
            5901,  # VNC port
            9222,  # Chrome remote debugging port
-            8080   # HTTP website port
+            8080,   # HTTP website port
+            8002,  # The browser api port
        ]
    ))
    logger.info(f"Sandbox created with ID: {sandbox.id}")