Merge branch 'PRODUCTION' into sync/production

2025-06-05 07:31:54 +00:00 · 2025-06-05 07:31:54 +00:00 · 4ef4eeceb1
parent 11e8b4f7bd a1a71a05de
commit 4ef4eeceb1
9 changed files with 61 additions and 22 deletions
--- a/backend/agent/run.py
+++ b/backend/agent/run.py
@ -290,7 +290,9 @@ async def run_agent(

    latest_user_message = await client.table('messages').select('*').eq('thread_id', thread_id).eq('type', 'user').order('created_at', desc=True).limit(1).execute()
    if latest_user_message.data and len(latest_user_message.data) > 0:
-        data = json.loads(latest_user_message.data[0]['content'])
+        data = latest_user_message.data[0]['content']
+        if isinstance(data, str):
+            data = json.loads(data)
        trace.update(input=data['content'])

    while continue_execution and iteration_count < max_iterations:
@ -327,14 +329,16 @@ async def run_agent(
        latest_browser_state_msg = await client.table('messages').select('*').eq('thread_id', thread_id).eq('type', 'browser_state').order('created_at', desc=True).limit(1).execute()
        if latest_browser_state_msg.data and len(latest_browser_state_msg.data) > 0:
            try:
-                browser_content = json.loads(latest_browser_state_msg.data[0]["content"])
+                browser_content = latest_browser_state_msg.data[0]["content"]
+                if isinstance(browser_content, str):
+                    browser_content = json.loads(browser_content)
                screenshot_base64 = browser_content.get("screenshot_base64")
-                screenshot_url = browser_content.get("screenshot_url")
+                screenshot_url = browser_content.get("image_url")
                
                # Create a copy of the browser state without screenshot data
                browser_state_text = browser_content.copy()
                browser_state_text.pop('screenshot_base64', None)
-                browser_state_text.pop('screenshot_url', None)
+                browser_state_text.pop('image_url', None)

                if browser_state_text:
                    temp_message_content_list.append({
@ -348,6 +352,7 @@ async def run_agent(
                        "type": "image_url",
                        "image_url": {
                            "url": screenshot_url,
+                            "format": "image/jpeg"
                        }
                    })
                elif screenshot_base64:
@ -369,7 +374,7 @@ async def run_agent(
        latest_image_context_msg = await client.table('messages').select('*').eq('thread_id', thread_id).eq('type', 'image_context').order('created_at', desc=True).limit(1).execute()
        if latest_image_context_msg.data and len(latest_image_context_msg.data) > 0:
            try:
-                image_context_content = json.loads(latest_image_context_msg.data[0]["content"])
+                image_context_content = latest_image_context_msg.data[0]["content"] if isinstance(latest_image_context_msg.data[0]["content"], dict) else json.loads(latest_image_context_msg.data[0]["content"])
                base64_image = image_context_content.get("base64")
                mime_type = image_context_content.get("mime_type")
                file_path = image_context_content.get("file_path", "unknown file")
--- a/backend/sandbox/README.md
+++ b/backend/sandbox/README.md
@ -20,7 +20,7 @@ You can modify the sandbox environment for development or to add new capabilitie
   ```
   cd backend/sandbox/docker
   docker compose build
-   docker push kortix/suna:0.1.2
+   docker push kortix/suna:0.1.3
   ```
 3. Test your changes locally using docker-compose

--- a/backend/sandbox/docker/Dockerfile
+++ b/backend/sandbox/docker/Dockerfile
@ -96,11 +96,6 @@ WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt

-# Copy server script
-COPY . /app
-COPY server.py /app/server.py
-COPY browser_api.py /app/browser_api.py
-
 # Install Playwright and browsers with system dependencies
 ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
 # Install Playwright package first
@ -111,6 +106,11 @@ RUN playwright install chromium
 # Verify installation
 RUN python -c "from playwright.sync_api import sync_playwright; print('Playwright installation verified')"

+# Copy server script
+COPY . /app
+COPY server.py /app/server.py
+COPY browser_api.py /app/browser_api.py
+
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
 ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome
--- a/backend/sandbox/docker/browser_api.py
+++ b/backend/sandbox/docker/browser_api.py
@ -1,5 +1,5 @@
 from fastapi import FastAPI, APIRouter, HTTPException, Body
-from playwright.async_api import async_playwright, Browser, Page
+from playwright.async_api import async_playwright, Browser, BrowserContext, Page
 from pydantic import BaseModel
 from typing import Optional, List, Dict, Any
 import asyncio
@ -282,6 +282,7 @@ class BrowserAutomation:
    def __init__(self):
        self.router = APIRouter()
        self.browser: Browser = None
+        self.browser_context: BrowserContext = None
        self.pages: List[Page] = []
        self.current_page_index: int = 0
        self.logger = logging.getLogger("browser_automation")
@ -341,6 +342,7 @@ class BrowserAutomation:
            
            try:
                self.browser = await playwright.chromium.launch(**launch_options)
+                self.browser_context = await self.browser.new_context(viewport={'width': 1024, 'height': 768})
                print("Browser launched successfully")
            except Exception as browser_error:
                print(f"Failed to launch browser: {browser_error}")
@ -348,6 +350,7 @@ class BrowserAutomation:
                print("Retrying with minimal options...")
                launch_options = {"timeout": 90000}
                self.browser = await playwright.chromium.launch(**launch_options)
+                self.browser_context = await self.browser.new_context(viewport={'width': 1024, 'height': 768})
                print("Browser launched with minimal options")

            try:
@ -356,13 +359,20 @@ class BrowserAutomation:
                self.current_page_index = 0
            except Exception as page_error:
                print(f"Error finding existing page, creating new one. ( {page_error})")
-                page = await self.browser.new_page(viewport={'width': 1024, 'height': 768})
+                page = await self.browser_context.new_page()
                print("New page created successfully")
                self.pages.append(page)
                self.current_page_index = 0
                # Navigate directly to google.com instead of about:blank
                await page.goto("https://www.google.com", wait_until="domcontentloaded", timeout=30000)
                print("Navigated to google.com")
+            
+            try:
+                self.browser_context.on("page", self.handle_page_created)
+            except Exception as e:
+                print(f"Error setting up page event handler: {e}")
+                traceback.print_exc()
+
                
                print("Browser initialization completed successfully")
        except Exception as e:
@ -372,8 +382,17 @@ class BrowserAutomation:
            
    async def shutdown(self):
        """Clean up browser instance on shutdown"""
+        if self.browser_context:
+            await self.browser_context.close()
        if self.browser:
            await self.browser.close()
+
+    async def handle_page_created(self, page: Page):
+        """Handle new page creation"""
+        await asyncio.sleep(0.5)
+        self.pages.append(page)
+        self.current_page_index = len(self.pages) - 1
+        print(f"Page created: {page.url}; current page index: {self.current_page_index}")
    
    async def get_current_page(self) -> Page:
        """Get the current active page"""
@ -958,6 +977,7 @@ class BrowserAutomation:
            # Give time for any navigation or DOM updates to occur
            await page.wait_for_load_state("networkidle", timeout=5000)
            
+            await asyncio.sleep(1)
            # Get updated state after action
            dom_state, screenshot, elements, metadata = await self.get_updated_browser_state(f"click_coordinates({action.x}, {action.y})")
            
@ -977,6 +997,7 @@ class BrowserAutomation:
            
            # Try to get state even after error
            try:
+                await asyncio.sleep(1)
                dom_state, screenshot, elements, metadata = await self.get_updated_browser_state("click_coordinates_error_recovery")
                return self.build_action_result(
                    False,
@ -1076,7 +1097,7 @@ class BrowserAutomation:
                await page.wait_for_load_state("networkidle", timeout=5000)
            except Exception as wait_error:
                print(f"Timeout or error waiting for network idle after click: {wait_error}")
-                await asyncio.sleep(1) # Fallback wait
+            await asyncio.sleep(1)

            # Get updated state after action
            dom_state, screenshot, elements, metadata = await self.get_updated_browser_state(f"click_element({action.index})")
@ -1161,6 +1182,7 @@ class BrowserAutomation:
                # Fallback to xpath
                await page.fill(f"//{element.tag_name}[{action.index}]", action.text)
            
+            await asyncio.sleep(1)
            # Get updated state after action
            dom_state, screenshot, elements, metadata = await self.get_updated_browser_state(f"input_text({action.index}, '{action.text}')")
            
@ -1192,6 +1214,7 @@ class BrowserAutomation:
            page = await self.get_current_page()
            await page.keyboard.press(action.keys)
            
+            await asyncio.sleep(1)
            # Get updated state after action
            dom_state, screenshot, elements, metadata = await self.get_updated_browser_state(f"send_keys({action.keys})")
            
@ -1267,7 +1290,7 @@ class BrowserAutomation:
        try:
            print(f"Attempting to open new tab with URL: {action.url}")
            # Create new page in same browser instance
-            new_page = await self.browser.new_page()
+            new_page = await self.browser_context.new_page()
            print(f"New page created successfully")
            
            # Navigate to the URL
--- a/backend/sandbox/docker/docker-compose.yml
+++ b/backend/sandbox/docker/docker-compose.yml
@ -6,12 +6,12 @@ services:
      dockerfile: ${DOCKERFILE:-Dockerfile}
      args:
        TARGETPLATFORM: ${TARGETPLATFORM:-linux/amd64}
-    image: kortix/suna:0.1.2.8
+    image: kortix/suna:0.1.3
    ports:
      - "6080:6080"  # noVNC web interface
      - "5901:5901"  # VNC port
      - "9222:9222"  # Chrome remote debugging port
-      - "8000:8000"  # API server port
+      - "8003:8003"  # API server port
      - "8080:8080"  # HTTP server port
    environment:
      - ANONYMIZED_TELEMETRY=${ANONYMIZED_TELEMETRY:-false}
--- a/backend/utils/config.py
+++ b/backend/utils/config.py
@ -159,7 +159,7 @@ class Configuration:
    STRIPE_PRODUCT_ID_STAGING: str = 'prod_SCgIj3G7yPOAWY'
    
    # Sandbox configuration
-    SANDBOX_IMAGE_NAME = "kortix/suna:0.1.2.8"
+    SANDBOX_IMAGE_NAME = "kortix/suna:0.1.3"
    SANDBOX_ENTRYPOINT = "/usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf"

    # LangFuse configuration
--- a/docs/SELF-HOSTING.md
+++ b/docs/SELF-HOSTING.md
@ -115,7 +115,7 @@ As part of the setup, you'll need to:
 1. Create a Daytona account
 2. Generate an API key
 3. Create a Docker image:
-   - Image name: `kortix/suna:0.1.2.8`
+   - Image name: `kortix/suna:0.1.3`
   - Entrypoint: `/usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf`

 ## Manual Configuration
--- a/frontend/src/components/thread/tool-views/BrowserToolView.tsx
+++ b/frontend/src/components/thread/tool-views/BrowserToolView.tsx
@ -65,7 +65,7 @@ export function BrowserToolView({
  const [imageError, setImageError] = React.useState(false);

  try {
-    const topLevelParsed = safeJsonParse<{ content?: string }>(toolContent, {});
+    const topLevelParsed = safeJsonParse<{ content?: any }>(toolContent, {});
    const innerContentString = topLevelParsed?.content || toolContent;
    if (innerContentString && typeof innerContentString === 'string') {
      const toolResultMatch = innerContentString.match(/ToolResult\([^)]*output='([\s\S]*?)'(?:\s*,|\s*\))/);
@ -116,7 +116,18 @@ export function BrowserToolView({
          screenshotUrl = finalParsedOutput?.image_url || null;
        }
      }
-    }
+    } else if (innerContentString && typeof innerContentString === "object") {
+        screenshotUrl = (() => {
+          if (!innerContentString) return null;
+          if (!("tool_execution" in innerContentString)) return null;
+          if (!("result" in innerContentString.tool_execution)) return null;
+          if (!("output" in innerContentString.tool_execution.result)) return null;
+          if (!("image_url" in innerContentString.tool_execution.result.output)) return null;
+          if (typeof innerContentString.tool_execution.result.output.image_url !== "string") return null;
+          return innerContentString.tool_execution.result.output.image_url;
+        })()
+      }
+    
  } catch (error) {
  }

--- a/setup.py
+++ b/setup.py
@ -237,7 +237,7 @@ def collect_daytona_info():
    print_info("Then, generate an API key from 'Keys' menu")
    print_info("After that, go to Images (https://app.daytona.io/dashboard/images)")
    print_info("Click '+ Create Image'")
-    print_info(f"Enter 'kortix/suna:0.1.2.8' as the image name")
+    print_info(f"Enter 'kortix/suna:0.1.3' as the image name")
    print_info(f"Set '/usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf' as the Entrypoint")

    input("Press Enter to continue once you've completed these steps...")