suna/backend/agent/tools/sb_vision_tool.py

import os
import base64
import mimetypes
from typing import Optional

from agentpress.tool import ToolResult, openapi_schema, xml_schema
from sandbox.tool_base import SandboxToolsBase
from agentpress.thread_manager import ThreadManager
import json

# Add common image MIME types if mimetypes module is limited
mimetypes.add_type("image/webp", ".webp")
mimetypes.add_type("image/jpeg", ".jpg")
mimetypes.add_type("image/jpeg", ".jpeg")
mimetypes.add_type("image/png", ".png")
mimetypes.add_type("image/gif", ".gif")

# Maximum file size in bytes (e.g., 5MB)
MAX_IMAGE_SIZE = 10 * 1024 * 1024

class SandboxVisionTool(SandboxToolsBase):
    """Tool for allowing the agent to 'see' images within the sandbox."""

    def __init__(self, project_id: str, thread_id: str, thread_manager: ThreadManager):
        super().__init__(project_id, thread_manager)
        self.thread_id = thread_id
        # Make thread_manager accessible within the tool instance
        self.thread_manager = thread_manager

    @openapi_schema({
        "type": "function",
        "function": {
            "name": "see_image",
            "description": "Allows the agent to 'see' an image file located in the /workspace directory. Provide the relative path to the image. The image content will be made available in the next turn's context.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "The relative path to the image file within the /workspace directory (e.g., 'screenshots/image.png'). Supported formats: JPG, PNG, GIF, WEBP. Max size: 5MB."
                    }
                },
                "required": ["file_path"]
            }
        }
    })
    @xml_schema(
        tag_name="see-image",
        mappings=[
            {"param_name": "file_path", "node_type": "attribute", "path": "."}
        ],
        example='''
        <!-- Example: Request to see an image named 'diagram.png' inside the 'docs' folder -->
        <see-image file_path="docs/diagram.png"></see-image>
        '''
    )
    async def see_image(self, file_path: str) -> ToolResult:
        """Reads an image file, converts it to base64, and adds it as a temporary message."""
        try:
            # Ensure sandbox is initialized
            await self._ensure_sandbox()

            # Clean and construct full path
            cleaned_path = self.clean_path(file_path)
            full_path = f"{self.workspace_path}/{cleaned_path}"

            # Check if file exists and get info
            try:
                file_info = self.sandbox.fs.get_file_info(full_path)
                if file_info.is_dir:
                    return self.fail_response(f"Path '{cleaned_path}' is a directory, not an image file.")
            except Exception as e:
                return self.fail_response(f"Image file not found at path: '{cleaned_path}'")

            # Check file size
            if file_info.size > MAX_IMAGE_SIZE:
                return self.fail_response(f"Image file '{cleaned_path}' is too large ({file_info.size / (1024*1024):.2f}MB). Maximum size is {MAX_IMAGE_SIZE / (1024*1024)}MB.")

            # Read image file content
            try:
                image_bytes = self.sandbox.fs.download_file(full_path)
            except Exception as e:
                return self.fail_response(f"Could not read image file: {cleaned_path}")

            # Convert to base64
            base64_image = base64.b64encode(image_bytes).decode('utf-8')

            # Determine MIME type
            mime_type, _ = mimetypes.guess_type(full_path)
            if not mime_type or not mime_type.startswith('image/'):
                # Basic fallback based on extension if mimetypes fails
                ext = os.path.splitext(cleaned_path)[1].lower()
                if ext == '.jpg' or ext == '.jpeg': mime_type = 'image/jpeg'
                elif ext == '.png': mime_type = 'image/png'
                elif ext == '.gif': mime_type = 'image/gif'
                elif ext == '.webp': mime_type = 'image/webp'
                else:
                    return self.fail_response(f"Unsupported or unknown image format for file: '{cleaned_path}'. Supported: JPG, PNG, GIF, WEBP.")

            # Prepare the temporary message content
            image_context_data = {
                "mime_type": mime_type,
                "base64": base64_image,
                "file_path": cleaned_path # Include path for context
            }

            # Add the temporary message using the thread_manager callback
            # Use a distinct type like 'image_context'
            await self.thread_manager.add_message(
                thread_id=self.thread_id,
                type="image_context", # Use a specific type for this
                content=image_context_data, # Store the dict directly
                is_llm_message=False # This is context generated by a tool
            )

            # Inform the agent the image will be available next turn
            return self.success_response(f"Successfully loaded the image '{cleaned_path}'.")

        except Exception as e:
            return self.fail_response(f"An unexpected error occurred while trying to see the image: {str(e)}")
support images 2025-04-28 08:28:21 +08:00			`import os`
			`import base64`
			`import mimetypes`
			`from typing import Optional`

			`from agentpress.tool import ToolResult, openapi_schema, xml_schema`
sandbox tool base 2025-05-10 10:26:42 +08:00			`from sandbox.tool_base import SandboxToolsBase`
support images 2025-04-28 08:28:21 +08:00			`from agentpress.thread_manager import ThreadManager`
			`import json`

			`# Add common image MIME types if mimetypes module is limited`
			`mimetypes.add_type("image/webp", ".webp")`
			`mimetypes.add_type("image/jpeg", ".jpg")`
			`mimetypes.add_type("image/jpeg", ".jpeg")`
			`mimetypes.add_type("image/png", ".png")`
			`mimetypes.add_type("image/gif", ".gif")`

			`# Maximum file size in bytes (e.g., 5MB)`
			`MAX_IMAGE_SIZE = 10 * 1024 * 1024`

			`class SandboxVisionTool(SandboxToolsBase):`
			`"""Tool for allowing the agent to 'see' images within the sandbox."""`

			`def __init__(self, project_id: str, thread_id: str, thread_manager: ThreadManager):`
			`super().__init__(project_id, thread_manager)`
			`self.thread_id = thread_id`
			`# Make thread_manager accessible within the tool instance`
			`self.thread_manager = thread_manager`

			`@openapi_schema({`
			`"type": "function",`
			`"function": {`
			`"name": "see_image",`
			`"description": "Allows the agent to 'see' an image file located in the /workspace directory. Provide the relative path to the image. The image content will be made available in the next turn's context.",`
			`"parameters": {`
			`"type": "object",`
			`"properties": {`
			`"file_path": {`
			`"type": "string",`
			`"description": "The relative path to the image file within the /workspace directory (e.g., 'screenshots/image.png'). Supported formats: JPG, PNG, GIF, WEBP. Max size: 5MB."`
			`}`
			`},`
			`"required": ["file_path"]`
			`}`
			`}`
			`})`
			`@xml_schema(`
			`tag_name="see-image",`
			`mappings=[`
			`{"param_name": "file_path", "node_type": "attribute", "path": "."}`
			`],`
			`example='''`
			`<!-- Example: Request to see an image named 'diagram.png' inside the 'docs' folder -->`
			`<see-image file_path="docs/diagram.png"></see-image>`
			`'''`
			`)`
			`async def see_image(self, file_path: str) -> ToolResult:`
			`"""Reads an image file, converts it to base64, and adds it as a temporary message."""`
			`try:`
			`# Ensure sandbox is initialized`
			`await self._ensure_sandbox()`

			`# Clean and construct full path`
			`cleaned_path = self.clean_path(file_path)`
			`full_path = f"{self.workspace_path}/{cleaned_path}"`

			`# Check if file exists and get info`
			`try:`
			`file_info = self.sandbox.fs.get_file_info(full_path)`
			`if file_info.is_dir:`
			`return self.fail_response(f"Path '{cleaned_path}' is a directory, not an image file.")`
			`except Exception as e:`
			`return self.fail_response(f"Image file not found at path: '{cleaned_path}'")`

			`# Check file size`
			`if file_info.size > MAX_IMAGE_SIZE:`
			`return self.fail_response(f"Image file '{cleaned_path}' is too large ({file_info.size / (10241024):.2f}MB). Maximum size is {MAX_IMAGE_SIZE / (10241024)}MB.")`

			`# Read image file content`
			`try:`
			`image_bytes = self.sandbox.fs.download_file(full_path)`
			`except Exception as e:`
			`return self.fail_response(f"Could not read image file: {cleaned_path}")`

			`# Convert to base64`
			`base64_image = base64.b64encode(image_bytes).decode('utf-8')`

			`# Determine MIME type`
			`mime_type, _ = mimetypes.guess_type(full_path)`
			`if not mime_type or not mime_type.startswith('image/'):`
			`# Basic fallback based on extension if mimetypes fails`
			`ext = os.path.splitext(cleaned_path)[1].lower()`
			`if ext == '.jpg' or ext == '.jpeg': mime_type = 'image/jpeg'`
			`elif ext == '.png': mime_type = 'image/png'`
			`elif ext == '.gif': mime_type = 'image/gif'`
			`elif ext == '.webp': mime_type = 'image/webp'`
			`else:`
			`return self.fail_response(f"Unsupported or unknown image format for file: '{cleaned_path}'. Supported: JPG, PNG, GIF, WEBP.")`

			`# Prepare the temporary message content`
			`image_context_data = {`
			`"mime_type": mime_type,`
			`"base64": base64_image,`
			`"file_path": cleaned_path # Include path for context`
			`}`

			`# Add the temporary message using the thread_manager callback`
			`# Use a distinct type like 'image_context'`
			`await self.thread_manager.add_message(`
			`thread_id=self.thread_id,`
			`type="image_context", # Use a specific type for this`
			`content=image_context_data, # Store the dict directly`
			`is_llm_message=False # This is context generated by a tool`
			`)`

			`# Inform the agent the image will be available next turn`
			`return self.success_response(f"Successfully loaded the image '{cleaned_path}'.")`

			`except Exception as e:`
			`return self.fail_response(f"An unexpected error occurred while trying to see the image: {str(e)}")`