diff --git a/backend/core/tools/image_context_manager.py b/backend/core/tools/image_context_manager.py index 65685e06..111183bb 100644 --- a/backend/core/tools/image_context_manager.py +++ b/backend/core/tools/image_context_manager.py @@ -69,23 +69,79 @@ class ImageContextManager: }).execute() result = db_result.data[0] if db_result.data and len(db_result.data) > 0 else None - if result: - logger.debug(f"Added image to context: {file_path}") - return result - else: + if not result: logger.error("Failed to insert image message") return None + + logger.debug(f"Added image to context: {file_path}") + + # ===== CRITICAL: Add image context management instruction ===== + # This hardcoded instruction ensures images are properly managed and removed + context_instruction = { + "role": "user", + "content": """⚠️ IMPORTANT - IMAGE CONTEXT MANAGEMENT: + + You are now viewing an image that has been loaded into context. Due to context window limitations, this image WILL BE AUTOMATICALLY REMOVED after you analyze it. + + REQUIRED ACTIONS: + 1. **Analyze the image thoroughly** - Look at all details, text, UI elements, colors, layout, etc. + 2. **Write a DETAILED SUMMARY** - Describe what you see in comprehensive detail so you can reference it later. Include: + - All visible text and labels + - UI components and their states + - Colors, layout, and visual hierarchy + - Any errors, warnings, or important information + - Relationships between elements + 3. **Call clear_images_from_context** - You MUST call this tool after your analysis to free up context tokens + + WHY THIS MATTERS: + - Images consume significant context tokens + - You will NOT see this image again after it's cleared (unless explicitly reloaded with load_image) + - Your written summary is your only future reference to this image + - Failing to clear images will cause context overflow + + REMEMBER: Be thorough in your summary - it's your permanent record of what you saw!""" + } + + context_instruction_metadata = { + "image_context": True, + "instruction_type": "context_management", + "related_file": file_path + } + + # Add the context management instruction + if self.thread_manager: + await self.thread_manager.add_message( + thread_id=thread_id, + type='user', + content=context_instruction, + is_llm_message=True, + metadata=context_instruction_metadata + ) + else: + # Fallback to direct DB access + client = await self.db.client + await client.table('messages').insert({ + 'thread_id': thread_id, + 'type': 'user', + 'content': context_instruction, + 'is_llm_message': True, + 'metadata': context_instruction_metadata + }).execute() + + logger.debug(f"Added context management instruction for image: {file_path}") + + return result except Exception as e: logger.error(f"Failed to add image to context: {str(e)}", exc_info=True) return None async def clear_images_from_context(self, thread_id: str) -> int: - """Remove all image context messages from a thread.""" + """Remove all image context messages from a thread, including images and their management instructions.""" try: client = await self.db.client - # Delete all messages with image_context metadata + # Delete all messages with image_context metadata (includes both images and instructions) result = await client.table('messages').delete().eq( 'thread_id', thread_id ).eq( @@ -95,7 +151,7 @@ class ImageContextManager: ).execute() deleted_count = len(result.data) if result.data else 0 - logger.debug(f"Cleared {deleted_count} images from context") + logger.debug(f"Cleared {deleted_count} image-related messages from context (images + instructions)") return deleted_count except Exception as e: diff --git a/backend/core/tools/sb_vision_tool.py b/backend/core/tools/sb_vision_tool.py index 66634eab..3a1a5f15 100644 --- a/backend/core/tools/sb_vision_tool.py +++ b/backend/core/tools/sb_vision_tool.py @@ -165,11 +165,9 @@ class SandboxVisionTool(SandboxToolsBase): os.unlink(temp_svg_path) except ImportError: - print(f"[SeeImage] SVG conversion not available - using original SVG file '{file_path}'") - return image_bytes, mime_type + raise Exception(f"SVG conversion libraries not available. Cannot display SVG file '{file_path}'. Please convert to PNG manually.") except Exception as e: - print(f"[SeeImage] SVG conversion failed - using original SVG file '{file_path}': {str(e)}") - return image_bytes, mime_type + raise Exception(f"SVG conversion failed for '{file_path}': {str(e)}. Please convert to PNG manually.") # Open image from bytes img = Image.open(BytesIO(image_bytes)) @@ -220,8 +218,14 @@ class SandboxVisionTool(SandboxToolsBase): return compressed_bytes, output_mime except Exception as e: - print(f"[SeeImage] Failed to compress image: {str(e)}. Using original.") - return image_bytes, mime_type + # CRITICAL: Never return unsupported formats + # If compression fails, we need to ensure we still return a supported format + if mime_type in ['image/jpeg', 'image/png', 'image/gif', 'image/webp']: + print(f"[SeeImage] Failed to compress image: {str(e)}. Using original (format is supported).") + return image_bytes, mime_type + else: + # Unsupported format and compression failed - must fail + raise Exception(f"Failed to process image '{file_path}' with unsupported format '{mime_type}': {str(e)}") def is_url(self, file_path: str) -> bool: """check if the file path is url""" @@ -265,7 +269,7 @@ class SandboxVisionTool(SandboxToolsBase): "type": "function", "function": { "name": "load_image", - "description": "Loads an image file into conversation context from the /workspace directory or from a URL. Provide either a relative path to a local image or the URL to an image. The image will be compressed before sending to reduce token usage. IMPORTANT: If you previously loaded an image but cleared context, you can load it again by calling this tool with the same file path - no need to ask user to re-upload.", + "description": "Loads an image file into conversation context from the /workspace directory or from a URL. CRITICAL: After loading, you MUST analyze the image thoroughly, write a detailed summary, and then call clear_images_from_context to free context tokens. Images consume significant tokens and must be actively managed. You can reload any image later with the same file path if needed.", "parameters": { "type": "object", "properties": { @@ -353,6 +357,15 @@ class SandboxVisionTool(SandboxToolsBase): print(f"[SeeImage] Warning: Could not save converted PNG to sandbox: {e}") # Continue with original path if save fails + # CRITICAL: Validate MIME type before upload - Anthropic only accepts 4 formats + SUPPORTED_MIME_TYPES = ['image/jpeg', 'image/png', 'image/gif', 'image/webp'] + if compressed_mime_type not in SUPPORTED_MIME_TYPES: + return self.fail_response( + f"Invalid image format '{compressed_mime_type}' after compression. " + f"Only {', '.join(SUPPORTED_MIME_TYPES)} are supported for viewing by the AI. " + f"Original file: '{cleaned_path}'. Please convert the image to a supported format." + ) + # Upload to Supabase Storage instead of base64 try: # Generate unique filename @@ -418,7 +431,7 @@ class SandboxVisionTool(SandboxToolsBase): "type": "function", "function": { "name": "clear_images_from_context", - "description": "Clears all images from conversation memory. Use when done analyzing images or to free up context tokens. IMPORTANT: Files remain accessible - use load_image with the same path to load any image again instead of asking user to re-upload.", + "description": "REQUIRED after viewing images: Removes all images and their instructions from context to free up tokens. You MUST call this after analyzing images. The image files remain accessible in the sandbox - you can reload them later with load_image if needed. This is critical for context management.", "parameters": { "type": "object", "properties": {}, @@ -435,7 +448,13 @@ class SandboxVisionTool(SandboxToolsBase): deleted_count = await self.image_context_manager.clear_images_from_context(self.thread_id) if deleted_count > 0: - return self.success_response(f"Successfully cleared {deleted_count} image(s) from conversation context. Visual memory has been reset.") + # Typically 2 messages per image: the image itself + the context instruction + image_count = deleted_count // 2 + return self.success_response( + f"Successfully cleared approximately {image_count} image(s) and their instructions from conversation context " + f"({deleted_count} total messages removed). Context tokens freed up. " + f"You can reload any image again using load_image if needed." + ) else: return self.success_response("No images found in conversation context to clear.")