Merge pull request #1786 from KrishavRajSingh/main

rm image from context
2025-10-08 07:43:20 +05:30 · 2025-10-08 07:43:20 +05:30 · cc5cc54ad9
parent ae04dddf65 a5d8edabc9
commit cc5cc54ad9
2 changed files with 91 additions and 16 deletions
--- a/backend/core/tools/image_context_manager.py
+++ b/backend/core/tools/image_context_manager.py
@ -69,23 +69,79 @@ class ImageContextManager:
                }).execute()
                result = db_result.data[0] if db_result.data and len(db_result.data) > 0 else None
            
-            if result:
-                logger.debug(f"Added image to context: {file_path}")
-                return result
-            else:
+            if not result:
                logger.error("Failed to insert image message")
                return None
+            
+            logger.debug(f"Added image to context: {file_path}")
+            
+            # ===== CRITICAL: Add image context management instruction =====
+            # This hardcoded instruction ensures images are properly managed and removed
+            context_instruction = {
+                "role": "user",
+                "content": """⚠️ IMPORTANT - IMAGE CONTEXT MANAGEMENT:
+
+                You are now viewing an image that has been loaded into context. Due to context window limitations, this image WILL BE AUTOMATICALLY REMOVED after you analyze it.
+
+                REQUIRED ACTIONS:
+                1. **Analyze the image thoroughly** - Look at all details, text, UI elements, colors, layout, etc.
+                2. **Write a DETAILED SUMMARY** - Describe what you see in comprehensive detail so you can reference it later. Include:
+                - All visible text and labels
+                - UI components and their states
+                - Colors, layout, and visual hierarchy
+                - Any errors, warnings, or important information
+                - Relationships between elements
+                3. **Call clear_images_from_context** - You MUST call this tool after your analysis to free up context tokens
+
+                WHY THIS MATTERS:
+                - Images consume significant context tokens
+                - You will NOT see this image again after it's cleared (unless explicitly reloaded with load_image)
+                - Your written summary is your only future reference to this image
+                - Failing to clear images will cause context overflow
+
+                REMEMBER: Be thorough in your summary - it's your permanent record of what you saw!"""
+            }
+            
+            context_instruction_metadata = {
+                "image_context": True,
+                "instruction_type": "context_management",
+                "related_file": file_path
+            }
+            
+            # Add the context management instruction
+            if self.thread_manager:
+                await self.thread_manager.add_message(
+                    thread_id=thread_id,
+                    type='user',
+                    content=context_instruction,
+                    is_llm_message=True,
+                    metadata=context_instruction_metadata
+                )
+            else:
+                # Fallback to direct DB access
+                client = await self.db.client
+                await client.table('messages').insert({
+                    'thread_id': thread_id,
+                    'type': 'user',
+                    'content': context_instruction,
+                    'is_llm_message': True,
+                    'metadata': context_instruction_metadata
+                }).execute()
+            
+            logger.debug(f"Added context management instruction for image: {file_path}")
+            
+            return result
                
        except Exception as e:
            logger.error(f"Failed to add image to context: {str(e)}", exc_info=True)
            return None
    
    async def clear_images_from_context(self, thread_id: str) -> int:
-        """Remove all image context messages from a thread."""
+        """Remove all image context messages from a thread, including images and their management instructions."""
        try:
            client = await self.db.client
            
-            # Delete all messages with image_context metadata
+            # Delete all messages with image_context metadata (includes both images and instructions)
            result = await client.table('messages').delete().eq(
                'thread_id', thread_id
            ).eq(
@ -95,7 +151,7 @@ class ImageContextManager:
            ).execute()
            
            deleted_count = len(result.data) if result.data else 0
-            logger.debug(f"Cleared {deleted_count} images from context")
+            logger.debug(f"Cleared {deleted_count} image-related messages from context (images + instructions)")
            return deleted_count
            
        except Exception as e:
--- a/backend/core/tools/sb_vision_tool.py
+++ b/backend/core/tools/sb_vision_tool.py
@ -165,11 +165,9 @@ class SandboxVisionTool(SandboxToolsBase):
                            os.unlink(temp_svg_path)
                            
                    except ImportError:
-                        print(f"[SeeImage] SVG conversion not available - using original SVG file '{file_path}'")
-                        return image_bytes, mime_type
+                        raise Exception(f"SVG conversion libraries not available. Cannot display SVG file '{file_path}'. Please convert to PNG manually.")
                    except Exception as e:
-                        print(f"[SeeImage] SVG conversion failed - using original SVG file '{file_path}': {str(e)}")
-                        return image_bytes, mime_type
+                        raise Exception(f"SVG conversion failed for '{file_path}': {str(e)}. Please convert to PNG manually.")
            
            # Open image from bytes
            img = Image.open(BytesIO(image_bytes))
@ -220,8 +218,14 @@ class SandboxVisionTool(SandboxToolsBase):
            return compressed_bytes, output_mime
            
        except Exception as e:
-            print(f"[SeeImage] Failed to compress image: {str(e)}. Using original.")
-            return image_bytes, mime_type
+            # CRITICAL: Never return unsupported formats
+            # If compression fails, we need to ensure we still return a supported format
+            if mime_type in ['image/jpeg', 'image/png', 'image/gif', 'image/webp']:
+                print(f"[SeeImage] Failed to compress image: {str(e)}. Using original (format is supported).")
+                return image_bytes, mime_type
+            else:
+                # Unsupported format and compression failed - must fail
+                raise Exception(f"Failed to process image '{file_path}' with unsupported format '{mime_type}': {str(e)}")

    def is_url(self, file_path: str) -> bool:
        """check if the file path is url"""
@ -265,7 +269,7 @@ class SandboxVisionTool(SandboxToolsBase):
        "type": "function",
        "function": {
            "name": "load_image",
-            "description": "Loads an image file into conversation context from the /workspace directory or from a URL. Provide either a relative path to a local image or the URL to an image. The image will be compressed before sending to reduce token usage. IMPORTANT: If you previously loaded an image but cleared context, you can load it again by calling this tool with the same file path - no need to ask user to re-upload.",
+            "description": "Loads an image file into conversation context from the /workspace directory or from a URL. CRITICAL: After loading, you MUST analyze the image thoroughly, write a detailed summary, and then call clear_images_from_context to free context tokens. Images consume significant tokens and must be actively managed. You can reload any image later with the same file path if needed.",
            "parameters": {
                "type": "object",
                "properties": {
@ -353,6 +357,15 @@ class SandboxVisionTool(SandboxToolsBase):
                    print(f"[SeeImage] Warning: Could not save converted PNG to sandbox: {e}")
                    # Continue with original path if save fails

+            # CRITICAL: Validate MIME type before upload - Anthropic only accepts 4 formats
+            SUPPORTED_MIME_TYPES = ['image/jpeg', 'image/png', 'image/gif', 'image/webp']
+            if compressed_mime_type not in SUPPORTED_MIME_TYPES:
+                return self.fail_response(
+                    f"Invalid image format '{compressed_mime_type}' after compression. "
+                    f"Only {', '.join(SUPPORTED_MIME_TYPES)} are supported for viewing by the AI. "
+                    f"Original file: '{cleaned_path}'. Please convert the image to a supported format."
+                )
+
            # Upload to Supabase Storage instead of base64
            try:
                # Generate unique filename
@ -418,7 +431,7 @@ class SandboxVisionTool(SandboxToolsBase):
        "type": "function",
        "function": {
            "name": "clear_images_from_context",
-            "description": "Clears all images from conversation memory. Use when done analyzing images or to free up context tokens. IMPORTANT: Files remain accessible - use load_image with the same path to load any image again instead of asking user to re-upload.",
+            "description": "REQUIRED after viewing images: Removes all images and their instructions from context to free up tokens. You MUST call this after analyzing images. The image files remain accessible in the sandbox - you can reload them later with load_image if needed. This is critical for context management.",
            "parameters": {
                "type": "object",
                "properties": {},
@ -435,7 +448,13 @@ class SandboxVisionTool(SandboxToolsBase):
            deleted_count = await self.image_context_manager.clear_images_from_context(self.thread_id)
            
            if deleted_count > 0:
-                return self.success_response(f"Successfully cleared {deleted_count} image(s) from conversation context. Visual memory has been reset.")
+                # Typically 2 messages per image: the image itself + the context instruction
+                image_count = deleted_count // 2
+                return self.success_response(
+                    f"Successfully cleared approximately {image_count} image(s) and their instructions from conversation context "
+                    f"({deleted_count} total messages removed). Context tokens freed up. "
+                    f"You can reload any image again using load_image if needed."
+                )
            else:
                return self.success_response("No images found in conversation context to clear.")