diff --git a/backend/agent/run.py b/backend/agent/run.py index 5ada055a..3301e498 100644 --- a/backend/agent/run.py +++ b/backend/agent/run.py @@ -122,18 +122,29 @@ async def run_agent( try: browser_content = json.loads(latest_browser_state_msg.data[0]["content"]) screenshot_base64 = browser_content.get("screenshot_base64") - # Create a copy of the browser state without screenshot + screenshot_url = browser_content.get("screenshot_url") + + # Create a copy of the browser state without screenshot data browser_state_text = browser_content.copy() browser_state_text.pop('screenshot_base64', None) browser_state_text.pop('screenshot_url', None) - browser_state_text.pop('screenshot_url_base64', None) if browser_state_text: temp_message_content_list.append({ "type": "text", "text": f"The following is the current state of the browser:\n{json.dumps(browser_state_text, indent=2)}" }) - if screenshot_base64: + + # Prioritize screenshot_url if available + if screenshot_url: + temp_message_content_list.append({ + "type": "image_url", + "image_url": { + "url": screenshot_url, + } + }) + elif screenshot_base64: + # Fallback to base64 if URL not available temp_message_content_list.append({ "type": "image_url", "image_url": { @@ -141,7 +152,7 @@ async def run_agent( } }) else: - logger.warning("Browser state found but no screenshot base64 data.") + logger.warning("Browser state found but no screenshot data.") await client.table('messages').delete().eq('message_id', latest_browser_state_msg.data[0]["message_id"]).execute() except Exception as e: diff --git a/backend/sandbox/docker/browser_api.py b/backend/sandbox/docker/browser_api.py index 471fc6b0..f122cdfc 100644 --- a/backend/sandbox/docker/browser_api.py +++ b/backend/sandbox/docker/browser_api.py @@ -15,6 +15,8 @@ import traceback import pytesseract from PIL import Image import io +from utils.logger import logger +from services.supabase import DBConnection ####################################################### # Action model definitions @@ -259,15 +261,16 @@ class BrowserActionResult(BaseModel): url: Optional[str] = None title: Optional[str] = None elements: Optional[str] = None # Formatted string of clickable elements - screenshot_base64: Optional[str] = None + screenshot_base64: Optional[str] = None # For backward compatibility + screenshot_url: Optional[str] = None pixels_above: int = 0 pixels_below: int = 0 content: Optional[str] = None - ocr_text: Optional[str] = None # Added field for OCR text + ocr_text: Optional[str] = None # Additional metadata - element_count: int = 0 # Number of interactive elements found - interactive_elements: Optional[List[Dict[str, Any]]] = None # Simplified list of interactive elements + element_count: int = 0 + interactive_elements: Optional[List[Dict[str, Any]]] = None viewport_width: Optional[int] = None viewport_height: Optional[int] = None @@ -288,6 +291,7 @@ class BrowserAutomation: self.include_attributes = ["id", "href", "src", "alt", "aria-label", "placeholder", "name", "role", "title", "value"] self.screenshot_dir = os.path.join(os.getcwd(), "screenshots") os.makedirs(self.screenshot_dir, exist_ok=True) + self.db = DBConnection() # Initialize DB connection # Register routes self.router.on_startup.append(self.startup) @@ -609,15 +613,85 @@ class BrowserAutomation: ) async def take_screenshot(self) -> str: - """Take a screenshot and return as base64 encoded string""" + """Take a screenshot and return as base64 encoded string or S3 URL""" try: page = await self.get_current_page() screenshot_bytes = await page.screenshot(type='jpeg', quality=60, full_page=False) - return base64.b64encode(screenshot_bytes).decode('utf-8') + + client = await self.db.client + + if client: + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + random_id = random.randint(1000, 9999) + filename = f"screenshot_{timestamp}_{random_id}.jpg" + + logger.info(f"Attempting to upload screenshot: {filename}") + result = await self.upload_to_storage(client, screenshot_bytes, filename) + + if isinstance(result, dict) and result.get("is_s3") and result.get("url"): + if await self.verify_file_exists(client, filename): + logger.info(f"Screenshot upload verified: {filename}") + else: + logger.error(f"Screenshot upload failed verification: {filename}") + return base64.b64encode(screenshot_bytes).decode('utf-8') + + return result + else: + logger.warning("No Supabase client available, falling back to base64") + return base64.b64encode(screenshot_bytes).decode('utf-8') except Exception as e: - print(f"Error taking screenshot: {e}") - # Return an empty string rather than failing + logger.error(f"Error taking screenshot: {str(e)}") + traceback.print_exc() return "" + + async def upload_to_storage(self, client, file_bytes: bytes, filename: str) -> str: + """Upload file to Supabase Storage and return the URL""" + try: + bucket_name = 'screenshots' + + buckets = client.storage.list_buckets() + if not any(bucket.name == bucket_name for bucket in buckets): + logger.info(f"Creating bucket: {bucket_name}") + try: + client.storage.create_bucket(bucket_name) + logger.info("Bucket created successfully") + except Exception as e: + logger.error(f"Failed to create bucket: {str(e)}") + raise + + logger.info(f"Uploading file: {filename}") + try: + result = client.storage.from_(bucket_name).upload( + path=filename, + file=file_bytes, + file_options={"content-type": "image/jpeg"} + ) + logger.info("File upload successful") + except Exception as e: + logger.error(f"Failed to upload file: {str(e)}") + raise + + file_url = client.storage.from_(bucket_name).get_public_url(filename) + logger.info(f"Generated URL: {file_url}") + + return {"url": file_url, "is_s3": True} + except Exception as e: + logger.error(f"Error in upload_to_storage: {str(e)}") + traceback.print_exc() + return base64.b64encode(file_bytes).decode('utf-8') + + async def verify_file_exists(self, client, filename: str) -> bool: + """Verify that a file exists in the storage bucket""" + logger.info(f"=== Verifying file exists: {filename} ===") + try: + bucket_name = 'screenshots' + files = client.storage.from_(bucket_name).list() + exists = any(f['name'] == filename for f in files) + logger.info(f"File verification result: {'exists' if exists else 'not found'}") + return exists + except Exception as e: + logger.error(f"Error verifying file: {str(e)}") + return False async def save_screenshot_to_file(self) -> str: """Take a screenshot and save to file, returning the path""" @@ -660,20 +734,32 @@ class BrowserAutomation: """Helper method to get updated browser state after any action Returns a tuple of (dom_state, screenshot, elements, metadata) """ + logger.info(f"=== Starting get_updated_browser_state for action: {action_name} ===") try: # Wait a moment for any potential async processes to settle + logger.info("Waiting for async processes to settle") await asyncio.sleep(0.5) # Get updated state + logger.info("Getting current DOM state") dom_state = await self.get_current_dom_state() + logger.info(f"DOM state retrieved - URL: {dom_state.url}, Title: {dom_state.title}") + + logger.info("Taking screenshot") screenshot = await self.take_screenshot() + logger.info(f"Screenshot result type: {'dict' if isinstance(screenshot, dict) else 'base64 string'}") + if isinstance(screenshot, dict) and screenshot.get("url"): + logger.info(f"Screenshot URL: {screenshot['url']}") # Format elements for output + logger.info("Formatting clickable elements") elements = dom_state.element_tree.clickable_elements_to_string( include_attributes=self.include_attributes ) + logger.info(f"Found {len(dom_state.selector_map)} clickable elements") # Collect additional metadata + logger.info("Collecting metadata") page = await self.get_current_page() metadata = {} @@ -699,8 +785,9 @@ class BrowserAutomation: metadata['interactive_elements'] = interactive_elements - # Get viewport dimensions - Fix syntax error in JavaScript + # Get viewport dimensions try: + logger.info("Getting viewport dimensions") viewport = await page.evaluate(""" () => { return { @@ -711,33 +798,43 @@ class BrowserAutomation: """) metadata['viewport_width'] = viewport.get('width', 0) metadata['viewport_height'] = viewport.get('height', 0) + logger.info(f"Viewport dimensions: {metadata['viewport_width']}x{metadata['viewport_height']}") except Exception as e: - print(f"Error getting viewport dimensions: {e}") + logger.error(f"Error getting viewport dimensions: {e}") metadata['viewport_width'] = 0 metadata['viewport_height'] = 0 # Extract OCR text from screenshot if available ocr_text = "" if screenshot: + logger.info("Extracting OCR text from screenshot") ocr_text = await self.extract_ocr_text_from_screenshot(screenshot) metadata['ocr_text'] = ocr_text + logger.info(f"OCR text length: {len(ocr_text)} characters") - print(f"Got updated state after {action_name}: {len(dom_state.selector_map)} elements") + logger.info(f"=== Completed get_updated_browser_state for {action_name} ===") return dom_state, screenshot, elements, metadata except Exception as e: - print(f"Error getting updated state after {action_name}: {e}") + logger.error(f"Error in get_updated_browser_state for {action_name}: {e}") traceback.print_exc() # Return empty values in case of error return None, "", "", {} def build_action_result(self, success: bool, message: str, dom_state, screenshot: str, - elements: str, metadata: dict, error: str = "", content: str = None, - fallback_url: str = None) -> BrowserActionResult: + elements: str, metadata: dict, error: str = "", content: str = None, + fallback_url: str = None) -> BrowserActionResult: """Helper method to build a consistent BrowserActionResult""" - # Ensure elements is never None to avoid display issues if elements is None: elements = "" + screenshot_base64 = None + screenshot_url = None + + if isinstance(screenshot, dict) and screenshot.get("is_s3"): + screenshot_url = screenshot.get("url") + else: + screenshot_base64 = screenshot + return BrowserActionResult( success=success, message=message, @@ -745,7 +842,8 @@ class BrowserAutomation: url=dom_state.url if dom_state else fallback_url or "", title=dom_state.title if dom_state else "", elements=elements, - screenshot_base64=screenshot, + screenshot_base64=screenshot_base64, + screenshot_url=screenshot_url, pixels_above=dom_state.pixels_above if dom_state else 0, pixels_below=dom_state.pixels_below if dom_state else 0, content=content, diff --git a/backend/utils/logger.py b/backend/utils/logger.py index 51574ed6..32fae2fd 100644 --- a/backend/utils/logger.py +++ b/backend/utils/logger.py @@ -100,22 +100,23 @@ def setup_logger(name: str = 'agentpress') -> logging.Logger: except Exception as e: print(f"Error setting up file handler: {e}") - # Console handler - WARNING in production, INFO in other environments + # Console handler - WARNING in production, DEBUG in other environments try: console_handler = logging.StreamHandler(sys.stdout) if config.ENV_MODE == EnvMode.PRODUCTION: console_handler.setLevel(logging.WARNING) else: - console_handler.setLevel(logging.INFO) + console_handler.setLevel(logging.DEBUG) console_formatter = logging.Formatter( - '%(asctime)s - %(levelname)s - %(message)s' + '%(asctime)s - %(levelname)s - %(name)s - %(message)s' ) console_handler.setFormatter(console_formatter) # Add console handler to logger logger.addHandler(console_handler) - print(f"Added console handler with level: {console_handler.level}") + logger.info(f"Added console handler with level: {console_handler.level}") + logger.info(f"Log file will be created at: {log_dir}") except Exception as e: print(f"Error setting up console handler: {e}") diff --git a/frontend/src/components/payment/paywall-dialog.tsx b/frontend/src/components/payment/paywall-dialog.tsx index 8b401684..1ce5a0bd 100644 --- a/frontend/src/components/payment/paywall-dialog.tsx +++ b/frontend/src/components/payment/paywall-dialog.tsx @@ -58,7 +58,7 @@ export const PaywallDialog: React.FC = ({ strayBackdrops.forEach(element => element.remove()); }; }, []); - + useEffect(() => { if (!open) { document.body.classList.remove('overflow-hidden'); diff --git a/frontend/src/components/thread/tool-views/BrowserToolView.tsx b/frontend/src/components/thread/tool-views/BrowserToolView.tsx index e18cf711..2fcb45e9 100644 --- a/frontend/src/components/thread/tool-views/BrowserToolView.tsx +++ b/frontend/src/components/thread/tool-views/BrowserToolView.tsx @@ -84,6 +84,7 @@ export function BrowserToolView({ browserStateMessage.content, {}, ); + console.log('Browser state content: ', browserStateContent) screenshotBase64 = browserStateContent?.screenshot_base64 || null; } }