suna/backend/core/utils/run_management.py

"""Agent run management utilities - starting, stopping, and monitoring agent runs."""
import json
from typing import Optional, List
from fastapi import HTTPException
from core.services import redis
from ..utils.logger import logger
from run_agent_background import update_agent_run_status, _cleanup_redis_response_list


async def cleanup_instance_runs(instance_id: str):
    """Clean up all running agents for a specific instance."""
    logger.debug(f"Starting cleanup of agent runs for instance {instance_id}")

    try:
        if not instance_id:
            logger.warning("Instance ID not set, cannot clean up instance-specific agent runs.")
            return

        running_keys = await redis.keys(f"active_run:{instance_id}:*")
        logger.debug(f"Found {len(running_keys)} running agent runs for instance {instance_id} to clean up")

        for key in running_keys:
            # Key format: active_run:{instance_id}:{agent_run_id}
            parts = key.split(":")
            if len(parts) == 3:
                agent_run_id = parts[2]
                await stop_agent_run_with_helpers(agent_run_id, error_message=f"Instance {instance_id} shutting down")
            else:
                logger.warning(f"Unexpected key format found: {key}")

    except Exception as e:
        logger.error(f"Failed to clean up running agent runs for instance {instance_id}: {str(e)}")


async def stop_agent_run_with_helpers(agent_run_id: str, error_message: Optional[str] = None):
    """
    Stop an agent run and clean up all associated resources.
    
    This function:
    1. Fetches final responses from Redis
    2. Updates database status
    3. Publishes STOP signals to all control channels
    4. Cleans up Redis keys
    
    Args:
        agent_run_id: The ID of the agent run to stop
        error_message: Optional error message if run failed
    """
    logger.debug(f"Stopping agent run: {agent_run_id}")
    
    # Import here to avoid circular dependency
    from ..core_utils import db
    
    client = await db.client
    final_status = "failed" if error_message else "stopped"

    # Attempt to fetch final responses from Redis
    response_list_key = f"agent_run:{agent_run_id}:responses"
    all_responses = []
    try:
        all_responses_json = await redis.lrange(response_list_key, 0, -1)
        all_responses = [json.loads(r) for r in all_responses_json]
        logger.debug(f"Fetched {len(all_responses)} responses from Redis for DB update on stop/fail: {agent_run_id}")
    except Exception as e:
        logger.error(f"Failed to fetch responses from Redis for {agent_run_id} during stop/fail: {e}")

    # Update the agent run status in the database
    update_success = await update_agent_run_status(
        client, agent_run_id, final_status, error=error_message
    )

    if not update_success:
        logger.error(f"Failed to update database status for stopped/failed run {agent_run_id}")
        raise HTTPException(status_code=500, detail="Failed to update agent run status in database")

    # Send STOP signal to the global control channel
    global_control_channel = f"agent_run:{agent_run_id}:control"
    try:
        await redis.publish(global_control_channel, "STOP")
        logger.debug(f"Published STOP signal to global channel {global_control_channel}")
    except Exception as e:
        logger.error(f"Failed to publish STOP signal to global channel {global_control_channel}: {str(e)}")

    # Find all instances handling this agent run and send STOP to instance-specific channels
    try:
        instance_keys = await redis.keys(f"active_run:*:{agent_run_id}")
        logger.debug(f"Found {len(instance_keys)} active instance keys for agent run {agent_run_id}")

        for key in instance_keys:
            # Key format: active_run:{instance_id}:{agent_run_id}
            parts = key.split(":")
            if len(parts) == 3:
                instance_id_from_key = parts[1]
                instance_control_channel = f"agent_run:{agent_run_id}:control:{instance_id_from_key}"
                try:
                    await redis.publish(instance_control_channel, "STOP")
                    logger.debug(f"Published STOP signal to instance channel {instance_control_channel}")
                except Exception as e:
                    logger.warning(f"Failed to publish STOP signal to instance channel {instance_control_channel}: {str(e)}")
            else:
                 logger.warning(f"Unexpected key format found: {key}")

        # Clean up the response list immediately on stop/fail
        await _cleanup_redis_response_list(agent_run_id)

    except Exception as e:
        logger.error(f"Failed to find or signal active instances for {agent_run_id}: {str(e)}")

    logger.debug(f"Successfully initiated stop process for agent run: {agent_run_id}")


async def check_for_active_project_agent_run(client, project_id: str) -> Optional[str]:
    """
    Check if there are any active agent runs for a project.
    
    Args:
        client: Database client
        project_id: The project ID to check
        
    Returns:
        The ID of an active agent run, or None if no active runs
    """
    project_threads = await client.table('threads').select('thread_id').eq('project_id', project_id).execute()
    project_thread_ids = [t['thread_id'] for t in project_threads.data]

    if project_thread_ids:
        from .query_utils import batch_query_in
        
        active_runs = await batch_query_in(
            client=client,
            table_name='agent_runs',
            select_fields='id',
            in_field='thread_id',
            in_values=project_thread_ids,
            additional_filters={'status': 'running'}
        )
        
        if active_runs:
            return active_runs[0]['id']
    return None
refactor: massive backend simplification - eliminate 450+ lines of duplicate code - Created unified AgentLoader for all agent data operations - Eliminated 8 manual AgentResponse constructions across codebase - Reduced agent_runs.py complexity by 163 lines (85→35 lines in 2 functions) - Reduced core_utils.py from 553→69 lines (-87%) - Extracted focused utility modules: * utils/icon_generator.py (179 lines) * utils/limits_checker.py (245 lines) * utils/run_management.py (145 lines) * utils/project_helpers.py (113 lines) * utils/mcp_helpers.py (41 lines) - Fixed duplicate router inclusions in api.py - Renamed template_service.py → marketplace_service.py for clarity - Reduced template transformer from 73→17 lines (-77%) - Inlined module-specific helpers (moved agent_run access check) - Fixed version data false alarm warnings - Updated AgentResponse model to support optional fields for list operations Total impact: ~450 lines of duplicate code eliminated, dramatically improved maintainability 2025-10-05 04:32:51 +08:00			`"""Agent run management utilities - starting, stopping, and monitoring agent runs."""`
			`import json`
			`from typing import Optional, List`
			`from fastapi import HTTPException`
			`from core.services import redis`
			`from ..utils.logger import logger`
			`from run_agent_background import update_agent_run_status, _cleanup_redis_response_list`


			`async def cleanup_instance_runs(instance_id: str):`
			`"""Clean up all running agents for a specific instance."""`
			`logger.debug(f"Starting cleanup of agent runs for instance {instance_id}")`

			`try:`
			`if not instance_id:`
			`logger.warning("Instance ID not set, cannot clean up instance-specific agent runs.")`
			`return`

			`running_keys = await redis.keys(f"active_run:{instance_id}:*")`
			`logger.debug(f"Found {len(running_keys)} running agent runs for instance {instance_id} to clean up")`

			`for key in running_keys:`
			`# Key format: active_run:{instance_id}:{agent_run_id}`
			`parts = key.split(":")`
			`if len(parts) == 3:`
			`agent_run_id = parts[2]`
			`await stop_agent_run_with_helpers(agent_run_id, error_message=f"Instance {instance_id} shutting down")`
			`else:`
			`logger.warning(f"Unexpected key format found: {key}")`

			`except Exception as e:`
			`logger.error(f"Failed to clean up running agent runs for instance {instance_id}: {str(e)}")`


			`async def stop_agent_run_with_helpers(agent_run_id: str, error_message: Optional[str] = None):`
			`"""`
			`Stop an agent run and clean up all associated resources.`

			`This function:`
			`1. Fetches final responses from Redis`
			`2. Updates database status`
			`3. Publishes STOP signals to all control channels`
			`4. Cleans up Redis keys`

			`Args:`
			`agent_run_id: The ID of the agent run to stop`
			`error_message: Optional error message if run failed`
			`"""`
			`logger.debug(f"Stopping agent run: {agent_run_id}")`

			`# Import here to avoid circular dependency`
			`from ..core_utils import db`

			`client = await db.client`
			`final_status = "failed" if error_message else "stopped"`

			`# Attempt to fetch final responses from Redis`
			`response_list_key = f"agent_run:{agent_run_id}:responses"`
			`all_responses = []`
			`try:`
			`all_responses_json = await redis.lrange(response_list_key, 0, -1)`
			`all_responses = [json.loads(r) for r in all_responses_json]`
			`logger.debug(f"Fetched {len(all_responses)} responses from Redis for DB update on stop/fail: {agent_run_id}")`
			`except Exception as e:`
			`logger.error(f"Failed to fetch responses from Redis for {agent_run_id} during stop/fail: {e}")`

			`# Update the agent run status in the database`
			`update_success = await update_agent_run_status(`
			`client, agent_run_id, final_status, error=error_message`
			`)`

			`if not update_success:`
			`logger.error(f"Failed to update database status for stopped/failed run {agent_run_id}")`
			`raise HTTPException(status_code=500, detail="Failed to update agent run status in database")`

			`# Send STOP signal to the global control channel`
			`global_control_channel = f"agent_run:{agent_run_id}:control"`
			`try:`
			`await redis.publish(global_control_channel, "STOP")`
			`logger.debug(f"Published STOP signal to global channel {global_control_channel}")`
			`except Exception as e:`
			`logger.error(f"Failed to publish STOP signal to global channel {global_control_channel}: {str(e)}")`

			`# Find all instances handling this agent run and send STOP to instance-specific channels`
			`try:`
			`instance_keys = await redis.keys(f"active_run:*:{agent_run_id}")`
			`logger.debug(f"Found {len(instance_keys)} active instance keys for agent run {agent_run_id}")`

			`for key in instance_keys:`
			`# Key format: active_run:{instance_id}:{agent_run_id}`
			`parts = key.split(":")`
			`if len(parts) == 3:`
			`instance_id_from_key = parts[1]`
			`instance_control_channel = f"agent_run:{agent_run_id}:control:{instance_id_from_key}"`
			`try:`
			`await redis.publish(instance_control_channel, "STOP")`
			`logger.debug(f"Published STOP signal to instance channel {instance_control_channel}")`
			`except Exception as e:`
			`logger.warning(f"Failed to publish STOP signal to instance channel {instance_control_channel}: {str(e)}")`
			`else:`
			`logger.warning(f"Unexpected key format found: {key}")`

			`# Clean up the response list immediately on stop/fail`
			`await _cleanup_redis_response_list(agent_run_id)`

			`except Exception as e:`
			`logger.error(f"Failed to find or signal active instances for {agent_run_id}: {str(e)}")`

			`logger.debug(f"Successfully initiated stop process for agent run: {agent_run_id}")`


			`async def check_for_active_project_agent_run(client, project_id: str) -> Optional[str]:`
			`"""`
			`Check if there are any active agent runs for a project.`

			`Args:`
			`client: Database client`
			`project_id: The project ID to check`

			`Returns:`
			`The ID of an active agent run, or None if no active runs`
			`"""`
			`project_threads = await client.table('threads').select('thread_id').eq('project_id', project_id).execute()`
			`project_thread_ids = [t['thread_id'] for t in project_threads.data]`

			`if project_thread_ids:`
			`from .query_utils import batch_query_in`

			`active_runs = await batch_query_in(`
			`client=client,`
			`table_name='agent_runs',`
			`select_fields='id',`
			`in_field='thread_id',`
			`in_values=project_thread_ids,`
			`additional_filters={'status': 'running'}`
			`)`

			`if active_runs:`
			`return active_runs[0]['id']`
			`return None`