suna/agentpress/llm.py

from typing import Union, Dict, Any, Optional, List
import litellm
import os
import json
import openai
from openai import OpenAIError
import asyncio
import logging

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', None)
ANTHROPIC_API_KEY = os.environ.get('ANTHROPIC_API_KEY', None)
GROQ_API_KEY = os.environ.get('GROQ_API_KEY', None)
AGENTOPS_API_KEY = os.environ.get('AGENTOPS_API_KEY', None)
FIREWORKS_API_KEY = os.environ.get('FIREWORKS_AI_API_KEY', None)
DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY', None)
OPENROUTER_API_KEY = os.environ.get('OPENROUTER_API_KEY', None)
GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY', None)

AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', None)
AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', None)
AWS_REGION_NAME = os.environ.get('AWS_REGION_NAME', None)

if OPENAI_API_KEY:
    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
if ANTHROPIC_API_KEY:
    os.environ['ANTHROPIC_API_KEY'] = ANTHROPIC_API_KEY
if GROQ_API_KEY:
    os.environ['GROQ_API_KEY'] = GROQ_API_KEY
if FIREWORKS_API_KEY:
    os.environ['FIREWORKS_AI_API_KEY'] = FIREWORKS_API_KEY
if DEEPSEEK_API_KEY:
    os.environ['DEEPSEEK_API_KEY'] = DEEPSEEK_API_KEY
if OPENROUTER_API_KEY:
    os.environ['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY
if GEMINI_API_KEY:
    os.environ['GEMINI_API_KEY'] = GEMINI_API_KEY

# Add AWS environment variables if they exist
if AWS_ACCESS_KEY_ID:
    os.environ['AWS_ACCESS_KEY_ID'] = AWS_ACCESS_KEY_ID
if AWS_SECRET_ACCESS_KEY:
    os.environ['AWS_SECRET_ACCESS_KEY'] = AWS_SECRET_ACCESS_KEY
if AWS_REGION_NAME:
    os.environ['AWS_REGION_NAME'] = AWS_REGION_NAME

async def make_llm_api_call(
    messages: list,
    model_name: str,
    response_format: Any = None,
    temperature: float = 0,
    max_tokens: int = None,
    tools: list = None,
    tool_choice: str = "auto",
    api_key: str = None,
    api_base: str = None,
    agentops_session: Any = None,
    stream: bool = False,
    top_p: float = None,
    stop: Optional[Union[str, List[str]]] = None  # Add stop parameter
) -> Union[Dict[str, Any], Any]:
    """
    Make an API call to a language model using litellm.

    This function provides a unified interface for making calls to various LLM providers
    (OpenAI, Anthropic, Groq, etc.) with support for streaming, tool calls, and retry logic.

    Args:
        messages (list): List of message dictionaries for the conversation
        model_name (str): Name of the model to use (e.g., "gpt-4", "claude-3")
        response_format (Any, optional): Desired format for the response
        temperature (float, optional): Sampling temperature. Defaults to 0
        max_tokens (int, optional): Maximum tokens in the response
        tools (list, optional): List of tool definitions for function calling
        tool_choice (str, optional): How to select tools ("auto" or "none")
        api_key (str, optional): Override default API key
        api_base (str, optional): Override default API base URL
        agentops_session (Any, optional): Session for agentops integration
        stream (bool, optional): Whether to stream the response. Defaults to False
        top_p (float, optional): Top-p sampling parameter
        stop (Union[str, List[str]], optional): Up to 4 sequences where the API will stop generating tokens

    Returns:
        Union[Dict[str, Any], Any]: API response, either complete or streaming

    Raises:
        Exception: If API call fails after retries
    """
    litellm.set_verbose = False

    async def attempt_api_call(api_call_func, max_attempts=3):
        """
        Attempt an API call with retries.

        Args:
            api_call_func: Async function that makes the API call
            max_attempts (int): Maximum number of retry attempts

        Returns:
            API response if successful

        Raises:
            Exception: If all retry attempts fail
        """
        nonlocal model_name  # Add this to access model_name
        for attempt in range(max_attempts):
            try:
                return await api_call_func()
            except litellm.exceptions.RateLimitError as e:
                # Check if it's Bedrock Claude and switch to direct Anthropic
                if "bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0" in model_name:
                    logging.info("Rate limit hit with Bedrock Claude, falling back to direct Anthropic API...")
                    model_name = "anthropic/claude-3-5-sonnet-latest"
                    continue

                logging.warning(f"Rate limit exceeded. Waiting for 30 seconds before retrying...")
                await asyncio.sleep(30)
            except OpenAIError as e:
                logging.info(f"API call failed, retrying attempt {attempt + 1}. Error: {e}")
                await asyncio.sleep(5)
            except json.JSONDecodeError:
                logging.error(f"JSON decoding failed, retrying attempt {attempt + 1}")
                await asyncio.sleep(5)
        raise Exception("Failed to make API call after multiple attempts.")

    async def api_call():
        """
        Prepare and execute the API call with the specified parameters.

        Returns:
            API response from the language model
        """
        api_call_params = {
            "model": model_name,
            "messages": messages,
            "temperature": temperature,
            "response_format": response_format,
            "top_p": top_p,
            "stream": stream,
        }

        # Add stop parameter if provided
        if stop is not None:
            api_call_params["stop"] = stop

        # Add optional parameters if provided
        if api_key:
            api_call_params["api_key"] = api_key
        if api_base:
            api_call_params["api_base"] = api_base

        # Handle token limits differently for different models
        if 'o1' in model_name:
            if max_tokens is not None:
                api_call_params["max_completion_tokens"] = max_tokens
        else:
            if max_tokens is not None:
                api_call_params["max_tokens"] = max_tokens

        if tools:
            api_call_params["tools"] = tools
            api_call_params["tool_choice"] = tool_choice

        # Add special headers for Claude models
        if "claude" in model_name.lower() or "anthropic" in model_name.lower():
            api_call_params["extra_headers"] = {
                "anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"
            }

        # Add OpenRouter specific parameters
        if "openrouter" in model_name.lower():
            if settings.or_site_url:
                api_call_params["headers"] = {
                    "HTTP-Referer": settings.or_site_url
                }
            if settings.or_app_name:
                api_call_params["headers"] = {
                    "X-Title": settings.or_app_name
                }

        # Add special handling for Deepseek
        if "deepseek" in model_name.lower():
            api_call_params["frequency_penalty"] = 0.5
            api_call_params["temperature"] = 0.7
            api_call_params["presence_penalty"] = 0.1

        # Add Bedrock-specific parameters
        if "bedrock" in model_name.lower():
            if settings.aws_access_key_id:
                api_call_params["aws_access_key_id"] = settings.aws_access_key_id
            if settings.aws_secret_access_key:
                api_call_params["aws_secret_access_key"] = settings.aws_secret_access_key
            if settings.aws_region_name:
                api_call_params["aws_region_name"] = settings.aws_region_name

        # Log the API request
        # logging.info(f"Sending API request: {json.dumps(api_call_params, indent=2)}")

        # Make the API call using either agentops session or direct litellm
        if agentops_session:
            response = await agentops_session.patch(litellm.acompletion)(**api_call_params)
        else:
            response = await litellm.acompletion(**api_call_params)

        # logging.info(f"Received API response: {response}")

        # # For streaming responses, attach cost tracking
        # if stream:
        #     # Create a wrapper object to track costs across chunks
        #     cost_tracker = {
        #         "prompt_tokens": 0,
        #         "completion_tokens": 0,
        #         "total_tokens": 0,
        #         "cost": 0.0
        #     }

        #     # Get the cost per token for the model
        #     model_cost = litellm.model_cost.get(model_name, {})
        #     input_cost = model_cost.get('input_cost_per_token', 0)
        #     output_cost = model_cost.get('output_cost_per_token', 0)

        #     # Attach the cost tracker to the response
        #     response.cost_tracker = cost_tracker
        #     response.model_info = {
        #         "input_cost_per_token": input_cost,
        #         "output_cost_per_token": output_cost
        #     }
        # else:
        #     # For non-streaming, cost is already included in the response
        #     response._hidden_params = {
        #         "response_cost": litellm.completion_cost(completion_response=response)
        #     }

        return response

    return await attempt_api_call(api_call)

if __name__ == "__main__":
    import asyncio
    async def test_llm_api_call(stream=True):
        """
        Test function for the LLM API call functionality.

        Args:
            stream (bool): Whether to test streaming mode
        """
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Complex essay on economics"}
        ]
        model_name = "gpt-4o"

        response = await make_llm_api_call(messages, model_name, stream=stream)

        if stream:
            print("\n🤖 Streaming response:\n")
            buffer = ""
            async for chunk in response:
                if isinstance(chunk, dict) and 'choices' in chunk:
                    content = chunk['choices'][0]['delta'].get('content', '')
                else:
                    content = chunk.choices[0].delta.content

                if content:
                    buffer += content
                    if content[-1].isspace():
                        print(buffer, end='', flush=True)
                        buffer = ""

            if buffer:
                print(buffer, flush=True)
            print("\n✨ Stream completed.\n")
        else:
            print("\n🤖 Response:\n")
            if isinstance(response, dict) and 'choices' in response:
                print(response['choices'][0]['message']['content'])
            else:
                print(response.choices[0].message.content)
            print()

    # asyncio.run(test_llm_api_call())

    async def test_bedrock():
        """
        Test function for Bedrock API call.
        """
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Hello from Bedrock!"}
        ]
        model_name = "bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0"

        response = await make_llm_api_call(messages, model_name, stream=True)

        print("\n🤖 Streaming response from Bedrock:\n")
        buffer = ""
        async for chunk in response:
            if isinstance(chunk, dict) and 'choices' in chunk:
                content = chunk['choices'][0]['delta'].get('content', '')
            else:
                content = chunk.choices[0].delta.content

            if content:
                buffer += content
                if content[-1].isspace():
                    print(buffer, end='', flush=True)
                    buffer = ""

        if buffer:
            print(buffer, flush=True)
        print("\n✨ Stream completed.\n")

    # Add test_bedrock to the test runs
    # asyncio.run(test_bedrock())