suna/backend/services/transcription.py

import os
import openai
import tempfile
from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
from pydantic import BaseModel
from typing import Optional
from utils.logger import logger
from utils.auth_utils import get_current_user_id_from_jwt

router = APIRouter(tags=["transcription"])

class TranscriptionResponse(BaseModel):
    text: str

@router.post("/transcription", response_model=TranscriptionResponse)
async def transcribe_audio(
    audio_file: UploadFile = File(...),
    user_id: str = Depends(get_current_user_id_from_jwt)
):
    """Transcribe audio file to text using OpenAI Whisper."""
    try:
        # Validate file type - OpenAI supports these formats
        allowed_types = [
            'audio/mp3', 'audio/mpeg', 'audio/mp4', 'audio/m4a',
            'audio/wav', 'audio/webm', 'audio/mpga'
        ]

        logger.info(f"Received audio file: {audio_file.filename}, content_type: {audio_file.content_type}")

        if audio_file.content_type not in allowed_types:
            raise HTTPException(
                status_code=400,
                detail=f"Unsupported file type: {audio_file.content_type}. Supported types: {', '.join(allowed_types)}"
            )

        # Check file size (25MB limit)
        content = await audio_file.read()
        if len(content) > 25 * 1024 * 1024:  # 25MB
            raise HTTPException(status_code=400, detail="File size exceeds 25MB limit")

        # Reset file pointer
        await audio_file.seek(0)

        # Initialize OpenAI client
        client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

        # Create a temporary file with the correct extension
        file_extension = audio_file.filename.split('.')[-1] if audio_file.filename and '.' in audio_file.filename else 'webm'

        with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_extension}') as temp_file:
            temp_file.write(content)
            temp_file_path = temp_file.name

        try:
            # Transcribe audio using the temporary file
            # OpenAI Whisper API has built-in limits: 25MB file size and handles duration limits internally
            with open(temp_file_path, 'rb') as f:
                transcription = client.audio.transcriptions.create(
                    model="gpt-4o-mini-transcribe",
                    file=f,
                    response_format="text"
                )

            logger.info(f"Successfully transcribed audio for user {user_id}")
            return TranscriptionResponse(text=transcription)

        finally:
            # Clean up temporary file
            try:
                os.unlink(temp_file_path)
            except Exception as e:
                logger.warning(f"Failed to delete temporary file {temp_file_path}: {e}")

    except Exception as e:
        logger.error(f"Error transcribing audio for user {user_id}: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")