Spaces:

daswer123
/

sonic-tts-webui

Sleeping

File size: 16,778 Bytes

cbaf23b

from typing import List
import gradio as gr
from pathlib import Path
from sonic_api_wrapper import CartesiaVoiceManager, VoiceAccessibility, improve_tts_text
import os
import json
import datetime

# Global variable to hold the manager instance
manager = None

# Constants
LANGUAGE_CHOICES = ["all", "ru", "en", "es", "pl", "de", "fr", "tr", "pt", "zh", "ja", "hi", "it", "ko", "nl", "sv"]
ACCESS_TYPE_MAP = {
    "All": VoiceAccessibility.ALL,
    "Custom Only": VoiceAccessibility.ONLY_CUSTOM,
    "Private Only": VoiceAccessibility.ONLY_PRIVATE,
    "API": VoiceAccessibility.ONLY_PUBLIC
}
SPEED_CHOICES = ["Very Slow", "Slow", "Normal", "Fast", "Very Fast"]
EMOTION_CHOICES = ["Neutral", "Happy", "Sad", "Angry", "Surprised", "Curious"]
EMOTION_INTENSITY = ["Very Weak", "Weak", "Medium", "Strong", "Very Strong"]

def map_speed(speed_type: str) -> float:
    speed_map = {
        "Very Slow": -1.0,
        "Slow": -0.5,
        "Normal": 0.0,
        "Fast": 0.5,
        "Very Fast": 1.0
    }
    return speed_map[speed_type]

def generate_output_filename(language: str) -> str:
    """Generate output filename with timestamp and language"""
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    return f"output/{timestamp}_{language}.wav"

def extract_voice_id_from_label(voice_label: str) -> str:
    """

    Extracts voice ID from label in dropdown

    For example: "John (en) [Custom]" -> extract ID from voices dictionary

    """
    global manager
    try:
        if not manager:
            return None

        # Get all voices and their labels
        choices = manager.get_voice_choices()
        # Find voice by label and get its ID
        voice_data = next((c for c in choices if c["label"] == voice_label), None)
        return voice_data["value"] if voice_data else None
    except Exception as e:
        print(f"❌ Error getting voices: {str(e)}")
        return None

def initialize_manager(api_key: str) -> str:
    global manager
    try:
        if not api_key:
            return "❌ API key is required to initialize the manager"

        manager = CartesiaVoiceManager(api_key=api_key, base_dir=Path("voice2voice"))
        return "✅ Manager initialized"
    except Exception as e:
        manager = None
        return f"❌ Error: {str(e)}"

def get_initial_voices():
    global manager
    """Get initial list of voices"""
    if not manager:
        return [], None
    choices = manager.get_voice_choices()
    if not choices:
        return [], None  
    return [c["label"] for c in choices], choices[0]["label"] if choices else None

def update_voice_list(language: str, access_type: str, current_voice: str = None):
    """

    Update the list of voices, preserving the current selection

    """
    global manager
    if not manager:
        return gr.update(choices=[], value=None), "❌ Manager is not initialized"
    
    try:
        choices = manager.get_voice_choices(
            language=None if language == "all" else language,
            accessibility=ACCESS_TYPE_MAP[access_type]
        )
        
        # Convert to list of labels
        choice_labels = [c["label"] for c in choices]
        
        # Determine value to select
        if current_voice in choice_labels:
            # Preserve current selection if available
            new_value = current_voice
        else:
            # Otherwise, take the first available voice
            new_value = choice_labels[0] if choice_labels else None
            
        return gr.update(choices=choice_labels, value=new_value), "✅ Voice list updated"
    except Exception as e:
        return gr.update(choices=[], value=None), f"❌ Error: {str(e)}"

def update_voice_info(voice_label: str) -> str:
    """Update voice information"""
    global manager
    if not manager or not voice_label:
        return ""
    
    try:
        voice_id = extract_voice_id_from_label(voice_label)
        if not voice_id:
            return "❌ Voice not found"
            
        info = manager.get_voice_info(voice_id)
        return (
            f"Name: {info['name']}\n"
            f"Language: {info['language']}\n"
            f"Type: {'Custom' if info.get('is_custom') else 'API'}\n"
            f"ID: {info['id']}"
        )
    except Exception as e:
        return f"❌ Error: {str(e)}"

def create_custom_voice(name: str, language: str, audio_data: tuple) -> tuple:
    """

    Creates a custom voice and updates the list of voices

    Returns: (status, updated dropdown, voice info)

    """
    global manager
    if not manager:
        return "❌ Manager is not initialized", gr.update(), ""
    
    if not name or not audio_data:
        return "❌ Name and voice file are required", gr.update(), ""
    
    try:
        # Get the audio file path
        audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data
        
        # Create the voice
        voice_id = manager.create_custom_voice(
            name=name,
            source=audio_path,
            language=language
        )
        
        print(voice_id)
        
        # Get updated list of voices
        choices = manager.get_voice_choices()
        choice_labels = [c["label"] for c in choices]
        
        # Find label for the new voice
        new_voice_label = next(c["label"] for c in choices if c["value"] == voice_id)
        
        # Get info of the new voice
        voice_info = manager.get_voice_info(voice_id)
        info_text = (
            f"Name: {voice_info['name']}\n"
            f"Language: {voice_info['language']}\n"
            f"Type: Custom\n"
            f"ID: {voice_info['id']}"
        )
        
        return (
            f"✅ Custom voice created: {voice_id}",
            gr.update(choices=choice_labels, value=new_voice_label),
            info_text
        )
        
    except Exception as e:
        return f"❌ Error creating voice: {str(e)}", gr.update(), ""

def on_auto_language_change(auto_language: bool):
    """Handler for changing the auto-detect language checkbox"""
    return gr.update(visible=not auto_language)

def map_emotions(selected_emotions, intensity):
    emotion_map = {
        "Happy": "positivity",
        "Sad": "sadness",
        "Angry": "anger",
        "Surprised": "surprise",
        "Curious": "curiosity"
    }
    
    intensity_map = {
        "Very Weak": "lowest",
        "Weak": "low",
        "Medium": "medium",
        "Strong": "high",
        "Very Strong": "highest"
    }
    
    emotions = []
    for emotion in selected_emotions:
        if emotion == "Neutral":
            continue
        if emotion in emotion_map:
            emotions.append({
                "name": emotion_map[emotion],
                "level": intensity_map[intensity]
            })
    return emotions

def generate_speech(

    text: str,

    voice_label: str,

    improve_text: bool,

    auto_language: bool,

    manual_language: str,

    speed_type: str,

    use_custom_speed: bool,

    custom_speed: float,

    emotions: List[str],

    emotion_intensity: str

):
    global manager
    """Generate speech considering language settings"""
    if not manager:
        return None, "❌ Manager is not initialized"
    
    if not text or not voice_label:
        return None, "❌ Text and voice are required"
    
    try:
        # Extract voice ID from label
        voice_id = extract_voice_id_from_label(voice_label)
        if not voice_id:
            return None, "❌ Voice not found"
            
        # Set the voice by ID
        manager.set_voice(voice_id)
        
        # If auto-detect is off, set language manually
        if not auto_language:
            manager.set_language(manual_language)
        
        # Set speed
        if use_custom_speed:
            manager.speed = custom_speed
        else:
            manager.speed = map_speed(speed_type)
        
        # Set emotions
        if emotions and emotions != ["Neutral"]:
            manager.set_emotions(map_emotions(emotions, emotion_intensity))
        else:
            manager.set_emotions()  # Reset emotions
        
        # Generate output file name
        output_file = generate_output_filename(
            manual_language if not auto_language else manager.current_language
        )
        
        # Create output directory if it doesn't exist
        os.makedirs("output", exist_ok=True)
        
        # Generate speech
        output_path = manager.speak(
            text=text if not improve_text else improve_tts_text(text, manager.current_language),
            output_file=output_file
        )
        
        return output_path, "✅ Audio generated successfully"
        
    except Exception as e:
        return None, f"❌ Error generating speech: {str(e)}"

def initialize_manager_and_update(api_key: str, language: str, access_type: str, current_voice: str = None):
    status = initialize_manager(api_key)
    if manager:
        voice_update, voice_status = update_voice_list(language, access_type, current_voice)
        combined_status = f"{status}\n{voice_status}"
        return combined_status, voice_update
    else:
        return status, gr.update(choices=[], value=None)

# Create the interface
with gr.Blocks() as demo:
    # API key
    cartesia_api_key = gr.Textbox(
        label="Cartesia API Key",
        value="",  # No default API key
        type='password'
    )
    
    with gr.Row():
        # Left column
        with gr.Column():
            cartesia_text = gr.TextArea(label="Text")
            
            with gr.Accordion(label="Settings", open=True):
                # Filters
                with gr.Accordion("Filters", open=True):
                    cartesia_setting_filter_lang = gr.Dropdown(
                        label="Language",
                        choices=LANGUAGE_CHOICES,
                        value="all"
                    )
                    cartesia_setting_filter_type = gr.Dropdown(
                        label="Type",
                        choices=list(ACCESS_TYPE_MAP.keys()),
                        value="All"
                    )
                
                # Settings tabs
                with gr.Tab("Standard"):
                    cartesia_setting_voice_info = gr.Textbox(
                        label="Voice Information",
                        interactive=False
                    )
                    with gr.Row():
                        initial_choices, initial_value = get_initial_voices()
                        cartesia_setting_voice = gr.Dropdown(
                            label="Voice",
                            choices=initial_choices,
                            value=initial_value
                        )
                    cartesia_setting_voice_update = gr.Button("Refresh")
                    cartesia_setting_auto_language = gr.Checkbox(
                         label="Automatically detect language from voice",
                         value=True
                     )
                    cartesia_setting_manual_language = gr.Dropdown(
                         label="Speech Language",
                         choices=["ru", "en", "es", "fr", "de", "pl", "it", "ja", "ko", "zh", "hi"],
                         value="en",
                         visible=False  # Initially hidden
                     )
                
                with gr.Tab("Custom"):
                    cartesia_setting_custom_name = gr.Textbox(label="Name")
                    cartesia_setting_custom_lang = gr.Dropdown(
                        label="Language",
                        choices=LANGUAGE_CHOICES[1:]  # Exclude "all"
                    )
                    cartesia_setting_custom_voice = gr.Audio(label="Voice File", type='filepath')
                    cartesia_setting_custom_add = gr.Button("Add")
            
            # Emotion control
            with gr.Accordion(label="Emotion Control (Beta)", open=False):
                cartesia_emotions = gr.Dropdown(
                    label="Emotions",
                    multiselect=True,
                    choices=EMOTION_CHOICES
                )
                cartesia_emotions_intensity = gr.Dropdown(
                    label="Intensity",
                    choices=EMOTION_INTENSITY,
                    value="Medium"
                )
            
            # Speed settings
            with gr.Accordion("Speed", open=True):
                cartesia_speed_speed = gr.Dropdown(
                    label="Speech Speed",
                    choices=SPEED_CHOICES,
                    value="Normal"
                )
                cartesia_speed_speed_allow_custom = gr.Checkbox(
                    label="Use custom speed value"
                )
                cartesia_speed_speed_custom = gr.Slider(
                    label="Speed",
                    value=0,
                    minimum=-1,
                    maximum=1,
                    step=0.1,
                    visible=False
                )
            
            cartesia_setting_improve_text = gr.Checkbox(
                label="Improve text according to recommendations",
                value=True
            )
        
        # Right column
        with gr.Column():
            cartessia_status_bar = gr.Label(value="Status")
            cartesia_output_audio = gr.Audio(
                label="Result",
                interactive=False
            )
            cartesia_output_button = gr.Button("Generate")

    # Events
    cartesia_api_key.change(
        initialize_manager_and_update,
        inputs=[cartesia_api_key, cartesia_setting_filter_lang, cartesia_setting_filter_type, cartesia_setting_voice],
        outputs=[cartessia_status_bar, cartesia_setting_voice]
    )
    
    cartesia_setting_filter_lang.change(
        update_voice_list,
        inputs=[
            cartesia_setting_filter_lang,
            cartesia_setting_filter_type,
            cartesia_setting_voice  # Pass the current selection
        ],
        outputs=[cartesia_setting_voice, cartessia_status_bar]
    )

    cartesia_setting_filter_type.change(
        update_voice_list,
        inputs=[
            cartesia_setting_filter_lang,
            cartesia_setting_filter_type,
            cartesia_setting_voice  # Pass the current selection
        ],
        outputs=[cartesia_setting_voice, cartessia_status_bar]
    )
    
    cartesia_setting_voice.change(
        update_voice_info,
        inputs=[cartesia_setting_voice],
        outputs=[cartesia_setting_voice_info]
    )
    
    cartesia_setting_voice_update.click(
        update_voice_list,
        inputs=[cartesia_setting_filter_lang, cartesia_setting_filter_type, cartesia_setting_voice],
        outputs=[cartesia_setting_voice, cartessia_status_bar]
    )
    
    cartesia_speed_speed_allow_custom.change(
        lambda x: gr.update(visible=x),
        inputs=[cartesia_speed_speed_allow_custom],
        outputs=[cartesia_speed_speed_custom]
    )
    
    cartesia_setting_custom_add.click(
        create_custom_voice,
        inputs=[
            cartesia_setting_custom_name,
            cartesia_setting_custom_lang,
            cartesia_setting_custom_voice
        ],
        outputs=[
            cartessia_status_bar,
            cartesia_setting_voice,  # Update dropdown
            cartesia_setting_voice_info  # Update voice info
        ]
    )
    
    cartesia_setting_auto_language.change(
        on_auto_language_change,
        inputs=[cartesia_setting_auto_language],
        outputs=[cartesia_setting_manual_language]
    )

    cartesia_output_button.click(
        generate_speech,
        inputs=[
            cartesia_text,
            cartesia_setting_voice,
            cartesia_setting_improve_text,
            cartesia_setting_auto_language,
            cartesia_setting_manual_language,
            cartesia_speed_speed,
            cartesia_speed_speed_allow_custom,
            cartesia_speed_speed_custom,
            cartesia_emotions,
            cartesia_emotions_intensity
        ],
        outputs=[
            cartesia_output_audio,
            cartessia_status_bar
        ]
    )

# Run the app
if __name__ == "__main__":
    demo.queue()
    demo.launch(share=True)