import gradio as gr
import torch
import os
from kokoro import generate
from models import build_model

# Initialize model and device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL = build_model('kokoro-v0_19.pth', device)

# Load the voice models
voices = {
    'af': torch.load("voices/af.pt", weights_only=True),
    'af_bella': torch.load("voices/af_bella.pt", weights_only=True),
    'af_sarah': torch.load("voices/af_sarah.pt", weights_only=True),
    'am_adam': torch.load("voices/am_adam.pt", weights_only=True),
    'am_michael': torch.load("voices/am_michael.pt", weights_only=True),
    'bf_emma': torch.load("voices/bf_emma.pt", weights_only=True),
    'bf_isabella': torch.load("voices/bf_isabella.pt", weights_only=True),
    'bm_george': torch.load("voices/bm_george.pt", weights_only=True),
    'bm_lewis': torch.load("voices/bm_lewis.pt", weights_only=True),
    'af_nicole': torch.load("voices/af_nicole.pt", weights_only=True),
    'af_sky': torch.load("voices/af_sky.pt", weights_only=True)
}


def parse_voice_formula(formula):
    """Parse the voice formula string and return the combined voice tensor."""
    if not formula.strip():
        raise ValueError("Empty voice formula")
        
    # Initialize the weighted sum
    weighted_sum = None
    
    # Split the formula into terms
    terms = formula.split('+')
    
    for term in terms:
        # Parse each term (format: "0.333 * voice_name")
        weight, voice_name = term.strip().split('*')
        weight = float(weight.strip())
        voice_name = voice_name.strip()
        
        # Get the voice tensor
        if voice_name not in voices:
            raise ValueError(f"Unknown voice: {voice_name}")
            
        voice_tensor = voices[voice_name]
        
        # Add to weighted sum
        if weighted_sum is None:
            weighted_sum = weight * voice_tensor
        else:
            weighted_sum += weight * voice_tensor
            
    return weighted_sum

def get_new_voice(formula):
    try:
        # Parse the formula and get the combined voice tensor
        weighted_voices = parse_voice_formula(formula)
        
        # Save and load the combined voice
        torch.save(weighted_voices, "weighted_normalised_voices.pt")
        VOICEPACK = torch.load("weighted_normalised_voices.pt", weights_only=False).to(device)
        return VOICEPACK
    except Exception as e:
        raise gr.Error(f"Failed to create voice: {str(e)}")

def text_to_speech(text, formula):
    try:
        if not text.strip():
            raise gr.Error("Please enter some text")
            
        if not formula.strip():
            raise gr.Error("Please select at least one voice")
            
        # Get the combined voice
        VOICEPACK = get_new_voice(formula)
        
        # Generate audio
        audio, phonemes = generate(MODEL, text, VOICEPACK, lang='a')
        return (24000, audio)
    except Exception as e:
        raise gr.Error(f"Failed to generate speech: {str(e)}")
        

custom_css = """
.container-wrap {
    display: flex !important;
    gap: 5px !important;
    justify-content: center !important;
    margin: 0 auto !important;
    max-width: 1400px !important;  /* Increased max-width */
}

.vert-group {
    min-width: 100px !important;   /* Increased from 80px */
    width: 120px !important;       /* Increased from 90px */
    flex: 0 0 auto !important;
}

.vert-group label {
    white-space: nowrap !important;
    overflow: visible !important;
    width: auto !important;
    font-size: 0.85em !important;  /* Slightly increased font size */
    transform-origin: left center !important;
    transform: rotate(0deg) translateX(-50%) !important;
    position: relative !important;
    left: 50% !important;
    display: inline-block !important;
    text-align: center !important;
    margin-bottom: 5px !important;
    padding: 0 5px !important;     /* Added padding */
}

.vert-group .wrap label {
    text-align: center !important;
    width: 100% !important;
    display: block !important;
}

.slider_input_container {
    height: 200px !important;
    position: relative !important;
    width: 50px !important;        /* Increased from 40px */
    margin: 0 auto !important;
    overflow: hidden !important;
}

.slider_input_container input[type="range"] {
    position: absolute !important;
    width: 200px !important;
    left: -75px !important;        /* Adjusted from -80px */
    top: 100px !important;
    transform: rotate(90deg) !important;
}

.min_value {
    position: absolute !important;
    bottom: 0 !important;
    left: 10px !important;
}

.max_value {
    position: absolute !important;
    top: 0 !important;
    left: 10px !important;
}

.tab-like-container {
    transform: scale(0.8) !important;
}

.gradio-row, .gradio-column {
    background: none !important;
    border: none !important;
    min-width: unset !important;
}

.heading {
    text-align: center !important;
    margin-bottom: 1rem !important;
}

.description {
    text-align: center !important;
    margin-bottom: 2rem !important;
    color: rgba(255, 255, 255, 0.7) !important;
}
"""

with gr.Blocks(css=custom_css, theme="ocean") as demo:
    gr.Markdown(
        """
        # 🎙️ Voice Mixer - Kokoro TTS
        ### Mix and match different voices to create your perfect text-to-speech voice
        
        This app lets you combine multiple voices with different weights to create custom voice combinations. 
        Select voices using checkboxes and adjust their weights using the sliders below.
        """
    )
    
    with gr.Row(variant="default", equal_height=True, elem_classes="container-wrap"):
        checkboxes = []
        sliders = []
        
        # Define slider configurations with emojis
        slider_configs = [
            ("af", "Default 👩‍🦰"),
            ("af_bella", "Bella 👩‍🦰 🇺🇸"), 
            ("af_sarah", "Sarah 👩‍🦰 🇺🇸"),
            ("af_nicole", "Nicole 👩‍🦰 🇺🇸"), 
            ("af_sky", "Sky 👩‍🦰 🇺🇸"),
            ("am_adam", "Adam 👨 🇺🇸"),
            ("am_michael", "Michael 👨 🇺🇸"),
            ("bf_emma", "Emma 👩‍🦰 🇬🇧"),
            ("bf_isabella", "Isabella 👩‍🦰 🇬🇧"),
            ("bm_george", "George 👨 🇬🇧"),
            ("bm_lewis", "Lewis 👨 🇬🇧")
        ]

        # Create columns for each slider
        for value, label in slider_configs:
            with gr.Column(min_width=70, scale=1, variant="default", elem_classes="vert-group"):
                checkbox = gr.Checkbox(label='')
                slider = gr.Slider(label=label, minimum=0, maximum=1, interactive=False, value=0, step=0.01)
                checkboxes.append(checkbox)
                sliders.append(slider)

    # Add voice combination formula display
    with gr.Row(equal_height=True):
        formula_display = gr.Textbox(
            label="Voice Combination Formula", 
            value="", 
            lines=2, 
            scale=4, 
            interactive=False
        )
        input_text = gr.Textbox(
            label="Input Text", 
            placeholder="Enter text to convert to speech", 
            lines=2, 
            scale=4
        )
        button_tts = gr.Button("🎙️ Generate Voice", scale=2, min_width=100)

    # Generate speech from the selected custom voice
    with gr.Row(equal_height=True):
        kokoro_tts = gr.Audio(label="Generated Speech", type="numpy")

    def generate_voice_formula(*values):
        """
        Generate a formatted string showing the normalized voice combination.
        Returns: String like "0.6 * voice1 + 0.4 * voice2"
        """
        n = len(values) // 2
        checkbox_values = values[:n]
        slider_values = list(values[n:])

        # Get active sliders and their names
        active_pairs = [(slider_values[i], slider_configs[i][0])  # Use value instead of label
                       for i in range(len(slider_configs))
                       if checkbox_values[i] and slider_values[i] > 0]

        if not active_pairs:
            return ""

        # Calculate sum for normalization
        total_sum = sum(value for value, _ in active_pairs)

        if total_sum == 0:
            return ""

        # For single voice, always use weight 1.0
        if len(active_pairs) == 1:
            return f"1.000 * {active_pairs[0][1]}"

        # Generate normalized formula for multiple voices
        terms = []
        for value, name in active_pairs:
            normalized_value = value / total_sum
            terms.append(f"{normalized_value:.3f} * {name}")

        return " + ".join(terms)

    def check_box(checkbox):
        """Handle checkbox changes."""
        if checkbox:
            return gr.Slider(interactive=True, value=1.0)  # Changed default to 1.0
        else:
            return gr.Slider(interactive=False, value=0)

    # Connect all checkboxes and sliders
    all_inputs = checkboxes + sliders

    # Update on checkbox changes
    for checkbox, slider in zip(checkboxes, sliders):
        checkbox.change(
            fn=check_box,
            inputs=[checkbox],
            outputs=[slider]
        )
        # Update formula on checkbox changes
        checkbox.change(
            fn=generate_voice_formula,
            inputs=all_inputs,
            outputs=[formula_display]
        )

    # Update formula on slider changes
    for slider in sliders:
        slider.change(
            fn=generate_voice_formula,
            inputs=all_inputs,
            outputs=[formula_display]
        )

    button_tts.click(
        fn=text_to_speech,
        inputs=[input_text, formula_display],
        outputs=[kokoro_tts]
    )


if __name__ == "__main__":
    demo.launch()