import gradio as gr import torch import os from kokoro import generate from models import build_model # Initialize model and device device = 'cuda' if torch.cuda.is_available() else 'cpu' MODEL = build_model('kokoro-v0_19.pth', device) # Load the voice models voices = { 'af': torch.load("voices/af.pt", weights_only=True), 'af_bella': torch.load("voices/af_bella.pt", weights_only=True), 'af_sarah': torch.load("voices/af_sarah.pt", weights_only=True), 'am_adam': torch.load("voices/am_adam.pt", weights_only=True), 'am_michael': torch.load("voices/am_michael.pt", weights_only=True), 'bf_emma': torch.load("voices/bf_emma.pt", weights_only=True), 'bf_isabella': torch.load("voices/bf_isabella.pt", weights_only=True), 'bm_george': torch.load("voices/bm_george.pt", weights_only=True), 'bm_lewis': torch.load("voices/bm_lewis.pt", weights_only=True), 'af_nicole': torch.load("voices/af_nicole.pt", weights_only=True), 'af_sky': torch.load("voices/af_sky.pt", weights_only=True) } def parse_voice_formula(formula): """Parse the voice formula string and return the combined voice tensor.""" if not formula.strip(): raise ValueError("Empty voice formula") # Initialize the weighted sum weighted_sum = None # Split the formula into terms terms = formula.split('+') for term in terms: # Parse each term (format: "0.333 * voice_name") weight, voice_name = term.strip().split('*') weight = float(weight.strip()) voice_name = voice_name.strip() # Get the voice tensor if voice_name not in voices: raise ValueError(f"Unknown voice: {voice_name}") voice_tensor = voices[voice_name] # Add to weighted sum if weighted_sum is None: weighted_sum = weight * voice_tensor else: weighted_sum += weight * voice_tensor return weighted_sum def get_new_voice(formula): try: # Parse the formula and get the combined voice tensor weighted_voices = parse_voice_formula(formula) # Save and load the combined voice torch.save(weighted_voices, "weighted_normalised_voices.pt") VOICEPACK = torch.load("weighted_normalised_voices.pt", weights_only=False).to(device) return VOICEPACK except Exception as e: raise gr.Error(f"Failed to create voice: {str(e)}") def text_to_speech(text, formula): try: if not text.strip(): raise gr.Error("Please enter some text") if not formula.strip(): raise gr.Error("Please select at least one voice") # Get the combined voice VOICEPACK = get_new_voice(formula) # Generate audio audio, phonemes = generate(MODEL, text, VOICEPACK, lang='a') return (24000, audio) except Exception as e: raise gr.Error(f"Failed to generate speech: {str(e)}") custom_css = """ .container-wrap { display: flex !important; gap: 5px !important; justify-content: center !important; margin: 0 auto !important; max-width: 1400px !important; /* Increased max-width */ } .vert-group { min-width: 100px !important; /* Increased from 80px */ width: 120px !important; /* Increased from 90px */ flex: 0 0 auto !important; } .vert-group label { white-space: nowrap !important; overflow: visible !important; width: auto !important; font-size: 0.85em !important; /* Slightly increased font size */ transform-origin: left center !important; transform: rotate(0deg) translateX(-50%) !important; position: relative !important; left: 50% !important; display: inline-block !important; text-align: center !important; margin-bottom: 5px !important; padding: 0 5px !important; /* Added padding */ } .vert-group .wrap label { text-align: center !important; width: 100% !important; display: block !important; } .slider_input_container { height: 200px !important; position: relative !important; width: 50px !important; /* Increased from 40px */ margin: 0 auto !important; overflow: hidden !important; } .slider_input_container input[type="range"] { position: absolute !important; width: 200px !important; left: -75px !important; /* Adjusted from -80px */ top: 100px !important; transform: rotate(90deg) !important; } .min_value { position: absolute !important; bottom: 0 !important; left: 10px !important; } .max_value { position: absolute !important; top: 0 !important; left: 10px !important; } .tab-like-container { transform: scale(0.8) !important; } .gradio-row, .gradio-column { background: none !important; border: none !important; min-width: unset !important; } .heading { text-align: center !important; margin-bottom: 1rem !important; } .description { text-align: center !important; margin-bottom: 2rem !important; color: rgba(255, 255, 255, 0.7) !important; } """ with gr.Blocks(css=custom_css, theme="ocean") as demo: gr.Markdown( """ # πŸŽ™οΈ Voice Mixer - Kokoro TTS ### Mix and match different voices to create your perfect text-to-speech voice This app lets you combine multiple voices with different weights to create custom voice combinations. Select voices using checkboxes and adjust their weights using the sliders below. """ ) with gr.Row(variant="default", equal_height=True, elem_classes="container-wrap"): checkboxes = [] sliders = [] # Define slider configurations with emojis slider_configs = [ ("af", "Default πŸ‘©β€πŸ¦°"), ("af_bella", "Bella πŸ‘©β€πŸ¦° πŸ‡ΊπŸ‡Έ"), ("af_sarah", "Sarah πŸ‘©β€πŸ¦° πŸ‡ΊπŸ‡Έ"), ("af_nicole", "Nicole πŸ‘©β€πŸ¦° πŸ‡ΊπŸ‡Έ"), ("af_sky", "Sky πŸ‘©β€πŸ¦° πŸ‡ΊπŸ‡Έ"), ("am_adam", "Adam πŸ‘¨ πŸ‡ΊπŸ‡Έ"), ("am_michael", "Michael πŸ‘¨ πŸ‡ΊπŸ‡Έ"), ("bf_emma", "Emma πŸ‘©β€πŸ¦° πŸ‡¬πŸ‡§"), ("bf_isabella", "Isabella πŸ‘©β€πŸ¦° πŸ‡¬πŸ‡§"), ("bm_george", "George πŸ‘¨ πŸ‡¬πŸ‡§"), ("bm_lewis", "Lewis πŸ‘¨ πŸ‡¬πŸ‡§") ] # Create columns for each slider for value, label in slider_configs: with gr.Column(min_width=70, scale=1, variant="default", elem_classes="vert-group"): checkbox = gr.Checkbox(label='') slider = gr.Slider(label=label, minimum=0, maximum=1, interactive=False, value=0, step=0.01) checkboxes.append(checkbox) sliders.append(slider) # Add voice combination formula display with gr.Row(equal_height=True): formula_display = gr.Textbox( label="Voice Combination Formula", value="", lines=2, scale=4, interactive=False ) input_text = gr.Textbox( label="Input Text", placeholder="Enter text to convert to speech", lines=2, scale=4 ) button_tts = gr.Button("πŸŽ™οΈ Generate Voice", scale=2, min_width=100) # Generate speech from the selected custom voice with gr.Row(equal_height=True): kokoro_tts = gr.Audio(label="Generated Speech", type="numpy") def generate_voice_formula(*values): """ Generate a formatted string showing the normalized voice combination. Returns: String like "0.6 * voice1 + 0.4 * voice2" """ n = len(values) // 2 checkbox_values = values[:n] slider_values = list(values[n:]) # Get active sliders and their names active_pairs = [(slider_values[i], slider_configs[i][0]) # Use value instead of label for i in range(len(slider_configs)) if checkbox_values[i] and slider_values[i] > 0] if not active_pairs: return "" # Calculate sum for normalization total_sum = sum(value for value, _ in active_pairs) if total_sum == 0: return "" # For single voice, always use weight 1.0 if len(active_pairs) == 1: return f"1.000 * {active_pairs[0][1]}" # Generate normalized formula for multiple voices terms = [] for value, name in active_pairs: normalized_value = value / total_sum terms.append(f"{normalized_value:.3f} * {name}") return " + ".join(terms) def check_box(checkbox): """Handle checkbox changes.""" if checkbox: return gr.Slider(interactive=True, value=1.0) # Changed default to 1.0 else: return gr.Slider(interactive=False, value=0) # Connect all checkboxes and sliders all_inputs = checkboxes + sliders # Update on checkbox changes for checkbox, slider in zip(checkboxes, sliders): checkbox.change( fn=check_box, inputs=[checkbox], outputs=[slider] ) # Update formula on checkbox changes checkbox.change( fn=generate_voice_formula, inputs=all_inputs, outputs=[formula_display] ) # Update formula on slider changes for slider in sliders: slider.change( fn=generate_voice_formula, inputs=all_inputs, outputs=[formula_display] ) button_tts.click( fn=text_to_speech, inputs=[input_text, formula_display], outputs=[kokoro_tts] ) if __name__ == "__main__": demo.launch()