import gradio as gr import torch import os from kokoro import generate from models import build_model # Initialize model and device device = 'cuda' if torch.cuda.is_available() else 'cpu' MODEL = build_model('kokoro-v0_19.pth', device) # Load the voice models voices = { 'af': torch.load("voices/af.pt", weights_only=True), 'af_bella': torch.load("voices/af_bella.pt", weights_only=True), 'af_sarah': torch.load("voices/af_sarah.pt", weights_only=True), 'am_adam': torch.load("voices/am_adam.pt", weights_only=True), 'am_michael': torch.load("voices/am_michael.pt", weights_only=True), 'bf_emma': torch.load("voices/bf_emma.pt", weights_only=True), 'bf_isabella': torch.load("voices/bf_isabella.pt", weights_only=True), 'bm_george': torch.load("voices/bm_george.pt", weights_only=True), 'bm_lewis': torch.load("voices/bm_lewis.pt", weights_only=True), 'af_nicole': torch.load("voices/af_nicole.pt", weights_only=True), 'af_sky': torch.load("voices/af_sky.pt", weights_only=True) } def parse_voice_formula(formula): """Parse the voice formula string and return the combined voice tensor.""" if not formula.strip(): raise ValueError("Empty voice formula") # Initialize the weighted sum weighted_sum = None # Split the formula into terms terms = formula.split('+') for term in terms: # Parse each term (format: "0.333 * voice_name") weight, voice_name = term.strip().split('*') weight = float(weight.strip()) voice_name = voice_name.strip() # Get the voice tensor if voice_name not in voices: raise ValueError(f"Unknown voice: {voice_name}") voice_tensor = voices[voice_name] # Add to weighted sum if weighted_sum is None: weighted_sum = weight * voice_tensor else: weighted_sum += weight * voice_tensor return weighted_sum def get_new_voice(formula): try: # Parse the formula and get the combined voice tensor weighted_voices = parse_voice_formula(formula) # Save and load the combined voice torch.save(weighted_voices, "weighted_normalised_voices.pt") VOICEPACK = torch.load("weighted_normalised_voices.pt", weights_only=False).to(device) return VOICEPACK except Exception as e: raise gr.Error(f"Failed to create voice: {str(e)}") def text_to_speech(text, formula): try: if not text.strip(): raise gr.Error("Please enter some text") if not formula.strip(): raise gr.Error("Please select at least one voice") # Get the combined voice VOICEPACK = get_new_voice(formula) # Generate audio audio, phonemes = generate(MODEL, text, VOICEPACK, lang='a') return (24000, audio) except Exception as e: raise gr.Error(f"Failed to generate speech: {str(e)}") custom_css = """ /* Main title */ .heading { color: rgb(76, 175, 147) !important; font-size: 2em !important; font-weight: 600 !important; text-align: center !important; margin: 20px 0 10px 0 !important; } /* Description text */ .description { color: rgba(76, 175, 147, 0.7) !important; text-align: center !important; max-width: 800px !important; margin: 0 auto 30px auto !important; font-size: 0.9em !important; line-height: 1.6 !important; } .container-wrap { display: flex !important; gap: 5px !important; justify-content: center !important; margin: 0 auto !important; max-width: 1400px !important; /* Increased max-width */ } .vert-group { min-width: 100px !important; /* Increased from 80px */ width: 120px !important; /* Increased from 90px */ flex: 0 0 auto !important; } .vert-group label { white-space: nowrap !important; overflow: visible !important; width: auto !important; font-size: 0.85em !important; /* Slightly increased font size */ transform-origin: left center !important; transform: rotate(0deg) translateX(-50%) !important; position: relative !important; left: 50% !important; display: inline-block !important; text-align: center !important; margin-bottom: 5px !important; padding: 0 5px !important; /* Added padding */ } .vert-group .wrap label { text-align: center !important; width: 100% !important; display: block !important; } /* Hover effect */ .vert-group:hover { transform: translateY(-5px) !important; box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2) !important; } .slider_input_container { height: 200px !important; position: relative !important; width: 50px !important; /* Increased from 40px */ margin: 0 auto !important; overflow: hidden !important; } .slider_input_container input[type="range"] { position: absolute !important; width: 200px !important; left: -75px !important; /* Adjusted from -80px */ top: 100px !important; transform: rotate(90deg) !important; } .min_value { position: absolute !important; bottom: 0 !important; left: 10px !important; } .max_value { position: absolute !important; top: 0 !important; left: 10px !important; } .tab-like-container { transform: scale(0.8) !important; } .gradio-row, .gradio-column { background: none !important; border: none !important; min-width: unset !important; } .heading { text-align: center !important; margin-bottom: 1rem !important; } .description { text-align: center !important; margin-bottom: 2rem !important; color: rgba(255, 255, 255, 0.7) !important; } /* Generate button */ #generate-btn { background: linear-gradient(90deg, rgb(76, 175, 147), rgb(76, 147, 175)) !important; border: none !important; border-radius: 8px !important; padding: 12px 24px !important; color: white !important; font-weight: 600 !important; transition: transform 0.2s, box-shadow 0.2s !important; } #generate-btn:hover { transform: translateY(-2px) !important; box-shadow: 0 5px 15px rgba(76, 175, 147, 0.3) !important; } """ with gr.Blocks(css=custom_css, theme="ocean") as demo: gr.HTML( """