ysharma's picture
ysharma HF staff
Update app.py
c845ca2 verified
raw
history blame
9.81 kB
import gradio as gr
import torch
import os
from kokoro import generate
from models import build_model
# Initialize model and device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL = build_model('kokoro-v0_19.pth', device)
# Load the voice models
voices = {
'af': torch.load("voices/af.pt", weights_only=True),
'af_bella': torch.load("voices/af_bella.pt", weights_only=True),
'af_sarah': torch.load("voices/af_sarah.pt", weights_only=True),
'am_adam': torch.load("voices/am_adam.pt", weights_only=True),
'am_michael': torch.load("voices/am_michael.pt", weights_only=True),
'bf_emma': torch.load("voices/bf_emma.pt", weights_only=True),
'bf_isabella': torch.load("voices/bf_isabella.pt", weights_only=True),
'bm_george': torch.load("voices/bm_george.pt", weights_only=True),
'bm_lewis': torch.load("voices/bm_lewis.pt", weights_only=True),
'af_nicole': torch.load("voices/af_nicole.pt", weights_only=True),
'af_sky': torch.load("voices/af_sky.pt", weights_only=True)
}
def parse_voice_formula(formula):
"""Parse the voice formula string and return the combined voice tensor."""
if not formula.strip():
raise ValueError("Empty voice formula")
# Initialize the weighted sum
weighted_sum = None
# Split the formula into terms
terms = formula.split('+')
for term in terms:
# Parse each term (format: "0.333 * voice_name")
weight, voice_name = term.strip().split('*')
weight = float(weight.strip())
voice_name = voice_name.strip()
# Get the voice tensor
if voice_name not in voices:
raise ValueError(f"Unknown voice: {voice_name}")
voice_tensor = voices[voice_name]
# Add to weighted sum
if weighted_sum is None:
weighted_sum = weight * voice_tensor
else:
weighted_sum += weight * voice_tensor
return weighted_sum
def get_new_voice(formula):
try:
# Parse the formula and get the combined voice tensor
weighted_voices = parse_voice_formula(formula)
# Save and load the combined voice
torch.save(weighted_voices, "weighted_normalised_voices.pt")
VOICEPACK = torch.load("weighted_normalised_voices.pt", weights_only=False).to(device)
return VOICEPACK
except Exception as e:
raise gr.Error(f"Failed to create voice: {str(e)}")
def text_to_speech(text, formula):
try:
if not text.strip():
raise gr.Error("Please enter some text")
if not formula.strip():
raise gr.Error("Please select at least one voice")
# Get the combined voice
VOICEPACK = get_new_voice(formula)
# Generate audio
audio, phonemes = generate(MODEL, text, VOICEPACK, lang='a')
return (24000, audio)
except Exception as e:
raise gr.Error(f"Failed to generate speech: {str(e)}")
custom_css = """
.container-wrap {
display: flex !important;
gap: 5px !important;
justify-content: center !important;
margin: 0 auto !important;
max-width: 1400px !important; /* Increased max-width */
}
.vert-group {
min-width: 100px !important; /* Increased from 80px */
width: 120px !important; /* Increased from 90px */
flex: 0 0 auto !important;
}
.vert-group label {
white-space: nowrap !important;
overflow: visible !important;
width: auto !important;
font-size: 0.85em !important; /* Slightly increased font size */
transform-origin: left center !important;
transform: rotate(0deg) translateX(-50%) !important;
position: relative !important;
left: 50% !important;
display: inline-block !important;
text-align: center !important;
margin-bottom: 5px !important;
padding: 0 5px !important; /* Added padding */
}
.vert-group .wrap label {
text-align: center !important;
width: 100% !important;
display: block !important;
}
.slider_input_container {
height: 200px !important;
position: relative !important;
width: 50px !important; /* Increased from 40px */
margin: 0 auto !important;
overflow: hidden !important;
}
.slider_input_container input[type="range"] {
position: absolute !important;
width: 200px !important;
left: -75px !important; /* Adjusted from -80px */
top: 100px !important;
transform: rotate(90deg) !important;
}
.min_value {
position: absolute !important;
bottom: 0 !important;
left: 10px !important;
}
.max_value {
position: absolute !important;
top: 0 !important;
left: 10px !important;
}
.tab-like-container {
transform: scale(0.8) !important;
}
.gradio-row, .gradio-column {
background: none !important;
border: none !important;
min-width: unset !important;
}
.heading {
text-align: center !important;
margin-bottom: 1rem !important;
}
.description {
text-align: center !important;
margin-bottom: 2rem !important;
color: rgba(255, 255, 255, 0.7) !important;
}
"""
with gr.Blocks(css=custom_css, theme="ocean") as demo:
gr.Markdown(
"""
# 🎙️ Voice Mixer - Kokoro TTS
### Mix and match different voices to create your perfect text-to-speech voice
This app lets you combine multiple voices with different weights to create custom voice combinations.
Select voices using checkboxes and adjust their weights using the sliders below.
"""
)
with gr.Row(variant="default", equal_height=True, elem_classes="container-wrap"):
checkboxes = []
sliders = []
# Define slider configurations with emojis
slider_configs = [
("af", "Default 👩‍🦰"),
("af_bella", "Bella 👩‍🦰 🇺🇸"),
("af_sarah", "Sarah 👩‍🦰 🇺🇸"),
("af_nicole", "Nicole 👩‍🦰 🇺🇸"),
("af_sky", "Sky 👩‍🦰 🇺🇸"),
("am_adam", "Adam 👨 🇺🇸"),
("am_michael", "Michael 👨 🇺🇸"),
("bf_emma", "Emma 👩‍🦰 🇬🇧"),
("bf_isabella", "Isabella 👩‍🦰 🇬🇧"),
("bm_george", "George 👨 🇬🇧"),
("bm_lewis", "Lewis 👨 🇬🇧")
]
# Create columns for each slider
for value, label in slider_configs:
with gr.Column(min_width=70, scale=1, variant="default", elem_classes="vert-group"):
checkbox = gr.Checkbox(label='')
slider = gr.Slider(label=label, minimum=0, maximum=1, interactive=False, value=0, step=0.01)
checkboxes.append(checkbox)
sliders.append(slider)
# Add voice combination formula display
with gr.Row(equal_height=True):
formula_display = gr.Textbox(
label="Voice Combination Formula",
value="",
lines=2,
scale=4,
interactive=False
)
input_text = gr.Textbox(
label="Input Text",
placeholder="Enter text to convert to speech",
lines=2,
scale=4
)
button_tts = gr.Button("🎙️ Generate Voice", scale=2, min_width=100)
# Generate speech from the selected custom voice
with gr.Row(equal_height=True):
kokoro_tts = gr.Audio(label="Generated Speech", type="numpy")
def generate_voice_formula(*values):
"""
Generate a formatted string showing the normalized voice combination.
Returns: String like "0.6 * voice1 + 0.4 * voice2"
"""
n = len(values) // 2
checkbox_values = values[:n]
slider_values = list(values[n:])
# Get active sliders and their names
active_pairs = [(slider_values[i], slider_configs[i][0]) # Use value instead of label
for i in range(len(slider_configs))
if checkbox_values[i] and slider_values[i] > 0]
if not active_pairs:
return ""
# Calculate sum for normalization
total_sum = sum(value for value, _ in active_pairs)
if total_sum == 0:
return ""
# For single voice, always use weight 1.0
if len(active_pairs) == 1:
return f"1.000 * {active_pairs[0][1]}"
# Generate normalized formula for multiple voices
terms = []
for value, name in active_pairs:
normalized_value = value / total_sum
terms.append(f"{normalized_value:.3f} * {name}")
return " + ".join(terms)
def check_box(checkbox):
"""Handle checkbox changes."""
if checkbox:
return gr.Slider(interactive=True, value=1.0) # Changed default to 1.0
else:
return gr.Slider(interactive=False, value=0)
# Connect all checkboxes and sliders
all_inputs = checkboxes + sliders
# Update on checkbox changes
for checkbox, slider in zip(checkboxes, sliders):
checkbox.change(
fn=check_box,
inputs=[checkbox],
outputs=[slider]
)
# Update formula on checkbox changes
checkbox.change(
fn=generate_voice_formula,
inputs=all_inputs,
outputs=[formula_display]
)
# Update formula on slider changes
for slider in sliders:
slider.change(
fn=generate_voice_formula,
inputs=all_inputs,
outputs=[formula_display]
)
button_tts.click(
fn=text_to_speech,
inputs=[input_text, formula_display],
outputs=[kokoro_tts]
)
if __name__ == "__main__":
demo.launch()