File size: 11,787 Bytes
b2fcaf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c845ca2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2fcaf7
97e6cf3
 
 
 
 
 
 
b9b94a3
97e6cf3
0760cc6
97e6cf3
0760cc6
97e6cf3
 
 
 
 
 
0760cc6
 
 
 
 
 
 
 
 
 
 
b2fcaf7
 
 
c845ca2
 
 
b2fcaf7
 
c845ca2
fc3064c
b2fcaf7
 
 
 
 
 
c845ca2
b2fcaf7
 
 
 
 
 
 
fc3064c
b2fcaf7
 
 
 
 
 
97e6cf3
 
 
 
 
b2fcaf7
 
 
c845ca2
b2fcaf7
 
 
 
 
 
c845ca2
b2fcaf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c845ca2
 
 
 
 
 
 
 
 
97e6cf3
 
 
 
 
 
 
 
 
 
 
 
 
 
c845ca2
b2fcaf7
 
35e8908
c845ca2
97e6cf3
 
6d8bd12
 
 
97e6cf3
c845ca2
 
97e6cf3
b2fcaf7
 
 
c845ca2
 
b2fcaf7
c845ca2
fc3064c
 
 
 
 
 
 
 
 
 
b2fcaf7
 
 
c845ca2
b2fcaf7
 
c845ca2
b2fcaf7
 
 
 
 
c845ca2
 
 
 
 
35e8908
a8b6d55
 
c845ca2
 
 
 
 
 
 
97e6cf3
b2fcaf7
 
 
ae9a123
b2fcaf7
 
 
 
e71a91a
b2fcaf7
 
 
 
 
 
e71a91a
 
 
b2fcaf7
 
 
 
e71a91a
 
 
 
 
 
b2fcaf7
 
 
 
 
c845ca2
b2fcaf7
 
 
 
 
 
 
 
 
 
c845ca2
b2fcaf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c845ca2
b2fcaf7
 
 
c845ca2
b2fcaf7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
import gradio as gr
import torch
import os
from kokoro import generate
from models import build_model

# Initialize model and device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL = build_model('kokoro-v0_19.pth', device)

# Load the voice models
voices = {
    'af': torch.load("voices/af.pt", weights_only=True),
    'af_bella': torch.load("voices/af_bella.pt", weights_only=True),
    'af_sarah': torch.load("voices/af_sarah.pt", weights_only=True),
    'am_adam': torch.load("voices/am_adam.pt", weights_only=True),
    'am_michael': torch.load("voices/am_michael.pt", weights_only=True),
    'bf_emma': torch.load("voices/bf_emma.pt", weights_only=True),
    'bf_isabella': torch.load("voices/bf_isabella.pt", weights_only=True),
    'bm_george': torch.load("voices/bm_george.pt", weights_only=True),
    'bm_lewis': torch.load("voices/bm_lewis.pt", weights_only=True),
    'af_nicole': torch.load("voices/af_nicole.pt", weights_only=True),
    'af_sky': torch.load("voices/af_sky.pt", weights_only=True)
}


def parse_voice_formula(formula):
    """Parse the voice formula string and return the combined voice tensor."""
    if not formula.strip():
        raise ValueError("Empty voice formula")
        
    # Initialize the weighted sum
    weighted_sum = None
    
    # Split the formula into terms
    terms = formula.split('+')
    
    for term in terms:
        # Parse each term (format: "0.333 * voice_name")
        weight, voice_name = term.strip().split('*')
        weight = float(weight.strip())
        voice_name = voice_name.strip()
        
        # Get the voice tensor
        if voice_name not in voices:
            raise ValueError(f"Unknown voice: {voice_name}")
            
        voice_tensor = voices[voice_name]
        
        # Add to weighted sum
        if weighted_sum is None:
            weighted_sum = weight * voice_tensor
        else:
            weighted_sum += weight * voice_tensor
            
    return weighted_sum

def get_new_voice(formula):
    try:
        # Parse the formula and get the combined voice tensor
        weighted_voices = parse_voice_formula(formula)
        
        # Save and load the combined voice
        torch.save(weighted_voices, "weighted_normalised_voices.pt")
        VOICEPACK = torch.load("weighted_normalised_voices.pt", weights_only=False).to(device)
        return VOICEPACK
    except Exception as e:
        raise gr.Error(f"Failed to create voice: {str(e)}")

def text_to_speech(text, formula):
    try:
        if not text.strip():
            raise gr.Error("Please enter some text")
            
        if not formula.strip():
            raise gr.Error("Please select at least one voice")
            
        # Get the combined voice
        VOICEPACK = get_new_voice(formula)
        
        # Generate audio
        audio, phonemes = generate(MODEL, text, VOICEPACK, lang='a')
        return (24000, audio)
    except Exception as e:
        raise gr.Error(f"Failed to generate speech: {str(e)}")
        

custom_css = """
/* Main title */
.heading {
    color: rgb(76, 175, 147) !important;
    font-size: 2em !important;
    font-weight: 600 !important;
    text-align: center !important;
    margin: 20px 0 10px 0 !important;
    width: 100% !important;
}
/* Description text - Dark mode */
.description {
    color: var(--body-text-color, rgba(76, 175, 147, 0.7)) !important;
    text-align: center !important;
    max-width: 800px !important;
    margin: 0 auto 30px auto !important;
    font-size: 0.9em !important;
    line-height: 1.6 !important;
}
/* Description text - Light mode override */
.light-mode .description, 
[data-theme="light"] .description {
    color: rgba(55, 65, 81, 0.9) !important;
}
/* Description text - Bold elements in light mode */
.light-mode .description b, 
[data-theme="light"] .description b {
    color: rgb(55, 65, 81) !important;
    font-weight: 600 !important;
}
.container-wrap {
    display: flex !important;
    gap: 5px !important;
    justify-content: center !important;
    margin: 0 auto !important;
    max-width: 1400px !important;  /* Increased max-width */
}
.vert-group {
    min-width: 100px !important;   /* Increased from 80px */
    width: 105px !important;       /* Increased from 90px */
    flex: 0 0 auto !important;
}
.vert-group label {
    white-space: nowrap !important;
    overflow: visible !important;
    width: auto !important;
    font-size: 0.85em !important;  /* Slightly increased font size */
    transform-origin: left center !important;
    transform: rotate(0deg) translateX(-50%) !important;
    position: relative !important;
    left: 50% !important;
    display: inline-block !important;
    text-align: center !important;
    margin-bottom: 5px !important;
    padding: 0 1px !important;     /* Added padding */
}
.vert-group .wrap label {
    text-align: center !important;
    width: 100% !important;
    display: block !important;
}
/* Hover effect */
.vert-group:hover {
    transform: translateY(-5px) !important;
    box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2) !important;
}
.slider_input_container {
    height: 200px !important;
    position: relative !important;
    width: 50px !important;        /* Increased from 40px */
    margin: 0 auto !important;
    overflow: hidden !important;
}
.slider_input_container input[type="range"] {
    position: absolute !important;
    width: 200px !important;
    left: -75px !important;        /* Adjusted from -80px */
    top: 100px !important;
    transform: rotate(90deg) !important;
}
.min_value {
    position: absolute !important;
    bottom: 0 !important;
    left: 10px !important;
}
.max_value {
    position: absolute !important;
    top: 0 !important;
    left: 10px !important;
}
.tab-like-container {
    transform: scale(0.8) !important;
}
.gradio-row, .gradio-column {
    background: none !important;
    border: none !important;
    min-width: unset !important;
}
.heading {
    text-align: center !important;
    margin-bottom: 1rem !important;
}
.description {
    text-align: center !important;
    margin-bottom: 2rem !important;
    color: rgba(255, 255, 255, 0.7) !important;
}
/* Generate button */
#generate-btn {
    background: linear-gradient(90deg, rgb(76, 175, 147), rgb(76, 147, 175)) !important;
    border: none !important;
    border-radius: 8px !important;
    padding: 12px 24px !important;
    color: white !important;
    font-weight: 600 !important;
    transition: transform 0.2s, box-shadow 0.2s !important;
}
#generate-btn:hover {
    transform: translateY(-2px) !important;
    box-shadow: 0 5px 15px rgba(76, 175, 147, 0.3) !important;
}
"""

with gr.Blocks(css=custom_css, theme="ocean") as demo:
    gr.HTML(
        """
        <div class="heading">🎙️ AI Voice Mixer Studio - Kokoro TTS</div>
        <div class="description">
            <b>Mix and match different voices to create your perfect text-to-speech voice.<br>Each slider represents a 
            unique voice with distinct characteristics. This app lets you combine multiple voices with different weights 
            to create custom voice combinations. Select voices using checkboxes and adjust their weights using the sliders below!</b>
        </div>
        """
    )

    with gr.Row(variant="default", equal_height=True, elem_classes="container-wrap"):
        checkboxes = []
        sliders = []
        
        # Define slider configurations with emojis
        slider_configs = [
            ("af", "Default 👩‍🦰"),
            ("af_bella", "Bella 👩‍🦰🇺🇸"), 
            ("af_sarah", "Sarah 👩‍🦰🇺🇸"),
            ("af_nicole", "Nicole 👩‍🦰🇺🇸"), 
            ("af_sky", "Sky 👩‍🦰🇺🇸"),
            ("am_adam", "Adam 👨🇺🇸"),
            ("am_michael", "Michael 👨🇺🇸"),
            ("bf_emma", "Emma 👩‍🦰🇬🇧"),
            ("bf_isabella", "Isabella 👩‍🦰🇬🇧"),
            ("bm_george", "George 👨🇬🇧"),
            ("bm_lewis", "Lewis 👨🇬🇧")
        ]

        # Create columns for each slider
        for value, label in slider_configs:
            with gr.Column(min_width=70, scale=1, variant="default", elem_classes="vert-group"):
                checkbox = gr.Checkbox(label='')
                slider = gr.Slider(label=label, minimum=0, maximum=1, interactive=False, value=0, step=0.01)
                checkboxes.append(checkbox)
                sliders.append(slider)

    # Add voice combination formula display
    with gr.Row(equal_height=True):
        formula_display = gr.Textbox(
            label="Voice Combination Formula", 
            value="", 
            lines=2, 
            scale=4, 
            interactive=False,
            placeholder="This will begin to display immediately once any of the voice checkboxes is selected selected",
            info="Slider values are normalized to create this voice formula. Use the Sliders to intuitively increase or decrease a voice effect."
        )
        input_text = gr.Textbox(
            label="Input Text", 
            placeholder="Enter text to convert to speech", 
            lines=2, 
            scale=4
        )
        button_tts = gr.Button("🎙️ Generate Voice", scale=2, min_width=100, elem_id="generate-btn")

    # Generate speech from the selected custom voice
    with gr.Row(equal_height=True):
        kokoro_tts = gr.Audio(label="Generated Speech", type="numpy", autoplay=True)

    def generate_voice_formula(*values):
        """
        Generate a formatted string showing the normalized voice combination.
        Returns: String like "0.6 * voice1" or "0.4 * voice1 + 0.6 * voice2"
        """
        n = len(values) // 2
        checkbox_values = values[:n]
        slider_values = list(values[n:])

        # Get active sliders and their names
        active_pairs = [(slider_values[i], slider_configs[i][0])
                      for i in range(len(slider_configs))
                      if checkbox_values[i]]

        if not active_pairs:
            return ""

        # If only one voice is selected, use its actual value
        if len(active_pairs) == 1:
            value, name = active_pairs[0]
            return f"{value:.3f} * {name}"

        # Calculate sum for normalization of multiple voices
        total_sum = sum(value for value, _ in active_pairs)

        if total_sum == 0:
            return ""

        # Generate normalized formula for multiple voices
        terms = []
        for value, name in active_pairs:
            normalized_value = value / total_sum
            terms.append(f"{normalized_value:.3f} * {name}")

        return " + ".join(terms)

    def check_box(checkbox):
        """Handle checkbox changes."""
        if checkbox:
            return gr.Slider(interactive=True, value=1.0)  # Changed default to 1.0
        else:
            return gr.Slider(interactive=False, value=0)

    # Connect all checkboxes and sliders
    all_inputs = checkboxes + sliders

    # Update on checkbox changes
    for checkbox, slider in zip(checkboxes, sliders):
        checkbox.change(
            fn=check_box,
            inputs=[checkbox],
            outputs=[slider]
        )
        # Update formula on checkbox changes
        checkbox.change(
            fn=generate_voice_formula,
            inputs=all_inputs,
            outputs=[formula_display]
        )

    # Update formula on slider changes
    for slider in sliders:
        slider.change(
            fn=generate_voice_formula,
            inputs=all_inputs,
            outputs=[formula_display]
        )

    button_tts.click(
        fn=text_to_speech,
        inputs=[input_text, formula_display],
        outputs=[kokoro_tts]
    )


if __name__ == "__main__":
    demo.launch()