Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +481 -962
- sonic_api_wrapper.py +174 -115
app.py
CHANGED
@@ -1,962 +1,481 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
-
from pathlib import Path
|
4 |
-
from
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
#
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
if
|
62 |
-
return
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
""
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
if
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
def generate_output_filename(language: str) -> str:
|
484 |
-
"""Генерация имени файла с временной меткой и языком"""
|
485 |
-
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
486 |
-
return f"output/{timestamp}_{language}.wav"
|
487 |
-
|
488 |
-
def extract_voice_id_from_label(voice_label: str) -> str:
|
489 |
-
"""
|
490 |
-
Извлекает ID голоса из метки в dropdown
|
491 |
-
Например: "John (en) [Custom]" -> извлечет ID из словаря голосов
|
492 |
-
"""
|
493 |
-
global manager
|
494 |
-
try:
|
495 |
-
if not manager:
|
496 |
-
return None
|
497 |
-
|
498 |
-
# Получаем все голоса и их метки
|
499 |
-
choices = manager.get_voice_choices()
|
500 |
-
# Находим голос по метке и берем его ID
|
501 |
-
voice_data = next((c for c in choices if c["label"] == voice_label), None)
|
502 |
-
return voice_data["value"] if voice_data else None
|
503 |
-
except Exception as e:
|
504 |
-
print(f"❌ Ошибка при получении голосов: {str(e)}")
|
505 |
-
return None
|
506 |
-
|
507 |
-
def initialize_manager(api_key: str) -> str:
|
508 |
-
global manager
|
509 |
-
try:
|
510 |
-
manager = CartesiaVoiceManager(api_key=api_key, base_dir=Path("voice2voice"))
|
511 |
-
return "✅ Менеджер инициализирован"
|
512 |
-
except Exception as e:
|
513 |
-
return f"❌ Ошибка: {str(e)}"
|
514 |
-
|
515 |
-
def get_initial_voices():
|
516 |
-
global manager
|
517 |
-
"""Получение начального списка голосов"""
|
518 |
-
if not manager:
|
519 |
-
# initialize_manager(DEFAULT_API_KEY)
|
520 |
-
return None
|
521 |
-
choices = manager.get_voice_choices()
|
522 |
-
if not choices:
|
523 |
-
return None
|
524 |
-
return [c["label"] for c in choices], choices[0]["label"] if choices else None
|
525 |
-
|
526 |
-
def update_voice_list (language: str, access_type: str, current_voice: str = None):
|
527 |
-
"""
|
528 |
-
Обновление списка голосов с сохранением текущего выбора
|
529 |
-
"""
|
530 |
-
global manager
|
531 |
-
if not manager:
|
532 |
-
return gr.update(choices=[], value=None), "❌ Менеджер не инициализирован"
|
533 |
-
|
534 |
-
try:
|
535 |
-
choices = manager.get_voice_choices(
|
536 |
-
language=None if language == "all" else language,
|
537 |
-
accessibility=ACCESS_TYPE_MAP[access_type]
|
538 |
-
)
|
539 |
-
|
540 |
-
# Преобразуем в список меток
|
541 |
-
choice_labels = [c["label"] for c in choices]
|
542 |
-
|
543 |
-
# Определяем значение для выбора
|
544 |
-
if current_voice in choice_labels:
|
545 |
-
# Сохраняем текущий выбор, если он доступен
|
546 |
-
new_value = current_voice
|
547 |
-
else:
|
548 |
-
# Иначе берем первый доступный голос
|
549 |
-
new_value = choice_labels[0] if choice_labels else None
|
550 |
-
|
551 |
-
return gr.update(choices=choice_labels, value=new_value), "✅ Список голосов обновлен"
|
552 |
-
except Exception as e:
|
553 |
-
return gr.update(choices=[], value=None), f"❌ Ошибка: {str(e)}"
|
554 |
-
|
555 |
-
def update_voice_info(voice_label: str) -> str:
|
556 |
-
"""Обновление информации о голосе"""
|
557 |
-
global manager
|
558 |
-
if not manager or not voice_label:
|
559 |
-
return ""
|
560 |
-
|
561 |
-
try:
|
562 |
-
voice_id = extract_voice_id_from_label(voice_label)
|
563 |
-
if not voice_id:
|
564 |
-
return "❌ Голос не найден"
|
565 |
-
|
566 |
-
info = manager.get_voice_info(voice_id)
|
567 |
-
return (
|
568 |
-
f"Имя: {info['name']}\n"
|
569 |
-
f"Язык: {info['language']}\n"
|
570 |
-
f"Тип: {'Кастомный' if info.get('is_custom') else 'API'}\n"
|
571 |
-
f"ID: {info['id']}"
|
572 |
-
)
|
573 |
-
except Exception as e:
|
574 |
-
return f"❌ Ошибка: {str(e)}"
|
575 |
-
|
576 |
-
def create_custom_voice(name: str, language: str, audio_data: tuple) -> tuple:
|
577 |
-
"""
|
578 |
-
Создание кастомного голоса и обновление списка голосов
|
579 |
-
Возвращает: (статус, обновленный dropdown, информация о голосе)
|
580 |
-
"""
|
581 |
-
global manager
|
582 |
-
if not manager:
|
583 |
-
return "❌ Менеджер не инициализирован", gr.update(), ""
|
584 |
-
|
585 |
-
if not name or not audio_data:
|
586 |
-
return "❌ Необходимо указать имя и файл голоса", gr.update(), ""
|
587 |
-
|
588 |
-
try:
|
589 |
-
# Получаем путь к аудио файлу
|
590 |
-
audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data
|
591 |
-
|
592 |
-
# Создаем голос
|
593 |
-
voice_id = manager.create_custom_embedding(
|
594 |
-
file_path=audio_path,
|
595 |
-
name=name,
|
596 |
-
language=language
|
597 |
-
)
|
598 |
-
|
599 |
-
print(voice_id)
|
600 |
-
|
601 |
-
# Получаем обновленный список голосов
|
602 |
-
choices = manager.get_voice_choices()
|
603 |
-
choice_labels = [c["label"] for c in choices]
|
604 |
-
|
605 |
-
# Находим метку для нового голоса
|
606 |
-
new_voice_label = next(c["label"] for c in choices if c["value"] == voice_id)
|
607 |
-
|
608 |
-
# Получаем информацию о новом голосе
|
609 |
-
voice_info = manager.get_voice_info(voice_id)
|
610 |
-
info_text = (
|
611 |
-
f"Имя: {voice_info['name']}\n"
|
612 |
-
f"Язык: {voice_info['language']}\n"
|
613 |
-
f"Тип: Кастомный\n"
|
614 |
-
f"ID: {voice_info['id']}"
|
615 |
-
)
|
616 |
-
|
617 |
-
return (
|
618 |
-
f"✅ Создан кастомный голос: {voice_id}",
|
619 |
-
gr.update(choices=choice_labels, value=new_voice_label),
|
620 |
-
info_text
|
621 |
-
)
|
622 |
-
|
623 |
-
except Exception as e:
|
624 |
-
return f"❌ Ошибка создания голоса: {str(e)}", gr.update(), ""
|
625 |
-
|
626 |
-
def on_auto_language_change(auto_language: bool):
|
627 |
-
"""Обработчик изменения галочки автоопределения языка"""
|
628 |
-
return gr.update(visible=not auto_language)
|
629 |
-
|
630 |
-
def map_emotions(selected_emotions, intensity):
|
631 |
-
emotion_map = {
|
632 |
-
"Весело": "positivity",
|
633 |
-
"Грустно": "sadness",
|
634 |
-
"Злобно": "anger",
|
635 |
-
"Удивленно": "surprise",
|
636 |
-
"Любопытно": "curiosity"
|
637 |
-
}
|
638 |
-
|
639 |
-
intensity_map = {
|
640 |
-
"Очень слабая": "lowest",
|
641 |
-
"Слабая": "low",
|
642 |
-
"Средняя": "medium",
|
643 |
-
"Сильная": "high",
|
644 |
-
"Очень сильная": "highest"
|
645 |
-
}
|
646 |
-
|
647 |
-
emotions = []
|
648 |
-
for emotion in selected_emotions:
|
649 |
-
if emotion == "Нейтрально":
|
650 |
-
continue
|
651 |
-
if emotion in emotion_map:
|
652 |
-
emotions.append({
|
653 |
-
"name": emotion_map[emotion],
|
654 |
-
"level": intensity_map[intensity]
|
655 |
-
})
|
656 |
-
return emotions
|
657 |
-
|
658 |
-
def generate_speech(
|
659 |
-
text: str,
|
660 |
-
voice_label: str,
|
661 |
-
improve_text: bool,
|
662 |
-
auto_language: bool,
|
663 |
-
manual_language: str,
|
664 |
-
speed_type: str,
|
665 |
-
use_custom_speed: bool,
|
666 |
-
custom_speed: float,
|
667 |
-
emotions: List[str],
|
668 |
-
emotion_intensity: str
|
669 |
-
):
|
670 |
-
global manager
|
671 |
-
"""Генерация речи с учетом настроек языка"""
|
672 |
-
if not manager:
|
673 |
-
return None, "❌ Менеджер не инициализирован"
|
674 |
-
|
675 |
-
if not text or not voice_label:
|
676 |
-
return None, "❌ Необходимо указать текст и голос"
|
677 |
-
|
678 |
-
try:
|
679 |
-
# Извлекаем ID голоса из метки
|
680 |
-
voice_id = extract_voice_id_from_label(voice_label)
|
681 |
-
if not voice_id:
|
682 |
-
return None, "❌ Голос не найден"
|
683 |
-
|
684 |
-
# Устанавливаем голос по ID
|
685 |
-
manager.set_voice(voice_id)
|
686 |
-
|
687 |
-
# Если автоопределение выключено, устанавливаем язык вручную
|
688 |
-
if not auto_language:
|
689 |
-
manager.set_language(manual_language)
|
690 |
-
|
691 |
-
# В функции generate_speech обновите установку скорости:
|
692 |
-
if use_custom_speed:
|
693 |
-
manager.speed = custom_speed
|
694 |
-
else:
|
695 |
-
manager.speed = map_speed(speed_type)
|
696 |
-
|
697 |
-
# Установка эмоций
|
698 |
-
emotion_map = {
|
699 |
-
"Нейтрально": None,
|
700 |
-
"Весело": "positivity",
|
701 |
-
"Грустно": "sadness",
|
702 |
-
"Злобно": "anger",
|
703 |
-
"Удивленно": "surprise",
|
704 |
-
"Любопытно": "curiosity"
|
705 |
-
}
|
706 |
-
|
707 |
-
intensity_map = {
|
708 |
-
"Слабая": "low",
|
709 |
-
"Средняя": "medium",
|
710 |
-
"Сильная": "high"
|
711 |
-
}
|
712 |
-
|
713 |
-
if emotions and emotions != ["Нейтрально"]:
|
714 |
-
manager.set_emotions(map_emotions(emotions, emotion_intensity))
|
715 |
-
else:
|
716 |
-
manager.set_emotions() # Сброс эмоций
|
717 |
-
|
718 |
-
# Генерация имени файла
|
719 |
-
output_file = generate_output_filename(
|
720 |
-
manual_language if not auto_language else manager.current_language
|
721 |
-
)
|
722 |
-
|
723 |
-
# Создаем директорию для выходных файлов, если её нет
|
724 |
-
os.makedirs("output", exist_ok=True)
|
725 |
-
|
726 |
-
# Генерация речи
|
727 |
-
output_path = manager.speak(
|
728 |
-
text=text if not improve_text else improve_tts_text(text, manager.current_language),
|
729 |
-
output_file=output_file
|
730 |
-
)
|
731 |
-
|
732 |
-
return output_path, "✅ Аудио сгенерировано успешно"
|
733 |
-
|
734 |
-
except Exception as e:
|
735 |
-
return None, f"❌ Ошибка генерации: {str(e)}"
|
736 |
-
|
737 |
-
# Создание интерфейса
|
738 |
-
with gr.Blocks() as demo:
|
739 |
-
# API ключ
|
740 |
-
cartesia_api_key = gr.Textbox(
|
741 |
-
label="API ключ Cartesia",
|
742 |
-
value=DEFAULT_API_KEY,
|
743 |
-
type='password'
|
744 |
-
)
|
745 |
-
|
746 |
-
with gr.Row():
|
747 |
-
# Левая колонка
|
748 |
-
with gr.Column():
|
749 |
-
cartesia_text = gr.TextArea(label="Текст")
|
750 |
-
|
751 |
-
with gr.Accordion(label="Настройки", open=True):
|
752 |
-
# Фильтры
|
753 |
-
with gr.Accordion("Фильтры", open=True):
|
754 |
-
cartesia_setting_filter_lang = gr.Dropdown(
|
755 |
-
label="Язык",
|
756 |
-
choices=LANGUAGE_CHOICES,
|
757 |
-
value="all"
|
758 |
-
)
|
759 |
-
cartesia_setting_filter_type = gr.Dropdown(
|
760 |
-
label="Тип",
|
761 |
-
choices=ACCESS_TYPE_MAP,
|
762 |
-
value="Все"
|
763 |
-
)
|
764 |
-
|
765 |
-
# Вкладки настроек
|
766 |
-
with gr.Tab("Стандарт"):
|
767 |
-
cartesia_setting_voice_info = gr.Textbox(
|
768 |
-
label="Информация о голосе",
|
769 |
-
interactive=False
|
770 |
-
)
|
771 |
-
with gr.Row():
|
772 |
-
if not manager:
|
773 |
-
initial_choices = None
|
774 |
-
initial_value = None
|
775 |
-
else:
|
776 |
-
initial_choices, initial_value = get_initial_voices()
|
777 |
-
cartesia_setting_voice = gr.Dropdown(
|
778 |
-
label="Голос",
|
779 |
-
choices=initial_choices,
|
780 |
-
value=initial_value
|
781 |
-
)
|
782 |
-
cartesia_setting_voice_update = gr.Button("Обновить")
|
783 |
-
cartesia_setting_auto_language = gr.Checkbox(
|
784 |
-
label="Автоматически определять язык из голоса",
|
785 |
-
value=True
|
786 |
-
)
|
787 |
-
cartesia_setting_manual_language = gr.Dropdown(
|
788 |
-
label="Язык озвучки",
|
789 |
-
choices=["ru", "en", "es", "fr", "de", "pl", "it", "ja", "ko", "zh", "hi"],
|
790 |
-
value="en",
|
791 |
-
visible=False # Изначально скрыт
|
792 |
-
)
|
793 |
-
|
794 |
-
with gr.Tab("Кастомный"):
|
795 |
-
cartesia_setting_custom_name = gr.Textbox(label="Имя")
|
796 |
-
cartesia_setting_custom_lang = gr.Dropdown(
|
797 |
-
label="Язык",
|
798 |
-
choices=LANGUAGE_CHOICES[1:] # Исключаем "all"
|
799 |
-
)
|
800 |
-
cartesia_setting_custom_voice = gr.Audio(label="Файл голоса",type='filepath')
|
801 |
-
cartesia_setting_custom_add = gr.Button("Добавить")
|
802 |
-
|
803 |
-
# with gr.Tab("Микс"):
|
804 |
-
# cartesia_setting_custom_mix = gr.Dropdown(
|
805 |
-
# label="Выберите голоса",
|
806 |
-
# multiselect=True,
|
807 |
-
# choices=[]
|
808 |
-
# )
|
809 |
-
# cartesia_setting_custom_mix_update = gr.Button("Обновить")
|
810 |
-
# for i in range(5):
|
811 |
-
# setattr(
|
812 |
-
# demo,
|
813 |
-
# f'mix_voice_{i+1}',
|
814 |
-
# gr.Slider(
|
815 |
-
# label=f"Голос {i+1}",
|
816 |
-
# value=0.5,
|
817 |
-
# minimum=0,
|
818 |
-
# maximum=1,
|
819 |
-
# step=0.01,
|
820 |
-
# visible=False
|
821 |
-
# )
|
822 |
-
# )
|
823 |
-
|
824 |
-
# Контроль эмоций
|
825 |
-
with gr.Accordion(label="Контроль эмоций (Beta)", open=False):
|
826 |
-
cartesia_emotions = gr.Dropdown(
|
827 |
-
label="Эмоции",
|
828 |
-
multiselect=True,
|
829 |
-
choices=EMOTION_CHOICES
|
830 |
-
)
|
831 |
-
cartesia_emotions_intensity = gr.Dropdown(
|
832 |
-
label="Интенсивность",
|
833 |
-
choices=EMOTION_INTENSITY,
|
834 |
-
value="Средняя"
|
835 |
-
)
|
836 |
-
|
837 |
-
# Настройки скорости
|
838 |
-
with gr.Accordion("Скорость", open=True):
|
839 |
-
cartesia_speed_speed = gr.Dropdown(
|
840 |
-
label="Скорость речи",
|
841 |
-
choices=SPEED_CHOICES,
|
842 |
-
value="Нормально"
|
843 |
-
)
|
844 |
-
cartesia_speed_speed_allow_custom = gr.Checkbox(
|
845 |
-
label="Использовать кастомное значение скорости"
|
846 |
-
)
|
847 |
-
cartesia_speed_speed_custom = gr.Slider(
|
848 |
-
label="Скорость",
|
849 |
-
value=0,
|
850 |
-
minimum=-1,
|
851 |
-
maximum=1,
|
852 |
-
step=0.1,
|
853 |
-
visible=False
|
854 |
-
)
|
855 |
-
|
856 |
-
cartesia_setting_improve_text = gr.Checkbox(
|
857 |
-
label="Улучшить текст согласно рекомендациям",
|
858 |
-
value=True
|
859 |
-
)
|
860 |
-
|
861 |
-
# Правая колонка
|
862 |
-
with gr.Column():
|
863 |
-
cartessia_status_bar = gr.Label(value="Статус")
|
864 |
-
cartesia_output_audio = gr.Audio(
|
865 |
-
label="Результат",
|
866 |
-
interactive=False
|
867 |
-
)
|
868 |
-
cartesia_output_button = gr.Button("Генерация")
|
869 |
-
|
870 |
-
# События
|
871 |
-
cartesia_api_key.change(
|
872 |
-
initialize_manager,
|
873 |
-
inputs=[cartesia_api_key],
|
874 |
-
outputs=[cartessia_status_bar]
|
875 |
-
)
|
876 |
-
|
877 |
-
cartesia_setting_filter_lang.change(
|
878 |
-
update_voice_list,
|
879 |
-
inputs=[
|
880 |
-
cartesia_setting_filter_lang,
|
881 |
-
cartesia_setting_filter_type,
|
882 |
-
cartesia_setting_voice # Передаем текущий выбор
|
883 |
-
],
|
884 |
-
outputs=[cartesia_setting_voice, cartessia_status_bar]
|
885 |
-
)
|
886 |
-
|
887 |
-
cartesia_setting_filter_type.change(
|
888 |
-
update_voice_list,
|
889 |
-
inputs=[
|
890 |
-
cartesia_setting_filter_lang,
|
891 |
-
cartesia_setting_filter_type,
|
892 |
-
cartesia_setting_voice # Передаем текущий выбор
|
893 |
-
],
|
894 |
-
outputs=[cartesia_setting_voice, cartessia_status_bar]
|
895 |
-
)
|
896 |
-
|
897 |
-
cartesia_setting_voice.change(
|
898 |
-
update_voice_info,
|
899 |
-
inputs=[cartesia_setting_voice],
|
900 |
-
outputs=[cartesia_setting_voice_info]
|
901 |
-
)
|
902 |
-
|
903 |
-
cartesia_setting_voice_update.click(
|
904 |
-
update_voice_list,
|
905 |
-
inputs=[cartesia_setting_filter_lang, cartesia_setting_filter_type],
|
906 |
-
outputs=[cartesia_setting_voice]
|
907 |
-
)
|
908 |
-
|
909 |
-
cartesia_speed_speed_allow_custom.change(
|
910 |
-
lambda x: gr.update(visible=x),
|
911 |
-
inputs=[cartesia_speed_speed_allow_custom],
|
912 |
-
outputs=[cartesia_speed_speed_custom]
|
913 |
-
)
|
914 |
-
|
915 |
-
cartesia_setting_custom_add.click(
|
916 |
-
create_custom_voice,
|
917 |
-
inputs=[
|
918 |
-
cartesia_setting_custom_name,
|
919 |
-
cartesia_setting_custom_lang,
|
920 |
-
cartesia_setting_custom_voice
|
921 |
-
],
|
922 |
-
outputs=[
|
923 |
-
cartessia_status_bar,
|
924 |
-
cartesia_setting_voice, # Обновляем dropdown
|
925 |
-
cartesia_setting_voice_info # Обновляем информацию о голосе
|
926 |
-
]
|
927 |
-
)
|
928 |
-
|
929 |
-
# Обновляем привязки событий
|
930 |
-
cartesia_setting_auto_language.change(
|
931 |
-
on_auto_language_change,
|
932 |
-
inputs=[cartesia_setting_auto_language],
|
933 |
-
outputs=[cartesia_setting_manual_language]
|
934 |
-
)
|
935 |
-
|
936 |
-
cartesia_output_button.click(
|
937 |
-
generate_speech,
|
938 |
-
inputs=[
|
939 |
-
cartesia_text,
|
940 |
-
cartesia_setting_voice,
|
941 |
-
cartesia_setting_improve_text,
|
942 |
-
cartesia_setting_auto_language,
|
943 |
-
cartesia_setting_manual_language,
|
944 |
-
cartesia_speed_speed,
|
945 |
-
cartesia_speed_speed_allow_custom,
|
946 |
-
cartesia_speed_speed_custom,
|
947 |
-
cartesia_emotions,
|
948 |
-
cartesia_emotions_intensity
|
949 |
-
],
|
950 |
-
outputs=[
|
951 |
-
cartesia_output_audio,
|
952 |
-
cartessia_status_bar
|
953 |
-
]
|
954 |
-
)
|
955 |
-
|
956 |
-
# Запуск приложения
|
957 |
-
if __name__ == "__main__":
|
958 |
-
# global manager
|
959 |
-
# Инициализация менеджера при запуске
|
960 |
-
# initialize_manager(DEFAULT_API_KEY)
|
961 |
-
# Запуск интерфейса
|
962 |
-
demo.launch()
|
|
|
1 |
+
from typing import List
|
2 |
+
import gradio as gr
|
3 |
+
from pathlib import Path
|
4 |
+
from sonic_api_wrapper import CartesiaVoiceManager, VoiceAccessibility, improve_tts_text
|
5 |
+
import os
|
6 |
+
import json
|
7 |
+
import datetime
|
8 |
+
|
9 |
+
# Global variable to hold the manager instance
|
10 |
+
manager = None
|
11 |
+
|
12 |
+
# Constants
|
13 |
+
LANGUAGE_CHOICES = ["all", "ru", "en", "es", "pl", "de", "fr", "tr", "pt", "zh", "ja", "hi", "it", "ko", "nl", "sv"]
|
14 |
+
ACCESS_TYPE_MAP = {
|
15 |
+
"All": VoiceAccessibility.ALL,
|
16 |
+
"Custom Only": VoiceAccessibility.ONLY_CUSTOM,
|
17 |
+
"Private Only": VoiceAccessibility.ONLY_PRIVATE,
|
18 |
+
"API": VoiceAccessibility.ONLY_PUBLIC
|
19 |
+
}
|
20 |
+
SPEED_CHOICES = ["Very Slow", "Slow", "Normal", "Fast", "Very Fast"]
|
21 |
+
EMOTION_CHOICES = ["Neutral", "Happy", "Sad", "Angry", "Surprised", "Curious"]
|
22 |
+
EMOTION_INTENSITY = ["Very Weak", "Weak", "Medium", "Strong", "Very Strong"]
|
23 |
+
|
24 |
+
def map_speed(speed_type: str) -> float:
|
25 |
+
speed_map = {
|
26 |
+
"Very Slow": -1.0,
|
27 |
+
"Slow": -0.5,
|
28 |
+
"Normal": 0.0,
|
29 |
+
"Fast": 0.5,
|
30 |
+
"Very Fast": 1.0
|
31 |
+
}
|
32 |
+
return speed_map[speed_type]
|
33 |
+
|
34 |
+
def generate_output_filename(language: str) -> str:
|
35 |
+
"""Generate output filename with timestamp and language"""
|
36 |
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
37 |
+
return f"output/{timestamp}_{language}.wav"
|
38 |
+
|
39 |
+
def extract_voice_id_from_label(voice_label: str) -> str:
|
40 |
+
"""
|
41 |
+
Extracts voice ID from label in dropdown
|
42 |
+
For example: "John (en) [Custom]" -> extract ID from voices dictionary
|
43 |
+
"""
|
44 |
+
global manager
|
45 |
+
try:
|
46 |
+
if not manager:
|
47 |
+
return None
|
48 |
+
|
49 |
+
# Get all voices and their labels
|
50 |
+
choices = manager.get_voice_choices()
|
51 |
+
# Find voice by label and get its ID
|
52 |
+
voice_data = next((c for c in choices if c["label"] == voice_label), None)
|
53 |
+
return voice_data["value"] if voice_data else None
|
54 |
+
except Exception as e:
|
55 |
+
print(f"❌ Error getting voices: {str(e)}")
|
56 |
+
return None
|
57 |
+
|
58 |
+
def initialize_manager(api_key: str) -> str:
|
59 |
+
global manager
|
60 |
+
try:
|
61 |
+
if not api_key:
|
62 |
+
return "❌ API key is required to initialize the manager"
|
63 |
+
|
64 |
+
manager = CartesiaVoiceManager(api_key=api_key, base_dir=Path("voice2voice"))
|
65 |
+
return "✅ Manager initialized"
|
66 |
+
except Exception as e:
|
67 |
+
manager = None
|
68 |
+
return f"❌ Error: {str(e)}"
|
69 |
+
|
70 |
+
def get_initial_voices():
|
71 |
+
global manager
|
72 |
+
"""Get initial list of voices"""
|
73 |
+
if not manager:
|
74 |
+
return [], None
|
75 |
+
choices = manager.get_voice_choices()
|
76 |
+
if not choices:
|
77 |
+
return [], None
|
78 |
+
return [c["label"] for c in choices], choices[0]["label"] if choices else None
|
79 |
+
|
80 |
+
def update_voice_list(language: str, access_type: str, current_voice: str = None):
|
81 |
+
"""
|
82 |
+
Update the list of voices, preserving the current selection
|
83 |
+
"""
|
84 |
+
global manager
|
85 |
+
if not manager:
|
86 |
+
return gr.update(choices=[], value=None), "❌ Manager is not initialized"
|
87 |
+
|
88 |
+
try:
|
89 |
+
choices = manager.get_voice_choices(
|
90 |
+
language=None if language == "all" else language,
|
91 |
+
accessibility=ACCESS_TYPE_MAP[access_type]
|
92 |
+
)
|
93 |
+
|
94 |
+
# Convert to list of labels
|
95 |
+
choice_labels = [c["label"] for c in choices]
|
96 |
+
|
97 |
+
# Determine value to select
|
98 |
+
if current_voice in choice_labels:
|
99 |
+
# Preserve current selection if available
|
100 |
+
new_value = current_voice
|
101 |
+
else:
|
102 |
+
# Otherwise, take the first available voice
|
103 |
+
new_value = choice_labels[0] if choice_labels else None
|
104 |
+
|
105 |
+
return gr.update(choices=choice_labels, value=new_value), "✅ Voice list updated"
|
106 |
+
except Exception as e:
|
107 |
+
return gr.update(choices=[], value=None), f"❌ Error: {str(e)}"
|
108 |
+
|
109 |
+
def update_voice_info(voice_label: str) -> str:
|
110 |
+
"""Update voice information"""
|
111 |
+
global manager
|
112 |
+
if not manager or not voice_label:
|
113 |
+
return ""
|
114 |
+
|
115 |
+
try:
|
116 |
+
voice_id = extract_voice_id_from_label(voice_label)
|
117 |
+
if not voice_id:
|
118 |
+
return "❌ Voice not found"
|
119 |
+
|
120 |
+
info = manager.get_voice_info(voice_id)
|
121 |
+
return (
|
122 |
+
f"Name: {info['name']}\n"
|
123 |
+
f"Language: {info['language']}\n"
|
124 |
+
f"Type: {'Custom' if info.get('is_custom') else 'API'}\n"
|
125 |
+
f"ID: {info['id']}"
|
126 |
+
)
|
127 |
+
except Exception as e:
|
128 |
+
return f"❌ Error: {str(e)}"
|
129 |
+
|
130 |
+
def create_custom_voice(name: str, language: str, audio_data: tuple) -> tuple:
|
131 |
+
"""
|
132 |
+
Creates a custom voice and updates the list of voices
|
133 |
+
Returns: (status, updated dropdown, voice info)
|
134 |
+
"""
|
135 |
+
global manager
|
136 |
+
if not manager:
|
137 |
+
return "❌ Manager is not initialized", gr.update(), ""
|
138 |
+
|
139 |
+
if not name or not audio_data:
|
140 |
+
return "❌ Name and voice file are required", gr.update(), ""
|
141 |
+
|
142 |
+
try:
|
143 |
+
# Get the audio file path
|
144 |
+
audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data
|
145 |
+
|
146 |
+
# Create the voice
|
147 |
+
voice_id = manager.create_custom_voice(
|
148 |
+
name=name,
|
149 |
+
source=audio_path,
|
150 |
+
language=language
|
151 |
+
)
|
152 |
+
|
153 |
+
print(voice_id)
|
154 |
+
|
155 |
+
# Get updated list of voices
|
156 |
+
choices = manager.get_voice_choices()
|
157 |
+
choice_labels = [c["label"] for c in choices]
|
158 |
+
|
159 |
+
# Find label for the new voice
|
160 |
+
new_voice_label = next(c["label"] for c in choices if c["value"] == voice_id)
|
161 |
+
|
162 |
+
# Get info of the new voice
|
163 |
+
voice_info = manager.get_voice_info(voice_id)
|
164 |
+
info_text = (
|
165 |
+
f"Name: {voice_info['name']}\n"
|
166 |
+
f"Language: {voice_info['language']}\n"
|
167 |
+
f"Type: Custom\n"
|
168 |
+
f"ID: {voice_info['id']}"
|
169 |
+
)
|
170 |
+
|
171 |
+
return (
|
172 |
+
f"✅ Custom voice created: {voice_id}",
|
173 |
+
gr.update(choices=choice_labels, value=new_voice_label),
|
174 |
+
info_text
|
175 |
+
)
|
176 |
+
|
177 |
+
except Exception as e:
|
178 |
+
return f"❌ Error creating voice: {str(e)}", gr.update(), ""
|
179 |
+
|
180 |
+
def on_auto_language_change(auto_language: bool):
|
181 |
+
"""Handler for changing the auto-detect language checkbox"""
|
182 |
+
return gr.update(visible=not auto_language)
|
183 |
+
|
184 |
+
def map_emotions(selected_emotions, intensity):
|
185 |
+
emotion_map = {
|
186 |
+
"Happy": "positivity",
|
187 |
+
"Sad": "sadness",
|
188 |
+
"Angry": "anger",
|
189 |
+
"Surprised": "surprise",
|
190 |
+
"Curious": "curiosity"
|
191 |
+
}
|
192 |
+
|
193 |
+
intensity_map = {
|
194 |
+
"Very Weak": "lowest",
|
195 |
+
"Weak": "low",
|
196 |
+
"Medium": "medium",
|
197 |
+
"Strong": "high",
|
198 |
+
"Very Strong": "highest"
|
199 |
+
}
|
200 |
+
|
201 |
+
emotions = []
|
202 |
+
for emotion in selected_emotions:
|
203 |
+
if emotion == "Neutral":
|
204 |
+
continue
|
205 |
+
if emotion in emotion_map:
|
206 |
+
emotions.append({
|
207 |
+
"name": emotion_map[emotion],
|
208 |
+
"level": intensity_map[intensity]
|
209 |
+
})
|
210 |
+
return emotions
|
211 |
+
|
212 |
+
def generate_speech(
|
213 |
+
text: str,
|
214 |
+
voice_label: str,
|
215 |
+
improve_text: bool,
|
216 |
+
auto_language: bool,
|
217 |
+
manual_language: str,
|
218 |
+
speed_type: str,
|
219 |
+
use_custom_speed: bool,
|
220 |
+
custom_speed: float,
|
221 |
+
emotions: List[str],
|
222 |
+
emotion_intensity: str
|
223 |
+
):
|
224 |
+
global manager
|
225 |
+
"""Generate speech considering language settings"""
|
226 |
+
if not manager:
|
227 |
+
return None, "❌ Manager is not initialized"
|
228 |
+
|
229 |
+
if not text or not voice_label:
|
230 |
+
return None, "❌ Text and voice are required"
|
231 |
+
|
232 |
+
try:
|
233 |
+
# Extract voice ID from label
|
234 |
+
voice_id = extract_voice_id_from_label(voice_label)
|
235 |
+
if not voice_id:
|
236 |
+
return None, "❌ Voice not found"
|
237 |
+
|
238 |
+
# Set the voice by ID
|
239 |
+
manager.set_voice(voice_id)
|
240 |
+
|
241 |
+
# If auto-detect is off, set language manually
|
242 |
+
if not auto_language:
|
243 |
+
manager.set_language(manual_language)
|
244 |
+
|
245 |
+
# Set speed
|
246 |
+
if use_custom_speed:
|
247 |
+
manager.speed = custom_speed
|
248 |
+
else:
|
249 |
+
manager.speed = map_speed(speed_type)
|
250 |
+
|
251 |
+
# Set emotions
|
252 |
+
if emotions and emotions != ["Neutral"]:
|
253 |
+
manager.set_emotions(map_emotions(emotions, emotion_intensity))
|
254 |
+
else:
|
255 |
+
manager.set_emotions() # Reset emotions
|
256 |
+
|
257 |
+
# Generate output file name
|
258 |
+
output_file = generate_output_filename(
|
259 |
+
manual_language if not auto_language else manager.current_language
|
260 |
+
)
|
261 |
+
|
262 |
+
# Create output directory if it doesn't exist
|
263 |
+
os.makedirs("output", exist_ok=True)
|
264 |
+
|
265 |
+
# Generate speech
|
266 |
+
output_path = manager.speak(
|
267 |
+
text=text if not improve_text else improve_tts_text(text, manager.current_language),
|
268 |
+
output_file=output_file
|
269 |
+
)
|
270 |
+
|
271 |
+
return output_path, "✅ Audio generated successfully"
|
272 |
+
|
273 |
+
except Exception as e:
|
274 |
+
return None, f"❌ Error generating speech: {str(e)}"
|
275 |
+
|
276 |
+
def initialize_manager_and_update(api_key: str, language: str, access_type: str, current_voice: str = None):
|
277 |
+
status = initialize_manager(api_key)
|
278 |
+
if manager:
|
279 |
+
voice_update, voice_status = update_voice_list(language, access_type, current_voice)
|
280 |
+
combined_status = f"{status}\n{voice_status}"
|
281 |
+
return combined_status, voice_update
|
282 |
+
else:
|
283 |
+
return status, gr.update(choices=[], value=None)
|
284 |
+
|
285 |
+
# Create the interface
|
286 |
+
with gr.Blocks() as demo:
|
287 |
+
# API key
|
288 |
+
cartesia_api_key = gr.Textbox(
|
289 |
+
label="Cartesia API Key",
|
290 |
+
value="", # No default API key
|
291 |
+
type='password'
|
292 |
+
)
|
293 |
+
|
294 |
+
with gr.Row():
|
295 |
+
# Left column
|
296 |
+
with gr.Column():
|
297 |
+
cartesia_text = gr.TextArea(label="Text")
|
298 |
+
|
299 |
+
with gr.Accordion(label="Settings", open=True):
|
300 |
+
# Filters
|
301 |
+
with gr.Accordion("Filters", open=True):
|
302 |
+
cartesia_setting_filter_lang = gr.Dropdown(
|
303 |
+
label="Language",
|
304 |
+
choices=LANGUAGE_CHOICES,
|
305 |
+
value="all"
|
306 |
+
)
|
307 |
+
cartesia_setting_filter_type = gr.Dropdown(
|
308 |
+
label="Type",
|
309 |
+
choices=list(ACCESS_TYPE_MAP.keys()),
|
310 |
+
value="All"
|
311 |
+
)
|
312 |
+
|
313 |
+
# Settings tabs
|
314 |
+
with gr.Tab("Standard"):
|
315 |
+
cartesia_setting_voice_info = gr.Textbox(
|
316 |
+
label="Voice Information",
|
317 |
+
interactive=False
|
318 |
+
)
|
319 |
+
with gr.Row():
|
320 |
+
initial_choices, initial_value = get_initial_voices()
|
321 |
+
cartesia_setting_voice = gr.Dropdown(
|
322 |
+
label="Voice",
|
323 |
+
choices=initial_choices,
|
324 |
+
value=initial_value
|
325 |
+
)
|
326 |
+
cartesia_setting_voice_update = gr.Button("Refresh")
|
327 |
+
cartesia_setting_auto_language = gr.Checkbox(
|
328 |
+
label="Automatically detect language from voice",
|
329 |
+
value=True
|
330 |
+
)
|
331 |
+
cartesia_setting_manual_language = gr.Dropdown(
|
332 |
+
label="Speech Language",
|
333 |
+
choices=["ru", "en", "es", "fr", "de", "pl", "it", "ja", "ko", "zh", "hi"],
|
334 |
+
value="en",
|
335 |
+
visible=False # Initially hidden
|
336 |
+
)
|
337 |
+
|
338 |
+
with gr.Tab("Custom"):
|
339 |
+
cartesia_setting_custom_name = gr.Textbox(label="Name")
|
340 |
+
cartesia_setting_custom_lang = gr.Dropdown(
|
341 |
+
label="Language",
|
342 |
+
choices=LANGUAGE_CHOICES[1:] # Exclude "all"
|
343 |
+
)
|
344 |
+
cartesia_setting_custom_voice = gr.Audio(label="Voice File", type='filepath')
|
345 |
+
cartesia_setting_custom_add = gr.Button("Add")
|
346 |
+
|
347 |
+
# Emotion control
|
348 |
+
with gr.Accordion(label="Emotion Control (Beta)", open=False):
|
349 |
+
cartesia_emotions = gr.Dropdown(
|
350 |
+
label="Emotions",
|
351 |
+
multiselect=True,
|
352 |
+
choices=EMOTION_CHOICES
|
353 |
+
)
|
354 |
+
cartesia_emotions_intensity = gr.Dropdown(
|
355 |
+
label="Intensity",
|
356 |
+
choices=EMOTION_INTENSITY,
|
357 |
+
value="Medium"
|
358 |
+
)
|
359 |
+
|
360 |
+
# Speed settings
|
361 |
+
with gr.Accordion("Speed", open=True):
|
362 |
+
cartesia_speed_speed = gr.Dropdown(
|
363 |
+
label="Speech Speed",
|
364 |
+
choices=SPEED_CHOICES,
|
365 |
+
value="Normal"
|
366 |
+
)
|
367 |
+
cartesia_speed_speed_allow_custom = gr.Checkbox(
|
368 |
+
label="Use custom speed value"
|
369 |
+
)
|
370 |
+
cartesia_speed_speed_custom = gr.Slider(
|
371 |
+
label="Speed",
|
372 |
+
value=0,
|
373 |
+
minimum=-1,
|
374 |
+
maximum=1,
|
375 |
+
step=0.1,
|
376 |
+
visible=False
|
377 |
+
)
|
378 |
+
|
379 |
+
cartesia_setting_improve_text = gr.Checkbox(
|
380 |
+
label="Improve text according to recommendations",
|
381 |
+
value=True
|
382 |
+
)
|
383 |
+
|
384 |
+
# Right column
|
385 |
+
with gr.Column():
|
386 |
+
cartessia_status_bar = gr.Label(value="Status")
|
387 |
+
cartesia_output_audio = gr.Audio(
|
388 |
+
label="Result",
|
389 |
+
interactive=False
|
390 |
+
)
|
391 |
+
cartesia_output_button = gr.Button("Generate")
|
392 |
+
|
393 |
+
# Events
|
394 |
+
cartesia_api_key.change(
|
395 |
+
initialize_manager_and_update,
|
396 |
+
inputs=[cartesia_api_key, cartesia_setting_filter_lang, cartesia_setting_filter_type, cartesia_setting_voice],
|
397 |
+
outputs=[cartessia_status_bar, cartesia_setting_voice]
|
398 |
+
)
|
399 |
+
|
400 |
+
cartesia_setting_filter_lang.change(
|
401 |
+
update_voice_list,
|
402 |
+
inputs=[
|
403 |
+
cartesia_setting_filter_lang,
|
404 |
+
cartesia_setting_filter_type,
|
405 |
+
cartesia_setting_voice # Pass the current selection
|
406 |
+
],
|
407 |
+
outputs=[cartesia_setting_voice, cartessia_status_bar]
|
408 |
+
)
|
409 |
+
|
410 |
+
cartesia_setting_filter_type.change(
|
411 |
+
update_voice_list,
|
412 |
+
inputs=[
|
413 |
+
cartesia_setting_filter_lang,
|
414 |
+
cartesia_setting_filter_type,
|
415 |
+
cartesia_setting_voice # Pass the current selection
|
416 |
+
],
|
417 |
+
outputs=[cartesia_setting_voice, cartessia_status_bar]
|
418 |
+
)
|
419 |
+
|
420 |
+
cartesia_setting_voice.change(
|
421 |
+
update_voice_info,
|
422 |
+
inputs=[cartesia_setting_voice],
|
423 |
+
outputs=[cartesia_setting_voice_info]
|
424 |
+
)
|
425 |
+
|
426 |
+
cartesia_setting_voice_update.click(
|
427 |
+
update_voice_list,
|
428 |
+
inputs=[cartesia_setting_filter_lang, cartesia_setting_filter_type, cartesia_setting_voice],
|
429 |
+
outputs=[cartesia_setting_voice, cartessia_status_bar]
|
430 |
+
)
|
431 |
+
|
432 |
+
cartesia_speed_speed_allow_custom.change(
|
433 |
+
lambda x: gr.update(visible=x),
|
434 |
+
inputs=[cartesia_speed_speed_allow_custom],
|
435 |
+
outputs=[cartesia_speed_speed_custom]
|
436 |
+
)
|
437 |
+
|
438 |
+
cartesia_setting_custom_add.click(
|
439 |
+
create_custom_voice,
|
440 |
+
inputs=[
|
441 |
+
cartesia_setting_custom_name,
|
442 |
+
cartesia_setting_custom_lang,
|
443 |
+
cartesia_setting_custom_voice
|
444 |
+
],
|
445 |
+
outputs=[
|
446 |
+
cartessia_status_bar,
|
447 |
+
cartesia_setting_voice, # Update dropdown
|
448 |
+
cartesia_setting_voice_info # Update voice info
|
449 |
+
]
|
450 |
+
)
|
451 |
+
|
452 |
+
cartesia_setting_auto_language.change(
|
453 |
+
on_auto_language_change,
|
454 |
+
inputs=[cartesia_setting_auto_language],
|
455 |
+
outputs=[cartesia_setting_manual_language]
|
456 |
+
)
|
457 |
+
|
458 |
+
cartesia_output_button.click(
|
459 |
+
generate_speech,
|
460 |
+
inputs=[
|
461 |
+
cartesia_text,
|
462 |
+
cartesia_setting_voice,
|
463 |
+
cartesia_setting_improve_text,
|
464 |
+
cartesia_setting_auto_language,
|
465 |
+
cartesia_setting_manual_language,
|
466 |
+
cartesia_speed_speed,
|
467 |
+
cartesia_speed_speed_allow_custom,
|
468 |
+
cartesia_speed_speed_custom,
|
469 |
+
cartesia_emotions,
|
470 |
+
cartesia_emotions_intensity
|
471 |
+
],
|
472 |
+
outputs=[
|
473 |
+
cartesia_output_audio,
|
474 |
+
cartessia_status_bar
|
475 |
+
]
|
476 |
+
)
|
477 |
+
|
478 |
+
# Run the app
|
479 |
+
if __name__ == "__main__":
|
480 |
+
demo.queue()
|
481 |
+
demo.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sonic_api_wrapper.py
CHANGED
@@ -3,12 +3,16 @@ import json
|
|
3 |
from pathlib import Path
|
4 |
from typing import List, Dict, Union, Optional
|
5 |
from enum import Enum
|
6 |
-
from cartesia import Cartesia
|
7 |
from tqdm import tqdm
|
8 |
from loguru import logger
|
9 |
from datetime import datetime
|
10 |
import re
|
11 |
|
|
|
|
|
|
|
|
|
|
|
12 |
class VoiceAccessibility(Enum):
|
13 |
ALL = "all"
|
14 |
ONLY_PUBLIC = "only_public"
|
@@ -28,49 +32,72 @@ class CartesiaVoiceManager:
|
|
28 |
|
29 |
def __init__(self, api_key: str = None, base_dir: Path = None):
|
30 |
self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
|
31 |
-
if
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
35 |
self.current_voice = None
|
36 |
self.current_model = None
|
37 |
self.current_language = None
|
38 |
self.current_mix = None
|
39 |
-
|
40 |
-
#
|
41 |
self.base_dir = base_dir or Path("voice2voice")
|
42 |
self.api_dir = self.base_dir / "api"
|
43 |
self.custom_dir = self.base_dir / "custom"
|
44 |
-
|
45 |
-
#
|
46 |
self.api_dir.mkdir(parents=True, exist_ok=True)
|
47 |
self.custom_dir.mkdir(parents=True, exist_ok=True)
|
48 |
-
|
49 |
-
#
|
50 |
self.voices = {}
|
51 |
self.loaded_voices = set()
|
52 |
-
|
53 |
-
#
|
54 |
self._speed = 0.0 # normal speed
|
55 |
self._emotions = {}
|
56 |
-
|
57 |
logger.add("cartesia_voice_manager.log", rotation="10 MB")
|
58 |
logger.info("CartesiaVoiceManager initialized")
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
def load_voice(self, voice_id: str) -> Dict:
|
61 |
if voice_id in self.loaded_voices:
|
62 |
return self.voices[voice_id]
|
63 |
-
|
64 |
voice_file = None
|
65 |
-
#
|
66 |
api_file = self.api_dir / f"{voice_id}.json"
|
67 |
custom_file = self.custom_dir / f"{voice_id}.json"
|
68 |
-
|
69 |
if api_file.exists():
|
70 |
voice_file = api_file
|
71 |
elif custom_file.exists():
|
72 |
voice_file = custom_file
|
73 |
-
|
74 |
if voice_file:
|
75 |
with open(voice_file, "r") as f:
|
76 |
voice_data = json.load(f)
|
@@ -79,32 +106,36 @@ class CartesiaVoiceManager:
|
|
79 |
logger.info(f"Loaded voice {voice_id} from {voice_file}")
|
80 |
return voice_data
|
81 |
else:
|
82 |
-
#
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
93 |
|
94 |
def extract_voice_id_from_label(self, voice_label: str) -> Optional[str]:
|
95 |
"""
|
96 |
-
|
97 |
-
|
98 |
"""
|
99 |
-
#
|
100 |
choices = self.get_voice_choices()
|
101 |
-
#
|
102 |
voice_data = next((c for c in choices if c["label"] == voice_label), None)
|
103 |
return voice_data["value"] if voice_data else None
|
104 |
-
|
105 |
def get_voice_choices(self, language: str = None, accessibility: VoiceAccessibility = VoiceAccessibility.ALL) -> List[Dict]:
|
106 |
"""
|
107 |
-
|
108 |
"""
|
109 |
voices = self.list_available_voices(
|
110 |
languages=[language] if language else None,
|
@@ -113,17 +144,17 @@ class CartesiaVoiceManager:
|
|
113 |
|
114 |
choices = []
|
115 |
for voice in voices:
|
116 |
-
#
|
117 |
choices.append({
|
118 |
"label": f"{voice['name']} ({voice['language']}){' [Custom]' if voice.get('is_custom') else ''}",
|
119 |
-
"value": voice['id'] #
|
120 |
})
|
121 |
|
122 |
return sorted(choices, key=lambda x: x['label'])
|
123 |
|
124 |
def get_voice_info(self, voice_id: str) -> Dict:
|
125 |
"""
|
126 |
-
|
127 |
"""
|
128 |
voice = self.load_voice(voice_id)
|
129 |
return {
|
@@ -133,7 +164,7 @@ class CartesiaVoiceManager:
|
|
133 |
"is_public": voice.get('is_public', True),
|
134 |
"id": voice['id']
|
135 |
}
|
136 |
-
|
137 |
def _save_voice_to_api(self, voice_data: Dict):
|
138 |
voice_id = voice_data["id"]
|
139 |
file_path = self.api_dir / f"{voice_id}.json"
|
@@ -149,37 +180,47 @@ class CartesiaVoiceManager:
|
|
149 |
logger.info(f"Saved custom voice {voice_id} to {file_path}")
|
150 |
|
151 |
def update_voices_from_api(self):
|
|
|
|
|
|
|
|
|
152 |
logger.info("Updating voices from API")
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
161 |
|
162 |
def list_available_voices(self, languages: List[str] = None, accessibility: VoiceAccessibility = VoiceAccessibility.ALL) -> List[Dict]:
|
163 |
filtered_voices = []
|
164 |
|
165 |
-
#
|
166 |
if accessibility in [VoiceAccessibility.ALL, VoiceAccessibility.ONLY_PUBLIC]:
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
|
|
|
|
|
|
181 |
|
182 |
-
#
|
183 |
if accessibility in [VoiceAccessibility.ALL, VoiceAccessibility.ONLY_PRIVATE, VoiceAccessibility.ONLY_CUSTOM]:
|
184 |
for file in self.custom_dir.glob("*.json"):
|
185 |
with open(file, "r") as f:
|
@@ -197,7 +238,7 @@ class CartesiaVoiceManager:
|
|
197 |
return filtered_voices
|
198 |
|
199 |
def set_voice(self, voice_id: str):
|
200 |
-
#
|
201 |
voice_file = None
|
202 |
api_file = self.api_dir / f"{voice_id}.json"
|
203 |
custom_file = self.custom_dir / f"{voice_id}.json"
|
@@ -208,19 +249,23 @@ class CartesiaVoiceManager:
|
|
208 |
voice_file = custom_file
|
209 |
|
210 |
if voice_file:
|
211 |
-
#
|
212 |
with open(voice_file, "r") as f:
|
213 |
self.current_voice = json.load(f)
|
214 |
else:
|
215 |
-
#
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
|
|
|
|
|
|
|
|
224 |
|
225 |
self.set_language(self.current_voice['language'])
|
226 |
logger.info(f"Set current voice to {voice_id}")
|
@@ -261,32 +306,35 @@ class CartesiaVoiceManager:
|
|
261 |
self._emotions = {}
|
262 |
logger.info("Cleared all emotions")
|
263 |
return
|
264 |
-
|
265 |
self._emotions = {}
|
266 |
for emotion in emotions:
|
267 |
name = emotion.get("name")
|
268 |
level = emotion.get("level")
|
269 |
-
|
270 |
if name not in self.EMOTION_NAMES:
|
271 |
raise ValueError(f"Invalid emotion name. Choose from: {self.EMOTION_NAMES}")
|
272 |
if level not in self.EMOTION_LEVELS:
|
273 |
raise ValueError(f"Invalid emotion level. Choose from: {self.EMOTION_LEVELS}")
|
274 |
-
|
275 |
self._emotions[name] = level
|
276 |
-
|
277 |
logger.info(f"Set emotions: {self._emotions}")
|
278 |
|
279 |
def _get_voice_controls(self):
|
280 |
controls = {"speed": self._speed}
|
281 |
-
|
282 |
if self._emotions:
|
283 |
controls["emotion"] = [f"{name}:{level}" for name, level in self._emotions.items()]
|
284 |
-
|
285 |
return controls
|
286 |
|
287 |
def speak(self, text: str, output_file: str = None):
|
288 |
if not self.current_model or not (self.current_voice or self.current_mix):
|
289 |
raise ValueError("Please set a model and a voice or voice mix before speaking.")
|
|
|
|
|
|
|
290 |
|
291 |
voice_embedding = self.current_voice['embedding'] if self.current_voice else self.current_mix
|
292 |
|
@@ -299,7 +347,7 @@ class CartesiaVoiceManager:
|
|
299 |
}
|
300 |
|
301 |
voice_controls = self._get_voice_controls()
|
302 |
-
|
303 |
logger.info(f"Generating audio for text: {text[:50]}... with voice controls: {voice_controls}")
|
304 |
if self.current_language == 'en':
|
305 |
audio_data = self.client.tts.bytes(
|
@@ -313,13 +361,14 @@ class CartesiaVoiceManager:
|
|
313 |
)
|
314 |
else:
|
315 |
audio_data = self.client.tts.bytes(
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
|
|
323 |
|
324 |
if output_file is None:
|
325 |
output_file = f"output_{self.current_language}.wav"
|
@@ -333,16 +382,19 @@ class CartesiaVoiceManager:
|
|
333 |
|
334 |
def _get_embedding(self, source: Union[str, Dict]) -> Dict:
|
335 |
"""
|
336 |
-
|
337 |
"""
|
338 |
if isinstance(source, dict) and 'embedding' in source:
|
339 |
return source['embedding']
|
340 |
elif isinstance(source, str):
|
341 |
if os.path.isfile(source):
|
342 |
-
#
|
|
|
|
|
|
|
343 |
return self.client.voices.clone(filepath=source)
|
344 |
else:
|
345 |
-
#
|
346 |
voice = self.load_voice(source)
|
347 |
return voice['embedding']
|
348 |
else:
|
@@ -350,11 +402,15 @@ class CartesiaVoiceManager:
|
|
350 |
|
351 |
def create_mixed_embedding(self, components: List[Dict[str, Union[str, float, Dict]]]) -> Dict:
|
352 |
"""
|
353 |
-
|
354 |
-
|
355 |
-
:param components:
|
356 |
-
:return:
|
357 |
"""
|
|
|
|
|
|
|
|
|
358 |
mix_components = []
|
359 |
for component in components:
|
360 |
embedding = self._get_embedding(component.get('id') or component.get('path') or component)
|
@@ -362,32 +418,35 @@ class CartesiaVoiceManager:
|
|
362 |
"embedding": embedding,
|
363 |
"weight": component['weight']
|
364 |
})
|
365 |
-
|
366 |
return self.client.voices.mix(mix_components)
|
367 |
|
368 |
def create_custom_voice(self, name: str, source: Union[str, List[Dict]], description: str = "", language: str = "en"):
|
369 |
"""
|
370 |
-
|
371 |
-
|
372 |
-
:param name:
|
373 |
-
:param source:
|
374 |
-
:param description:
|
375 |
-
:param language:
|
376 |
-
:return: ID
|
377 |
"""
|
378 |
logger.info(f"Creating custom voice: {name}")
|
379 |
-
|
380 |
if isinstance(source, str):
|
381 |
-
#
|
|
|
|
|
|
|
382 |
embedding = self.client.voices.clone(filepath=source)
|
383 |
elif isinstance(source, list):
|
384 |
-
#
|
385 |
embedding = self.create_mixed_embedding(source)
|
386 |
else:
|
387 |
raise ValueError("Invalid source type. Expected file path or list of components.")
|
388 |
|
389 |
voice_id = f"custom_{len([f for f in self.custom_dir.glob('*.json')])}"
|
390 |
-
|
391 |
voice_data = {
|
392 |
"id": voice_id,
|
393 |
"name": name,
|
@@ -397,30 +456,30 @@ class CartesiaVoiceManager:
|
|
397 |
"is_public": False,
|
398 |
"is_custom": True
|
399 |
}
|
400 |
-
|
401 |
self._save_voice_to_custom(voice_data)
|
402 |
self.voices[voice_id] = voice_data
|
403 |
self.loaded_voices.add(voice_id)
|
404 |
-
|
405 |
logger.info(f"Created custom voice with id: {voice_id}")
|
406 |
return voice_id
|
407 |
|
408 |
def get_voice_id_by_name(self, name: str) -> List[str]:
|
409 |
matching_voices = []
|
410 |
-
|
411 |
-
#
|
412 |
for directory in [self.api_dir, self.custom_dir]:
|
413 |
for file in directory.glob("*.json"):
|
414 |
with open(file, "r") as f:
|
415 |
voice_data = json.load(f)
|
416 |
if voice_data['name'] == name:
|
417 |
matching_voices.append(voice_data['id'])
|
418 |
-
|
419 |
if not matching_voices:
|
420 |
logger.warning(f"No voices found with name: {name}")
|
421 |
else:
|
422 |
logger.info(f"Found {len(matching_voices)} voice(s) with name: {name}")
|
423 |
-
|
424 |
return matching_voices
|
425 |
|
426 |
def improve_tts_text(text: str, language: str = 'en') -> str:
|
@@ -430,7 +489,7 @@ def improve_tts_text(text: str, language: str = 'en') -> str:
|
|
430 |
def format_date(match):
|
431 |
date = datetime.strptime(match.group(), '%Y-%m-%d')
|
432 |
return date.strftime('%m/%d/%Y')
|
433 |
-
|
434 |
text = re.sub(r'\d{4}-\d{2}-\d{2}', format_date, text)
|
435 |
text = text.replace(' - ', ' - - ')
|
436 |
text = re.sub(r'\?(?![\s\n])', '??', text)
|
@@ -443,4 +502,4 @@ def improve_tts_text(text: str, language: str = 'en') -> str:
|
|
443 |
elif language.lower() in ['fr', 'fra', 'french']:
|
444 |
text = text.replace('M.', 'Monsieur')
|
445 |
|
446 |
-
return text
|
|
|
3 |
from pathlib import Path
|
4 |
from typing import List, Dict, Union, Optional
|
5 |
from enum import Enum
|
|
|
6 |
from tqdm import tqdm
|
7 |
from loguru import logger
|
8 |
from datetime import datetime
|
9 |
import re
|
10 |
|
11 |
+
try:
|
12 |
+
from cartesia import Cartesia
|
13 |
+
except ImportError:
|
14 |
+
Cartesia = None # Handle the case where Cartesia is not installed
|
15 |
+
|
16 |
class VoiceAccessibility(Enum):
|
17 |
ALL = "all"
|
18 |
ONLY_PUBLIC = "only_public"
|
|
|
32 |
|
33 |
def __init__(self, api_key: str = None, base_dir: Path = None):
|
34 |
self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
|
35 |
+
if self.api_key and Cartesia:
|
36 |
+
self.client = Cartesia(api_key=self.api_key)
|
37 |
+
logger.info("Cartesia client initialized with API key.")
|
38 |
+
else:
|
39 |
+
self.client = None
|
40 |
+
logger.warning("API key not provided. Cartesia client is not initialized. Some features will be unavailable.")
|
41 |
+
|
42 |
self.current_voice = None
|
43 |
self.current_model = None
|
44 |
self.current_language = None
|
45 |
self.current_mix = None
|
46 |
+
|
47 |
+
# Setting up directories
|
48 |
self.base_dir = base_dir or Path("voice2voice")
|
49 |
self.api_dir = self.base_dir / "api"
|
50 |
self.custom_dir = self.base_dir / "custom"
|
51 |
+
|
52 |
+
# Create necessary directories
|
53 |
self.api_dir.mkdir(parents=True, exist_ok=True)
|
54 |
self.custom_dir.mkdir(parents=True, exist_ok=True)
|
55 |
+
|
56 |
+
# Initialize voices
|
57 |
self.voices = {}
|
58 |
self.loaded_voices = set()
|
59 |
+
|
60 |
+
# Speed and emotion settings
|
61 |
self._speed = 0.0 # normal speed
|
62 |
self._emotions = {}
|
63 |
+
|
64 |
logger.add("cartesia_voice_manager.log", rotation="10 MB")
|
65 |
logger.info("CartesiaVoiceManager initialized")
|
66 |
|
67 |
+
|
68 |
+
def set_api_key(self, api_key: str):
|
69 |
+
"""
|
70 |
+
Устанавливает API ключ и инициализирует клиент Cartesia.
|
71 |
+
Затем обновляет список голосов из API.
|
72 |
+
"""
|
73 |
+
self.api_key = api_key
|
74 |
+
if Cartesia:
|
75 |
+
try:
|
76 |
+
self.client = Cartesia(api_key=self.api_key)
|
77 |
+
logger.info("Cartesia client initialized with new API key.")
|
78 |
+
self.update_voices_from_api()
|
79 |
+
except Exception as e:
|
80 |
+
logger.error(f"Failed to initialize Cartesia client with the provided API key: {e}")
|
81 |
+
self.client = None
|
82 |
+
raise ValueError("Failed to initialize Cartesia client with the provided API key.")
|
83 |
+
else:
|
84 |
+
logger.error("Cartesia library is not available. Cannot initialize Cartesia client.")
|
85 |
+
raise ImportError("Cartesia library is not installed.")
|
86 |
+
|
87 |
def load_voice(self, voice_id: str) -> Dict:
|
88 |
if voice_id in self.loaded_voices:
|
89 |
return self.voices[voice_id]
|
90 |
+
|
91 |
voice_file = None
|
92 |
+
# Search for voice file in api and custom directories
|
93 |
api_file = self.api_dir / f"{voice_id}.json"
|
94 |
custom_file = self.custom_dir / f"{voice_id}.json"
|
95 |
+
|
96 |
if api_file.exists():
|
97 |
voice_file = api_file
|
98 |
elif custom_file.exists():
|
99 |
voice_file = custom_file
|
100 |
+
|
101 |
if voice_file:
|
102 |
with open(voice_file, "r") as f:
|
103 |
voice_data = json.load(f)
|
|
|
106 |
logger.info(f"Loaded voice {voice_id} from {voice_file}")
|
107 |
return voice_data
|
108 |
else:
|
109 |
+
# If voice not found locally, try to load from API
|
110 |
+
if self.client:
|
111 |
+
try:
|
112 |
+
voice_data = self.client.voices.get(id=voice_id)
|
113 |
+
self._save_voice_to_api(voice_data)
|
114 |
+
self.voices[voice_id] = voice_data
|
115 |
+
self.loaded_voices.add(voice_id)
|
116 |
+
logger.info(f"Loaded voice {voice_id} from API")
|
117 |
+
return voice_data
|
118 |
+
except Exception as e:
|
119 |
+
logger.error(f"Failed to load voice {voice_id}: {e}")
|
120 |
+
raise ValueError(f"Voice with id {voice_id} not found")
|
121 |
+
else:
|
122 |
+
logger.error(f"Cannot load voice {voice_id} without API client.")
|
123 |
+
raise ValueError(f"Voice with id {voice_id} not found and API client is not available.")
|
124 |
|
125 |
def extract_voice_id_from_label(self, voice_label: str) -> Optional[str]:
|
126 |
"""
|
127 |
+
Extracts voice ID from label in dropdown
|
128 |
+
For example: "John (en) [Custom]" -> extract ID from voices dictionary
|
129 |
"""
|
130 |
+
# Get all voices and their labels
|
131 |
choices = self.get_voice_choices()
|
132 |
+
# Find voice by label and get its ID
|
133 |
voice_data = next((c for c in choices if c["label"] == voice_label), None)
|
134 |
return voice_data["value"] if voice_data else None
|
135 |
+
|
136 |
def get_voice_choices(self, language: str = None, accessibility: VoiceAccessibility = VoiceAccessibility.ALL) -> List[Dict]:
|
137 |
"""
|
138 |
+
Returns a list of voices for dropdown menu
|
139 |
"""
|
140 |
voices = self.list_available_voices(
|
141 |
languages=[language] if language else None,
|
|
|
144 |
|
145 |
choices = []
|
146 |
for voice in voices:
|
147 |
+
# Keep only ID in value
|
148 |
choices.append({
|
149 |
"label": f"{voice['name']} ({voice['language']}){' [Custom]' if voice.get('is_custom') else ''}",
|
150 |
+
"value": voice['id'] # Only ID here
|
151 |
})
|
152 |
|
153 |
return sorted(choices, key=lambda x: x['label'])
|
154 |
|
155 |
def get_voice_info(self, voice_id: str) -> Dict:
|
156 |
"""
|
157 |
+
Returns voice information for display
|
158 |
"""
|
159 |
voice = self.load_voice(voice_id)
|
160 |
return {
|
|
|
164 |
"is_public": voice.get('is_public', True),
|
165 |
"id": voice['id']
|
166 |
}
|
167 |
+
|
168 |
def _save_voice_to_api(self, voice_data: Dict):
|
169 |
voice_id = voice_data["id"]
|
170 |
file_path = self.api_dir / f"{voice_id}.json"
|
|
|
180 |
logger.info(f"Saved custom voice {voice_id} to {file_path}")
|
181 |
|
182 |
def update_voices_from_api(self):
|
183 |
+
if not self.client:
|
184 |
+
logger.warning("Cannot update voices from API without API client.")
|
185 |
+
return
|
186 |
+
|
187 |
logger.info("Updating voices from API")
|
188 |
+
try:
|
189 |
+
api_voices = self.client.voices.list()
|
190 |
+
for voice in tqdm(api_voices, desc="Updating voices"):
|
191 |
+
voice_id = voice["id"]
|
192 |
+
full_voice_data = self.client.voices.get(id=voice_id)
|
193 |
+
self._save_voice_to_api(full_voice_data)
|
194 |
+
if voice_id in self.loaded_voices:
|
195 |
+
self.voices[voice_id] = full_voice_data
|
196 |
+
logger.info(f"Updated {len(api_voices)} voices from API")
|
197 |
+
except Exception as e:
|
198 |
+
logger.error(f"Failed to update voices from API: {e}")
|
199 |
|
200 |
def list_available_voices(self, languages: List[str] = None, accessibility: VoiceAccessibility = VoiceAccessibility.ALL) -> List[Dict]:
|
201 |
filtered_voices = []
|
202 |
|
203 |
+
# Get only metadata from API (without embeddings)
|
204 |
if accessibility in [VoiceAccessibility.ALL, VoiceAccessibility.ONLY_PUBLIC]:
|
205 |
+
if self.client:
|
206 |
+
try:
|
207 |
+
api_voices = self.client.voices.list()
|
208 |
+
# Keep only metadata
|
209 |
+
for voice in api_voices:
|
210 |
+
metadata = {
|
211 |
+
'id': voice['id'],
|
212 |
+
'name': voice['name'],
|
213 |
+
'language': voice['language'],
|
214 |
+
'is_public': True
|
215 |
+
}
|
216 |
+
if languages is None or metadata['language'] in languages:
|
217 |
+
filtered_voices.append(metadata)
|
218 |
+
except Exception as e:
|
219 |
+
logger.error(f"Failed to fetch voices from API: {e}")
|
220 |
+
else:
|
221 |
+
logger.warning("API client is not available. Skipping public voices.")
|
222 |
|
223 |
+
# Add custom voices if needed
|
224 |
if accessibility in [VoiceAccessibility.ALL, VoiceAccessibility.ONLY_PRIVATE, VoiceAccessibility.ONLY_CUSTOM]:
|
225 |
for file in self.custom_dir.glob("*.json"):
|
226 |
with open(file, "r") as f:
|
|
|
238 |
return filtered_voices
|
239 |
|
240 |
def set_voice(self, voice_id: str):
|
241 |
+
# Check for local file with embedding
|
242 |
voice_file = None
|
243 |
api_file = self.api_dir / f"{voice_id}.json"
|
244 |
custom_file = self.custom_dir / f"{voice_id}.json"
|
|
|
249 |
voice_file = custom_file
|
250 |
|
251 |
if voice_file:
|
252 |
+
# Use local data
|
253 |
with open(voice_file, "r") as f:
|
254 |
self.current_voice = json.load(f)
|
255 |
else:
|
256 |
+
# Get full data with embedding from API
|
257 |
+
if self.client:
|
258 |
+
try:
|
259 |
+
voice_data = self.client.voices.get(id=voice_id)
|
260 |
+
# Save for future use
|
261 |
+
self._save_voice_to_api(voice_data)
|
262 |
+
self.current_voice = voice_data
|
263 |
+
except Exception as e:
|
264 |
+
logger.error(f"Failed to get voice {voice_id}: {e}")
|
265 |
+
raise ValueError(f"Voice with id {voice_id} not found")
|
266 |
+
else:
|
267 |
+
logger.error(f"Cannot set voice {voice_id} without API client.")
|
268 |
+
raise ValueError(f"Voice with id {voice_id} not found and API client is not available.")
|
269 |
|
270 |
self.set_language(self.current_voice['language'])
|
271 |
logger.info(f"Set current voice to {voice_id}")
|
|
|
306 |
self._emotions = {}
|
307 |
logger.info("Cleared all emotions")
|
308 |
return
|
309 |
+
|
310 |
self._emotions = {}
|
311 |
for emotion in emotions:
|
312 |
name = emotion.get("name")
|
313 |
level = emotion.get("level")
|
314 |
+
|
315 |
if name not in self.EMOTION_NAMES:
|
316 |
raise ValueError(f"Invalid emotion name. Choose from: {self.EMOTION_NAMES}")
|
317 |
if level not in self.EMOTION_LEVELS:
|
318 |
raise ValueError(f"Invalid emotion level. Choose from: {self.EMOTION_LEVELS}")
|
319 |
+
|
320 |
self._emotions[name] = level
|
321 |
+
|
322 |
logger.info(f"Set emotions: {self._emotions}")
|
323 |
|
324 |
def _get_voice_controls(self):
|
325 |
controls = {"speed": self._speed}
|
326 |
+
|
327 |
if self._emotions:
|
328 |
controls["emotion"] = [f"{name}:{level}" for name, level in self._emotions.items()]
|
329 |
+
|
330 |
return controls
|
331 |
|
332 |
def speak(self, text: str, output_file: str = None):
|
333 |
if not self.current_model or not (self.current_voice or self.current_mix):
|
334 |
raise ValueError("Please set a model and a voice or voice mix before speaking.")
|
335 |
+
if not self.client:
|
336 |
+
logger.error("Cannot generate speech without API client.")
|
337 |
+
raise ValueError("API client is not initialized. Cannot generate speech.")
|
338 |
|
339 |
voice_embedding = self.current_voice['embedding'] if self.current_voice else self.current_mix
|
340 |
|
|
|
347 |
}
|
348 |
|
349 |
voice_controls = self._get_voice_controls()
|
350 |
+
|
351 |
logger.info(f"Generating audio for text: {text[:50]}... with voice controls: {voice_controls}")
|
352 |
if self.current_language == 'en':
|
353 |
audio_data = self.client.tts.bytes(
|
|
|
361 |
)
|
362 |
else:
|
363 |
audio_data = self.client.tts.bytes(
|
364 |
+
model_id='sonic-multilingual',
|
365 |
+
transcript=improved_text,
|
366 |
+
voice_embedding=voice_embedding,
|
367 |
+
duration=None,
|
368 |
+
output_format=output_format,
|
369 |
+
language=self.current_language,
|
370 |
+
_experimental_voice_controls=voice_controls
|
371 |
+
)
|
372 |
|
373 |
if output_file is None:
|
374 |
output_file = f"output_{self.current_language}.wav"
|
|
|
382 |
|
383 |
def _get_embedding(self, source: Union[str, Dict]) -> Dict:
|
384 |
"""
|
385 |
+
Gets embedding from various sources: ID, file path, or existing embedding
|
386 |
"""
|
387 |
if isinstance(source, dict) and 'embedding' in source:
|
388 |
return source['embedding']
|
389 |
elif isinstance(source, str):
|
390 |
if os.path.isfile(source):
|
391 |
+
# If it's a file path, create a new embedding
|
392 |
+
if not self.client:
|
393 |
+
logger.error("Cannot clone voice without API client.")
|
394 |
+
raise ValueError("API client is not initialized. Cannot clone voice.")
|
395 |
return self.client.voices.clone(filepath=source)
|
396 |
else:
|
397 |
+
# If it's an ID, load the voice and return its embedding
|
398 |
voice = self.load_voice(source)
|
399 |
return voice['embedding']
|
400 |
else:
|
|
|
402 |
|
403 |
def create_mixed_embedding(self, components: List[Dict[str, Union[str, float, Dict]]]) -> Dict:
|
404 |
"""
|
405 |
+
Creates a mixed embedding from multiple components
|
406 |
+
|
407 |
+
:param components: List of dictionaries, each containing 'id' (or 'path', or embedding) and 'weight'
|
408 |
+
:return: New mixed embedding
|
409 |
"""
|
410 |
+
if not self.client:
|
411 |
+
logger.error("Cannot create mixed embedding without API client.")
|
412 |
+
raise ValueError("API client is not initialized. Cannot create mixed embedding.")
|
413 |
+
|
414 |
mix_components = []
|
415 |
for component in components:
|
416 |
embedding = self._get_embedding(component.get('id') or component.get('path') or component)
|
|
|
418 |
"embedding": embedding,
|
419 |
"weight": component['weight']
|
420 |
})
|
421 |
+
|
422 |
return self.client.voices.mix(mix_components)
|
423 |
|
424 |
def create_custom_voice(self, name: str, source: Union[str, List[Dict]], description: str = "", language: str = "en"):
|
425 |
"""
|
426 |
+
Creates a custom voice from a file or a mix of voices
|
427 |
+
|
428 |
+
:param name: Name of the new voice
|
429 |
+
:param source: File path or list of components to mix
|
430 |
+
:param description: Description of the voice
|
431 |
+
:param language: Language of the voice
|
432 |
+
:return: ID of the new voice
|
433 |
"""
|
434 |
logger.info(f"Creating custom voice: {name}")
|
435 |
+
|
436 |
if isinstance(source, str):
|
437 |
+
# If source is a string, assume it's a file path
|
438 |
+
if not self.client:
|
439 |
+
logger.error("Cannot clone voice without API client.")
|
440 |
+
raise ValueError("API client is not initialized. Cannot clone voice.")
|
441 |
embedding = self.client.voices.clone(filepath=source)
|
442 |
elif isinstance(source, list):
|
443 |
+
# If source is a list, create a mixed embedding
|
444 |
embedding = self.create_mixed_embedding(source)
|
445 |
else:
|
446 |
raise ValueError("Invalid source type. Expected file path or list of components.")
|
447 |
|
448 |
voice_id = f"custom_{len([f for f in self.custom_dir.glob('*.json')])}"
|
449 |
+
|
450 |
voice_data = {
|
451 |
"id": voice_id,
|
452 |
"name": name,
|
|
|
456 |
"is_public": False,
|
457 |
"is_custom": True
|
458 |
}
|
459 |
+
|
460 |
self._save_voice_to_custom(voice_data)
|
461 |
self.voices[voice_id] = voice_data
|
462 |
self.loaded_voices.add(voice_id)
|
463 |
+
|
464 |
logger.info(f"Created custom voice with id: {voice_id}")
|
465 |
return voice_id
|
466 |
|
467 |
def get_voice_id_by_name(self, name: str) -> List[str]:
|
468 |
matching_voices = []
|
469 |
+
|
470 |
+
# Check both directories
|
471 |
for directory in [self.api_dir, self.custom_dir]:
|
472 |
for file in directory.glob("*.json"):
|
473 |
with open(file, "r") as f:
|
474 |
voice_data = json.load(f)
|
475 |
if voice_data['name'] == name:
|
476 |
matching_voices.append(voice_data['id'])
|
477 |
+
|
478 |
if not matching_voices:
|
479 |
logger.warning(f"No voices found with name: {name}")
|
480 |
else:
|
481 |
logger.info(f"Found {len(matching_voices)} voice(s) with name: {name}")
|
482 |
+
|
483 |
return matching_voices
|
484 |
|
485 |
def improve_tts_text(text: str, language: str = 'en') -> str:
|
|
|
489 |
def format_date(match):
|
490 |
date = datetime.strptime(match.group(), '%Y-%m-%d')
|
491 |
return date.strftime('%m/%d/%Y')
|
492 |
+
|
493 |
text = re.sub(r'\d{4}-\d{2}-\d{2}', format_date, text)
|
494 |
text = text.replace(' - ', ' - - ')
|
495 |
text = re.sub(r'\?(?![\s\n])', '??', text)
|
|
|
502 |
elif language.lower() in ['fr', 'fra', 'french']:
|
503 |
text = text.replace('M.', 'Monsieur')
|
504 |
|
505 |
+
return text
|