daswer123 commited on
Commit
cbaf23b
1 Parent(s): 8b52829

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +481 -962
  2. sonic_api_wrapper.py +174 -115
app.py CHANGED
@@ -1,962 +1,481 @@
1
- import os
2
- import json
3
- from pathlib import Path
4
- from typing import List, Dict, Union, Optional
5
- from enum import Enum
6
- from cartesia import Cartesia
7
- from tqdm import tqdm
8
- from loguru import logger
9
- from datetime import datetime
10
- import re
11
-
12
- class VoiceAccessibility(Enum):
13
- ALL = "all"
14
- ONLY_PUBLIC = "only_public"
15
- ONLY_PRIVATE = "only_private"
16
- ONLY_CUSTOM = "only_custom"
17
-
18
- class CartesiaVoiceManager:
19
- SPEED_OPTIONS = {
20
- "slowest": -1.0,
21
- "slow": -0.5,
22
- "normal": 0.0,
23
- "fast": 0.5,
24
- "fastest": 1.0
25
- }
26
- EMOTION_NAMES = ["anger", "positivity", "surprise", "sadness", "curiosity"]
27
- EMOTION_LEVELS = ["lowest", "low", "medium", "high", "highest"]
28
-
29
- def __init__(self, api_key: str = None, base_dir: Path = None):
30
- self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
31
- if not self.api_key:
32
- raise ValueError("API key is required. Please provide it as an argument or set CARTESIA_API_KEY environment variable.")
33
-
34
- self.client = Cartesia(api_key=self.api_key)
35
- self.current_voice = None
36
- self.current_model = None
37
- self.current_language = None
38
- self.current_mix = None
39
-
40
- # Настройка директорий
41
- self.base_dir = base_dir or Path("voice2voice")
42
- self.api_dir = self.base_dir / "api"
43
- self.custom_dir = self.base_dir / "custom"
44
-
45
- # Создание необходимых директорий
46
- self.api_dir.mkdir(parents=True, exist_ok=True)
47
- self.custom_dir.mkdir(parents=True, exist_ok=True)
48
-
49
- # Инициализация голосов
50
- self.voices = {}
51
- self.loaded_voices = set()
52
-
53
- # Настройки скорости и эмоций
54
- self._speed = 0.0 # normal speed
55
- self._emotions = {}
56
-
57
- logger.add("cartesia_voice_manager.log", rotation="10 MB")
58
- logger.info("CartesiaVoiceManager initialized")
59
-
60
- def load_voice(self, voice_id: str) -> Dict:
61
- if voice_id in self.loaded_voices:
62
- return self.voices[voice_id]
63
-
64
- voice_file = None
65
- # Поиск файла голоса в api и custom директориях
66
- api_file = self.api_dir / f"{voice_id}.json"
67
- custom_file = self.custom_dir / f"{voice_id}.json"
68
-
69
- if api_file.exists():
70
- voice_file = api_file
71
- elif custom_file.exists():
72
- voice_file = custom_file
73
-
74
- if voice_file:
75
- with open(voice_file, "r") as f:
76
- voice_data = json.load(f)
77
- self.voices[voice_id] = voice_data
78
- self.loaded_voices.add(voice_id)
79
- logger.info(f"Loaded voice {voice_id} from {voice_file}")
80
- return voice_data
81
- else:
82
- # Если голос не найден локально, пытаемся загрузить из API
83
- try:
84
- voice_data = self.client.voices.get(id=voice_id)
85
- self._save_voice_to_api(voice_data)
86
- self.voices[voice_id] = voice_data
87
- self.loaded_voices.add(voice_id)
88
- logger.info(f"Loaded voice {voice_id} from API")
89
- return voice_data
90
- except Exception as e:
91
- logger.error(f"Failed to load voice {voice_id}: {e}")
92
- raise ValueError(f"Voice with id {voice_id} not found")
93
-
94
- def extract_voice_id_from_label(self, voice_label: str) -> Optional[str]:
95
- """
96
- Извлекает ID голоса из метки в dropdown
97
- Например: "John (en) [Custom]" -> извлечет ID из словаря голосов
98
- """
99
- # Получаем все голоса и их метки
100
- choices = self.get_voice_choices()
101
- # Находим голос по метке и берем его ID
102
- voice_data = next((c for c in choices if c["label"] == voice_label), None)
103
- return voice_data["value"] if voice_data else None
104
-
105
- def get_voice_choices(self, language: str = None, accessibility: VoiceAccessibility = VoiceAccessibility.ALL) -> List[Dict]:
106
- """
107
- Возвращает список голосов для dropdown меню
108
- """
109
- voices = self.list_available_voices(
110
- languages=[language] if language else None,
111
- accessibility=accessibility
112
- )
113
-
114
- choices = []
115
- for voice in voices:
116
- # Сохраняем только ID в value
117
- choices.append({
118
- "label": f"{voice['name']} ({voice['language']}){' [Custom]' if voice.get('is_custom') else ''}",
119
- "value": voice['id'] # Здесь только ID
120
- })
121
-
122
- return sorted(choices, key=lambda x: x['label'])
123
-
124
- def get_voice_info(self, voice_id: str) -> Dict:
125
- """
126
- Возвращает инф��рмацию о голосе для отображения
127
- """
128
- voice = self.load_voice(voice_id)
129
- return {
130
- "name": voice['name'],
131
- "language": voice['language'],
132
- "is_custom": voice.get('is_custom', False),
133
- "is_public": voice.get('is_public', True),
134
- "id": voice['id']
135
- }
136
-
137
- def _save_voice_to_api(self, voice_data: Dict):
138
- voice_id = voice_data["id"]
139
- file_path = self.api_dir / f"{voice_id}.json"
140
- with open(file_path, "w") as f:
141
- json.dump(voice_data, f, indent=2)
142
- logger.info(f"Saved API voice {voice_id} to {file_path}")
143
-
144
- def _save_voice_to_custom(self, voice_data: Dict):
145
- voice_id = voice_data["id"]
146
- file_path = self.custom_dir / f"{voice_id}.json"
147
- with open(file_path, "w") as f:
148
- json.dump(voice_data, f, indent=2)
149
- logger.info(f"Saved custom voice {voice_id} to {file_path}")
150
-
151
- def update_voices_from_api(self):
152
- logger.info("Updating voices from API")
153
- api_voices = self.client.voices.list()
154
- for voice in tqdm(api_voices, desc="Updating voices"):
155
- voice_id = voice["id"]
156
- full_voice_data = self.client.voices.get(id=voice_id)
157
- self._save_voice_to_api(full_voice_data)
158
- if voice_id in self.loaded_voices:
159
- self.voices[voice_id] = full_voice_data
160
- logger.info(f"Updated {len(api_voices)} voices from API")
161
-
162
- def list_available_voices(self, languages: List[str] = None, accessibility: VoiceAccessibility = VoiceAccessibility.ALL) -> List[Dict]:
163
- filtered_voices = []
164
-
165
- # Получаем только метаданные из API (без эмбеддингов)
166
- if accessibility in [VoiceAccessibility.ALL, VoiceAccessibility.ONLY_PUBLIC]:
167
- try:
168
- api_voices = self.client.voices.list()
169
- # Сохраняем только метаданные
170
- for voice in api_voices:
171
- metadata = {
172
- 'id': voice['id'],
173
- 'name': voice['name'],
174
- 'language': voice['language'],
175
- 'is_public': True
176
- }
177
- if languages is None or metadata['language'] in languages:
178
- filtered_voices.append(metadata)
179
- except Exception as e:
180
- logger.error(f"Failed to fetch voices from API: {e}")
181
-
182
- # Добавляем кастомные голоса если нужно
183
- if accessibility in [VoiceAccessibility.ALL, VoiceAccessibility.ONLY_PRIVATE, VoiceAccessibility.ONLY_CUSTOM]:
184
- for file in self.custom_dir.glob("*.json"):
185
- with open(file, "r") as f:
186
- voice_data = json.load(f)
187
- if languages is None or voice_data['language'] in languages:
188
- filtered_voices.append({
189
- 'id': voice_data['id'],
190
- 'name': voice_data['name'],
191
- 'language': voice_data['language'],
192
- 'is_public': False,
193
- 'is_custom': True
194
- })
195
-
196
- logger.info(f"Found {len(filtered_voices)} voices matching criteria")
197
- return filtered_voices
198
-
199
- def set_voice(self, voice_id: str):
200
- # Проверяем наличие локального файла с эмбеддингом
201
- voice_file = None
202
- api_file = self.api_dir / f"{voice_id}.json"
203
- custom_file = self.custom_dir / f"{voice_id}.json"
204
-
205
- if api_file.exists():
206
- voice_file = api_file
207
- elif custom_file.exists():
208
- voice_file = custom_file
209
-
210
- if voice_file:
211
- # Используем локальные данные
212
- with open(voice_file, "r") as f:
213
- self.current_voice = json.load(f)
214
- else:
215
- # Получаем полные данные с эмбеддингом из API
216
- try:
217
- voice_data = self.client.voices.get(id=voice_id)
218
- # Сохраняем для будущего использования
219
- self._save_voice_to_api(voice_data)
220
- self.current_voice = voice_data
221
- except Exception as e:
222
- logger.error(f"Failed to get voice {voice_id}: {e}")
223
- raise ValueError(f"Voice with id {voice_id} not found")
224
-
225
- self.set_language(self.current_voice['language'])
226
- logger.info(f"Set current voice to {voice_id}")
227
-
228
- def set_model(self, language: str):
229
- if language.lower() in ['en', 'eng', 'english']:
230
- self.current_model = "sonic-english"
231
- else:
232
- self.current_model = "sonic-multilingual"
233
- self.current_language = language
234
- logger.info(f"Set model to {self.current_model} for language {language}")
235
-
236
- def set_language(self, language: str):
237
- self.current_language = language
238
- self.set_model(language)
239
- logger.info(f"Set language to {language}")
240
-
241
- @property
242
- def speed(self):
243
- return self._speed
244
-
245
- @speed.setter
246
- def speed(self, value):
247
- if isinstance(value, str):
248
- if value not in self.SPEED_OPTIONS:
249
- raise ValueError(f"Invalid speed value. Use one of: {list(self.SPEED_OPTIONS.keys())}")
250
- self._speed = self.SPEED_OPTIONS[value]
251
- elif isinstance(value, (int, float)):
252
- if not -1 <= value <= 1:
253
- raise ValueError("Speed value must be between -1 and 1")
254
- self._speed = value
255
- else:
256
- raise ValueError("Speed must be a string from SPEED_OPTIONS or a number between -1 and 1")
257
- logger.info(f"Set speed to {self._speed}")
258
-
259
- def set_emotions(self, emotions: List[Dict[str, str]] = None):
260
- if emotions is None:
261
- self._emotions = {}
262
- logger.info("Cleared all emotions")
263
- return
264
-
265
- self._emotions = {}
266
- for emotion in emotions:
267
- name = emotion.get("name")
268
- level = emotion.get("level")
269
-
270
- if name not in self.EMOTION_NAMES:
271
- raise ValueError(f"Invalid emotion name. Choose from: {self.EMOTION_NAMES}")
272
- if level not in self.EMOTION_LEVELS:
273
- raise ValueError(f"Invalid emotion level. Choose from: {self.EMOTION_LEVELS}")
274
-
275
- self._emotions[name] = level
276
-
277
- logger.info(f"Set emotions: {self._emotions}")
278
-
279
- def _get_voice_controls(self):
280
- controls = {"speed": self._speed}
281
-
282
- if self._emotions:
283
- controls["emotion"] = [f"{name}:{level}" for name, level in self._emotions.items()]
284
-
285
- return controls
286
-
287
- def speak(self, text: str, output_file: str = None):
288
- if not self.current_model or not (self.current_voice or self.current_mix):
289
- raise ValueError("Please set a model and a voice or voice mix before speaking.")
290
-
291
- voice_embedding = self.current_voice['embedding'] if self.current_voice else self.current_mix
292
-
293
- improved_text = improve_tts_text(text, self.current_language)
294
-
295
- output_format = {
296
- "container": "wav",
297
- "encoding": "pcm_f32le",
298
- "sample_rate": 44100,
299
- }
300
-
301
- voice_controls = self._get_voice_controls()
302
-
303
- logger.info(f"Generating audio for text: {text[:50]}... with voice controls: {voice_controls}")
304
- if self.current_language == 'en':
305
- audio_data = self.client.tts.bytes(
306
- model_id='sonic-english',
307
- transcript=improved_text,
308
- voice_embedding=voice_embedding,
309
- duration=None,
310
- output_format=output_format,
311
- # language=self.current_language,
312
- _experimental_voice_controls=voice_controls
313
- )
314
- else:
315
- audio_data = self.client.tts.bytes(
316
- model_id='sonic-multilingual',
317
- transcript=improved_text,
318
- voice_embedding=voice_embedding,
319
- duration=None,
320
- output_format=output_format,
321
- language=self.current_language,
322
- _experimental_voice_controls=voice_controls)
323
-
324
- if output_file is None:
325
- output_file = f"output_{self.current_language}.wav"
326
-
327
- with open(output_file, "wb") as f:
328
- f.write(audio_data)
329
- logger.info(f"Audio saved to {output_file}")
330
- print(f"Audio generated and saved to {output_file}")
331
-
332
- return output_file
333
-
334
- def _get_embedding(self, source: Union[str, Dict]) -> Dict:
335
- """
336
- Получает эмбеддинг из различных источников: ID, путь к файлу или существующий эмбеддинг
337
- """
338
- if isinstance(source, dict) and 'embedding' in source:
339
- return source['embedding']
340
- elif isinstance(source, str):
341
- if os.path.isfile(source):
342
- # Если это путь к файлу, создаем новый эмбеддинг
343
- return self.client.voices.clone(filepath=source)
344
- else:
345
- # Если это ID, загружаем голос и возвращаем его эмбеддинг
346
- voice = self.load_voice(source)
347
- return voice['embedding']
348
- else:
349
- raise ValueError(f"Invalid source type: {type(source)}")
350
-
351
- def create_mixed_embedding(self, components: List[Dict[str, Union[str, float, Dict]]]) -> Dict:
352
- """
353
- Создает смешанный эмбеддинг из нескольких компонентов
354
-
355
- :param components: Список словарей, каждый содержит 'id' (или 'path', или эмбеддинг) и 'weight'
356
- :return: Новый смешанный эмбеддинг
357
- """
358
- mix_components = []
359
- for component in components:
360
- embedding = self._get_embedding(component.get('id') or component.get('path') or component)
361
- mix_components.append({
362
- "embedding": embedding,
363
- "weight": component['weight']
364
- })
365
-
366
- return self.client.voices.mix(mix_components)
367
-
368
- def create_custom_voice(self, name: str, source: Union[str, List[Dict]], description: str = "", language: str = "en"):
369
- """
370
- Создает кастомный голос из файла или смеси голосов
371
-
372
- :param name: Имя нового голоса
373
- :param source: Путь к файлу или список компонентов для смешивания
374
- :param description: Описание голоса
375
- :param language: Язык голоса
376
- :return: ID нового голоса
377
- """
378
- logger.info(f"Creating custom voice: {name}")
379
-
380
- if isinstance(source, str):
381
- # Если источник - строка, считаем это путем к файлу
382
- embedding = self.client.voices.clone(filepath=source)
383
- elif isinstance(source, list):
384
- # Если источник - список, создаем смешанный эмбеддинг
385
- embedding = self.create_mixed_embedding(source)
386
- else:
387
- raise ValueError("Invalid source type. Expected file path or list of components.")
388
-
389
- voice_id = f"custom_{len([f for f in self.custom_dir.glob('*.json')])}"
390
-
391
- voice_data = {
392
- "id": voice_id,
393
- "name": name,
394
- "description": description,
395
- "embedding": embedding,
396
- "language": language,
397
- "is_public": False,
398
- "is_custom": True
399
- }
400
-
401
- self._save_voice_to_custom(voice_data)
402
- self.voices[voice_id] = voice_data
403
- self.loaded_voices.add(voice_id)
404
-
405
- logger.info(f"Created custom voice with id: {voice_id}")
406
- return voice_id
407
-
408
- def get_voice_id_by_name(self, name: str) -> List[str]:
409
- matching_voices = []
410
-
411
- # Проверяем оба каталога
412
- for directory in [self.api_dir, self.custom_dir]:
413
- for file in directory.glob("*.json"):
414
- with open(file, "r") as f:
415
- voice_data = json.load(f)
416
- if voice_data['name'] == name:
417
- matching_voices.append(voice_data['id'])
418
-
419
- if not matching_voices:
420
- logger.warning(f"No voices found with name: {name}")
421
- else:
422
- logger.info(f"Found {len(matching_voices)} voice(s) with name: {name}")
423
-
424
- return matching_voices
425
-
426
- def improve_tts_text(text: str, language: str = 'en') -> str:
427
- text = re.sub(r'(\w+)(\s*)$', r'\1.\2', text)
428
- text = re.sub(r'(\w+)(\s*\n)', r'\1.\2', text)
429
-
430
- def format_date(match):
431
- date = datetime.strptime(match.group(), '%Y-%m-%d')
432
- return date.strftime('%m/%d/%Y')
433
-
434
- text = re.sub(r'\d{4}-\d{2}-\d{2}', format_date, text)
435
- text = text.replace(' - ', ' - - ')
436
- text = re.sub(r'\?(?![\s\n])', '??', text)
437
- text = text.replace('"', '')
438
- text = text.replace("'", '')
439
- text = re.sub(r'(https?://\S+|\S+@\S+\.\S+)\?', r'\1 ?', text)
440
-
441
- if language.lower() in ['ru', 'rus', 'russian']:
442
- text = text.replace('г.', 'году')
443
- elif language.lower() in ['fr', 'fra', 'french']:
444
- text = text.replace('M.', 'Monsieur')
445
-
446
- return text
447
-
448
-
449
- from typing import List
450
- import gradio as gr
451
- from pathlib import Path
452
- import os
453
- import json
454
-
455
- # Инициализация базовых переменных
456
- DEFAULT_API_KEY = ""
457
- LANGUAGE_CHOICES = ["all", "ru", "en", "es", "pl", "de", "fr"]
458
- ACCESS_TYPE_MAP = {
459
- "Все": VoiceAccessibility.ALL,
460
- "Только кастомные": VoiceAccessibility.ONLY_CUSTOM,
461
- "Апи": VoiceAccessibility.ONLY_PUBLIC
462
- }
463
- # Обновленные константы
464
- SPEED_CHOICES = ["Очень медленно", "Медленно", "Нормально", "Быстро", "Очень быстро"]
465
- EMOTION_CHOICES = ["Нейтрально", "Весело", "Грустно", "Злобно", "Удивленно", "Любопытно"]
466
- EMOTION_INTENSITY = ["Очень слабая", "Слабая", "Средняя", "Сильная", "Очень сильная"]
467
-
468
- # Глобальная переменная для хранения экземпляра менеджера
469
- manager = None
470
-
471
- import datetime
472
-
473
- def map_speed(speed_type: str) -> float:
474
- speed_map = {
475
- "Очень медленно": -1.0,
476
- "Медленно": -0.5,
477
- "Нормально": 0.0,
478
- "Быстро": 0.5,
479
- "Очень быстро": 1.0
480
- }
481
- return speed_map[speed_type]
482
-
483
- def generate_output_filename(language: str) -> str:
484
- """Генерация имени файла с временной меткой и языком"""
485
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
486
- return f"output/{timestamp}_{language}.wav"
487
-
488
- def extract_voice_id_from_label(voice_label: str) -> str:
489
- """
490
- Извлекает ID голоса из метки в dropdown
491
- Например: "John (en) [Custom]" -> извлечет ID из словаря голосов
492
- """
493
- global manager
494
- try:
495
- if not manager:
496
- return None
497
-
498
- # Получаем все голоса и их метки
499
- choices = manager.get_voice_choices()
500
- # Находим голос по метке и берем его ID
501
- voice_data = next((c for c in choices if c["label"] == voice_label), None)
502
- return voice_data["value"] if voice_data else None
503
- except Exception as e:
504
- print(f"❌ Ошибка при получении голосов: {str(e)}")
505
- return None
506
-
507
- def initialize_manager(api_key: str) -> str:
508
- global manager
509
- try:
510
- manager = CartesiaVoiceManager(api_key=api_key, base_dir=Path("voice2voice"))
511
- return "✅ Менеджер инициализирован"
512
- except Exception as e:
513
- return f"❌ Ошибка: {str(e)}"
514
-
515
- def get_initial_voices():
516
- global manager
517
- """Получение начального списка голосов"""
518
- if not manager:
519
- # initialize_manager(DEFAULT_API_KEY)
520
- return None
521
- choices = manager.get_voice_choices()
522
- if not choices:
523
- return None
524
- return [c["label"] for c in choices], choices[0]["label"] if choices else None
525
-
526
- def update_voice_list (language: str, access_type: str, current_voice: str = None):
527
- """
528
- Обновление списка голосов с сохранением текущего выбора
529
- """
530
- global manager
531
- if not manager:
532
- return gr.update(choices=[], value=None), "❌ Менеджер не инициализирован"
533
-
534
- try:
535
- choices = manager.get_voice_choices(
536
- language=None if language == "all" else language,
537
- accessibility=ACCESS_TYPE_MAP[access_type]
538
- )
539
-
540
- # Преобразуем в список меток
541
- choice_labels = [c["label"] for c in choices]
542
-
543
- # Определяем значение для выбора
544
- if current_voice in choice_labels:
545
- # Сохраняем текущий выбор, если он доступен
546
- new_value = current_voice
547
- else:
548
- # Иначе берем первый доступный голос
549
- new_value = choice_labels[0] if choice_labels else None
550
-
551
- return gr.update(choices=choice_labels, value=new_value), "✅ Список голосов обновлен"
552
- except Exception as e:
553
- return gr.update(choices=[], value=None), f"❌ Ошибка: {str(e)}"
554
-
555
- def update_voice_info(voice_label: str) -> str:
556
- """Обновление информации о голосе"""
557
- global manager
558
- if not manager or not voice_label:
559
- return ""
560
-
561
- try:
562
- voice_id = extract_voice_id_from_label(voice_label)
563
- if not voice_id:
564
- return "❌ Голос не найден"
565
-
566
- info = manager.get_voice_info(voice_id)
567
- return (
568
- f"Имя: {info['name']}\n"
569
- f"Язык: {info['language']}\n"
570
- f"Тип: {'Кастомный' if info.get('is_custom') else 'API'}\n"
571
- f"ID: {info['id']}"
572
- )
573
- except Exception as e:
574
- return f"❌ Ошибка: {str(e)}"
575
-
576
- def create_custom_voice(name: str, language: str, audio_data: tuple) -> tuple:
577
- """
578
- Создание кастомного голоса и обновление списка голосов
579
- Возвращает: (статус, обновленный dropdown, информация о голосе)
580
- """
581
- global manager
582
- if not manager:
583
- return "❌ Менеджер не инициализирован", gr.update(), ""
584
-
585
- if not name or not audio_data:
586
- return "❌ Необходимо указать имя и файл голоса", gr.update(), ""
587
-
588
- try:
589
- # Получаем путь к аудио файлу
590
- audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data
591
-
592
- # Создаем голос
593
- voice_id = manager.create_custom_embedding(
594
- file_path=audio_path,
595
- name=name,
596
- language=language
597
- )
598
-
599
- print(voice_id)
600
-
601
- # Получаем обновленный список голосов
602
- choices = manager.get_voice_choices()
603
- choice_labels = [c["label"] for c in choices]
604
-
605
- # Находим метку для нового голоса
606
- new_voice_label = next(c["label"] for c in choices if c["value"] == voice_id)
607
-
608
- # Получаем информацию о новом голосе
609
- voice_info = manager.get_voice_info(voice_id)
610
- info_text = (
611
- f"Имя: {voice_info['name']}\n"
612
- f"Язык: {voice_info['language']}\n"
613
- f"Тип: Кастомный\n"
614
- f"ID: {voice_info['id']}"
615
- )
616
-
617
- return (
618
- f"✅ Создан кастомный голос: {voice_id}",
619
- gr.update(choices=choice_labels, value=new_voice_label),
620
- info_text
621
- )
622
-
623
- except Exception as e:
624
- return f"❌ Ошибка создания голоса: {str(e)}", gr.update(), ""
625
-
626
- def on_auto_language_change(auto_language: bool):
627
- """Обработчик изменения галочки автоопределения языка"""
628
- return gr.update(visible=not auto_language)
629
-
630
- def map_emotions(selected_emotions, intensity):
631
- emotion_map = {
632
- "Весело": "positivity",
633
- "Грустно": "sadness",
634
- "Злобно": "anger",
635
- "Удивленно": "surprise",
636
- "Любопытно": "curiosity"
637
- }
638
-
639
- intensity_map = {
640
- "Очень слабая": "lowest",
641
- "Слабая": "low",
642
- "Средняя": "medium",
643
- "Сильная": "high",
644
- "Очень сильная": "highest"
645
- }
646
-
647
- emotions = []
648
- for emotion in selected_emotions:
649
- if emotion == "Нейтрально":
650
- continue
651
- if emotion in emotion_map:
652
- emotions.append({
653
- "name": emotion_map[emotion],
654
- "level": intensity_map[intensity]
655
- })
656
- return emotions
657
-
658
- def generate_speech(
659
- text: str,
660
- voice_label: str,
661
- improve_text: bool,
662
- auto_language: bool,
663
- manual_language: str,
664
- speed_type: str,
665
- use_custom_speed: bool,
666
- custom_speed: float,
667
- emotions: List[str],
668
- emotion_intensity: str
669
- ):
670
- global manager
671
- """Генерация речи с учетом настроек языка"""
672
- if not manager:
673
- return None, "❌ Менеджер не инициализирован"
674
-
675
- if not text or not voice_label:
676
- return None, "❌ Необходимо указать текст и голос"
677
-
678
- try:
679
- # Извлекаем ID голоса из метки
680
- voice_id = extract_voice_id_from_label(voice_label)
681
- if not voice_id:
682
- return None, "❌ Голос не найден"
683
-
684
- # Устанавливаем голос по ID
685
- manager.set_voice(voice_id)
686
-
687
- # Если автоопределение выключено, устанавливаем язык вручную
688
- if not auto_language:
689
- manager.set_language(manual_language)
690
-
691
- # В функции generate_speech обновите установку скорости:
692
- if use_custom_speed:
693
- manager.speed = custom_speed
694
- else:
695
- manager.speed = map_speed(speed_type)
696
-
697
- # Установка эмоций
698
- emotion_map = {
699
- "Нейтрально": None,
700
- "Весело": "positivity",
701
- "Грустно": "sadness",
702
- "Злобно": "anger",
703
- "Удивленно": "surprise",
704
- "Любопытно": "curiosity"
705
- }
706
-
707
- intensity_map = {
708
- "Слабая": "low",
709
- "Средняя": "medium",
710
- "Сильная": "high"
711
- }
712
-
713
- if emotions and emotions != ["Нейтрально"]:
714
- manager.set_emotions(map_emotions(emotions, emotion_intensity))
715
- else:
716
- manager.set_emotions() # Сброс эмоций
717
-
718
- # Генерация имени файла
719
- output_file = generate_output_filename(
720
- manual_language if not auto_language else manager.current_language
721
- )
722
-
723
- # Создаем директорию для выходных файлов, если её нет
724
- os.makedirs("output", exist_ok=True)
725
-
726
- # Генерация речи
727
- output_path = manager.speak(
728
- text=text if not improve_text else improve_tts_text(text, manager.current_language),
729
- output_file=output_file
730
- )
731
-
732
- return output_path, "✅ Аудио сгенерировано успешно"
733
-
734
- except Exception as e:
735
- return None, f"❌ Ошибка генерации: {str(e)}"
736
-
737
- # Создание интерфейса
738
- with gr.Blocks() as demo:
739
- # API ключ
740
- cartesia_api_key = gr.Textbox(
741
- label="API ключ Cartesia",
742
- value=DEFAULT_API_KEY,
743
- type='password'
744
- )
745
-
746
- with gr.Row():
747
- # Левая колонка
748
- with gr.Column():
749
- cartesia_text = gr.TextArea(label="Текст")
750
-
751
- with gr.Accordion(label="Настройки", open=True):
752
- # Фильтры
753
- with gr.Accordion("Фильтры", open=True):
754
- cartesia_setting_filter_lang = gr.Dropdown(
755
- label="Язык",
756
- choices=LANGUAGE_CHOICES,
757
- value="all"
758
- )
759
- cartesia_setting_filter_type = gr.Dropdown(
760
- label="Тип",
761
- choices=ACCESS_TYPE_MAP,
762
- value="Все"
763
- )
764
-
765
- # Вкладки настроек
766
- with gr.Tab("Стандарт"):
767
- cartesia_setting_voice_info = gr.Textbox(
768
- label="Информация о голосе",
769
- interactive=False
770
- )
771
- with gr.Row():
772
- if not manager:
773
- initial_choices = None
774
- initial_value = None
775
- else:
776
- initial_choices, initial_value = get_initial_voices()
777
- cartesia_setting_voice = gr.Dropdown(
778
- label="Голос",
779
- choices=initial_choices,
780
- value=initial_value
781
- )
782
- cartesia_setting_voice_update = gr.Button("Обновить")
783
- cartesia_setting_auto_language = gr.Checkbox(
784
- label="Автоматически определять язык из голоса",
785
- value=True
786
- )
787
- cartesia_setting_manual_language = gr.Dropdown(
788
- label="Язык озвучки",
789
- choices=["ru", "en", "es", "fr", "de", "pl", "it", "ja", "ko", "zh", "hi"],
790
- value="en",
791
- visible=False # Изначально скрыт
792
- )
793
-
794
- with gr.Tab("Кастомный"):
795
- cartesia_setting_custom_name = gr.Textbox(label="Имя")
796
- cartesia_setting_custom_lang = gr.Dropdown(
797
- label="Язык",
798
- choices=LANGUAGE_CHOICES[1:] # Исключаем "all"
799
- )
800
- cartesia_setting_custom_voice = gr.Audio(label="Файл голоса",type='filepath')
801
- cartesia_setting_custom_add = gr.Button("Добавить")
802
-
803
- # with gr.Tab("Микс"):
804
- # cartesia_setting_custom_mix = gr.Dropdown(
805
- # label="Выберите голоса",
806
- # multiselect=True,
807
- # choices=[]
808
- # )
809
- # cartesia_setting_custom_mix_update = gr.Button("Обновить")
810
- # for i in range(5):
811
- # setattr(
812
- # demo,
813
- # f'mix_voice_{i+1}',
814
- # gr.Slider(
815
- # label=f"Голос {i+1}",
816
- # value=0.5,
817
- # minimum=0,
818
- # maximum=1,
819
- # step=0.01,
820
- # visible=False
821
- # )
822
- # )
823
-
824
- # Контроль эмоций
825
- with gr.Accordion(label="Контроль эмоций (Beta)", open=False):
826
- cartesia_emotions = gr.Dropdown(
827
- label="Эмоции",
828
- multiselect=True,
829
- choices=EMOTION_CHOICES
830
- )
831
- cartesia_emotions_intensity = gr.Dropdown(
832
- label="Интенсивность",
833
- choices=EMOTION_INTENSITY,
834
- value="Средняя"
835
- )
836
-
837
- # Настройки скорости
838
- with gr.Accordion("Скорость", open=True):
839
- cartesia_speed_speed = gr.Dropdown(
840
- label="Скорость речи",
841
- choices=SPEED_CHOICES,
842
- value="Нормально"
843
- )
844
- cartesia_speed_speed_allow_custom = gr.Checkbox(
845
- label="Использовать кастомное значение скорости"
846
- )
847
- cartesia_speed_speed_custom = gr.Slider(
848
- label="Скорость",
849
- value=0,
850
- minimum=-1,
851
- maximum=1,
852
- step=0.1,
853
- visible=False
854
- )
855
-
856
- cartesia_setting_improve_text = gr.Checkbox(
857
- label="Улучшить текст согласно рекомендациям",
858
- value=True
859
- )
860
-
861
- # Правая колонка
862
- with gr.Column():
863
- cartessia_status_bar = gr.Label(value="Статус")
864
- cartesia_output_audio = gr.Audio(
865
- label="Результат",
866
- interactive=False
867
- )
868
- cartesia_output_button = gr.Button("Генерация")
869
-
870
- # События
871
- cartesia_api_key.change(
872
- initialize_manager,
873
- inputs=[cartesia_api_key],
874
- outputs=[cartessia_status_bar]
875
- )
876
-
877
- cartesia_setting_filter_lang.change(
878
- update_voice_list,
879
- inputs=[
880
- cartesia_setting_filter_lang,
881
- cartesia_setting_filter_type,
882
- cartesia_setting_voice # Передаем текущий выбор
883
- ],
884
- outputs=[cartesia_setting_voice, cartessia_status_bar]
885
- )
886
-
887
- cartesia_setting_filter_type.change(
888
- update_voice_list,
889
- inputs=[
890
- cartesia_setting_filter_lang,
891
- cartesia_setting_filter_type,
892
- cartesia_setting_voice # Передаем текущий выбор
893
- ],
894
- outputs=[cartesia_setting_voice, cartessia_status_bar]
895
- )
896
-
897
- cartesia_setting_voice.change(
898
- update_voice_info,
899
- inputs=[cartesia_setting_voice],
900
- outputs=[cartesia_setting_voice_info]
901
- )
902
-
903
- cartesia_setting_voice_update.click(
904
- update_voice_list,
905
- inputs=[cartesia_setting_filter_lang, cartesia_setting_filter_type],
906
- outputs=[cartesia_setting_voice]
907
- )
908
-
909
- cartesia_speed_speed_allow_custom.change(
910
- lambda x: gr.update(visible=x),
911
- inputs=[cartesia_speed_speed_allow_custom],
912
- outputs=[cartesia_speed_speed_custom]
913
- )
914
-
915
- cartesia_setting_custom_add.click(
916
- create_custom_voice,
917
- inputs=[
918
- cartesia_setting_custom_name,
919
- cartesia_setting_custom_lang,
920
- cartesia_setting_custom_voice
921
- ],
922
- outputs=[
923
- cartessia_status_bar,
924
- cartesia_setting_voice, # Обновляем dropdown
925
- cartesia_setting_voice_info # Обновляем информацию о голосе
926
- ]
927
- )
928
-
929
- # Обновляем привязки событий
930
- cartesia_setting_auto_language.change(
931
- on_auto_language_change,
932
- inputs=[cartesia_setting_auto_language],
933
- outputs=[cartesia_setting_manual_language]
934
- )
935
-
936
- cartesia_output_button.click(
937
- generate_speech,
938
- inputs=[
939
- cartesia_text,
940
- cartesia_setting_voice,
941
- cartesia_setting_improve_text,
942
- cartesia_setting_auto_language,
943
- cartesia_setting_manual_language,
944
- cartesia_speed_speed,
945
- cartesia_speed_speed_allow_custom,
946
- cartesia_speed_speed_custom,
947
- cartesia_emotions,
948
- cartesia_emotions_intensity
949
- ],
950
- outputs=[
951
- cartesia_output_audio,
952
- cartessia_status_bar
953
- ]
954
- )
955
-
956
- # Запуск приложения
957
- if __name__ == "__main__":
958
- # global manager
959
- # Инициализация менеджера при запуске
960
- # initialize_manager(DEFAULT_API_KEY)
961
- # Запуск интерфейса
962
- demo.launch()
 
1
+ from typing import List
2
+ import gradio as gr
3
+ from pathlib import Path
4
+ from sonic_api_wrapper import CartesiaVoiceManager, VoiceAccessibility, improve_tts_text
5
+ import os
6
+ import json
7
+ import datetime
8
+
9
+ # Global variable to hold the manager instance
10
+ manager = None
11
+
12
+ # Constants
13
+ LANGUAGE_CHOICES = ["all", "ru", "en", "es", "pl", "de", "fr", "tr", "pt", "zh", "ja", "hi", "it", "ko", "nl", "sv"]
14
+ ACCESS_TYPE_MAP = {
15
+ "All": VoiceAccessibility.ALL,
16
+ "Custom Only": VoiceAccessibility.ONLY_CUSTOM,
17
+ "Private Only": VoiceAccessibility.ONLY_PRIVATE,
18
+ "API": VoiceAccessibility.ONLY_PUBLIC
19
+ }
20
+ SPEED_CHOICES = ["Very Slow", "Slow", "Normal", "Fast", "Very Fast"]
21
+ EMOTION_CHOICES = ["Neutral", "Happy", "Sad", "Angry", "Surprised", "Curious"]
22
+ EMOTION_INTENSITY = ["Very Weak", "Weak", "Medium", "Strong", "Very Strong"]
23
+
24
+ def map_speed(speed_type: str) -> float:
25
+ speed_map = {
26
+ "Very Slow": -1.0,
27
+ "Slow": -0.5,
28
+ "Normal": 0.0,
29
+ "Fast": 0.5,
30
+ "Very Fast": 1.0
31
+ }
32
+ return speed_map[speed_type]
33
+
34
+ def generate_output_filename(language: str) -> str:
35
+ """Generate output filename with timestamp and language"""
36
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
37
+ return f"output/{timestamp}_{language}.wav"
38
+
39
+ def extract_voice_id_from_label(voice_label: str) -> str:
40
+ """
41
+ Extracts voice ID from label in dropdown
42
+ For example: "John (en) [Custom]" -> extract ID from voices dictionary
43
+ """
44
+ global manager
45
+ try:
46
+ if not manager:
47
+ return None
48
+
49
+ # Get all voices and their labels
50
+ choices = manager.get_voice_choices()
51
+ # Find voice by label and get its ID
52
+ voice_data = next((c for c in choices if c["label"] == voice_label), None)
53
+ return voice_data["value"] if voice_data else None
54
+ except Exception as e:
55
+ print(f"❌ Error getting voices: {str(e)}")
56
+ return None
57
+
58
+ def initialize_manager(api_key: str) -> str:
59
+ global manager
60
+ try:
61
+ if not api_key:
62
+ return "❌ API key is required to initialize the manager"
63
+
64
+ manager = CartesiaVoiceManager(api_key=api_key, base_dir=Path("voice2voice"))
65
+ return "✅ Manager initialized"
66
+ except Exception as e:
67
+ manager = None
68
+ return f"❌ Error: {str(e)}"
69
+
70
+ def get_initial_voices():
71
+ global manager
72
+ """Get initial list of voices"""
73
+ if not manager:
74
+ return [], None
75
+ choices = manager.get_voice_choices()
76
+ if not choices:
77
+ return [], None
78
+ return [c["label"] for c in choices], choices[0]["label"] if choices else None
79
+
80
+ def update_voice_list(language: str, access_type: str, current_voice: str = None):
81
+ """
82
+ Update the list of voices, preserving the current selection
83
+ """
84
+ global manager
85
+ if not manager:
86
+ return gr.update(choices=[], value=None), "❌ Manager is not initialized"
87
+
88
+ try:
89
+ choices = manager.get_voice_choices(
90
+ language=None if language == "all" else language,
91
+ accessibility=ACCESS_TYPE_MAP[access_type]
92
+ )
93
+
94
+ # Convert to list of labels
95
+ choice_labels = [c["label"] for c in choices]
96
+
97
+ # Determine value to select
98
+ if current_voice in choice_labels:
99
+ # Preserve current selection if available
100
+ new_value = current_voice
101
+ else:
102
+ # Otherwise, take the first available voice
103
+ new_value = choice_labels[0] if choice_labels else None
104
+
105
+ return gr.update(choices=choice_labels, value=new_value), "✅ Voice list updated"
106
+ except Exception as e:
107
+ return gr.update(choices=[], value=None), f"❌ Error: {str(e)}"
108
+
109
+ def update_voice_info(voice_label: str) -> str:
110
+ """Update voice information"""
111
+ global manager
112
+ if not manager or not voice_label:
113
+ return ""
114
+
115
+ try:
116
+ voice_id = extract_voice_id_from_label(voice_label)
117
+ if not voice_id:
118
+ return " Voice not found"
119
+
120
+ info = manager.get_voice_info(voice_id)
121
+ return (
122
+ f"Name: {info['name']}\n"
123
+ f"Language: {info['language']}\n"
124
+ f"Type: {'Custom' if info.get('is_custom') else 'API'}\n"
125
+ f"ID: {info['id']}"
126
+ )
127
+ except Exception as e:
128
+ return f"❌ Error: {str(e)}"
129
+
130
+ def create_custom_voice(name: str, language: str, audio_data: tuple) -> tuple:
131
+ """
132
+ Creates a custom voice and updates the list of voices
133
+ Returns: (status, updated dropdown, voice info)
134
+ """
135
+ global manager
136
+ if not manager:
137
+ return "❌ Manager is not initialized", gr.update(), ""
138
+
139
+ if not name or not audio_data:
140
+ return "❌ Name and voice file are required", gr.update(), ""
141
+
142
+ try:
143
+ # Get the audio file path
144
+ audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data
145
+
146
+ # Create the voice
147
+ voice_id = manager.create_custom_voice(
148
+ name=name,
149
+ source=audio_path,
150
+ language=language
151
+ )
152
+
153
+ print(voice_id)
154
+
155
+ # Get updated list of voices
156
+ choices = manager.get_voice_choices()
157
+ choice_labels = [c["label"] for c in choices]
158
+
159
+ # Find label for the new voice
160
+ new_voice_label = next(c["label"] for c in choices if c["value"] == voice_id)
161
+
162
+ # Get info of the new voice
163
+ voice_info = manager.get_voice_info(voice_id)
164
+ info_text = (
165
+ f"Name: {voice_info['name']}\n"
166
+ f"Language: {voice_info['language']}\n"
167
+ f"Type: Custom\n"
168
+ f"ID: {voice_info['id']}"
169
+ )
170
+
171
+ return (
172
+ f"✅ Custom voice created: {voice_id}",
173
+ gr.update(choices=choice_labels, value=new_voice_label),
174
+ info_text
175
+ )
176
+
177
+ except Exception as e:
178
+ return f"❌ Error creating voice: {str(e)}", gr.update(), ""
179
+
180
+ def on_auto_language_change(auto_language: bool):
181
+ """Handler for changing the auto-detect language checkbox"""
182
+ return gr.update(visible=not auto_language)
183
+
184
+ def map_emotions(selected_emotions, intensity):
185
+ emotion_map = {
186
+ "Happy": "positivity",
187
+ "Sad": "sadness",
188
+ "Angry": "anger",
189
+ "Surprised": "surprise",
190
+ "Curious": "curiosity"
191
+ }
192
+
193
+ intensity_map = {
194
+ "Very Weak": "lowest",
195
+ "Weak": "low",
196
+ "Medium": "medium",
197
+ "Strong": "high",
198
+ "Very Strong": "highest"
199
+ }
200
+
201
+ emotions = []
202
+ for emotion in selected_emotions:
203
+ if emotion == "Neutral":
204
+ continue
205
+ if emotion in emotion_map:
206
+ emotions.append({
207
+ "name": emotion_map[emotion],
208
+ "level": intensity_map[intensity]
209
+ })
210
+ return emotions
211
+
212
+ def generate_speech(
213
+ text: str,
214
+ voice_label: str,
215
+ improve_text: bool,
216
+ auto_language: bool,
217
+ manual_language: str,
218
+ speed_type: str,
219
+ use_custom_speed: bool,
220
+ custom_speed: float,
221
+ emotions: List[str],
222
+ emotion_intensity: str
223
+ ):
224
+ global manager
225
+ """Generate speech considering language settings"""
226
+ if not manager:
227
+ return None, "❌ Manager is not initialized"
228
+
229
+ if not text or not voice_label:
230
+ return None, "❌ Text and voice are required"
231
+
232
+ try:
233
+ # Extract voice ID from label
234
+ voice_id = extract_voice_id_from_label(voice_label)
235
+ if not voice_id:
236
+ return None, "❌ Voice not found"
237
+
238
+ # Set the voice by ID
239
+ manager.set_voice(voice_id)
240
+
241
+ # If auto-detect is off, set language manually
242
+ if not auto_language:
243
+ manager.set_language(manual_language)
244
+
245
+ # Set speed
246
+ if use_custom_speed:
247
+ manager.speed = custom_speed
248
+ else:
249
+ manager.speed = map_speed(speed_type)
250
+
251
+ # Set emotions
252
+ if emotions and emotions != ["Neutral"]:
253
+ manager.set_emotions(map_emotions(emotions, emotion_intensity))
254
+ else:
255
+ manager.set_emotions() # Reset emotions
256
+
257
+ # Generate output file name
258
+ output_file = generate_output_filename(
259
+ manual_language if not auto_language else manager.current_language
260
+ )
261
+
262
+ # Create output directory if it doesn't exist
263
+ os.makedirs("output", exist_ok=True)
264
+
265
+ # Generate speech
266
+ output_path = manager.speak(
267
+ text=text if not improve_text else improve_tts_text(text, manager.current_language),
268
+ output_file=output_file
269
+ )
270
+
271
+ return output_path, " Audio generated successfully"
272
+
273
+ except Exception as e:
274
+ return None, f"❌ Error generating speech: {str(e)}"
275
+
276
+ def initialize_manager_and_update(api_key: str, language: str, access_type: str, current_voice: str = None):
277
+ status = initialize_manager(api_key)
278
+ if manager:
279
+ voice_update, voice_status = update_voice_list(language, access_type, current_voice)
280
+ combined_status = f"{status}\n{voice_status}"
281
+ return combined_status, voice_update
282
+ else:
283
+ return status, gr.update(choices=[], value=None)
284
+
285
+ # Create the interface
286
+ with gr.Blocks() as demo:
287
+ # API key
288
+ cartesia_api_key = gr.Textbox(
289
+ label="Cartesia API Key",
290
+ value="", # No default API key
291
+ type='password'
292
+ )
293
+
294
+ with gr.Row():
295
+ # Left column
296
+ with gr.Column():
297
+ cartesia_text = gr.TextArea(label="Text")
298
+
299
+ with gr.Accordion(label="Settings", open=True):
300
+ # Filters
301
+ with gr.Accordion("Filters", open=True):
302
+ cartesia_setting_filter_lang = gr.Dropdown(
303
+ label="Language",
304
+ choices=LANGUAGE_CHOICES,
305
+ value="all"
306
+ )
307
+ cartesia_setting_filter_type = gr.Dropdown(
308
+ label="Type",
309
+ choices=list(ACCESS_TYPE_MAP.keys()),
310
+ value="All"
311
+ )
312
+
313
+ # Settings tabs
314
+ with gr.Tab("Standard"):
315
+ cartesia_setting_voice_info = gr.Textbox(
316
+ label="Voice Information",
317
+ interactive=False
318
+ )
319
+ with gr.Row():
320
+ initial_choices, initial_value = get_initial_voices()
321
+ cartesia_setting_voice = gr.Dropdown(
322
+ label="Voice",
323
+ choices=initial_choices,
324
+ value=initial_value
325
+ )
326
+ cartesia_setting_voice_update = gr.Button("Refresh")
327
+ cartesia_setting_auto_language = gr.Checkbox(
328
+ label="Automatically detect language from voice",
329
+ value=True
330
+ )
331
+ cartesia_setting_manual_language = gr.Dropdown(
332
+ label="Speech Language",
333
+ choices=["ru", "en", "es", "fr", "de", "pl", "it", "ja", "ko", "zh", "hi"],
334
+ value="en",
335
+ visible=False # Initially hidden
336
+ )
337
+
338
+ with gr.Tab("Custom"):
339
+ cartesia_setting_custom_name = gr.Textbox(label="Name")
340
+ cartesia_setting_custom_lang = gr.Dropdown(
341
+ label="Language",
342
+ choices=LANGUAGE_CHOICES[1:] # Exclude "all"
343
+ )
344
+ cartesia_setting_custom_voice = gr.Audio(label="Voice File", type='filepath')
345
+ cartesia_setting_custom_add = gr.Button("Add")
346
+
347
+ # Emotion control
348
+ with gr.Accordion(label="Emotion Control (Beta)", open=False):
349
+ cartesia_emotions = gr.Dropdown(
350
+ label="Emotions",
351
+ multiselect=True,
352
+ choices=EMOTION_CHOICES
353
+ )
354
+ cartesia_emotions_intensity = gr.Dropdown(
355
+ label="Intensity",
356
+ choices=EMOTION_INTENSITY,
357
+ value="Medium"
358
+ )
359
+
360
+ # Speed settings
361
+ with gr.Accordion("Speed", open=True):
362
+ cartesia_speed_speed = gr.Dropdown(
363
+ label="Speech Speed",
364
+ choices=SPEED_CHOICES,
365
+ value="Normal"
366
+ )
367
+ cartesia_speed_speed_allow_custom = gr.Checkbox(
368
+ label="Use custom speed value"
369
+ )
370
+ cartesia_speed_speed_custom = gr.Slider(
371
+ label="Speed",
372
+ value=0,
373
+ minimum=-1,
374
+ maximum=1,
375
+ step=0.1,
376
+ visible=False
377
+ )
378
+
379
+ cartesia_setting_improve_text = gr.Checkbox(
380
+ label="Improve text according to recommendations",
381
+ value=True
382
+ )
383
+
384
+ # Right column
385
+ with gr.Column():
386
+ cartessia_status_bar = gr.Label(value="Status")
387
+ cartesia_output_audio = gr.Audio(
388
+ label="Result",
389
+ interactive=False
390
+ )
391
+ cartesia_output_button = gr.Button("Generate")
392
+
393
+ # Events
394
+ cartesia_api_key.change(
395
+ initialize_manager_and_update,
396
+ inputs=[cartesia_api_key, cartesia_setting_filter_lang, cartesia_setting_filter_type, cartesia_setting_voice],
397
+ outputs=[cartessia_status_bar, cartesia_setting_voice]
398
+ )
399
+
400
+ cartesia_setting_filter_lang.change(
401
+ update_voice_list,
402
+ inputs=[
403
+ cartesia_setting_filter_lang,
404
+ cartesia_setting_filter_type,
405
+ cartesia_setting_voice # Pass the current selection
406
+ ],
407
+ outputs=[cartesia_setting_voice, cartessia_status_bar]
408
+ )
409
+
410
+ cartesia_setting_filter_type.change(
411
+ update_voice_list,
412
+ inputs=[
413
+ cartesia_setting_filter_lang,
414
+ cartesia_setting_filter_type,
415
+ cartesia_setting_voice # Pass the current selection
416
+ ],
417
+ outputs=[cartesia_setting_voice, cartessia_status_bar]
418
+ )
419
+
420
+ cartesia_setting_voice.change(
421
+ update_voice_info,
422
+ inputs=[cartesia_setting_voice],
423
+ outputs=[cartesia_setting_voice_info]
424
+ )
425
+
426
+ cartesia_setting_voice_update.click(
427
+ update_voice_list,
428
+ inputs=[cartesia_setting_filter_lang, cartesia_setting_filter_type, cartesia_setting_voice],
429
+ outputs=[cartesia_setting_voice, cartessia_status_bar]
430
+ )
431
+
432
+ cartesia_speed_speed_allow_custom.change(
433
+ lambda x: gr.update(visible=x),
434
+ inputs=[cartesia_speed_speed_allow_custom],
435
+ outputs=[cartesia_speed_speed_custom]
436
+ )
437
+
438
+ cartesia_setting_custom_add.click(
439
+ create_custom_voice,
440
+ inputs=[
441
+ cartesia_setting_custom_name,
442
+ cartesia_setting_custom_lang,
443
+ cartesia_setting_custom_voice
444
+ ],
445
+ outputs=[
446
+ cartessia_status_bar,
447
+ cartesia_setting_voice, # Update dropdown
448
+ cartesia_setting_voice_info # Update voice info
449
+ ]
450
+ )
451
+
452
+ cartesia_setting_auto_language.change(
453
+ on_auto_language_change,
454
+ inputs=[cartesia_setting_auto_language],
455
+ outputs=[cartesia_setting_manual_language]
456
+ )
457
+
458
+ cartesia_output_button.click(
459
+ generate_speech,
460
+ inputs=[
461
+ cartesia_text,
462
+ cartesia_setting_voice,
463
+ cartesia_setting_improve_text,
464
+ cartesia_setting_auto_language,
465
+ cartesia_setting_manual_language,
466
+ cartesia_speed_speed,
467
+ cartesia_speed_speed_allow_custom,
468
+ cartesia_speed_speed_custom,
469
+ cartesia_emotions,
470
+ cartesia_emotions_intensity
471
+ ],
472
+ outputs=[
473
+ cartesia_output_audio,
474
+ cartessia_status_bar
475
+ ]
476
+ )
477
+
478
+ # Run the app
479
+ if __name__ == "__main__":
480
+ demo.queue()
481
+ demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sonic_api_wrapper.py CHANGED
@@ -3,12 +3,16 @@ import json
3
  from pathlib import Path
4
  from typing import List, Dict, Union, Optional
5
  from enum import Enum
6
- from cartesia import Cartesia
7
  from tqdm import tqdm
8
  from loguru import logger
9
  from datetime import datetime
10
  import re
11
 
 
 
 
 
 
12
  class VoiceAccessibility(Enum):
13
  ALL = "all"
14
  ONLY_PUBLIC = "only_public"
@@ -28,49 +32,72 @@ class CartesiaVoiceManager:
28
 
29
  def __init__(self, api_key: str = None, base_dir: Path = None):
30
  self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
31
- if not self.api_key:
32
- raise ValueError("API key is required. Please provide it as an argument or set CARTESIA_API_KEY environment variable.")
33
-
34
- self.client = Cartesia(api_key=self.api_key)
 
 
 
35
  self.current_voice = None
36
  self.current_model = None
37
  self.current_language = None
38
  self.current_mix = None
39
-
40
- # Настройка директорий
41
  self.base_dir = base_dir or Path("voice2voice")
42
  self.api_dir = self.base_dir / "api"
43
  self.custom_dir = self.base_dir / "custom"
44
-
45
- # Создание необходимых директорий
46
  self.api_dir.mkdir(parents=True, exist_ok=True)
47
  self.custom_dir.mkdir(parents=True, exist_ok=True)
48
-
49
- # Инициализация голосов
50
  self.voices = {}
51
  self.loaded_voices = set()
52
-
53
- # Настройки скорости и эмоций
54
  self._speed = 0.0 # normal speed
55
  self._emotions = {}
56
-
57
  logger.add("cartesia_voice_manager.log", rotation="10 MB")
58
  logger.info("CartesiaVoiceManager initialized")
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def load_voice(self, voice_id: str) -> Dict:
61
  if voice_id in self.loaded_voices:
62
  return self.voices[voice_id]
63
-
64
  voice_file = None
65
- # Поиск файла голоса в api и custom директориях
66
  api_file = self.api_dir / f"{voice_id}.json"
67
  custom_file = self.custom_dir / f"{voice_id}.json"
68
-
69
  if api_file.exists():
70
  voice_file = api_file
71
  elif custom_file.exists():
72
  voice_file = custom_file
73
-
74
  if voice_file:
75
  with open(voice_file, "r") as f:
76
  voice_data = json.load(f)
@@ -79,32 +106,36 @@ class CartesiaVoiceManager:
79
  logger.info(f"Loaded voice {voice_id} from {voice_file}")
80
  return voice_data
81
  else:
82
- # Если голос не найден локально, пытаемся загрузить из API
83
- try:
84
- voice_data = self.client.voices.get(id=voice_id)
85
- self._save_voice_to_api(voice_data)
86
- self.voices[voice_id] = voice_data
87
- self.loaded_voices.add(voice_id)
88
- logger.info(f"Loaded voice {voice_id} from API")
89
- return voice_data
90
- except Exception as e:
91
- logger.error(f"Failed to load voice {voice_id}: {e}")
92
- raise ValueError(f"Voice with id {voice_id} not found")
 
 
 
 
93
 
94
  def extract_voice_id_from_label(self, voice_label: str) -> Optional[str]:
95
  """
96
- Извлекает ID голоса из метки в dropdown
97
- Например: "John (en) [Custom]" -> извлечет ID из словаря голосов
98
  """
99
- # Получаем все голоса и их метки
100
  choices = self.get_voice_choices()
101
- # Находим голос по метке и берем его ID
102
  voice_data = next((c for c in choices if c["label"] == voice_label), None)
103
  return voice_data["value"] if voice_data else None
104
-
105
  def get_voice_choices(self, language: str = None, accessibility: VoiceAccessibility = VoiceAccessibility.ALL) -> List[Dict]:
106
  """
107
- Возвращает список голосов для dropdown меню
108
  """
109
  voices = self.list_available_voices(
110
  languages=[language] if language else None,
@@ -113,17 +144,17 @@ class CartesiaVoiceManager:
113
 
114
  choices = []
115
  for voice in voices:
116
- # Сохраняем только ID в value
117
  choices.append({
118
  "label": f"{voice['name']} ({voice['language']}){' [Custom]' if voice.get('is_custom') else ''}",
119
- "value": voice['id'] # Здесь только ID
120
  })
121
 
122
  return sorted(choices, key=lambda x: x['label'])
123
 
124
  def get_voice_info(self, voice_id: str) -> Dict:
125
  """
126
- Возвращает информацию о голосе для отображения
127
  """
128
  voice = self.load_voice(voice_id)
129
  return {
@@ -133,7 +164,7 @@ class CartesiaVoiceManager:
133
  "is_public": voice.get('is_public', True),
134
  "id": voice['id']
135
  }
136
-
137
  def _save_voice_to_api(self, voice_data: Dict):
138
  voice_id = voice_data["id"]
139
  file_path = self.api_dir / f"{voice_id}.json"
@@ -149,37 +180,47 @@ class CartesiaVoiceManager:
149
  logger.info(f"Saved custom voice {voice_id} to {file_path}")
150
 
151
  def update_voices_from_api(self):
 
 
 
 
152
  logger.info("Updating voices from API")
153
- api_voices = self.client.voices.list()
154
- for voice in tqdm(api_voices, desc="Updating voices"):
155
- voice_id = voice["id"]
156
- full_voice_data = self.client.voices.get(id=voice_id)
157
- self._save_voice_to_api(full_voice_data)
158
- if voice_id in self.loaded_voices:
159
- self.voices[voice_id] = full_voice_data
160
- logger.info(f"Updated {len(api_voices)} voices from API")
 
 
 
161
 
162
  def list_available_voices(self, languages: List[str] = None, accessibility: VoiceAccessibility = VoiceAccessibility.ALL) -> List[Dict]:
163
  filtered_voices = []
164
 
165
- # Получаем только метаданные из API (без эмбеддингов)
166
  if accessibility in [VoiceAccessibility.ALL, VoiceAccessibility.ONLY_PUBLIC]:
167
- try:
168
- api_voices = self.client.voices.list()
169
- # Сохраняем только метаданные
170
- for voice in api_voices:
171
- metadata = {
172
- 'id': voice['id'],
173
- 'name': voice['name'],
174
- 'language': voice['language'],
175
- 'is_public': True
176
- }
177
- if languages is None or metadata['language'] in languages:
178
- filtered_voices.append(metadata)
179
- except Exception as e:
180
- logger.error(f"Failed to fetch voices from API: {e}")
 
 
 
181
 
182
- # Добавляем кастомные голоса если нужно
183
  if accessibility in [VoiceAccessibility.ALL, VoiceAccessibility.ONLY_PRIVATE, VoiceAccessibility.ONLY_CUSTOM]:
184
  for file in self.custom_dir.glob("*.json"):
185
  with open(file, "r") as f:
@@ -197,7 +238,7 @@ class CartesiaVoiceManager:
197
  return filtered_voices
198
 
199
  def set_voice(self, voice_id: str):
200
- # Проверяем наличие локального файла с эмбеддингом
201
  voice_file = None
202
  api_file = self.api_dir / f"{voice_id}.json"
203
  custom_file = self.custom_dir / f"{voice_id}.json"
@@ -208,19 +249,23 @@ class CartesiaVoiceManager:
208
  voice_file = custom_file
209
 
210
  if voice_file:
211
- # Используем локальные данные
212
  with open(voice_file, "r") as f:
213
  self.current_voice = json.load(f)
214
  else:
215
- # Получаем полные данные с эмбеддингом из API
216
- try:
217
- voice_data = self.client.voices.get(id=voice_id)
218
- # Сохраняем для будущего использования
219
- self._save_voice_to_api(voice_data)
220
- self.current_voice = voice_data
221
- except Exception as e:
222
- logger.error(f"Failed to get voice {voice_id}: {e}")
223
- raise ValueError(f"Voice with id {voice_id} not found")
 
 
 
 
224
 
225
  self.set_language(self.current_voice['language'])
226
  logger.info(f"Set current voice to {voice_id}")
@@ -261,32 +306,35 @@ class CartesiaVoiceManager:
261
  self._emotions = {}
262
  logger.info("Cleared all emotions")
263
  return
264
-
265
  self._emotions = {}
266
  for emotion in emotions:
267
  name = emotion.get("name")
268
  level = emotion.get("level")
269
-
270
  if name not in self.EMOTION_NAMES:
271
  raise ValueError(f"Invalid emotion name. Choose from: {self.EMOTION_NAMES}")
272
  if level not in self.EMOTION_LEVELS:
273
  raise ValueError(f"Invalid emotion level. Choose from: {self.EMOTION_LEVELS}")
274
-
275
  self._emotions[name] = level
276
-
277
  logger.info(f"Set emotions: {self._emotions}")
278
 
279
  def _get_voice_controls(self):
280
  controls = {"speed": self._speed}
281
-
282
  if self._emotions:
283
  controls["emotion"] = [f"{name}:{level}" for name, level in self._emotions.items()]
284
-
285
  return controls
286
 
287
  def speak(self, text: str, output_file: str = None):
288
  if not self.current_model or not (self.current_voice or self.current_mix):
289
  raise ValueError("Please set a model and a voice or voice mix before speaking.")
 
 
 
290
 
291
  voice_embedding = self.current_voice['embedding'] if self.current_voice else self.current_mix
292
 
@@ -299,7 +347,7 @@ class CartesiaVoiceManager:
299
  }
300
 
301
  voice_controls = self._get_voice_controls()
302
-
303
  logger.info(f"Generating audio for text: {text[:50]}... with voice controls: {voice_controls}")
304
  if self.current_language == 'en':
305
  audio_data = self.client.tts.bytes(
@@ -313,13 +361,14 @@ class CartesiaVoiceManager:
313
  )
314
  else:
315
  audio_data = self.client.tts.bytes(
316
- model_id='sonic-multilingual',
317
- transcript=improved_text,
318
- voice_embedding=voice_embedding,
319
- duration=None,
320
- output_format=output_format,
321
- language=self.current_language,
322
- _experimental_voice_controls=voice_controls)
 
323
 
324
  if output_file is None:
325
  output_file = f"output_{self.current_language}.wav"
@@ -333,16 +382,19 @@ class CartesiaVoiceManager:
333
 
334
  def _get_embedding(self, source: Union[str, Dict]) -> Dict:
335
  """
336
- Получает эмбеддинг из различных источников: ID, путь к файлу или существующий эмбеддинг
337
  """
338
  if isinstance(source, dict) and 'embedding' in source:
339
  return source['embedding']
340
  elif isinstance(source, str):
341
  if os.path.isfile(source):
342
- # Если это путь к файлу, создаем новый эмбеддинг
 
 
 
343
  return self.client.voices.clone(filepath=source)
344
  else:
345
- # Если это ID, загружаем голос и возвращаем его эмбеддинг
346
  voice = self.load_voice(source)
347
  return voice['embedding']
348
  else:
@@ -350,11 +402,15 @@ class CartesiaVoiceManager:
350
 
351
  def create_mixed_embedding(self, components: List[Dict[str, Union[str, float, Dict]]]) -> Dict:
352
  """
353
- Создает смешанный эмбеддинг из нескольких компонентов
354
-
355
- :param components: Список словарей, каждый содержит 'id' (или 'path', или эмбеддинг) и 'weight'
356
- :return: Новый смешанный эмбеддинг
357
  """
 
 
 
 
358
  mix_components = []
359
  for component in components:
360
  embedding = self._get_embedding(component.get('id') or component.get('path') or component)
@@ -362,32 +418,35 @@ class CartesiaVoiceManager:
362
  "embedding": embedding,
363
  "weight": component['weight']
364
  })
365
-
366
  return self.client.voices.mix(mix_components)
367
 
368
  def create_custom_voice(self, name: str, source: Union[str, List[Dict]], description: str = "", language: str = "en"):
369
  """
370
- Создает кастомный голос из файла или смеси голосов
371
-
372
- :param name: Имя нового голоса
373
- :param source: Путь к файлу или список компонентов для смешивания
374
- :param description: Описание голоса
375
- :param language: Язык голоса
376
- :return: ID нового голоса
377
  """
378
  logger.info(f"Creating custom voice: {name}")
379
-
380
  if isinstance(source, str):
381
- # Если источник - строка, считаем это путем к файлу
 
 
 
382
  embedding = self.client.voices.clone(filepath=source)
383
  elif isinstance(source, list):
384
- # Если источник - список, создаем смешанный эмбеддинг
385
  embedding = self.create_mixed_embedding(source)
386
  else:
387
  raise ValueError("Invalid source type. Expected file path or list of components.")
388
 
389
  voice_id = f"custom_{len([f for f in self.custom_dir.glob('*.json')])}"
390
-
391
  voice_data = {
392
  "id": voice_id,
393
  "name": name,
@@ -397,30 +456,30 @@ class CartesiaVoiceManager:
397
  "is_public": False,
398
  "is_custom": True
399
  }
400
-
401
  self._save_voice_to_custom(voice_data)
402
  self.voices[voice_id] = voice_data
403
  self.loaded_voices.add(voice_id)
404
-
405
  logger.info(f"Created custom voice with id: {voice_id}")
406
  return voice_id
407
 
408
  def get_voice_id_by_name(self, name: str) -> List[str]:
409
  matching_voices = []
410
-
411
- # Проверяем оба каталога
412
  for directory in [self.api_dir, self.custom_dir]:
413
  for file in directory.glob("*.json"):
414
  with open(file, "r") as f:
415
  voice_data = json.load(f)
416
  if voice_data['name'] == name:
417
  matching_voices.append(voice_data['id'])
418
-
419
  if not matching_voices:
420
  logger.warning(f"No voices found with name: {name}")
421
  else:
422
  logger.info(f"Found {len(matching_voices)} voice(s) with name: {name}")
423
-
424
  return matching_voices
425
 
426
  def improve_tts_text(text: str, language: str = 'en') -> str:
@@ -430,7 +489,7 @@ def improve_tts_text(text: str, language: str = 'en') -> str:
430
  def format_date(match):
431
  date = datetime.strptime(match.group(), '%Y-%m-%d')
432
  return date.strftime('%m/%d/%Y')
433
-
434
  text = re.sub(r'\d{4}-\d{2}-\d{2}', format_date, text)
435
  text = text.replace(' - ', ' - - ')
436
  text = re.sub(r'\?(?![\s\n])', '??', text)
@@ -443,4 +502,4 @@ def improve_tts_text(text: str, language: str = 'en') -> str:
443
  elif language.lower() in ['fr', 'fra', 'french']:
444
  text = text.replace('M.', 'Monsieur')
445
 
446
- return text
 
3
  from pathlib import Path
4
  from typing import List, Dict, Union, Optional
5
  from enum import Enum
 
6
  from tqdm import tqdm
7
  from loguru import logger
8
  from datetime import datetime
9
  import re
10
 
11
+ try:
12
+ from cartesia import Cartesia
13
+ except ImportError:
14
+ Cartesia = None # Handle the case where Cartesia is not installed
15
+
16
  class VoiceAccessibility(Enum):
17
  ALL = "all"
18
  ONLY_PUBLIC = "only_public"
 
32
 
33
  def __init__(self, api_key: str = None, base_dir: Path = None):
34
  self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
35
+ if self.api_key and Cartesia:
36
+ self.client = Cartesia(api_key=self.api_key)
37
+ logger.info("Cartesia client initialized with API key.")
38
+ else:
39
+ self.client = None
40
+ logger.warning("API key not provided. Cartesia client is not initialized. Some features will be unavailable.")
41
+
42
  self.current_voice = None
43
  self.current_model = None
44
  self.current_language = None
45
  self.current_mix = None
46
+
47
+ # Setting up directories
48
  self.base_dir = base_dir or Path("voice2voice")
49
  self.api_dir = self.base_dir / "api"
50
  self.custom_dir = self.base_dir / "custom"
51
+
52
+ # Create necessary directories
53
  self.api_dir.mkdir(parents=True, exist_ok=True)
54
  self.custom_dir.mkdir(parents=True, exist_ok=True)
55
+
56
+ # Initialize voices
57
  self.voices = {}
58
  self.loaded_voices = set()
59
+
60
+ # Speed and emotion settings
61
  self._speed = 0.0 # normal speed
62
  self._emotions = {}
63
+
64
  logger.add("cartesia_voice_manager.log", rotation="10 MB")
65
  logger.info("CartesiaVoiceManager initialized")
66
 
67
+
68
+ def set_api_key(self, api_key: str):
69
+ """
70
+ Устанавливает API ключ и инициализирует клиент Cartesia.
71
+ Затем обновляет список голосов из API.
72
+ """
73
+ self.api_key = api_key
74
+ if Cartesia:
75
+ try:
76
+ self.client = Cartesia(api_key=self.api_key)
77
+ logger.info("Cartesia client initialized with new API key.")
78
+ self.update_voices_from_api()
79
+ except Exception as e:
80
+ logger.error(f"Failed to initialize Cartesia client with the provided API key: {e}")
81
+ self.client = None
82
+ raise ValueError("Failed to initialize Cartesia client with the provided API key.")
83
+ else:
84
+ logger.error("Cartesia library is not available. Cannot initialize Cartesia client.")
85
+ raise ImportError("Cartesia library is not installed.")
86
+
87
  def load_voice(self, voice_id: str) -> Dict:
88
  if voice_id in self.loaded_voices:
89
  return self.voices[voice_id]
90
+
91
  voice_file = None
92
+ # Search for voice file in api and custom directories
93
  api_file = self.api_dir / f"{voice_id}.json"
94
  custom_file = self.custom_dir / f"{voice_id}.json"
95
+
96
  if api_file.exists():
97
  voice_file = api_file
98
  elif custom_file.exists():
99
  voice_file = custom_file
100
+
101
  if voice_file:
102
  with open(voice_file, "r") as f:
103
  voice_data = json.load(f)
 
106
  logger.info(f"Loaded voice {voice_id} from {voice_file}")
107
  return voice_data
108
  else:
109
+ # If voice not found locally, try to load from API
110
+ if self.client:
111
+ try:
112
+ voice_data = self.client.voices.get(id=voice_id)
113
+ self._save_voice_to_api(voice_data)
114
+ self.voices[voice_id] = voice_data
115
+ self.loaded_voices.add(voice_id)
116
+ logger.info(f"Loaded voice {voice_id} from API")
117
+ return voice_data
118
+ except Exception as e:
119
+ logger.error(f"Failed to load voice {voice_id}: {e}")
120
+ raise ValueError(f"Voice with id {voice_id} not found")
121
+ else:
122
+ logger.error(f"Cannot load voice {voice_id} without API client.")
123
+ raise ValueError(f"Voice with id {voice_id} not found and API client is not available.")
124
 
125
  def extract_voice_id_from_label(self, voice_label: str) -> Optional[str]:
126
  """
127
+ Extracts voice ID from label in dropdown
128
+ For example: "John (en) [Custom]" -> extract ID from voices dictionary
129
  """
130
+ # Get all voices and their labels
131
  choices = self.get_voice_choices()
132
+ # Find voice by label and get its ID
133
  voice_data = next((c for c in choices if c["label"] == voice_label), None)
134
  return voice_data["value"] if voice_data else None
135
+
136
  def get_voice_choices(self, language: str = None, accessibility: VoiceAccessibility = VoiceAccessibility.ALL) -> List[Dict]:
137
  """
138
+ Returns a list of voices for dropdown menu
139
  """
140
  voices = self.list_available_voices(
141
  languages=[language] if language else None,
 
144
 
145
  choices = []
146
  for voice in voices:
147
+ # Keep only ID in value
148
  choices.append({
149
  "label": f"{voice['name']} ({voice['language']}){' [Custom]' if voice.get('is_custom') else ''}",
150
+ "value": voice['id'] # Only ID here
151
  })
152
 
153
  return sorted(choices, key=lambda x: x['label'])
154
 
155
  def get_voice_info(self, voice_id: str) -> Dict:
156
  """
157
+ Returns voice information for display
158
  """
159
  voice = self.load_voice(voice_id)
160
  return {
 
164
  "is_public": voice.get('is_public', True),
165
  "id": voice['id']
166
  }
167
+
168
  def _save_voice_to_api(self, voice_data: Dict):
169
  voice_id = voice_data["id"]
170
  file_path = self.api_dir / f"{voice_id}.json"
 
180
  logger.info(f"Saved custom voice {voice_id} to {file_path}")
181
 
182
  def update_voices_from_api(self):
183
+ if not self.client:
184
+ logger.warning("Cannot update voices from API without API client.")
185
+ return
186
+
187
  logger.info("Updating voices from API")
188
+ try:
189
+ api_voices = self.client.voices.list()
190
+ for voice in tqdm(api_voices, desc="Updating voices"):
191
+ voice_id = voice["id"]
192
+ full_voice_data = self.client.voices.get(id=voice_id)
193
+ self._save_voice_to_api(full_voice_data)
194
+ if voice_id in self.loaded_voices:
195
+ self.voices[voice_id] = full_voice_data
196
+ logger.info(f"Updated {len(api_voices)} voices from API")
197
+ except Exception as e:
198
+ logger.error(f"Failed to update voices from API: {e}")
199
 
200
  def list_available_voices(self, languages: List[str] = None, accessibility: VoiceAccessibility = VoiceAccessibility.ALL) -> List[Dict]:
201
  filtered_voices = []
202
 
203
+ # Get only metadata from API (without embeddings)
204
  if accessibility in [VoiceAccessibility.ALL, VoiceAccessibility.ONLY_PUBLIC]:
205
+ if self.client:
206
+ try:
207
+ api_voices = self.client.voices.list()
208
+ # Keep only metadata
209
+ for voice in api_voices:
210
+ metadata = {
211
+ 'id': voice['id'],
212
+ 'name': voice['name'],
213
+ 'language': voice['language'],
214
+ 'is_public': True
215
+ }
216
+ if languages is None or metadata['language'] in languages:
217
+ filtered_voices.append(metadata)
218
+ except Exception as e:
219
+ logger.error(f"Failed to fetch voices from API: {e}")
220
+ else:
221
+ logger.warning("API client is not available. Skipping public voices.")
222
 
223
+ # Add custom voices if needed
224
  if accessibility in [VoiceAccessibility.ALL, VoiceAccessibility.ONLY_PRIVATE, VoiceAccessibility.ONLY_CUSTOM]:
225
  for file in self.custom_dir.glob("*.json"):
226
  with open(file, "r") as f:
 
238
  return filtered_voices
239
 
240
  def set_voice(self, voice_id: str):
241
+ # Check for local file with embedding
242
  voice_file = None
243
  api_file = self.api_dir / f"{voice_id}.json"
244
  custom_file = self.custom_dir / f"{voice_id}.json"
 
249
  voice_file = custom_file
250
 
251
  if voice_file:
252
+ # Use local data
253
  with open(voice_file, "r") as f:
254
  self.current_voice = json.load(f)
255
  else:
256
+ # Get full data with embedding from API
257
+ if self.client:
258
+ try:
259
+ voice_data = self.client.voices.get(id=voice_id)
260
+ # Save for future use
261
+ self._save_voice_to_api(voice_data)
262
+ self.current_voice = voice_data
263
+ except Exception as e:
264
+ logger.error(f"Failed to get voice {voice_id}: {e}")
265
+ raise ValueError(f"Voice with id {voice_id} not found")
266
+ else:
267
+ logger.error(f"Cannot set voice {voice_id} without API client.")
268
+ raise ValueError(f"Voice with id {voice_id} not found and API client is not available.")
269
 
270
  self.set_language(self.current_voice['language'])
271
  logger.info(f"Set current voice to {voice_id}")
 
306
  self._emotions = {}
307
  logger.info("Cleared all emotions")
308
  return
309
+
310
  self._emotions = {}
311
  for emotion in emotions:
312
  name = emotion.get("name")
313
  level = emotion.get("level")
314
+
315
  if name not in self.EMOTION_NAMES:
316
  raise ValueError(f"Invalid emotion name. Choose from: {self.EMOTION_NAMES}")
317
  if level not in self.EMOTION_LEVELS:
318
  raise ValueError(f"Invalid emotion level. Choose from: {self.EMOTION_LEVELS}")
319
+
320
  self._emotions[name] = level
321
+
322
  logger.info(f"Set emotions: {self._emotions}")
323
 
324
  def _get_voice_controls(self):
325
  controls = {"speed": self._speed}
326
+
327
  if self._emotions:
328
  controls["emotion"] = [f"{name}:{level}" for name, level in self._emotions.items()]
329
+
330
  return controls
331
 
332
  def speak(self, text: str, output_file: str = None):
333
  if not self.current_model or not (self.current_voice or self.current_mix):
334
  raise ValueError("Please set a model and a voice or voice mix before speaking.")
335
+ if not self.client:
336
+ logger.error("Cannot generate speech without API client.")
337
+ raise ValueError("API client is not initialized. Cannot generate speech.")
338
 
339
  voice_embedding = self.current_voice['embedding'] if self.current_voice else self.current_mix
340
 
 
347
  }
348
 
349
  voice_controls = self._get_voice_controls()
350
+
351
  logger.info(f"Generating audio for text: {text[:50]}... with voice controls: {voice_controls}")
352
  if self.current_language == 'en':
353
  audio_data = self.client.tts.bytes(
 
361
  )
362
  else:
363
  audio_data = self.client.tts.bytes(
364
+ model_id='sonic-multilingual',
365
+ transcript=improved_text,
366
+ voice_embedding=voice_embedding,
367
+ duration=None,
368
+ output_format=output_format,
369
+ language=self.current_language,
370
+ _experimental_voice_controls=voice_controls
371
+ )
372
 
373
  if output_file is None:
374
  output_file = f"output_{self.current_language}.wav"
 
382
 
383
  def _get_embedding(self, source: Union[str, Dict]) -> Dict:
384
  """
385
+ Gets embedding from various sources: ID, file path, or existing embedding
386
  """
387
  if isinstance(source, dict) and 'embedding' in source:
388
  return source['embedding']
389
  elif isinstance(source, str):
390
  if os.path.isfile(source):
391
+ # If it's a file path, create a new embedding
392
+ if not self.client:
393
+ logger.error("Cannot clone voice without API client.")
394
+ raise ValueError("API client is not initialized. Cannot clone voice.")
395
  return self.client.voices.clone(filepath=source)
396
  else:
397
+ # If it's an ID, load the voice and return its embedding
398
  voice = self.load_voice(source)
399
  return voice['embedding']
400
  else:
 
402
 
403
  def create_mixed_embedding(self, components: List[Dict[str, Union[str, float, Dict]]]) -> Dict:
404
  """
405
+ Creates a mixed embedding from multiple components
406
+
407
+ :param components: List of dictionaries, each containing 'id' (or 'path', or embedding) and 'weight'
408
+ :return: New mixed embedding
409
  """
410
+ if not self.client:
411
+ logger.error("Cannot create mixed embedding without API client.")
412
+ raise ValueError("API client is not initialized. Cannot create mixed embedding.")
413
+
414
  mix_components = []
415
  for component in components:
416
  embedding = self._get_embedding(component.get('id') or component.get('path') or component)
 
418
  "embedding": embedding,
419
  "weight": component['weight']
420
  })
421
+
422
  return self.client.voices.mix(mix_components)
423
 
424
  def create_custom_voice(self, name: str, source: Union[str, List[Dict]], description: str = "", language: str = "en"):
425
  """
426
+ Creates a custom voice from a file or a mix of voices
427
+
428
+ :param name: Name of the new voice
429
+ :param source: File path or list of components to mix
430
+ :param description: Description of the voice
431
+ :param language: Language of the voice
432
+ :return: ID of the new voice
433
  """
434
  logger.info(f"Creating custom voice: {name}")
435
+
436
  if isinstance(source, str):
437
+ # If source is a string, assume it's a file path
438
+ if not self.client:
439
+ logger.error("Cannot clone voice without API client.")
440
+ raise ValueError("API client is not initialized. Cannot clone voice.")
441
  embedding = self.client.voices.clone(filepath=source)
442
  elif isinstance(source, list):
443
+ # If source is a list, create a mixed embedding
444
  embedding = self.create_mixed_embedding(source)
445
  else:
446
  raise ValueError("Invalid source type. Expected file path or list of components.")
447
 
448
  voice_id = f"custom_{len([f for f in self.custom_dir.glob('*.json')])}"
449
+
450
  voice_data = {
451
  "id": voice_id,
452
  "name": name,
 
456
  "is_public": False,
457
  "is_custom": True
458
  }
459
+
460
  self._save_voice_to_custom(voice_data)
461
  self.voices[voice_id] = voice_data
462
  self.loaded_voices.add(voice_id)
463
+
464
  logger.info(f"Created custom voice with id: {voice_id}")
465
  return voice_id
466
 
467
  def get_voice_id_by_name(self, name: str) -> List[str]:
468
  matching_voices = []
469
+
470
+ # Check both directories
471
  for directory in [self.api_dir, self.custom_dir]:
472
  for file in directory.glob("*.json"):
473
  with open(file, "r") as f:
474
  voice_data = json.load(f)
475
  if voice_data['name'] == name:
476
  matching_voices.append(voice_data['id'])
477
+
478
  if not matching_voices:
479
  logger.warning(f"No voices found with name: {name}")
480
  else:
481
  logger.info(f"Found {len(matching_voices)} voice(s) with name: {name}")
482
+
483
  return matching_voices
484
 
485
  def improve_tts_text(text: str, language: str = 'en') -> str:
 
489
  def format_date(match):
490
  date = datetime.strptime(match.group(), '%Y-%m-%d')
491
  return date.strftime('%m/%d/%Y')
492
+
493
  text = re.sub(r'\d{4}-\d{2}-\d{2}', format_date, text)
494
  text = text.replace(' - ', ' - - ')
495
  text = re.sub(r'\?(?![\s\n])', '??', text)
 
502
  elif language.lower() in ['fr', 'fra', 'french']:
503
  text = text.replace('M.', 'Monsieur')
504
 
505
+ return text