Spaces:
Sleeping
Sleeping
from typing import List | |
import gradio as gr | |
from pathlib import Path | |
from sonic_api_wrapper import CartesiaVoiceManager, VoiceAccessibility, improve_tts_text | |
import os | |
import json | |
import datetime | |
# Global variable to hold the manager instance | |
manager = None | |
# Constants | |
LANGUAGE_CHOICES = ["all", "ru", "en", "es", "pl", "de", "fr", "tr", "pt", "zh", "ja", "hi", "it", "ko", "nl", "sv"] | |
ACCESS_TYPE_MAP = { | |
"All": VoiceAccessibility.ALL, | |
"Custom Only": VoiceAccessibility.ONLY_CUSTOM, | |
"Private Only": VoiceAccessibility.ONLY_PRIVATE, | |
"API": VoiceAccessibility.ONLY_PUBLIC | |
} | |
SPEED_CHOICES = ["Very Slow", "Slow", "Normal", "Fast", "Very Fast"] | |
EMOTION_CHOICES = ["Neutral", "Happy", "Sad", "Angry", "Surprised", "Curious"] | |
EMOTION_INTENSITY = ["Very Weak", "Weak", "Medium", "Strong", "Very Strong"] | |
def map_speed(speed_type: str) -> float: | |
speed_map = { | |
"Very Slow": -1.0, | |
"Slow": -0.5, | |
"Normal": 0.0, | |
"Fast": 0.5, | |
"Very Fast": 1.0 | |
} | |
return speed_map[speed_type] | |
def generate_output_filename(language: str) -> str: | |
"""Generate output filename with timestamp and language""" | |
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | |
return f"output/{timestamp}_{language}.wav" | |
def extract_voice_id_from_label(voice_label: str) -> str: | |
""" | |
Extracts voice ID from label in dropdown | |
For example: "John (en) [Custom]" -> extract ID from voices dictionary | |
""" | |
global manager | |
try: | |
if not manager: | |
return None | |
# Get all voices and their labels | |
choices = manager.get_voice_choices() | |
# Find voice by label and get its ID | |
voice_data = next((c for c in choices if c["label"] == voice_label), None) | |
return voice_data["value"] if voice_data else None | |
except Exception as e: | |
print(f"β Error getting voices: {str(e)}") | |
return None | |
def initialize_manager(api_key: str) -> str: | |
global manager | |
try: | |
if not api_key: | |
return "β API key is required to initialize the manager" | |
manager = CartesiaVoiceManager(api_key=api_key, base_dir=Path("voice2voice")) | |
return "β Manager initialized" | |
except Exception as e: | |
manager = None | |
return f"β Error: {str(e)}" | |
def get_initial_voices(): | |
global manager | |
"""Get initial list of voices""" | |
if not manager: | |
return [], None | |
choices = manager.get_voice_choices() | |
if not choices: | |
return [], None | |
return [c["label"] for c in choices], choices[0]["label"] if choices else None | |
def update_voice_list(language: str, access_type: str, current_voice: str = None): | |
""" | |
Update the list of voices, preserving the current selection | |
""" | |
global manager | |
if not manager: | |
return gr.update(choices=[], value=None), "β Manager is not initialized" | |
try: | |
choices = manager.get_voice_choices( | |
language=None if language == "all" else language, | |
accessibility=ACCESS_TYPE_MAP[access_type] | |
) | |
# Convert to list of labels | |
choice_labels = [c["label"] for c in choices] | |
# Determine value to select | |
if current_voice in choice_labels: | |
# Preserve current selection if available | |
new_value = current_voice | |
else: | |
# Otherwise, take the first available voice | |
new_value = choice_labels[0] if choice_labels else None | |
return gr.update(choices=choice_labels, value=new_value), "β Voice list updated" | |
except Exception as e: | |
return gr.update(choices=[], value=None), f"β Error: {str(e)}" | |
def update_voice_info(voice_label: str) -> str: | |
"""Update voice information""" | |
global manager | |
if not manager or not voice_label: | |
return "" | |
try: | |
voice_id = extract_voice_id_from_label(voice_label) | |
if not voice_id: | |
return "β Voice not found" | |
info = manager.get_voice_info(voice_id) | |
return ( | |
f"Name: {info['name']}\n" | |
f"Language: {info['language']}\n" | |
f"Type: {'Custom' if info.get('is_custom') else 'API'}\n" | |
f"ID: {info['id']}" | |
) | |
except Exception as e: | |
return f"β Error: {str(e)}" | |
def create_custom_voice(name: str, language: str, audio_data: tuple) -> tuple: | |
""" | |
Creates a custom voice and updates the list of voices | |
Returns: (status, updated dropdown, voice info) | |
""" | |
global manager | |
if not manager: | |
return "β Manager is not initialized", gr.update(), "" | |
if not name or not audio_data: | |
return "β Name and voice file are required", gr.update(), "" | |
try: | |
# Get the audio file path | |
audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data | |
# Create the voice | |
voice_id = manager.create_custom_voice( | |
name=name, | |
source=audio_path, | |
language=language | |
) | |
print(voice_id) | |
# Get updated list of voices | |
choices = manager.get_voice_choices() | |
choice_labels = [c["label"] for c in choices] | |
# Find label for the new voice | |
new_voice_label = next(c["label"] for c in choices if c["value"] == voice_id) | |
# Get info of the new voice | |
voice_info = manager.get_voice_info(voice_id) | |
info_text = ( | |
f"Name: {voice_info['name']}\n" | |
f"Language: {voice_info['language']}\n" | |
f"Type: Custom\n" | |
f"ID: {voice_info['id']}" | |
) | |
return ( | |
f"β Custom voice created: {voice_id}", | |
gr.update(choices=choice_labels, value=new_voice_label), | |
info_text | |
) | |
except Exception as e: | |
return f"β Error creating voice: {str(e)}", gr.update(), "" | |
def on_auto_language_change(auto_language: bool): | |
"""Handler for changing the auto-detect language checkbox""" | |
return gr.update(visible=not auto_language) | |
def map_emotions(selected_emotions, intensity): | |
emotion_map = { | |
"Happy": "positivity", | |
"Sad": "sadness", | |
"Angry": "anger", | |
"Surprised": "surprise", | |
"Curious": "curiosity" | |
} | |
intensity_map = { | |
"Very Weak": "lowest", | |
"Weak": "low", | |
"Medium": "medium", | |
"Strong": "high", | |
"Very Strong": "highest" | |
} | |
emotions = [] | |
for emotion in selected_emotions: | |
if emotion == "Neutral": | |
continue | |
if emotion in emotion_map: | |
emotions.append({ | |
"name": emotion_map[emotion], | |
"level": intensity_map[intensity] | |
}) | |
return emotions | |
def generate_speech( | |
text: str, | |
voice_label: str, | |
improve_text: bool, | |
auto_language: bool, | |
manual_language: str, | |
speed_type: str, | |
use_custom_speed: bool, | |
custom_speed: float, | |
emotions: List[str], | |
emotion_intensity: str | |
): | |
global manager | |
"""Generate speech considering language settings""" | |
if not manager: | |
return None, "β Manager is not initialized" | |
if not text or not voice_label: | |
return None, "β Text and voice are required" | |
try: | |
# Extract voice ID from label | |
voice_id = extract_voice_id_from_label(voice_label) | |
if not voice_id: | |
return None, "β Voice not found" | |
# Set the voice by ID | |
manager.set_voice(voice_id) | |
# If auto-detect is off, set language manually | |
if not auto_language: | |
manager.set_language(manual_language) | |
# Set speed | |
if use_custom_speed: | |
manager.speed = custom_speed | |
else: | |
manager.speed = map_speed(speed_type) | |
# Set emotions | |
if emotions and emotions != ["Neutral"]: | |
manager.set_emotions(map_emotions(emotions, emotion_intensity)) | |
else: | |
manager.set_emotions() # Reset emotions | |
# Generate output file name | |
output_file = generate_output_filename( | |
manual_language if not auto_language else manager.current_language | |
) | |
# Create output directory if it doesn't exist | |
os.makedirs("output", exist_ok=True) | |
# Generate speech | |
output_path = manager.speak( | |
text=text if not improve_text else improve_tts_text(text, manager.current_language), | |
output_file=output_file | |
) | |
return output_path, "β Audio generated successfully" | |
except Exception as e: | |
return None, f"β Error generating speech: {str(e)}" | |
def initialize_manager_and_update(api_key: str, language: str, access_type: str, current_voice: str = None): | |
status = initialize_manager(api_key) | |
if manager: | |
voice_update, voice_status = update_voice_list(language, access_type, current_voice) | |
combined_status = f"{status}\n{voice_status}" | |
return combined_status, voice_update | |
else: | |
return status, gr.update(choices=[], value=None) | |
# Create the interface | |
with gr.Blocks() as demo: | |
# API key | |
cartesia_api_key = gr.Textbox( | |
label="Cartesia API Key", | |
value="", # No default API key | |
type='password' | |
) | |
with gr.Row(): | |
# Left column | |
with gr.Column(): | |
cartesia_text = gr.TextArea(label="Text") | |
with gr.Accordion(label="Settings", open=True): | |
# Filters | |
with gr.Accordion("Filters", open=True): | |
cartesia_setting_filter_lang = gr.Dropdown( | |
label="Language", | |
choices=LANGUAGE_CHOICES, | |
value="all" | |
) | |
cartesia_setting_filter_type = gr.Dropdown( | |
label="Type", | |
choices=list(ACCESS_TYPE_MAP.keys()), | |
value="All" | |
) | |
# Settings tabs | |
with gr.Tab("Standard"): | |
cartesia_setting_voice_info = gr.Textbox( | |
label="Voice Information", | |
interactive=False | |
) | |
with gr.Row(): | |
initial_choices, initial_value = get_initial_voices() | |
cartesia_setting_voice = gr.Dropdown( | |
label="Voice", | |
choices=initial_choices, | |
value=initial_value | |
) | |
cartesia_setting_voice_update = gr.Button("Refresh") | |
cartesia_setting_auto_language = gr.Checkbox( | |
label="Automatically detect language from voice", | |
value=True | |
) | |
cartesia_setting_manual_language = gr.Dropdown( | |
label="Speech Language", | |
choices=["ru", "en", "es", "fr", "de", "pl", "it", "ja", "ko", "zh", "hi"], | |
value="en", | |
visible=False # Initially hidden | |
) | |
with gr.Tab("Custom"): | |
cartesia_setting_custom_name = gr.Textbox(label="Name") | |
cartesia_setting_custom_lang = gr.Dropdown( | |
label="Language", | |
choices=LANGUAGE_CHOICES[1:] # Exclude "all" | |
) | |
cartesia_setting_custom_voice = gr.Audio(label="Voice File", type='filepath') | |
cartesia_setting_custom_add = gr.Button("Add") | |
# Emotion control | |
with gr.Accordion(label="Emotion Control (Beta)", open=False): | |
cartesia_emotions = gr.Dropdown( | |
label="Emotions", | |
multiselect=True, | |
choices=EMOTION_CHOICES | |
) | |
cartesia_emotions_intensity = gr.Dropdown( | |
label="Intensity", | |
choices=EMOTION_INTENSITY, | |
value="Medium" | |
) | |
# Speed settings | |
with gr.Accordion("Speed", open=True): | |
cartesia_speed_speed = gr.Dropdown( | |
label="Speech Speed", | |
choices=SPEED_CHOICES, | |
value="Normal" | |
) | |
cartesia_speed_speed_allow_custom = gr.Checkbox( | |
label="Use custom speed value" | |
) | |
cartesia_speed_speed_custom = gr.Slider( | |
label="Speed", | |
value=0, | |
minimum=-1, | |
maximum=1, | |
step=0.1, | |
visible=False | |
) | |
cartesia_setting_improve_text = gr.Checkbox( | |
label="Improve text according to recommendations", | |
value=True | |
) | |
# Right column | |
with gr.Column(): | |
cartessia_status_bar = gr.Label(value="Status") | |
cartesia_output_audio = gr.Audio( | |
label="Result", | |
interactive=False | |
) | |
cartesia_output_button = gr.Button("Generate") | |
# Events | |
cartesia_api_key.change( | |
initialize_manager_and_update, | |
inputs=[cartesia_api_key, cartesia_setting_filter_lang, cartesia_setting_filter_type, cartesia_setting_voice], | |
outputs=[cartessia_status_bar, cartesia_setting_voice] | |
) | |
cartesia_setting_filter_lang.change( | |
update_voice_list, | |
inputs=[ | |
cartesia_setting_filter_lang, | |
cartesia_setting_filter_type, | |
cartesia_setting_voice # Pass the current selection | |
], | |
outputs=[cartesia_setting_voice, cartessia_status_bar] | |
) | |
cartesia_setting_filter_type.change( | |
update_voice_list, | |
inputs=[ | |
cartesia_setting_filter_lang, | |
cartesia_setting_filter_type, | |
cartesia_setting_voice # Pass the current selection | |
], | |
outputs=[cartesia_setting_voice, cartessia_status_bar] | |
) | |
cartesia_setting_voice.change( | |
update_voice_info, | |
inputs=[cartesia_setting_voice], | |
outputs=[cartesia_setting_voice_info] | |
) | |
cartesia_setting_voice_update.click( | |
update_voice_list, | |
inputs=[cartesia_setting_filter_lang, cartesia_setting_filter_type, cartesia_setting_voice], | |
outputs=[cartesia_setting_voice, cartessia_status_bar] | |
) | |
cartesia_speed_speed_allow_custom.change( | |
lambda x: gr.update(visible=x), | |
inputs=[cartesia_speed_speed_allow_custom], | |
outputs=[cartesia_speed_speed_custom] | |
) | |
cartesia_setting_custom_add.click( | |
create_custom_voice, | |
inputs=[ | |
cartesia_setting_custom_name, | |
cartesia_setting_custom_lang, | |
cartesia_setting_custom_voice | |
], | |
outputs=[ | |
cartessia_status_bar, | |
cartesia_setting_voice, # Update dropdown | |
cartesia_setting_voice_info # Update voice info | |
] | |
) | |
cartesia_setting_auto_language.change( | |
on_auto_language_change, | |
inputs=[cartesia_setting_auto_language], | |
outputs=[cartesia_setting_manual_language] | |
) | |
cartesia_output_button.click( | |
generate_speech, | |
inputs=[ | |
cartesia_text, | |
cartesia_setting_voice, | |
cartesia_setting_improve_text, | |
cartesia_setting_auto_language, | |
cartesia_setting_manual_language, | |
cartesia_speed_speed, | |
cartesia_speed_speed_allow_custom, | |
cartesia_speed_speed_custom, | |
cartesia_emotions, | |
cartesia_emotions_intensity | |
], | |
outputs=[ | |
cartesia_output_audio, | |
cartessia_status_bar | |
] | |
) | |
# Run the app | |
if __name__ == "__main__": | |
demo.queue() | |
demo.launch(share=True) | |