LivePortrait2 / elevenlabs_utils.py
yerang
add files
d98c79a
raw
history blame
4.98 kB
import os
from io import BytesIO
from typing import IO, Optional
import time
import uuid
from pathlib import Path
from pydub import AudioSegment
import gradio as gr
from elevenlabs import Voice, VoiceSettings, save
from elevenlabs.client import ElevenLabs
def generate_random_filename(parent, extension="txt"):
"""
Generates a random filename using UUID and current timestamp.
Args:
extension (str): The file extension for the generated filename. Default is 'txt'.
Returns:
str: A random filename with the specified extension.
"""
# Generate a random UUID
random_uuid = uuid.uuid4()
# Get the current timestamp
timestamp = int(time.time())
# Combine UUID and timestamp to create a unique filename
filename = f"{random_uuid}_{timestamp}.{extension}"
file_path = os.path.join(parent, filename)
return file_path
ELEVEN_LABS_MODEL = os.getenv("ELEVEN_LABS_MODEL", "eleven_multilingual_v2")
ELEVEN_LABS_LANGUAGE_SUPPORTS = [
"English",
"Chinese",
"Spanish",
"Hindi",
"Portuguese",
"French",
"German",
"Japanese",
"Arabic",
"Korean",
"Indonesian",
"Italian",
"Dutch",
"Turkish",
"Polish",
"Swedish",
"Filipino",
"Malay",
"Russian",
"Romanian",
"Ukrainian",
"Greek",
"Czech",
"Danish",
"Finnish",
"Bulgarian",
"Croatian",
"Slovak",
"Tamil",
]
class ElevenLabsPipeline:
def __init__(self):
eleven_labs_api_key = os.getenv("ELEVENLABS_API_KEY", "sk_f4f7d77bc8065b15824cf52ea46c7d99e0e5db2a0f93b673")
if eleven_labs_api_key is None:
raise Exception("ELEVENLABS_API_KEY ํ™˜๊ฒฝ๋ณ€์ˆ˜๋ฅผ ์„ค์ •ํ•ด์ฃผ์„ธ์š”.")
self.client = ElevenLabs(
api_key=eleven_labs_api_key, # Defaults to ELEVEN_API_KEY
)
os.makedirs("./tmp", exist_ok=True)
def clone_voice(self, audio, name, description=None):
response = self.client.voices.get_all()
for voice in response.voices:
if voice.name == name:
return "์กด์žฌํ•˜๋Š” ์Œ์„ฑ์ž…๋‹ˆ๋‹ค. ์Œ์„ฑ ์ƒ์„ฑ์„ ์‹œ์ž‘ํ•ด์ฃผ์„ธ์š”."
try:
voice = self.client.clone(
name=name,
description=description, # Optional
files=[audio],
)
return "Voice Clone์„ ์„ฑ๊ณต์ ์œผ๋กœ ์ƒ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค."
except Exception as e:
return str(e)
def _get_voice(self, name: str):
response = self.client.voices.get_all()
current_voice = None
for voice in response.voices:
if voice.name == name:
current_voice = voice
break
return current_voice
def generate_voice(
self,
text: str,
audio: str = None,
language: str = "ko",
mute_before_ms: Optional[int] = 0,
mute_after_ms: Optional[int] = 0,
stability: float = 0.5,
similarity_boost: float = 0.75,
style: float = 0.0,
use_speaker_boost=True,
) -> str:
if audio is not None:
name = Path(audio).stem
self.clone_voice(audio, name)
else:
gr.Info("์Œ์„ฑ์ด ์•ˆ์ฃผ์–ด์กŒ์Šต๋‹ˆ๋‹ค. ๊ธฐ๋ณธ ์Œ์„ฑ์œผ๋กœ ์ƒ์„ฑํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค.", duration=2)
name = "Laura"
current_voice = self._get_voice(name)
if current_voice is None:
current_voice = self._get_voice(name)
response = self.client.generate(
text=text,
model=ELEVEN_LABS_MODEL,
voice=Voice(
voice_id=current_voice.voice_id,
settings=VoiceSettings(
stability=stability,
similarity_boost=similarity_boost,
style=style,
use_speaker_boost=use_speaker_boost,
language=language,
),
),
)
# Create a BytesIO object to hold the audio data in memory
audio_stream = BytesIO()
# Write each chunk of audio data to the stream
for chunk in response:
if chunk:
audio_stream.write(chunk)
# Reset stream position to the beginning
audio_stream.seek(0)
# Load the audio stream into an AudioSegment
audio_segment = AudioSegment.from_file(audio_stream, format="mp3")
# Create silent segments for before and after
mute_before = AudioSegment.silent(duration=mute_before_ms)
mute_after = AudioSegment.silent(duration=mute_after_ms)
# Concatenate the segments
combined_segment = mute_before + audio_segment + mute_after
tmp_file = generate_random_filename("./tmp", "mp3")
# Export the combined audio to the specified file
combined_segment.export(tmp_file, format="mp3", bitrate="128k")
return tmp_file