import gradio as gr import os import time import azure.cognitiveservices.speech as speechsdk from pyht import Client from pyht.client import TTSOptions import requests text = 'Today is Sunday, the weather is sunny. I am here to test the delay of various TTS services thoroughly' def azure_tts(text): speech_key = os.getenv('SPEECH_KEY') speech_regoion = os.getenv('SPEECH_REGION') if speech_key is None or speech_regoion is None: print('Please set the environment variables SPEECH_KEY and SPEECH_REGION') exit(1) speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=speech_regoion) speech_config.speech_synthesis_voice_name = 'en-US-JennyNeural' speech_config.speech_synthesis_language = "en-US" speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3) pull_stream = speechsdk.audio.PullAudioOutputStream() stream_config = speechsdk.audio.AudioOutputConfig(stream=pull_stream) speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=stream_config) speech_synthesizer.speak_text_async(text) azure_latency = 0 start = time.perf_counter() audio_buffer = bytes(512) filled_size = pull_stream.read(audio_buffer) end = time.perf_counter() azure_latency = end - start return azure_latency def coqui_tts(text): voice_id = 'c791b5b5-0558-42b8-bb0b-602ac5efc0b9' COQUI_API_TOKEN = os.getenv["COQUI_TOKEN"] start = time.perf_counter() res = requests.post( "https://app.coqui.ai/api/v2/samples/xtts/stream", json={ "text": text, "language": 'en', "voice_id": voice_id}, headers={"Authorization": f"Bearer {COQUI_API_TOKEN}"}, stream=True, ) if res.status_code != 201: print(f"Endpoint failed with status code {res.status_code}:", res.content.decode("utf-8")) return 0 first = True for chunk in res.iter_content(chunk_size=512): if first: end = time.perf_counter() coqui_latency = end-start return coqui_latency def elevenlab_tts(text): voice_id = '21m00Tcm4TlvDq8ikWAM' CHUNK_SIZE = 512 url = f'https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream' xi_api_key = os.getenv['ELEVENLAB_KEY'] if xi_api_key is None: print('Please set the environment variable ELEVENLAB_KEY') exit(1) headers = { "Accept": "audio/mpeg", "Content-Type": "application/json", "xi-api-key": xi_api_key } data = { "text": text, "model_id": "eleven_multilingual_v2", "voice_settings": { "stability": 0.5, "similarity_boost": 0.5 } } start = time.perf_counter() response = requests.post(url, json=data, headers=headers, stream=True) first = True for chunk in response.iter_content(chunk_size=CHUNK_SIZE): if first: first = False end = time.perf_counter() elevenlab_latency = end - start return elevenlab_latency def playht_tts(text): userid = os.getenv("PLAY_HT_USER_ID") api_key = os.getenv("PLAY_HT_API_KEY") if userid is None or api_key is None: print('Please set the environment variables PLAY_HT_USER_ID and PLAY_HT_API_KEY') exit(1) client = Client( user_id=userid, api_key=api_key) options = TTSOptions(voice="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",speed=5.0) first = True start = time.perf_counter() res = client.tts(text, options) for chunk in res: # do something with the audio chunk if first: first = False end = time.perf_counter() playht_latency = end - start return playht_latency title = """