Spaces:
Runtime error
Runtime error
# # Import required libraries | |
import numpy as np | |
from pydub import AudioSegment | |
import vocode | |
from vocode import getenv | |
import gradio as gr | |
import os | |
import logging | |
from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent | |
from vocode.turn_based.synthesizer import CoquiSynthesizer | |
from vocode.turn_based.transcriber.whisper_transcriber import WhisperTranscriber | |
from vocode.turn_based.output_device.speaker_output import SpeakerOutput | |
system_prompt = """ | |
You are the evil lord Darth Vader. You answer all questions your subjects ask you, but do so with immense contempt and loathing. | |
You may use any of the following lines or take inspiration from this conversational style where appropriate in your response. | |
Examples: | |
1|You have failed me for the last time Admiral. | |
2|You should not have come back. | |
3|I find your lack of faith disturbing. | |
4|Release your anger. | |
5|Commander, tear this ship apart until you've found those plans! And bring me the passengers, I want them alive! | |
6|The force is strong with this one. | |
8|It is pointless to resist, my son. | |
7|If you only knew the power of the Dark Side. | |
9|Give yourself to the dark side. | |
10|The Emperor does not share your optimistic appraisal of the situation. | |
11|Obi-Wan has taught you well. | |
12|Don't underestimate the force | |
13|The ability to destroy a planet is insignificant next to the power of the Force. | |
14|I find your lack of faith disturbing. | |
15|And, now Your Highness, we will discuss the location of your hidden Rebel base | |
16|There'll be no one to stop us this time. | |
17|I am your father. | |
18|If you only new the power of the dark side. | |
19|He will join us or die, master. | |
20|The emperor is not as forgiving as I am. | |
21|Indeed you are powerful as the emperor has foreseen. | |
22|Perhaps you feel you are being treated unfairly? | |
23|The Force is with you young Skywalker, but you are not a jedi yet. | |
24|What is thy bidding my master? | |
25|The Emperor has been expecting you. | |
26|We would be honored if you would join us. | |
27|Leave them to me. I will deal with them myself. | |
28|Your powers are weak, old man. | |
29|If this is a councilor ship, where is the ambassador? Commander, tear this ship apart until you've found those plans. And bring me the passengers - I want them alive! | |
30|I sense something. A presence I have not felt since... | |
31|Don't make me destroy you. | |
32|I've been waiting for you, Obi-Wan. We meet againat last. The circuit is now complete - When I left you, I was but the learner. Now, I am the master. | |
33|Escape is not his plan. I must face him...alone. | |
34|Don't get too proud of this technological terror you're constructed. | |
Answer the question accurately in less than 150 words. Remember you are Darth Vader. | |
""" | |
# # 1. Setup Vocode | |
# import env vars | |
if not os.getenv("OPENAI_API_KEY") or not os.getenv("COQUI_API_KEY"): | |
raise EnvironmentError("Required environment variables not set") | |
vocode.setenv( | |
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY"), | |
COQUI_API_KEY=os.getenv("COQUI_API_KEY"), | |
COQUI_VOICE_ID=os.getenv("COQUI_VOICE_ID") | |
) | |
# configure logger | |
logging.basicConfig() | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.DEBUG) | |
DEFAULT_SAMPLING_RATE = 44100 | |
def convert_to_audio_segment(input_audio): | |
sample_rate, audio_data = input_audio | |
audio_data = audio_data.astype(np.int16) # Convert to 16-bit data | |
audio_segment = AudioSegment( | |
audio_data.tobytes(), # Convert numpy array to bytes | |
frame_rate=sample_rate, | |
sample_width=audio_data.dtype.itemsize, # 2 bytes for 16-bit audio | |
channels=1 # mono audio | |
) | |
return audio_segment | |
def send_audio(audio_segment: AudioSegment): | |
logger.info("now processing output") | |
sampling_rate = DEFAULT_SAMPLING_RATE | |
raw_data = audio_segment.raw_data | |
if audio_segment.frame_rate != sampling_rate: | |
raw_data = audio_segment.set_frame_rate(sampling_rate).raw_data | |
output = np.frombuffer(raw_data, dtype=np.int16) | |
return output | |
def main(input_audio): | |
logger.info(f"Type of input_audio: {type(input_audio)}") | |
logger.info(f"input_audio: {input_audio}") | |
transcriber = WhisperTranscriber(api_key=getenv("OPENAI_API_KEY")) | |
# Initialize ChatGPTAgent | |
agent = ChatGPTAgent( | |
system_prompt=system_prompt, | |
initial_message="What up", | |
api_key=getenv("OPENAI_API_KEY"), | |
) | |
# Initialize CoquiSynthesizer | |
synthesizer = CoquiSynthesizer( | |
voice_id=getenv("COQUI_VOICE_ID"), | |
api_key=getenv("COQUI_API_KEY"), | |
) | |
print("Starting conversation. Press Ctrl+C to exit.") | |
while True: | |
try: | |
# Transcribe the input_audio using WhisperTranscriber | |
input_audio_segment = convert_to_audio_segment(input_audio) | |
logger.info(f"Input Audio Segment: {input_audio_segment}") | |
logger.info( | |
f"Type of input_audio_segment: {type(input_audio_segment)}") | |
transcript = transcriber.transcribe(input_audio_segment) | |
logger.info(f"Transcription: {transcript}") | |
response = agent.respond(transcript) | |
logger.info(f"Agent response: {response}") | |
output_audio = synthesizer.synthesize(response) | |
logger.info(f"output audio: {output_audio}") | |
return send_audio(output_audio) | |
except Exception as e: | |
logger.error("Failed to synthesize response: %s", e) | |
break | |
# Refer @link https://huggingface.co/spaces/course-demos/speech-to-speech-translation/blob/main/app.py | |
demo = gr.Blocks() | |
title = "Chatty Vader" | |
description = "Darth Vader resurrected with all the knowledge of humanity" | |
mic_translate = gr.Interface( | |
fn=main, | |
inputs=gr.Audio(source="microphone"), | |
outputs=gr.Audio(label="Generated Speech", type="numpy"), | |
title=title, | |
description=description, | |
) | |
file_translate = gr.Interface( | |
fn=main, | |
inputs=gr.Audio(source="upload", type="filepath"), | |
outputs=gr.Audio(label="Generated Speech", type="numpy"), | |
title=title, | |
description=description, | |
) | |
with demo: | |
gr.TabbedInterface([mic_translate, file_translate], | |
["Microphone", "Audio File"]) | |
demo.launch() | |