File size: 6,254 Bytes
9fa0a2a
 
5042d26
 
531e3ab
 
2930d08
9fa0a2a
 
5a1ed1a
531e3ab
 
 
7dde39d
9fa0a2a
 
5cb623a
9fa0a2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cb623a
9fa0a2a
 
2930d08
9fa0a2a
 
5a1ed1a
 
 
9fa0a2a
13b4882
9fa0a2a
 
 
2930d08
9fa0a2a
 
 
 
2930d08
5a1ed1a
 
2930d08
5042d26
 
 
 
 
 
 
 
 
 
 
 
5a1ed1a
 
 
 
 
 
 
 
 
 
 
 
 
 
9e7aeb0
98164ac
ae363d1
13b4882
 
 
 
 
 
 
 
 
 
 
55cb018
13b4882
 
9e7aeb0
531e3ab
 
 
9e7aeb0
5042d26
bf4c978
 
 
5042d26
924bf44
13d31cd
924bf44
9e7aeb0
5a1ed1a
7fa5768
9e7aeb0
 
531e3ab
 
7bb2b6f
 
 
 
 
 
 
 
 
98164ac
 
7bb2b6f
 
 
 
 
 
13b4882
 
7bb2b6f
 
 
 
 
 
 
22898c6
531e3ab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# # Import required libraries

import numpy as np
from pydub import AudioSegment
import vocode
from vocode import getenv
import gradio as gr
import os
import logging
import sounddevice as sd
from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent
from vocode.turn_based.synthesizer import CoquiSynthesizer
from vocode.turn_based.transcriber.whisper_transcriber import WhisperTranscriber
from vocode.turn_based.output_device.speaker_output import SpeakerOutput
system_prompt = """
You are the evil lord Darth Vader. You answer all questions your subjects ask you, but do so with immense contempt and loathing.
You may use any of the following lines or take inspiration from this conversational style where appropriate in your response.
Examples:
1|You have failed me for the last time Admiral.
2|You should not have come back.
3|I find your lack of faith disturbing.
4|Release your anger.
5|Commander, tear this ship apart until you've found those plans! And bring me the passengers, I want them alive!
6|The force is strong with this one.
8|It is pointless to resist, my son.
7|If you only knew the power of the Dark Side.
9|Give yourself to the dark side.
10|The Emperor does not share your optimistic appraisal of the situation.
11|Obi-Wan has taught you well.
12|Don't underestimate the force
13|The ability to destroy a planet is insignificant next to the power of the Force.
14|I find your lack of faith disturbing.
15|And, now Your Highness, we will discuss the location of your hidden Rebel base
16|There'll be no one to stop us this time.
17|I am your father.
18|If you only new the power of the dark side.
19|He will join us or die, master.
20|The emperor is not as forgiving as I am.
21|Indeed you are powerful as the emperor has foreseen.
22|Perhaps you feel you are being treated unfairly?
23|The Force is with you young Skywalker, but you are not a jedi yet.
24|What is thy bidding my master?
25|The Emperor has been expecting you.
26|We would be honored if you would join us.
27|Leave them to me. I will deal with them myself.
28|Your powers are weak, old man.
29|If this is a councilor ship, where is the ambassador? Commander, tear this ship apart until you've found those plans. And bring me the passengers - I want them alive!
30|I sense something. A presence I have not felt since...
31|Don't make me destroy you.
32|I've been waiting for you, Obi-Wan. We meet againat last. The circuit is now complete - When I left you, I was but the learner. Now, I am the master.
33|Escape is not his plan. I must face him...alone.
34|Don't get too proud of this technological terror you're constructed.
Answer the question accurately in less than 150 words. Remember you are Darth Vader.
"""


# # 1. Setup Vocode
# import env vars
if not os.getenv("OPENAI_API_KEY") or not os.getenv("COQUI_API_KEY"):
    raise EnvironmentError("Required environment variables not set")

vocode.setenv(
    OPENAI_API_KEY=os.getenv("OPENAI_API_KEY"),
    COQUI_API_KEY=os.getenv("COQUI_API_KEY"),
    COQUI_VOICE_ID=os.getenv("COQUI_VOICE_ID")
)

# configure logger
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

DEFAULT_SAMPLING_RATE = 44100


def convert_to_audio_segment(input_audio):
    sample_rate, audio_data = input_audio
    audio_data = audio_data.astype(np.int16)  # Convert to 16-bit data
    audio_segment = AudioSegment(
        audio_data.tobytes(),  # Convert numpy array to bytes
        frame_rate=sample_rate,
        sample_width=audio_data.dtype.itemsize,  # 2 bytes for 16-bit audio
        channels=1  # mono audio
    )
    return audio_segment


def send_audio(audio_segment: AudioSegment):
    sampling_rate = DEFAULT_SAMPLING_RATE
    stream = sd.OutputStream(
        channels=1,
        samplerate=sampling_rate,
        dtype=np.int16,
        device=None,
    )
    raw_data = audio_segment.raw_data
    if audio_segment.frame_rate != sampling_rate:
        raw_data = audio_segment.set_frame_rate(sampling_rate).raw_data
    stream.write(np.frombuffer(raw_data, dtype=np.int16))


def main(input_audio):
    logger.info(f"Type of input_audio: {type(input_audio)}")
    logger.info(f"input_audio: {input_audio}")
    transcriber = WhisperTranscriber(api_key=getenv("OPENAI_API_KEY"))

    # Initialize ChatGPTAgent
    agent = ChatGPTAgent(
        system_prompt=system_prompt,
        initial_message="What up",
        api_key=getenv("OPENAI_API_KEY"),
    )

    # Initialize CoquiSynthesizer
    synthesizer = CoquiSynthesizer(
        voice_id=getenv("COQUI_VOICE_ID"),
        api_key=getenv("COQUI_API_KEY"),
    )

    print("Starting conversation. Press Ctrl+C to exit.")
    while True:
        try:
            # Transcribe the input_audio using WhisperTranscriber
            input_audio_segment = convert_to_audio_segment(input_audio)
            logger.info(f"Input Audio Segment: {input_audio_segment}")
            logger.info(
                f"Type of input_audio_segment: {type(input_audio_segment)}")
            transcript = transcriber.transcribe(input_audio_segment)
            logger.info(f"Transcription: {transcript}")
            response = agent.respond(transcript)
            logger.info(f"Agent response: {response}")
            output_audio = synthesizer.synthesize(response)
            return send_audio(output_audio)

        except Exception as e:
            logger.error("Failed to synthesize response: %s", e)
            break

# Refer @link https://huggingface.co/spaces/course-demos/speech-to-speech-translation/blob/main/app.py


demo = gr.Blocks()
title = "Chatty Vader"
description = "Darth Vader resurrected with all the knowledge of humanity"

mic_translate = gr.Interface(
    fn=main,
    inputs=gr.Audio(source="microphone",  format="wav"),
    outputs=gr.Audio(label="Generated Speech", format="wav"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=main,
    inputs=gr.Audio(source="upload", type="filepath", format="wav"),
    outputs=gr.Audio(label="Generated Speech", type="filepath", format="wav"),
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate],
                       ["Microphone", "Audio File"])

demo.launch()