Spaces:
Running
Running
import gradio as gr | |
from gradio_client import Client | |
import uuid | |
import warnings | |
import numpy as np | |
import json | |
import os | |
from gradio_client import Client, FileData, handle_file | |
warnings.filterwarnings("ignore") | |
import tempfile | |
import scipy.io.wavfile as wavfile | |
client = Client(os.environ['src']) | |
def create_frontend_demo(): | |
def chat_function(message, history, session_id): | |
if not session_id: | |
session_id = "user_" + uuid.uuid4().hex[:8] | |
result = client.predict( | |
message, | |
history, | |
session_id, | |
fn_index=0 | |
) | |
# The backend returns: empty_string, history, audio_path, display_text | |
_, new_history, audio_path, display_text = result | |
# For audio, we need to return the path string directly | |
return "", new_history, audio_path, session_id, display_text | |
with gr.Blocks(theme="Respair/[email protected]") as demo: | |
session_id_state = gr.State("") | |
with gr.Tabs() as tabs: | |
with gr.Tab("Chat"): | |
session_display = gr.Markdown("Current Session ID: None", label="Session ID") | |
chatbot = gr.Chatbot( | |
label="Conversation History", | |
height=400, | |
avatar_images=[ | |
"photo_2024-03-01_22-30-42.jpg", | |
"colored_blured.png" | |
], | |
placeholder="Start chatting with Aira..." | |
) | |
msg = gr.Textbox( | |
show_label=False, | |
placeholder="Enter text and press enter", | |
container=False | |
) | |
audio_output = gr.Audio( | |
label="Aira's Response", | |
type="filepath", | |
streaming=False, | |
autoplay=True | |
) | |
with gr.Row(): | |
audio_input = gr.Audio( | |
sources=["microphone"], | |
type="numpy", | |
label="Audio Input", | |
streaming=False | |
) | |
with gr.Tab("Options"): | |
with gr.Column(): | |
session_input = gr.Textbox( | |
value="", | |
label="Session ID (leave blank for new session)" | |
) | |
gen_id_btn = gr.Button("Set Session ID") | |
session_msg = gr.Markdown("") | |
clear_btn = gr.Button("Clear Conversation") | |
gr.Markdown(""" | |
This is a personal project I wanted to do for a while (G̶o̶t̶t̶a̶ ̶m̶a̶k̶e̶ ̶u̶s̶e̶ ̶o̶f̶ ̶t̶h̶i̶s̶ ̶P̶r̶o̶ ̶s̶u̶b̶ ̶p̶e̶r̶k̶s̶ ̶w̶h̶i̶l̶e̶ ̶I̶ ̶h̶a̶v̶e̶ ̶i̶t̶). <br> | |
Aira's voice is made to be unique, it doesn't belong to any real person out there. <br> | |
You can talk to her in English or Japanese, but she will only respond in Japanese (Subs over dubs, bros) ask her to give you a Subtitle if you can't talk in Japanese. <br> | |
The majority of the latency depends on the HF's inference api. | |
LLM is not fine-tuned or optimized at all. the current state of conversational off-the-shelf japanese LLM seem to be less than remarkable, please beware of that. | |
1. Enter your Session ID above or leave blank for a new one | |
2. Click 'Set Session ID' to confirm | |
3. Use 'Clear Conversation' to reset the chat | |
4. Your conversation history is saved based on your Session ID | |
I'll try to keep this demo up for a while. | |
""") | |
def respond(message, chat_history, session_id): | |
return chat_function(message, chat_history, session_id) | |
msg.submit( | |
respond, | |
inputs=[msg, chatbot, session_id_state], | |
outputs=[msg, chatbot, audio_output, session_id_state, session_display] | |
) | |
def set_session(user_id): | |
result = client.predict( | |
user_id, | |
fn_index=1 | |
) | |
new_id, display_text = result | |
return new_id, "", display_text | |
gen_id_btn.click( | |
set_session, | |
inputs=[session_input], | |
outputs=[session_id_state, session_msg, session_display] | |
) | |
def handle_audio(audio_data, history, session_id): | |
if audio_data is None: | |
return None, history, session_id, f"Current Session ID: {session_id}" | |
try: | |
sample_rate, audio_array = audio_data | |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=True) as temp: | |
wavfile.write(temp.name, sample_rate, audio_array) | |
audio = {"path": temp.name, "meta": {"_type": "gradio.FileData"}} | |
# Get the result while the temporary file still exists | |
result = client.predict( | |
audio, | |
history, | |
session_id, | |
api_name="/handle_audio" | |
) | |
# Unpack only 3 values and construct the display text | |
audio_path, new_history, new_session_id = result | |
display_text = f"Current Session ID: {new_session_id}" | |
return audio_path, new_history, new_session_id, display_text | |
except Exception as e: | |
print(f"Error processing audio: {str(e)}") | |
import traceback | |
traceback.print_exc() # This will print the full error traceback | |
return None, history, session_id, f"Error processing audio. Session ID: {session_id}" | |
audio_input.stop_recording( | |
handle_audio, | |
inputs=[audio_input, chatbot, session_id_state], | |
outputs=[audio_output, chatbot, session_id_state, session_display] | |
) | |
clear_btn.click( | |
lambda: [], | |
None, | |
[chatbot] | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_frontend_demo() | |
demo.launch(server_name="0.0.0.0", server_port=7861, show_error=True) |