Spaces:
Running
on
T4
Running
on
T4
from __future__ import annotations | |
import os | |
import gradio as gr | |
import numpy as np | |
import torch | |
import torchaudio | |
from seamless_communication.models.inference.translator import Translator | |
from m4t_app import * | |
from pydub import AudioSegment | |
import time | |
from time import sleep | |
# m4t_demo() | |
USE_M4T = True | |
def translate_audio_file_segment(audio_file): | |
print("translate_m4t state") | |
return predict( | |
task_name="S2ST", | |
audio_source="microphone", | |
input_audio_mic=audio_file, | |
input_audio_file=None, | |
input_text="", | |
source_language="English", | |
target_language="Portuguese", | |
) | |
def translate_m4t_callback( | |
audio_file, translated_audio_bytes_state, translated_text_state | |
): | |
translated_wav_segment, translated_text = translate_audio_file_segment(audio_file) | |
print('translated_audio_bytes_state', translated_audio_bytes_state) | |
print('translated_wav_segment', translated_wav_segment) | |
# combine translated wav into larger.. | |
if type(translated_audio_bytes_state) is not tuple: | |
translated_audio_bytes_state = translated_wav_segment | |
else: | |
translated_audio_bytes_state = (translated_audio_bytes_state[0], np.append(translated_audio_bytes_state[1], translated_wav_segment[1])) | |
# translated_wav_segment[1] | |
translated_text_state += " | " + str(translated_text) | |
return [ | |
audio_file, | |
translated_wav_segment, | |
translated_audio_bytes_state, | |
translated_text_state, | |
translated_audio_bytes_state, | |
translated_text_state, | |
] | |
def clear(): | |
print("Clearing State") | |
return [bytes(), ""] | |
def blocks(): | |
with gr.Blocks() as demo: | |
translated_audio_bytes_state = gr.State(None) | |
translated_text_state = gr.State("") | |
# input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3") | |
if USE_M4T: | |
input_audio = gr.Audio( | |
label="Input Audio", | |
type="filepath", | |
source="microphone", | |
streaming=True, | |
) | |
else: | |
input_audio = gr.Audio( | |
label="Input Audio", | |
type="filepath", | |
format="mp3", | |
source="microphone", | |
streaming=True, | |
) | |
most_recent_input_audio_segment = gr.Audio( | |
label="Recent Input Audio Segment segments", format="bytes", streaming=True | |
) | |
# TODO: Should add combined input audio segments... | |
stream_as_bytes_btn = gr.Button("Translate most recent recording segment") | |
output_translation_segment = gr.Audio( | |
label="Translated audio segment", | |
autoplay=False, | |
streaming=True, | |
type="numpy", | |
) | |
output_translation_combined = gr.Audio( | |
label="Translated audio combined", | |
autoplay=False, | |
streaming=True, | |
type="numpy", | |
) | |
# Could add output text segment | |
stream_output_text = gr.Textbox(label="Translated text") | |
stream_as_bytes_btn.click( | |
translate_m4t_callback, | |
[input_audio, translated_audio_bytes_state, translated_text_state], | |
[ | |
most_recent_input_audio_segment, | |
output_translation_segment, | |
output_translation_combined, | |
stream_output_text, | |
translated_audio_bytes_state, | |
translated_text_state, | |
], | |
) | |
input_audio.change( | |
translate_m4t_callback, | |
[input_audio, translated_audio_bytes_state, translated_text_state], | |
[ | |
most_recent_input_audio_segment, | |
output_translation_segment, | |
output_translation_combined, | |
stream_output_text, | |
translated_audio_bytes_state, | |
translated_text_state, | |
], | |
) | |
# input_audio.change(stream_bytes, [input_audio, translated_audio_bytes_state, translated_text_state], [most_recent_input_audio_segment, stream_output_text, translated_audio_bytes_state, translated_text_state]) | |
# input_audio.change(lambda input_audio: recorded_audio, [input_audio], [recorded_audio]) | |
input_audio.clear( | |
clear, None, [translated_audio_bytes_state, translated_text_state] | |
) | |
input_audio.start_recording( | |
clear, None, [translated_audio_bytes_state, translated_text_state] | |
) | |
demo.queue().launch() | |
# if __name__ == "__main__": | |
blocks() | |