Spaces:

espnet
/

cvss-c_es-en_s2st

Running

App Files Files Community

cvss-c_es-en_s2st / app.py

tjysdsg

Try to implement s2st

4defacc over 1 year ago

raw

history blame

4.47 kB

	import os
	import gradio as gr
	import numpy as np
	import torch
	import torchaudio
	from typing import Tuple, Optional
	import soundfile as sf
	from s2st_inference import s2st_inference

	SAMPLE_RATE = 16000
	MAX_INPUT_LENGTH = 60 # seconds

	S2UT_TAG = 'espnet/jiyang_tang_cvss-c_es-en_discrete_unit'
	S2UT_DIR = 'model'
	VOCODER_TAG = 'espnet/cvss-c_en_wavegan_hubert_vocoder'
	VOCODER_DIR = 'vocoder'


	def download_model(tag: str, out_dir: str):
	from huggingface_hub import snapshot_download

	return snapshot_download(repo_id=tag, local_dir=out_dir)


	def s2st(
	audio_source: str,
	input_audio_mic: Optional[str],
	input_audio_file: Optional[str],
	):
	if audio_source == 'file':
	input_path = input_audio_file
	else:
	input_path = input_audio_mic

	if input_path is None:
	gr.Error(f"Input audio is too long. Truncated to {MAX_INPUT_LENGTH} seconds.")
	return (None, None), None

	orig_wav, orig_sr = torchaudio.load(input_path)
	wav = torchaudio.functional.resample(orig_wav, orig_freq=orig_sr, new_freq=SAMPLE_RATE)
	max_length = int(MAX_INPUT_LENGTH * SAMPLE_RATE)
	if wav.shape[1] > max_length:
	wav = wav[:, :max_length]
	gr.Warning(f"Input audio is too long. Truncated to {MAX_INPUT_LENGTH} seconds.")

	wav = wav[0] # mono

	# Download models
	os.makedirs(S2UT_DIR, exist_ok=True)
	os.makedirs(VOCODER_DIR, exist_ok=True)
	s2ut_path = download_model(S2UT_TAG, S2UT_DIR)
	vocoder_path = download_model(VOCODER_TAG, VOCODER_DIR)

	# Temporary change cwd to model dir so that it loads correctly
	cwd = os.getcwd()
	os.chdir(s2ut_path)

	# Translate wav
	out_wav = s2st_inference(
	wav,
	train_config=os.path.join(
	s2ut_path,
	'exp',
	's2st_train_s2st_discrete_unit_raw_fbank_es_en',
	'config.yaml',
	),
	model_file=os.path.join(
	s2ut_path,
	'exp',
	's2st_train_s2st_discrete_unit_raw_fbank_es_en',
	'500epoch.pth',
	),
	vocoder_file=os.path.join(
	vocoder_path,
	'checkpoint-400000steps.pkl',
	),
	vocoder_config=os.path.join(
	vocoder_path,
	'config.yml',
	),
	)

	# Restore working directory
	os.chdir(cwd)

	# Save result
	output_path = 'output.wav'
	sf.write(
	output_path,
	out_wav,
	16000,
	"PCM_16",
	)

	return output_path, f'Source: {audio_source}'


	def update_audio_ui(audio_source: str) -> Tuple[dict, dict]:
	mic = audio_source == "microphone"
	return (
	gr.update(visible=mic, value=None), # input_audio_mic
	gr.update(visible=not mic, value=None), # input_audio_file
	)


	def main():
	with gr.Blocks() as demo:
	with gr.Group():
	with gr.Row() as audio_box:
	audio_source = gr.Radio(
	label="Audio source",
	choices=["file", "microphone"],
	value="file",
	)
	input_audio_mic = gr.Audio(
	label="Input speech",
	type="filepath",
	source="microphone",
	visible=False,
	)
	input_audio_file = gr.Audio(
	label="Input speech",
	type="filepath",
	source="upload",
	visible=True,
	)

	btn = gr.Button("Translate")

	with gr.Column():
	output_audio = gr.Audio(
	label="Translated speech",
	autoplay=False,
	streaming=False,
	type="numpy",
	)
	output_text = gr.Textbox(label="Translated text")

	audio_source.change(
	fn=update_audio_ui,
	inputs=audio_source,
	outputs=[
	input_audio_mic,
	input_audio_file,
	],
	queue=False,
	api_name=False,
	)

	btn.click(
	fn=s2st,
	inputs=[
	audio_source,
	input_audio_mic,
	input_audio_file,
	],
	outputs=[output_audio, output_text],
	api_name="run",
	)

	demo.queue(max_size=50).launch()


	if __name__ == '__main__':
	main()