import os import gradio as gr import librosa import soundfile as sf def inference(audio_1, audio_2): # Load the audio files translated_audio, sr_tr = librosa.load(audio_1, sr=None) background_audio, sr_bg = librosa.load(audio_2, sr=None) # Ensure both audio files have the same sample rate if sr_bg != sr_tr: translated_audio = librosa.resample(translated_audio, sr_tr, sr_bg) sr = sr_bg else: sr = sr_bg # or sr_tr, they're the same # Pad or truncate the audio to make them the same length max_len = max(len(background_audio), len(translated_audio)) background_audio = librosa.util.fix_length(background_audio, max_len) translated_audio = librosa.util.fix_length(translated_audio, max_len) # Mix the audio full_audio = background_audio + translated_audio # Normalize to prevent clipping full_audio = librosa.util.normalize(full_audio) sf.write("test.wav", full_audio, sr) # Write the output return "test.wav" title = "음성 합성" demo = gr.Interface( inference, [gr.Audio(type="filepath", label="Vocals"),gr.Audio(type="filepath", label="배경음")], gr.Audio(type="filepath", label="합성 결과"), title=title, ) demo.queue(max_size=1) demo.launch(debug=True)