import os
import gradio as gr
import librosa
import soundfile as sf


def inference(audio_1, audio_2):
    # Load the audio files
    translated_audio, sr_tr = librosa.load(audio_1, sr=None)
    background_audio, sr_bg = librosa.load(audio_2, sr=None)

    # Ensure both audio files have the same sample rate
    if sr_bg != sr_tr:
        translated_audio = librosa.resample(translated_audio, sr_tr, sr_bg)
        sr = sr_bg
    else:
        sr = sr_bg  # or sr_tr, they're the same

    # Pad or truncate the audio to make them the same length
    max_len = max(len(background_audio), len(translated_audio))
    background_audio = librosa.util.fix_length(background_audio, max_len)
    translated_audio = librosa.util.fix_length(translated_audio, max_len)

    # Mix the audio
    full_audio = background_audio + translated_audio

    # Normalize to prevent clipping
    full_audio = librosa.util.normalize(full_audio)
    sf.write("test.wav", full_audio, sr)
    # Write the output
    return "test.wav"
    
title = "음성 합성"

demo = gr.Interface(
    inference, 
    [gr.Audio(type="filepath", label="Vocals"),gr.Audio(type="filepath", label="배경음")],
    gr.Audio(type="filepath", label="합성 결과"),
    title=title,
    )

demo.queue(max_size=1)
demo.launch(debug=True)