import gradio as gr
import librosa
import numpy as np
import torch

from diffusers import SpectrogramDiffusionPipeline, MidiProcessor

pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
pipe = pipe.to("cuda")
processor = MidiProcessor()


def predict(audio_file_pth):
    # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))

    output = pipe(processor(audio_file_pth.name)[:2])
    audio = output.audios[0]

    return (16000, audio.ravel())


title = "Music Spectrogram Diffusion: Multi-instrument Music Synthesis with Spectrogram Diffusion"

description = """
In this work, the authors focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. 
This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments.

They use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter.
"""
examples = []


gr.Interface(
    fn=predict,
    inputs=[
        gr.File(file_count="single", file_types=[".mid"]),
    ],
    outputs=[
        gr.Audio(label="Synthesised Music", type="numpy"),
    ],
    title=title,
    description=description,
    theme='gstaff/xkcd',
).launch(debug=True)