import gradio as gr import librosa import numpy as np import torch from diffusers import SpectrogramDiffusionPipeline, MidiProcessor pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") pipe = pipe.to("cuda") processor = MidiProcessor() def predict(audio_file_pth): # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels)) output = pipe(processor(audio_file_pth.name)[:2]) audio = output.audios[0] return (16000, audio.ravel()) title = "Music Spectrogram Diffusion: Multi-instrument Music Synthesis with Spectrogram Diffusion" description = """ In this work, the authors focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. They use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. """ examples = [] gr.Interface( fn=predict, inputs=[ gr.File(file_count="single", file_types=[".mid"]), ], outputs=[ gr.Audio(label="Synthesised Music", type="numpy"), ], title=title, description=description, theme='gstaff/xkcd', ).launch(debug=True)