import gradio as gr import librosa import numpy as np import torch import random from diffusers import SpectrogramDiffusionPipeline, MidiProcessor pipe = SpectrogramDiffusionPipeline.from_pretrained( "google/music-spectrogram-diffusion", torch_dtype=torch.float16 ).to("cuda") pipe.enable_xformers_memory_efficient_attention() processor = MidiProcessor() COLORS = [ ["#ff0000", "#00ff00"], ["#00ff00", "#0000ff"], ["#0000ff", "#ff0000"], ] def predict(audio_file_pth): with torch.inference_mode(): output = pipe(processor(audio_file_pth.name)[:5]) audio = output.audios[0] return gr.make_waveform((16000, audio.ravel()), bars_color=random.choice(COLORS), bar_count=75) title = "Music Spectrogram Diffusion: Multi-instrument Music Synthesis with Spectrogram Diffusion" description = """ In this work, the authors focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. They use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. """ examples = ["examples/beethoven_mond_2.mid", "examples/beethoven_hammerklavier_2.mid"] gr.HTML("""
For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
References: Music Spectrogram Diffusion paper | original GitHub | original weights
@article{hawthorne2022multi, title={Multi-instrument music synthesis with spectrogram diffusion}, author={Hawthorne, Curtis and Simon, Ian and Roberts, Adam and Zeghidour, Neil and Gardner, Josh and Manilow, Ethan and Engel, Jesse}, journal={arXiv preprint arXiv:2206.05408}, year={2022} }