import gradio as gr import librosa import numpy as np import torch import random from diffusers import SpectrogramDiffusionPipeline, MidiProcessor pipe = SpectrogramDiffusionPipeline.from_pretrained( "google/music-spectrogram-diffusion", torch_dtype=torch.float16 ).to("cuda") pipe.enable_xformers_memory_efficient_attention() processor = MidiProcessor() COLORS = [ ["#ff0000", "#00ff00"], ["#00ff00", "#0000ff"], ["#0000ff", "#ff0000"], ] def predict(audio_file_pth): with torch.inference_mode(): output = pipe(processor(audio_file_pth.name)[:5]) audio = output.audios[0] return gr.make_waveform((16000, audio.ravel()), bars_color=random.choice(COLORS), bar_count=75) title = "Music Spectrogram Diffusion: Multi-instrument Music Synthesis with Spectrogram Diffusion" description = """
For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
References: Music Spectrogram Diffusion paper | original GitHub | original weights
@article{hawthorne2022multi, title={Multi-instrument music synthesis with spectrogram diffusion}, author={Hawthorne, Curtis and Simon, Ian and Roberts, Adam and Zeghidour, Neil and Gardner, Josh and Manilow, Ethan and Engel, Jesse}, journal={arXiv preprint arXiv:2206.05408}, year={2022} }