import gradio as gr import librosa import numpy as np import torch import random from diffusers import SpectrogramDiffusionPipeline, MidiProcessor pipe = SpectrogramDiffusionPipeline.from_pretrained( "google/music-spectrogram-diffusion", torch_dtype=torch.float16 ).to("cuda") pipe.enable_xformers_memory_efficient_attention() processor = MidiProcessor() COLORS = [ ["#ff0000", "#00ff00"], ["#00ff00", "#0000ff"], ["#0000ff", "#ff0000"], ] def predict(audio_file_pth): with torch.inference_mode(): output = pipe(processor(audio_file_pth.name)[:5]) audio = output.audios[0] return gr.make_waveform((16000, audio.ravel()), bars_color=random.choice(COLORS), bar_count=75) title = "Music Spectrogram Diffusion: Multi-instrument Music Synthesis with Spectrogram Diffusion" description = """

For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.

In this work, the authors focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. They use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. """ examples = ["examples/beethoven_mond_2.mid", "examples/beethoven_hammerklavier_2.mid"] article = """

References: Music Spectrogram Diffusion paper | original GitHub | original weights

@article{hawthorne2022multi,
  title={Multi-instrument music synthesis with spectrogram diffusion},
  author={Hawthorne, Curtis and Simon, Ian and Roberts, Adam and Zeghidour, Neil and Gardner, Josh and Manilow, Ethan and Engel, Jesse},
  journal={arXiv preprint arXiv:2206.05408},
  year={2022}
}

""" gr.Interface( fn=predict, inputs=[ gr.File(label="Upload MIDI", file_count="single", file_types=[".mid"]), ], outputs=[ gr.Video(label="Synthesised Music"), ], title=title, description=description, theme="gradio/monochrome", examples=examples, article=article, ).launch(debug=True)