import gradio as gr import librosa import numpy as np import torch import random from diffusers import SpectrogramDiffusionPipeline, MidiProcessor pipe = SpectrogramDiffusionPipeline.from_pretrained( "google/music-spectrogram-diffusion", torch_dtype=torch.float16 ).to("cuda") pipe.enable_xformers_memory_efficient_attention() processor = MidiProcessor() COLORS = [ ["#ff0000", "#00ff00"], ["#00ff00", "#0000ff"], ["#0000ff", "#ff0000"], ] def predict(audio_file_pth): with torch.inference_mode(): output = pipe(processor(audio_file_pth.name)[:2]) audio = output.audios[0] return gr.make_waveform((16000, audio.ravel()), bars_color=random.choice(COLORS), bar_count=75) title = "Music Spectrogram Diffusion: Multi-instrument Music Synthesis with Spectrogram Diffusion" description = """ In this work, the authors focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. They use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. """ examples = ["examples/beethoven_mond_2.mid", "examples/beethoven_hammerklavier_2.mid"] gr.Interface( fn=predict, inputs=[ gr.File(label="Upload MIDI", file_count="single", file_types=[".mid"]), ], outputs=[ gr.Video(label="Synthesised Audio"), ], title=title, description=description, theme="gradio/monochrome", examples=examples, ).launch(debug=True)