File size: 2,823 Bytes
89d7ff4
 
 
 
3918537
89d7ff4
 
 
f04e297
 
 
ba2b8a5
 
89d7ff4
 
3918537
 
 
 
 
89d7ff4
 
 
ba2b8a5
4b50ebb
ba2b8a5
89d7ff4
3918537
89d7ff4
 
 
 
 
91fcea6
 
 
 
 
 
89d7ff4
1381480
89d7ff4
 
 
 
f1a443d
89d7ff4
59b84ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89d7ff4
 
 
ba2b8a5
89d7ff4
 
4b50ebb
89d7ff4
 
 
f04e297
f1a443d
59b84ef
f04e297
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import gradio as gr
import librosa
import numpy as np
import torch
import random

from diffusers import SpectrogramDiffusionPipeline, MidiProcessor

pipe = SpectrogramDiffusionPipeline.from_pretrained(
    "google/music-spectrogram-diffusion", torch_dtype=torch.float16
).to("cuda")
pipe.enable_xformers_memory_efficient_attention()

processor = MidiProcessor()

COLORS = [
    ["#ff0000", "#00ff00"],
    ["#00ff00", "#0000ff"],
    ["#0000ff", "#ff0000"],
]

def predict(audio_file_pth):

    with torch.inference_mode():
        output = pipe(processor(audio_file_pth.name)[:5])
        audio = output.audios[0]

    return gr.make_waveform((16000, audio.ravel()), bars_color=random.choice(COLORS), bar_count=75)


title = "Music Spectrogram Diffusion: Multi-instrument Music Synthesis with Spectrogram Diffusion"

description = """
<p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
<br/>
<a href="https://huggingface.co/spaces/reach-vb/music-spectrogram-diffusion?duplicate=true">
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
</p>

In this work, the authors focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. 
This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments.

They use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter.
"""

examples = ["examples/beethoven_mond_2.mid", "examples/beethoven_hammerklavier_2.mid"]


article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2206.05408">Music Spectrogram Diffusion paper</a> |
<a href="https://github.com/magenta/music-spectrogram-diffusion">original GitHub</a> |
<a href="https://github.com/magenta/music-spectrogram-diffusion#pretrained-models">original weights</a></p>
<pre>
@article{hawthorne2022multi,
  title={Multi-instrument music synthesis with spectrogram diffusion},
  author={Hawthorne, Curtis and Simon, Ian and Roberts, Adam and Zeghidour, Neil and Gardner, Josh and Manilow, Ethan and Engel, Jesse},
  journal={arXiv preprint arXiv:2206.05408},
  year={2022}
}
</pre>
</div>
"""

gr.Interface(
    fn=predict,
    inputs=[
        gr.File(label="Upload MIDI", file_count="single", file_types=[".mid"]),
    ],
    outputs=[
        gr.Video(label="Synthesised Music"),
    ],
    title=title,
    description=description,
    theme="gradio/monochrome",
    examples=examples,
    article=article,
).launch(debug=True)