File size: 2,838 Bytes
89d7ff4
 
 
 
3918537
89d7ff4
 
 
f04e297
 
 
ba2b8a5
 
89d7ff4
 
3918537
 
 
 
 
89d7ff4
 
 
ba2b8a5
4b50ebb
ba2b8a5
89d7ff4
3918537
89d7ff4
 
 
 
 
 
1381480
89d7ff4
 
 
 
f1a443d
89d7ff4
59b84ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89d7ff4
 
 
ba2b8a5
89d7ff4
 
4b50ebb
89d7ff4
 
 
f04e297
f1a443d
59b84ef
f04e297
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import gradio as gr
import librosa
import numpy as np
import torch
import random

from diffusers import SpectrogramDiffusionPipeline, MidiProcessor

pipe = SpectrogramDiffusionPipeline.from_pretrained(
    "google/music-spectrogram-diffusion", torch_dtype=torch.float16
).to("cuda")
pipe.enable_xformers_memory_efficient_attention()

processor = MidiProcessor()

COLORS = [
    ["#ff0000", "#00ff00"],
    ["#00ff00", "#0000ff"],
    ["#0000ff", "#ff0000"],
]

def predict(audio_file_pth):

    with torch.inference_mode():
        output = pipe(processor(audio_file_pth.name)[:5])
        audio = output.audios[0]

    return gr.make_waveform((16000, audio.ravel()), bars_color=random.choice(COLORS), bar_count=75)


title = "Music Spectrogram Diffusion: Multi-instrument Music Synthesis with Spectrogram Diffusion"

description = """
In this work, the authors focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. 
This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments.

They use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter.
"""

examples = ["examples/beethoven_mond_2.mid", "examples/beethoven_hammerklavier_2.mid"]

gr.HTML("""
<p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
<br/>
<a href="https://huggingface.co/spaces/reach-vb/music-spectrogram-diffusion?duplicate=true">
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
</p>""")

article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2206.05408">Music Spectrogram Diffusion paper</a> |
<a href="https://github.com/magenta/music-spectrogram-diffusion">original GitHub</a> |
<a href="https://github.com/magenta/music-spectrogram-diffusion#pretrained-models">original weights</a></p>
<pre>
@article{hawthorne2022multi,
  title={Multi-instrument music synthesis with spectrogram diffusion},
  author={Hawthorne, Curtis and Simon, Ian and Roberts, Adam and Zeghidour, Neil and Gardner, Josh and Manilow, Ethan and Engel, Jesse},
  journal={arXiv preprint arXiv:2206.05408},
  year={2022}
}
</pre>
</div>
"""

gr.Interface(
    fn=predict,
    inputs=[
        gr.File(label="Upload MIDI", file_count="single", file_types=[".mid"]),
    ],
    outputs=[
        gr.Video(label="Synthesised Music"),
    ],
    title=title,
    description=description,
    theme="gradio/monochrome",
    examples=examples,
    article=article,
).launch(debug=True)