File size: 1,549 Bytes
0291473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431cf64
7c54836
 
 
 
0a716a3
431cf64
 
 
 
 
 
 
4518a48
431cf64
ba020f3
5d8dc18
431cf64
 
 
4518a48
431cf64
 
 
 
 
709af2a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import gradio as gr
from diffusers import AudioLDMControlNetPipeline, ControlNetModel
from pretty_midi import PrettyMIDI
import torch

if torch.cuda.is_available():
    device = "cuda"
    torch_dtype = torch.float16
else:
    device = "cpu"
    torch_dtype = torch.float32

controlnet = ControlNetModel.from_pretrained("lauraibnz/midi-audioldm", torch_dtype=torch_dtype)
pipe = AudioLDMControlNetPipeline.from_pretrained("cvssp/audioldm-m-full", controlnet=controlnet, torch_dtype=torch_dtype)
pipe = pipe.to(device)

def predict(midi_file=None, prompt="", negative_prompt="", audio_length_in_s=5, controlnet_conditioning_scale=1, num_inference_steps=20, guess_mode=False):
    if midi_file:
        midi_file = midi_file.name
    else:
        midi_file = "test.mid"
    midi = PrettyMIDI(midi_file)
    audio = pipe(
        prompt,
        negative_prompt=negative_prompt,
        midi=midi, 
        audio_length_in_s=audio_length_in_s, 
        num_inference_steps=num_inference_steps, 
        controlnet_conditioning_scale=float(controlnet_conditioning_scale),
        guess_mode=guess_mode,
    )
    return (16000, audio.audios.T)

demo = gr.Interface(fn=predict, inputs=[
    gr.File(file_types=[".mid"]), 
    "text",
    gr.Textbox(label="negative prompt"),
    gr.Slider(0, 30, value=5, step=5, label="duration (seconds)"), 
    gr.Slider(0.0, 1.0, value=1.0, step=0.1, label="conditioning scale"),
    gr.Slider(0, 50, value=20, step=0.1, label="inference steps"),
    gr.Checkbox(label="guess mode")
], outputs="audio")
demo.launch()