Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -34,52 +34,62 @@ class InferRunner:
|
|
34 |
|
35 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
36 |
runner = InferRunner(device)
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
|
39 |
with torch.no_grad():
|
40 |
latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
|
41 |
mel = runner.vae.decode_first_stage(latents)
|
42 |
wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
|
43 |
-
outpath = f"
|
44 |
sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
|
45 |
return outpath
|
46 |
-
with gr.Blocks() as demo:
|
47 |
-
with gr.Row():
|
48 |
-
gr.Markdown("## PicoAudio")
|
49 |
|
50 |
-
with gr.Row():
|
51 |
-
with gr.Column():
|
52 |
-
prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
|
53 |
-
value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
|
54 |
-
run_button = gr.Button()
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
run_button.click(fn=infer,
|
67 |
-
inputs=[prompt, num_steps, guidance_scale],
|
68 |
-
outputs=[outaudio])
|
69 |
-
# with gr.Row():
|
70 |
-
# with gr.Column():
|
71 |
-
# gr.Examples(
|
72 |
-
# examples = [['An amateur recording features a steel drum playing in a higher register',25,5,55],
|
73 |
-
# ['An instrumental song with a caribbean feel, happy mood, and featuring steel pan music, programmed percussion, and bass',25,5,55],
|
74 |
-
# ['This musical piece features a playful and emotionally melodic male vocal accompanied by piano',25,5,55],
|
75 |
-
# ['A eerie yet calming experimental electronic track featuring haunting synthesizer strings and pads',25,5,55],
|
76 |
-
# ['A slow tempo pop instrumental piece featuring only acoustic guitar with fingerstyle and percussive strumming techniques',25,5,55]],
|
77 |
-
# inputs = [prompt, ddim_steps, scale, seed],
|
78 |
-
# outputs = [outaudio],
|
79 |
-
# )
|
80 |
-
# cache_examples="lazy", # Turn on to cache.
|
81 |
-
# with gr.Column():
|
82 |
-
# pass
|
83 |
|
84 |
demo.launch()
|
85 |
|
|
|
34 |
|
35 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
36 |
runner = InferRunner(device)
|
37 |
+
event_list = [
|
38 |
+
"burping_belching", # 0
|
39 |
+
"car_horn_honking", #
|
40 |
+
"cat_meowing", #
|
41 |
+
"cow_mooing", #
|
42 |
+
"dog_barking", #
|
43 |
+
"door_knocking", #
|
44 |
+
"door_slamming", #
|
45 |
+
"explosion", #
|
46 |
+
"gunshot", # 8
|
47 |
+
"sheep_goat_bleating", #
|
48 |
+
"sneeze", #
|
49 |
+
"spraying", #
|
50 |
+
"thump_thud", #
|
51 |
+
"train_horn", #
|
52 |
+
"tapping_clicking_clanking", #
|
53 |
+
"woman_laughing", #
|
54 |
+
"duck_quacking", # 16
|
55 |
+
"whistling", #
|
56 |
+
]
|
57 |
def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
|
58 |
with torch.no_grad():
|
59 |
latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
|
60 |
mel = runner.vae.decode_first_stage(latents)
|
61 |
wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
|
62 |
+
outpath = f"output.wav"
|
63 |
sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
|
64 |
return outpath
|
|
|
|
|
|
|
65 |
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
gr.Markdown("## PicoAudio")
|
68 |
+
gr.Markdown("18 events: " + ", ".join(event_list))
|
69 |
+
prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
|
70 |
+
value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
|
71 |
+
|
72 |
+
|
73 |
+
num_steps = gr.Slider(label="num_steps",
|
74 |
+
minimum=1, maximum=300, value=200, step=1)
|
75 |
+
guidance_scale = gr.Slider(label="guidance_scale Scale:(Large => more relevant to text but the quality may drop)",
|
76 |
+
minimum=0.1, maximum=8.0, value=3.0, step=0.1)
|
77 |
+
|
78 |
+
|
79 |
+
gr_interface = gr.Interface(
|
80 |
+
fn=infer,
|
81 |
+
inputs=[prompt, num_steps, guidance_scale],
|
82 |
+
outputs=[outaudio],
|
83 |
+
# title="
|
84 |
+
allow_flagging=False,
|
85 |
+
examples=[
|
86 |
+
["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
|
87 |
+
["dog_barking at 0.562-2.562_4.25-6.25."],
|
88 |
+
["cow_mooing at 0.958-3.582_5.272-7.896."],
|
89 |
+
],
|
90 |
+
cache_examples="lazy", # Turn on to cache.
|
91 |
+
)
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
demo.launch()
|
95 |
|