Spaces:

amphion
/

PicoAudio

Running on Zero

App Files Files Community

ZeyuXie commited on Jul 16, 2024

Commit

9a7b1fe

verified ·

1 Parent(s): 5f61b1b

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -36

app.py CHANGED Viewed

@@ -34,52 +34,62 @@ class InferRunner:
 device = "cuda" if torch.cuda.is_available() else "cpu"
 runner = InferRunner(device)
 def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
     with torch.no_grad():
         latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
         mel = runner.vae.decode_first_stage(latents)
         wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
-    outpath = f"synthesized/output.wav"
     sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
     return outpath
-with gr.Blocks() as demo:
-    with gr.Row():
-        gr.Markdown("## PicoAudio")
-    with gr.Row():
-        with gr.Column():
-            prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
-                value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
-            run_button = gr.Button()
-            with gr.Accordion("Advanced options", open=False):
-                num_steps = gr.Slider(label="num_steps", minimum=1,
-                                       maximum=300, value=200, step=1)
-                guidance_scale = gr.Slider(
-                    label="guidance_scale Scale:(Large => more relevant to text but the quality may drop)", minimum=0.1, maximum=8.0, value=3.0, step=0.1
-                )
-        with gr.Column():
-            outaudio = gr.Audio()
-    run_button.click(fn=infer,
-                    inputs=[prompt, num_steps, guidance_scale],
-                    outputs=[outaudio])
-    # with gr.Row():
-    #     with gr.Column():
-    #         gr.Examples(
-    #                     examples = [['An amateur recording features a steel drum playing in a higher register',25,5,55],
-    #                                 ['An instrumental song with a caribbean feel, happy mood, and featuring steel pan music, programmed percussion, and bass',25,5,55],
-    #                                 ['This musical piece features a playful and emotionally melodic male vocal accompanied by piano',25,5,55],
-    #                                 ['A eerie yet calming experimental electronic track featuring haunting synthesizer strings and pads',25,5,55],
-    #                                 ['A slow tempo pop instrumental piece featuring only acoustic guitar with fingerstyle and percussive strumming techniques',25,5,55]],
-    #                     inputs = [prompt, ddim_steps, scale, seed],
-    #                     outputs = [outaudio],
-    #                     )
-    #                     cache_examples="lazy", # Turn on to cache.
-    #     with gr.Column():
-    #         pass
 demo.launch()

 device = "cuda" if torch.cuda.is_available() else "cpu"
 runner = InferRunner(device)
+event_list = [
+            "burping_belching",             # 0
+            "car_horn_honking",             #
+            "cat_meowing",                  #
+            "cow_mooing",                   #
+            "dog_barking",                  #
+            "door_knocking",                #
+            "door_slamming",                #
+            "explosion",                    #
+            "gunshot",                      # 8
+            "sheep_goat_bleating",          #
+            "sneeze",                       #
+            "spraying",                     #
+            "thump_thud",                   #
+            "train_horn",                   #
+            "tapping_clicking_clanking",    #
+            "woman_laughing",               #
+            "duck_quacking",                # 16
+            "whistling",                    #
+        ]
 def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
     with torch.no_grad():
         latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
         mel = runner.vae.decode_first_stage(latents)
         wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
+    outpath = f"output.wav"
     sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
     return outpath
+gr.Markdown("## PicoAudio")
+gr.Markdown("18 events: " + ", ".join(event_list))
+prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
+    value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
+num_steps = gr.Slider(label="num_steps",
+    minimum=1, maximum=300, value=200, step=1)
+guidance_scale = gr.Slider(label="guidance_scale Scale:(Large => more relevant to text but the quality may drop)",
+    minimum=0.1, maximum=8.0, value=3.0, step=0.1)
+gr_interface = gr.Interface(
+    fn=infer,
+    inputs=[prompt, num_steps, guidance_scale],
+    outputs=[outaudio],
+    # title="
+    allow_flagging=False,
+    examples=[
+        ["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
+        ["dog_barking at 0.562-2.562_4.25-6.25."],
+        ["cow_mooing at 0.958-3.582_5.272-7.896."],
+    ],
+    cache_examples="lazy", # Turn on to cache.
+)
 demo.launch()