ZeyuXie commited on
Commit
9a7b1fe
·
verified ·
1 Parent(s): 5f61b1b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -36
app.py CHANGED
@@ -34,52 +34,62 @@ class InferRunner:
34
 
35
  device = "cuda" if torch.cuda.is_available() else "cpu"
36
  runner = InferRunner(device)
37
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
39
  with torch.no_grad():
40
  latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
41
  mel = runner.vae.decode_first_stage(latents)
42
  wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
43
- outpath = f"synthesized/output.wav"
44
  sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
45
  return outpath
46
- with gr.Blocks() as demo:
47
- with gr.Row():
48
- gr.Markdown("## PicoAudio")
49
 
50
- with gr.Row():
51
- with gr.Column():
52
- prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
53
- value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
54
- run_button = gr.Button()
55
 
56
- with gr.Accordion("Advanced options", open=False):
57
- num_steps = gr.Slider(label="num_steps", minimum=1,
58
- maximum=300, value=200, step=1)
59
- guidance_scale = gr.Slider(
60
- label="guidance_scale Scale:(Large => more relevant to text but the quality may drop)", minimum=0.1, maximum=8.0, value=3.0, step=0.1
61
- )
62
-
63
- with gr.Column():
64
- outaudio = gr.Audio()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- run_button.click(fn=infer,
67
- inputs=[prompt, num_steps, guidance_scale],
68
- outputs=[outaudio])
69
- # with gr.Row():
70
- # with gr.Column():
71
- # gr.Examples(
72
- # examples = [['An amateur recording features a steel drum playing in a higher register',25,5,55],
73
- # ['An instrumental song with a caribbean feel, happy mood, and featuring steel pan music, programmed percussion, and bass',25,5,55],
74
- # ['This musical piece features a playful and emotionally melodic male vocal accompanied by piano',25,5,55],
75
- # ['A eerie yet calming experimental electronic track featuring haunting synthesizer strings and pads',25,5,55],
76
- # ['A slow tempo pop instrumental piece featuring only acoustic guitar with fingerstyle and percussive strumming techniques',25,5,55]],
77
- # inputs = [prompt, ddim_steps, scale, seed],
78
- # outputs = [outaudio],
79
- # )
80
- # cache_examples="lazy", # Turn on to cache.
81
- # with gr.Column():
82
- # pass
83
 
84
  demo.launch()
85
 
 
34
 
35
  device = "cuda" if torch.cuda.is_available() else "cpu"
36
  runner = InferRunner(device)
37
+ event_list = [
38
+ "burping_belching", # 0
39
+ "car_horn_honking", #
40
+ "cat_meowing", #
41
+ "cow_mooing", #
42
+ "dog_barking", #
43
+ "door_knocking", #
44
+ "door_slamming", #
45
+ "explosion", #
46
+ "gunshot", # 8
47
+ "sheep_goat_bleating", #
48
+ "sneeze", #
49
+ "spraying", #
50
+ "thump_thud", #
51
+ "train_horn", #
52
+ "tapping_clicking_clanking", #
53
+ "woman_laughing", #
54
+ "duck_quacking", # 16
55
+ "whistling", #
56
+ ]
57
  def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
58
  with torch.no_grad():
59
  latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
60
  mel = runner.vae.decode_first_stage(latents)
61
  wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
62
+ outpath = f"output.wav"
63
  sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
64
  return outpath
 
 
 
65
 
 
 
 
 
 
66
 
67
+ gr.Markdown("## PicoAudio")
68
+ gr.Markdown("18 events: " + ", ".join(event_list))
69
+ prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
70
+ value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
71
+
72
+
73
+ num_steps = gr.Slider(label="num_steps",
74
+ minimum=1, maximum=300, value=200, step=1)
75
+ guidance_scale = gr.Slider(label="guidance_scale Scale:(Large => more relevant to text but the quality may drop)",
76
+ minimum=0.1, maximum=8.0, value=3.0, step=0.1)
77
+
78
+
79
+ gr_interface = gr.Interface(
80
+ fn=infer,
81
+ inputs=[prompt, num_steps, guidance_scale],
82
+ outputs=[outaudio],
83
+ # title="
84
+ allow_flagging=False,
85
+ examples=[
86
+ ["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
87
+ ["dog_barking at 0.562-2.562_4.25-6.25."],
88
+ ["cow_mooing at 0.958-3.582_5.272-7.896."],
89
+ ],
90
+ cache_examples="lazy", # Turn on to cache.
91
+ )
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  demo.launch()
95