Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -66,32 +66,32 @@ class Tango:
|
|
66 |
else:
|
67 |
return list(self.chunks(outputs, samples))
|
68 |
|
69 |
-
# Initialize
|
70 |
-
|
|
|
|
|
|
|
71 |
|
72 |
-
def gradio_generate(prompt):
|
73 |
-
|
74 |
-
output_wave = tango.generate(prompt)
|
75 |
-
|
76 |
-
# Save the output_wave as a temporary WAV file
|
77 |
output_filename = "temp_output.wav"
|
78 |
wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
|
79 |
|
80 |
return output_filename
|
81 |
|
82 |
-
# Add the description text box
|
83 |
description_text = '''
|
84 |
TANGO is a latent diffusion model (LDM) for text-to-audio (TTA) generation. TANGO can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. We use the frozen instruction-tuned LLM Flan-T5 as the text encoder and train a UNet based diffusion model for audio generation. We perform comparably to current state-of-the-art models for TTA across both objective and subjective metrics, despite training the LDM on a 63 times smaller dataset. We release our model, training, inference code, and pre-trained checkpoints for the research community.
|
85 |
'''
|
86 |
|
87 |
-
#
|
88 |
input_text = gr.inputs.Textbox(lines=2, label="Prompt")
|
89 |
output_audio = gr.outputs.Audio(label="Generated Audio", type="filepath")
|
|
|
90 |
|
91 |
-
#
|
92 |
gr_interface = gr.Interface(
|
93 |
fn=gradio_generate,
|
94 |
-
inputs=input_text,
|
95 |
outputs=[output_audio],
|
96 |
title="TANGO: Text to Audio using Instruction-Guided Diffusion",
|
97 |
description="Generate audio using TANGO by providing a text prompt.",
|
@@ -99,16 +99,17 @@ gr_interface = gr.Interface(
|
|
99 |
examples=[
|
100 |
["An audience cheering and clapping"],
|
101 |
["Rolling thunder with lightning strikes"],
|
|
|
102 |
["A car engine revving"],
|
103 |
["A dog barking"],
|
104 |
["A cat meowing"],
|
105 |
["Emergency sirens wailing"],
|
106 |
["Whistling with birds chirping"],
|
107 |
-
["A
|
108 |
["Motor vehicles are driving with loud engines and a person whistles"],
|
109 |
-
["People cheering in a stadium while
|
110 |
["A helicopter is in flight"],
|
111 |
-
["A
|
112 |
],
|
113 |
cache_examples=False,
|
114 |
)
|
|
|
66 |
else:
|
67 |
return list(self.chunks(outputs, samples))
|
68 |
|
69 |
+
# Initialize TANGO
|
70 |
+
if torch.cuda.is_available():
|
71 |
+
tango = Tango()
|
72 |
+
else:
|
73 |
+
tango = Tango(device="cpu")
|
74 |
|
75 |
+
def gradio_generate(prompt, steps):
|
76 |
+
output_wave = tango.generate(prompt, int(steps))
|
|
|
|
|
|
|
77 |
output_filename = "temp_output.wav"
|
78 |
wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
|
79 |
|
80 |
return output_filename
|
81 |
|
|
|
82 |
description_text = '''
|
83 |
TANGO is a latent diffusion model (LDM) for text-to-audio (TTA) generation. TANGO can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. We use the frozen instruction-tuned LLM Flan-T5 as the text encoder and train a UNet based diffusion model for audio generation. We perform comparably to current state-of-the-art models for TTA across both objective and subjective metrics, despite training the LDM on a 63 times smaller dataset. We release our model, training, inference code, and pre-trained checkpoints for the research community.
|
84 |
'''
|
85 |
|
86 |
+
# Gradio input and output components
|
87 |
input_text = gr.inputs.Textbox(lines=2, label="Prompt")
|
88 |
output_audio = gr.outputs.Audio(label="Generated Audio", type="filepath")
|
89 |
+
denoising_steps = gr.Number(value=100, label="Steps", interactive=True, precision=0)
|
90 |
|
91 |
+
# Gradio interface
|
92 |
gr_interface = gr.Interface(
|
93 |
fn=gradio_generate,
|
94 |
+
inputs=[input_text, denoising_steps],
|
95 |
outputs=[output_audio],
|
96 |
title="TANGO: Text to Audio using Instruction-Guided Diffusion",
|
97 |
description="Generate audio using TANGO by providing a text prompt.",
|
|
|
99 |
examples=[
|
100 |
["An audience cheering and clapping"],
|
101 |
["Rolling thunder with lightning strikes"],
|
102 |
+
["Gentle water stream, birds chirping and sudden gun shot"]
|
103 |
["A car engine revving"],
|
104 |
["A dog barking"],
|
105 |
["A cat meowing"],
|
106 |
["Emergency sirens wailing"],
|
107 |
["Whistling with birds chirping"],
|
108 |
+
["A person snoring"],
|
109 |
["Motor vehicles are driving with loud engines and a person whistles"],
|
110 |
+
["People cheering in a stadium while thunder and lightning strikes"],
|
111 |
["A helicopter is in flight"],
|
112 |
+
["A dog barking and a man talking and a racing car passes by"],
|
113 |
],
|
114 |
cache_examples=False,
|
115 |
)
|