deepanway commited on
Commit
37952eb
·
1 Parent(s): fa2eee3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -14
app.py CHANGED
@@ -66,32 +66,32 @@ class Tango:
66
  else:
67
  return list(self.chunks(outputs, samples))
68
 
69
- # Initialize Tango model
70
- tango = Tango()
 
 
 
71
 
72
- def gradio_generate(prompt):
73
-
74
- output_wave = tango.generate(prompt)
75
-
76
- # Save the output_wave as a temporary WAV file
77
  output_filename = "temp_output.wav"
78
  wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
79
 
80
  return output_filename
81
 
82
- # Add the description text box
83
  description_text = '''
84
  TANGO is a latent diffusion model (LDM) for text-to-audio (TTA) generation. TANGO can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. We use the frozen instruction-tuned LLM Flan-T5 as the text encoder and train a UNet based diffusion model for audio generation. We perform comparably to current state-of-the-art models for TTA across both objective and subjective metrics, despite training the LDM on a 63 times smaller dataset. We release our model, training, inference code, and pre-trained checkpoints for the research community.
85
  '''
86
 
87
- # Define Gradio input and output components
88
  input_text = gr.inputs.Textbox(lines=2, label="Prompt")
89
  output_audio = gr.outputs.Audio(label="Generated Audio", type="filepath")
 
90
 
91
- # Create Gradio interface
92
  gr_interface = gr.Interface(
93
  fn=gradio_generate,
94
- inputs=input_text,
95
  outputs=[output_audio],
96
  title="TANGO: Text to Audio using Instruction-Guided Diffusion",
97
  description="Generate audio using TANGO by providing a text prompt.",
@@ -99,16 +99,17 @@ gr_interface = gr.Interface(
99
  examples=[
100
  ["An audience cheering and clapping"],
101
  ["Rolling thunder with lightning strikes"],
 
102
  ["A car engine revving"],
103
  ["A dog barking"],
104
  ["A cat meowing"],
105
  ["Emergency sirens wailing"],
106
  ["Whistling with birds chirping"],
107
- ["A dog barking and a man talking and a racing car passes by"],
108
  ["Motor vehicles are driving with loud engines and a person whistles"],
109
- ["People cheering in a stadium while rolling thunder and lightning strikes"],
110
  ["A helicopter is in flight"],
111
- ["A person snoring"]
112
  ],
113
  cache_examples=False,
114
  )
 
66
  else:
67
  return list(self.chunks(outputs, samples))
68
 
69
+ # Initialize TANGO
70
+ if torch.cuda.is_available():
71
+ tango = Tango()
72
+ else:
73
+ tango = Tango(device="cpu")
74
 
75
+ def gradio_generate(prompt, steps):
76
+ output_wave = tango.generate(prompt, int(steps))
 
 
 
77
  output_filename = "temp_output.wav"
78
  wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
79
 
80
  return output_filename
81
 
 
82
  description_text = '''
83
  TANGO is a latent diffusion model (LDM) for text-to-audio (TTA) generation. TANGO can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. We use the frozen instruction-tuned LLM Flan-T5 as the text encoder and train a UNet based diffusion model for audio generation. We perform comparably to current state-of-the-art models for TTA across both objective and subjective metrics, despite training the LDM on a 63 times smaller dataset. We release our model, training, inference code, and pre-trained checkpoints for the research community.
84
  '''
85
 
86
+ # Gradio input and output components
87
  input_text = gr.inputs.Textbox(lines=2, label="Prompt")
88
  output_audio = gr.outputs.Audio(label="Generated Audio", type="filepath")
89
+ denoising_steps = gr.Number(value=100, label="Steps", interactive=True, precision=0)
90
 
91
+ # Gradio interface
92
  gr_interface = gr.Interface(
93
  fn=gradio_generate,
94
+ inputs=[input_text, denoising_steps],
95
  outputs=[output_audio],
96
  title="TANGO: Text to Audio using Instruction-Guided Diffusion",
97
  description="Generate audio using TANGO by providing a text prompt.",
 
99
  examples=[
100
  ["An audience cheering and clapping"],
101
  ["Rolling thunder with lightning strikes"],
102
+ ["Gentle water stream, birds chirping and sudden gun shot"]
103
  ["A car engine revving"],
104
  ["A dog barking"],
105
  ["A cat meowing"],
106
  ["Emergency sirens wailing"],
107
  ["Whistling with birds chirping"],
108
+ ["A person snoring"],
109
  ["Motor vehicles are driving with loud engines and a person whistles"],
110
+ ["People cheering in a stadium while thunder and lightning strikes"],
111
  ["A helicopter is in flight"],
112
+ ["A dog barking and a man talking and a racing car passes by"],
113
  ],
114
  cache_examples=False,
115
  )