Bils commited on
Commit
18fbeec
·
verified ·
1 Parent(s): f4d6ba6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -181
app.py CHANGED
@@ -7,216 +7,139 @@ import torch
7
  from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
- from pydub import AudioSegment
11
- import numpy as np
12
 
13
- # Load environment variables
14
  load_dotenv()
15
  hf_token = os.getenv("HF_TKN")
16
 
17
- # Device configuration
18
- device = "cuda" if torch.cuda.is_available() else "cpu"
19
- torch_dtype = torch.float16 if device == "cuda" else torch.float32
20
 
21
- # Initialize models with automatic device detection
22
- @spaces.GPU(duration=120)
23
- def load_models():
24
- global captioning_pipeline, pipe
25
- captioning_pipeline = pipeline(
26
- "image-to-text",
27
- model="nlpconnect/vit-gpt2-image-captioning",
28
- device=0 if torch.cuda.is_available() else -1
29
- )
30
- pipe = DiffusionPipeline.from_pretrained(
31
- "cvssp/audioldm2",
32
- use_auth_token=hf_token,
33
- torch_dtype=torch_dtype
34
- ).to(device)
35
 
36
- load_models()
 
 
 
37
 
38
- @spaces.GPU(duration=60)
39
- def analyze_image(image_file):
40
- """Generate caption from image with error handling"""
41
  try:
42
- results = captioning_pipeline(image_file)
43
- if results and isinstance(results, list):
44
- return results[0].get("generated_text", "").strip()
45
- return "Could not generate caption"
 
 
 
 
 
 
 
 
 
46
  except Exception as e:
47
- return f"Error: {str(e)}"
48
 
49
  @spaces.GPU(duration=120)
50
- def generate_audio(prompt):
51
- """Generate audio from text prompt"""
52
  try:
53
- return pipe(
54
- prompt=prompt,
 
55
  num_inference_steps=50,
56
  guidance_scale=7.5
57
- ).audios[0]
58
- except Exception as e:
59
- print(f"Audio generation error: {str(e)}")
60
- return None
 
 
 
61
 
62
- def blend_audios(audio_list):
63
- """Mix multiple audio arrays into one"""
64
- try:
65
- valid_audios = [arr for arr in audio_list if arr is not None]
66
- if not valid_audios:
67
- return None
68
-
69
- max_length = max(arr.shape[0] for arr in valid_audios)
70
- mixed = np.zeros(max_length)
71
-
72
- for arr in valid_audios:
73
- if arr.shape[0] < max_length:
74
- padded = np.pad(arr, (0, max_length - arr.shape[0]))
75
- else:
76
- padded = arr[:max_length]
77
- mixed += padded
78
-
79
- mixed = mixed / np.max(np.abs(mixed))
80
- _, tmp_path = tempfile.mkstemp(suffix=".wav")
81
- write(tmp_path, 16000, mixed)
82
- return tmp_path
83
  except Exception as e:
84
- print(f"Blending error: {str(e)}")
85
  return None
86
 
87
  css = """
88
- #col-container { max-width: 800px; margin: 0 auto; }
89
- .toggle-row { margin: 1rem 0; }
90
- .prompt-box { margin-bottom: 0.5rem; }
91
- .danger { color: #ff4444; font-weight: bold; }
92
  """
93
 
94
  with gr.Blocks(css=css) as demo:
95
  with gr.Column(elem_id="col-container"):
96
- # Header Section
97
  gr.HTML("""
98
- <h1 style="text-align: center;">🎶 Generate Sound Effects from Image or Text</h1>
99
- <p style="text-align: center;">
100
- ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
101
- </p>
102
  """)
103
 
104
- # Input Mode Toggle
105
- input_mode = gr.Radio(
106
- choices=["Image Input", "Text Input"],
107
- value="Image Input",
108
- label="Select Input Mode",
109
- elem_classes="toggle-row"
110
- )
111
-
112
- # Image Input Section
113
- with gr.Column(visible=True) as image_col:
114
- image_upload = gr.Image(type="filepath", label="Upload Image")
115
- generate_desc_btn = gr.Button("Generate Description from Image", variant="primary")
116
- caption_display = gr.Textbox(label="Generated Description", interactive=False)
117
-
118
- # Text Input Section
119
- with gr.Column(visible=False) as text_col:
120
- with gr.Row():
121
- prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...")
122
- prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...")
123
- additional_prompts = gr.Column()
124
- add_prompt_btn = gr.Button("➕ Add Another Prompt", variant="secondary")
125
- gr.Markdown("<div class='danger'>Max 5 prompts for stability</div>")
126
-
127
- # Generation Controls
128
- generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary")
129
- audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
130
-
131
- # Documentation Section
132
- gr.Markdown("""
133
- ## 👥 How You Can Contribute
134
- We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
135
- Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua
136
- """)
137
-
138
- # Visitor Badge
139
- gr.HTML("""
140
- <div style="text-align: center;">
141
- <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
142
- <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759"/>
143
- </a>
144
- </div>
145
- """)
146
-
147
- # Input Mode Toggle Handler
148
- input_mode.change(
149
- lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")),
150
- inputs=input_mode,
151
- outputs=[image_col, text_col],
152
- concurrency_limit=1
153
- )
154
 
155
- # Image Description Generation
156
- generate_desc_btn.click(
157
- analyze_image,
158
  inputs=image_upload,
159
- outputs=caption_display,
160
- concurrency_limit=2
161
- )
162
-
163
- # Dynamic Prompt Addition
164
- def add_prompt(current_count):
165
- if current_count >= 5:
166
- return current_count, gr.update()
167
- new_count = current_count + 1
168
- new_prompt = gr.Textbox(
169
- label=f"Sound Prompt {new_count}",
170
- lines=2,
171
- visible=True,
172
- placeholder="Enter sound description..."
173
- )
174
- return new_count, new_prompt
175
-
176
- prompt_count = gr.State(2)
177
- add_prompt_btn.click(
178
- add_prompt,
179
- inputs=prompt_count,
180
- outputs=[prompt_count, additional_prompts],
181
- concurrency_limit=1
182
  )
183
 
184
- # Sound Generation Handler
185
- def process_inputs(mode, image_file, caption, *prompts):
186
- try:
187
- if mode == "Image Input":
188
- if not image_file:
189
- raise gr.Error("Please upload an image")
190
- caption = analyze_image(image_file)
191
- prompts = [caption]
192
- else:
193
- prompts = [p.strip() for p in prompts if p.strip()]
194
- if not prompts:
195
- raise gr.Error("Please enter at least one valid prompt")
196
-
197
- # Generate individual audio tracks
198
- audio_tracks = []
199
- for prompt in prompts:
200
- if not prompt:
201
- continue
202
- audio = generate_audio(prompt)
203
- if audio is not None:
204
- audio_tracks.append(audio)
205
-
206
- # Blend audio tracks
207
- if not audio_tracks:
208
- return None
209
- return blend_audios(audio_tracks)
210
-
211
- except Exception as e:
212
- raise gr.Error(f"Processing error: {str(e)}")
213
-
214
- generate_sound_btn.click(
215
- process_inputs,
216
- inputs=[input_mode, image_upload, caption_display, prompt1, prompt2],
217
- outputs=audio_output,
218
- concurrency_limit=2
219
  )
 
 
 
220
 
221
- if __name__ == "__main__":
222
- demo.launch(max_threads=4)
 
7
  from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
+ from pathlib import Path
 
11
 
 
12
  load_dotenv()
13
  hf_token = os.getenv("HF_TKN")
14
 
15
+ device_id = 0 if torch.cuda.is_available() else -1
 
 
16
 
17
+ captioning_pipeline = pipeline(
18
+ "image-to-text",
19
+ model="nlpconnect/vit-gpt2-image-captioning",
20
+ device=device_id
21
+ )
 
 
 
 
 
 
 
 
 
22
 
23
+ pipe = DiffusionPipeline.from_pretrained(
24
+ "cvssp/audioldm2",
25
+ use_auth_token=hf_token
26
+ )
27
 
28
+ @spaces.GPU(duration=120)
29
+ def analyze_image_with_free_model(image_file):
 
30
  try:
31
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
+ temp_file.write(image_file)
33
+ temp_image_path = temp_file.name
34
+
35
+ results = captioning_pipeline(temp_image_path)
36
+ if not results or not isinstance(results, list):
37
+ return "Error: Could not generate caption.", True
38
+
39
+ caption = results[0].get("generated_text", "").strip()
40
+ if not caption:
41
+ return "No caption was generated.", True
42
+ return caption, False
43
+
44
  except Exception as e:
45
+ return f"Error analyzing image: {e}", True
46
 
47
  @spaces.GPU(duration=120)
48
+ def get_audioldm_from_caption(caption):
 
49
  try:
50
+ pipe.to("cuda")
51
+ audio_output = pipe(
52
+ prompt=caption,
53
  num_inference_steps=50,
54
  guidance_scale=7.5
55
+ )
56
+ pipe.to("cpu")
57
+ audio = audio_output.audios[0]
58
+
59
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
60
+ write(temp_wav.name, 16000, audio)
61
+ return temp_wav.name
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  except Exception as e:
64
+ print(f"Error generating audio from caption: {e}")
65
  return None
66
 
67
  css = """
68
+ #col-container{
69
+ margin: 0 auto;
70
+ max-width: 800px;
71
+ }
72
  """
73
 
74
  with gr.Blocks(css=css) as demo:
75
  with gr.Column(elem_id="col-container"):
 
76
  gr.HTML("""
77
+ <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
78
+ <p style="text-align: center;">
79
+ ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
80
+ </p>
81
  """)
82
 
83
+ gr.Markdown("""
84
+ Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
85
+ descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
86
+
87
+ **💡 How it works:**
88
+ 1. **Upload an image**: Choose an image that you'd like to analyze.
89
+ 2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
90
+ 3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
91
+ sound effect that matches the image context.
92
+
93
+ Enjoy the journey from visual to auditory sensation with just a few clicks!
94
+ """)
95
+
96
+ image_upload = gr.File(label="Upload Image", type="binary")
97
+ generate_description_button = gr.Button("Generate Description")
98
+ caption_display = gr.Textbox(label="Image Description", interactive=False)
99
+ generate_sound_button = gr.Button("Generate Sound Effect")
100
+ audio_output = gr.Audio(label="Generated Sound Effect")
101
+
102
+ gr.Markdown("""
103
+ ## 👥 How You Can Contribute
104
+ We welcome contributions and suggestions for improvements. Your feedback is invaluable
105
+ to the continuous enhancement of this application.
106
+
107
+ For support, questions, or to contribute, please contact us at
108
+ [contact@bilsimaging.com](mailto:[email protected]).
109
+
110
+ Support our work and get involved by donating through
111
+ [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
112
+ """)
113
+
114
+ gr.Markdown("""
115
+ ## 📢 Stay Connected
116
+ This app is a testament to the creative possibilities that emerge when technology meets art.
117
+ Enjoy exploring the auditory landscape of your images!
118
+ """)
119
+
120
+ def update_caption(image_file):
121
+ description, _ = analyze_image_with_free_model(image_file)
122
+ return description
123
+
124
+ def generate_sound(description):
125
+ if not description or description.startswith("Error"):
126
+ return None
127
+ audio_path = get_audioldm_from_caption(description)
128
+ return audio_path
 
 
 
 
129
 
130
+ generate_description_button.click(
131
+ fn=update_caption,
 
132
  inputs=image_upload,
133
+ outputs=caption_display
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  )
135
 
136
+ generate_sound_button.click(
137
+ fn=generate_sound,
138
+ inputs=caption_display,
139
+ outputs=audio_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  )
141
+
142
+ gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
143
+ html = gr.HTML()
144
 
145
+ demo.launch(debug=True, share=True)