Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on 11 days ago

Commit

18fbeec

verified ·

1 Parent(s): f4d6ba6

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -181

app.py CHANGED Viewed

@@ -7,216 +7,139 @@ import torch
 from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
-from pydub import AudioSegment
-import numpy as np
-# Load environment variables
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
-# Device configuration
-device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if device == "cuda" else torch.float32
-# Initialize models with automatic device detection
-@spaces.GPU(duration=120)
-def load_models():
-    global captioning_pipeline, pipe
-    captioning_pipeline = pipeline(
-        "image-to-text",
-        model="nlpconnect/vit-gpt2-image-captioning",
-        device=0 if torch.cuda.is_available() else -1
-    )
-    pipe = DiffusionPipeline.from_pretrained(
-        "cvssp/audioldm2",
-        use_auth_token=hf_token,
-        torch_dtype=torch_dtype
-    ).to(device)
-load_models()
-@spaces.GPU(duration=60)
-def analyze_image(image_file):
-    """Generate caption from image with error handling"""
     try:
-        results = captioning_pipeline(image_file)
-        if results and isinstance(results, list):
-            return results[0].get("generated_text", "").strip()
-        return "Could not generate caption"
     except Exception as e:
-        return f"Error: {str(e)}"
 @spaces.GPU(duration=120)
-def generate_audio(prompt):
-    """Generate audio from text prompt"""
     try:
-        return pipe(
-            prompt=prompt,
             num_inference_steps=50,
             guidance_scale=7.5
-        ).audios[0]
-    except Exception as e:
-        print(f"Audio generation error: {str(e)}")
-        return None
-def blend_audios(audio_list):
-    """Mix multiple audio arrays into one"""
-    try:
-        valid_audios = [arr for arr in audio_list if arr is not None]
-        if not valid_audios:
-            return None
-        max_length = max(arr.shape[0] for arr in valid_audios)
-        mixed = np.zeros(max_length)
-        for arr in valid_audios:
-            if arr.shape[0] < max_length:
-                padded = np.pad(arr, (0, max_length - arr.shape[0]))
-            else:
-                padded = arr[:max_length]
-            mixed += padded
-        mixed = mixed / np.max(np.abs(mixed))
-        _, tmp_path = tempfile.mkstemp(suffix=".wav")
-        write(tmp_path, 16000, mixed)
-        return tmp_path
     except Exception as e:
-        print(f"Blending error: {str(e)}")
         return None
 css = """
-#col-container { max-width: 800px; margin: 0 auto; }
-.toggle-row { margin: 1rem 0; }
-.prompt-box { margin-bottom: 0.5rem; }
-.danger { color: #ff4444; font-weight: bold; }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
-        # Header Section
         gr.HTML("""
-        <h1 style="text-align: center;">🎶 Generate Sound Effects from Image or Text</h1>
-        <p style="text-align: center;">
-            ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
-        </p>
         """)
-        # Input Mode Toggle
-        input_mode = gr.Radio(
-            choices=["Image Input", "Text Input"],
-            value="Image Input",
-            label="Select Input Mode",
-            elem_classes="toggle-row"
-        )
-        # Image Input Section
-        with gr.Column(visible=True) as image_col:
-            image_upload = gr.Image(type="filepath", label="Upload Image")
-            generate_desc_btn = gr.Button("Generate Description from Image", variant="primary")
-            caption_display = gr.Textbox(label="Generated Description", interactive=False)
-        # Text Input Section
-        with gr.Column(visible=False) as text_col:
-            with gr.Row():
-                prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...")
-                prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...")
-            additional_prompts = gr.Column()
-            add_prompt_btn = gr.Button("➕ Add Another Prompt", variant="secondary")
-            gr.Markdown("<div class='danger'>Max 5 prompts for stability</div>")
-        # Generation Controls
-        generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary")
-        audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
-        # Documentation Section
-        gr.Markdown("""
-        ## 👥 How You Can Contribute
-        We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
-        Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua
-        """)
-        # Visitor Badge
-        gr.HTML("""
-        <div style="text-align: center;">
-            <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
-                <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759"/>
-            </a>
-        </div>
-        """)
-    # Input Mode Toggle Handler
-    input_mode.change(
-        lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")),
-        inputs=input_mode,
-        outputs=[image_col, text_col],
-        concurrency_limit=1
-    )
-    # Image Description Generation
-    generate_desc_btn.click(
-        analyze_image,
         inputs=image_upload,
-        outputs=caption_display,
-        concurrency_limit=2
-    )
-    # Dynamic Prompt Addition
-    def add_prompt(current_count):
-        if current_count >= 5:
-            return current_count, gr.update()
-        new_count = current_count + 1
-        new_prompt = gr.Textbox(
-            label=f"Sound Prompt {new_count}",
-            lines=2,
-            visible=True,
-            placeholder="Enter sound description..."
-        )
-        return new_count, new_prompt
-    prompt_count = gr.State(2)
-    add_prompt_btn.click(
-        add_prompt,
-        inputs=prompt_count,
-        outputs=[prompt_count, additional_prompts],
-        concurrency_limit=1
     )
-    # Sound Generation Handler
-    def process_inputs(mode, image_file, caption, *prompts):
-        try:
-            if mode == "Image Input":
-                if not image_file:
-                    raise gr.Error("Please upload an image")
-                caption = analyze_image(image_file)
-                prompts = [caption]
-            else:
-                prompts = [p.strip() for p in prompts if p.strip()]
-                if not prompts:
-                    raise gr.Error("Please enter at least one valid prompt")
-            # Generate individual audio tracks
-            audio_tracks = []
-            for prompt in prompts:
-                if not prompt:
-                    continue
-                audio = generate_audio(prompt)
-                if audio is not None:
-                    audio_tracks.append(audio)
-            # Blend audio tracks
-            if not audio_tracks:
-                return None
-            return blend_audios(audio_tracks)
-        except Exception as e:
-            raise gr.Error(f"Processing error: {str(e)}")
-    generate_sound_btn.click(
-        process_inputs,
-        inputs=[input_mode, image_upload, caption_display, prompt1, prompt2],
-        outputs=audio_output,
-        concurrency_limit=2
     )
-if __name__ == "__main__":
-    demo.launch(max_threads=4)

 from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
+from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
+device_id = 0 if torch.cuda.is_available() else -1
+captioning_pipeline = pipeline(
+    "image-to-text",
+    model="nlpconnect/vit-gpt2-image-captioning",
+    device=device_id
+)
+pipe = DiffusionPipeline.from_pretrained(
+    "cvssp/audioldm2",
+    use_auth_token=hf_token
+)
+@spaces.GPU(duration=120)
+def analyze_image_with_free_model(image_file):
     try:
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
+            temp_file.write(image_file)
+            temp_image_path = temp_file.name
+        results = captioning_pipeline(temp_image_path)
+        if not results or not isinstance(results, list):
+            return "Error: Could not generate caption.", True
+        caption = results[0].get("generated_text", "").strip()
+        if not caption:
+            return "No caption was generated.", True
+        return caption, False
     except Exception as e:
+        return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
+def get_audioldm_from_caption(caption):
     try:
+        pipe.to("cuda")
+        audio_output = pipe(
+            prompt=caption,
             num_inference_steps=50,
             guidance_scale=7.5
+        )
+        pipe.to("cpu")
+        audio = audio_output.audios[0]
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
+            write(temp_wav.name, 16000, audio)
+            return temp_wav.name
     except Exception as e:
+        print(f"Error generating audio from caption: {e}")
         return None
 css = """
+#col-container{
+    margin: 0 auto;
+    max-width: 800px;
+    }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
+    <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
+    <p style="text-align: center;">
+        ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
+    </p>
         """)
+    gr.Markdown("""
+    Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
+    descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
+    **💡 How it works:**
+    1. **Upload an image**: Choose an image that you'd like to analyze.
+    2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
+    3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
+       sound effect that matches the image context.
+    Enjoy the journey from visual to auditory sensation with just a few clicks!
+    """)
+    image_upload = gr.File(label="Upload Image", type="binary")
+    generate_description_button = gr.Button("Generate Description")
+    caption_display = gr.Textbox(label="Image Description", interactive=False)
+    generate_sound_button = gr.Button("Generate Sound Effect")
+    audio_output = gr.Audio(label="Generated Sound Effect")
+    gr.Markdown("""
+    ## 👥 How You Can Contribute
+    We welcome contributions and suggestions for improvements. Your feedback is invaluable
+    to the continuous enhancement of this application.
+    For support, questions, or to contribute, please contact us at
+    [contact@bilsimaging.com](mailto:[email protected]).
+    Support our work and get involved by donating through
+    [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
+    """)
+    gr.Markdown("""
+    ## 📢 Stay Connected
+    This app is a testament to the creative possibilities that emerge when technology meets art.
+    Enjoy exploring the auditory landscape of your images!
+    """)
+    def update_caption(image_file):
+        description, _ = analyze_image_with_free_model(image_file)
+        return description
+    def generate_sound(description):
+        if not description or description.startswith("Error"):
+            return None
+        audio_path = get_audioldm_from_caption(description)
+        return audio_path
+    generate_description_button.click(
+        fn=update_caption,
         inputs=image_upload,
+        outputs=caption_display
     )
+    generate_sound_button.click(
+        fn=generate_sound,
+        inputs=caption_display,
+        outputs=audio_output
     )
+    gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
+    html = gr.HTML()
+demo.launch(debug=True, share=True)