Spaces:

Bton
/

Image2AudioButBetter

Runtime error

File size: 4,161 Bytes

import gradio as gr
from gradio_client import Client
import json
import re
from moviepy.editor import VideoFileClip
from moviepy.audio.AudioClip import AudioClip

def extract_audio(video_in):
    input_video = video_in
    output_audio = 'audio.wav'
    
    # Open the video file and extract the audio
    video_clip = VideoFileClip(input_video)
    audio_clip = video_clip.audio
    
    # Save the audio as a .wav file
    audio_clip.write_audiofile(output_audio, fps=44100)  # Use 44100 Hz as the sample rate for .wav files  
    print("Audio extraction complete.")

    return 'audio.wav'

def get_caption_from_kosmos(image_in):
    kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")

    kosmos2_result = kosmos2_client.predict(
        image_in,	# str (filepath or URL to image) in 'Test Image' Image component
        "Detailed",	# str in 'Description Type' Radio component
        fn_index=4
    )

    print(f"KOSMOS2 RETURNS: {kosmos2_result}")

    with open(kosmos2_result[1], 'r') as f:
        data = json.load(f)
    
    reconstructed_sentence = []
    for sublist in data:
        reconstructed_sentence.append(sublist[0])

    full_sentence = ' '.join(reconstructed_sentence)
    #print(full_sentence)

    # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
    pattern = r'^Describe this image in detail:\s*(.*)$'
    # Apply the regex pattern to extract the description text.
    match = re.search(pattern, full_sentence)
    if match:
        description = match.group(1)
        print(description)
    else:
        print("Unable to locate valid description.")

    # Find the last occurrence of "."
    last_period_index = description.rfind('.')

    # Truncate the string up to the last period
    truncated_caption = description[:last_period_index + 1]

    # print(truncated_caption)
    print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
    
    return truncated_caption

def get_caption(image_in):
    client = Client("https://vikhyatk-moondream1.hf.space/")
    result = client.predict(
		image_in,	# filepath  in 'image' Image component
		"provided the given image caption, generate a one sentence long description of an appropriate sound effect for the context",	# str  in 'Question' Textbox component
		api_name="/answer_question"
    )
    print(result)
    return result

def get_audioldm(prompt):
    client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
    result = client.predict(
        prompt,
        "low quality",
        10,
        3.5,
        45,
        3,
        fn_index=1
    )
    print(result)
    audio_result = extract_audio(result)
    return audio_result

def infer(image_in, chosen_model):
    caption = get_caption(image_in)
    if chosen_model == "MAGNet" :
        magnet_result = get_magnet(caption)
        return magnet_result
    elif chosen_model == "AudioLDM-2" : 
        audioldm_result = get_audioldm(caption)
        return audioldm_result
    elif chosen_model == "AudioGen" :
        audiogen_result = get_audiogen(caption)
        return audiogen_result

css="""
#col-container{
    margin: 0 auto;
    max-width: 800px;
}
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML("""
        <h2 style="text-align: center;">
            Image to SFX
        </h2>
        <p style="text-align: center;">
            Compare MAGNet, AudioLDM2 and AudioGen sound effects generation from image caption.
        </p>
        """)
        
        with gr.Column():
            image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="doggy.jpg")
            with gr.Row():
                chosen_model = gr.Radio(label="Choose a model", choices=["AudioLDM-2"], value="AudioLDM-2")
                submit_btn = gr.Button("Submit")
        with gr.Column():
            audio_o = gr.Audio(label="Audio output")
    
    submit_btn.click(
        fn=infer,
        inputs=[image_in, chosen_model],
        outputs=[audio_o],
        concurrency_limit = 4
    )

demo.queue(max_size=10).launch(debug=True)