Bton's picture
Update app.py
953ffb1 verified
import gradio as gr
from gradio_client import Client
import json
import re
from moviepy.editor import VideoFileClip
from moviepy.audio.AudioClip import AudioClip
def extract_audio(video_in):
input_video = video_in
output_audio = 'audio.wav'
# Open the video file and extract the audio
video_clip = VideoFileClip(input_video)
audio_clip = video_clip.audio
# Save the audio as a .wav file
audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files
print("Audio extraction complete.")
return 'audio.wav'
def get_caption_from_kosmos(image_in):
kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
kosmos2_result = kosmos2_client.predict(
image_in, # str (filepath or URL to image) in 'Test Image' Image component
"Detailed", # str in 'Description Type' Radio component
fn_index=4
)
print(f"KOSMOS2 RETURNS: {kosmos2_result}")
with open(kosmos2_result[1], 'r') as f:
data = json.load(f)
reconstructed_sentence = []
for sublist in data:
reconstructed_sentence.append(sublist[0])
full_sentence = ' '.join(reconstructed_sentence)
#print(full_sentence)
# Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
pattern = r'^Describe this image in detail:\s*(.*)$'
# Apply the regex pattern to extract the description text.
match = re.search(pattern, full_sentence)
if match:
description = match.group(1)
print(description)
else:
print("Unable to locate valid description.")
# Find the last occurrence of "."
last_period_index = description.rfind('.')
# Truncate the string up to the last period
truncated_caption = description[:last_period_index + 1]
# print(truncated_caption)
print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
return truncated_caption
def get_caption(image_in):
client = Client("https://vikhyatk-moondream1.hf.space/")
result = client.predict(
image_in, # filepath in 'image' Image component
"provided the given image caption, generate a one sentence long description of an appropriate sound effect for the context", # str in 'Question' Textbox component
api_name="/answer_question"
)
print(result)
return result
def get_audioldm(prompt):
client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
result = client.predict(
prompt,
"low quality",
10,
3.5,
45,
3,
fn_index=1
)
print(result)
audio_result = extract_audio(result)
return audio_result
def infer(image_in, chosen_model):
caption = get_caption(image_in)
if chosen_model == "MAGNet" :
magnet_result = get_magnet(caption)
return magnet_result
elif chosen_model == "AudioLDM-2" :
audioldm_result = get_audioldm(caption)
return audioldm_result
elif chosen_model == "AudioGen" :
audiogen_result = get_audiogen(caption)
return audiogen_result
css="""
#col-container{
margin: 0 auto;
max-width: 800px;
}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML("""
<h2 style="text-align: center;">
Image to SFX
</h2>
<p style="text-align: center;">
Compare MAGNet, AudioLDM2 and AudioGen sound effects generation from image caption.
</p>
""")
with gr.Column():
image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="doggy.jpg")
with gr.Row():
chosen_model = gr.Radio(label="Choose a model", choices=["AudioLDM-2"], value="AudioLDM-2")
submit_btn = gr.Button("Submit")
with gr.Column():
audio_o = gr.Audio(label="Audio output")
submit_btn.click(
fn=infer,
inputs=[image_in, chosen_model],
outputs=[audio_o],
concurrency_limit = 4
)
demo.queue(max_size=10).launch(debug=True)