Spaces:

beingcognitive
/

Image_to_Music

Running

App Files Files Community

Image_to_Music / app.py

beingcognitive

streamlit app

53fe2ef 23 days ago

raw

history blame contribute delete

No virus

5.42 kB

	import streamlit as st
	from transformers import AutoProcessor, BlipForConditionalGeneration, pipeline, AutoModelForCausalLM, AutoTokenizer
	from PIL import Image as PILImage
	import scipy.io.wavfile as wavfile
	import os
	import uuid

	# Set page config at the very beginning
	st.set_page_config(page_title="Image to Music", layout="wide")

	# Load models outside of functions
	@st.cache_resource
	def load_models():
	model_id = "Salesforce/blip-image-captioning-large"
	processor = AutoProcessor.from_pretrained(model_id)
	blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
	synthesiser = pipeline("text-to-audio", model="facebook/musicgen-small")
	phi_model = AutoModelForCausalLM.from_pretrained(
	"microsoft/Phi-3.5-mini-instruct",
	device_map="auto",
	torch_dtype="auto",
	trust_remote_code=True
	)
	phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
	return processor, blip_model, synthesiser, phi_model, phi_tokenizer

	processor, blip_model, synthesiser, phi_model, phi_tokenizer = load_models()

	@st.cache_data
	def image_to_text(_image: PILImage.Image):
	try:
	# Prepare the image for the model
	inputs = processor(images=_image, return_tensors="pt")

	# Generate caption
	output = blip_model.generate(**inputs, max_new_tokens=100)

	# Decode the output
	caption = processor.decode(output[0], skip_special_tokens=True)

	return caption
	# # Create a music generation prompt based on the caption
	# music_prompt = f"Generate music inspired by this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions evoked by the scene."

	# return music_prompt
	except Exception as e:
	return f"Error in image_to_text: {str(e)}"

	@st.cache_data
	def refine_prompt(caption: str):
	try:
	messages = [
	{"role": "system", "content": "You are a helpful AI assistant for generating music prompts."},
	{"role": "user", "content": f"Generate a detailed music prompt based on this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions."}
	]
	pipe = pipeline(
	"text-generation",
	model=phi_model,
	tokenizer=phi_tokenizer,
	)
	generation_args = {
	"max_new_tokens": 500,
	"return_full_text": False,
	"temperature": 0.7,
	"do_sample": True,
	}
	output = pipe(messages, **generation_args)
	refined_prompt = output[0]['generated_text']
	return refined_prompt
	except Exception as e:
	return f"Error in refine_prompt: {str(e)}"

	def text_to_music(response: str):
	try:
	music = synthesiser(response, forward_params={"do_sample": True})
	output_path = f"musicgen_out_{uuid.uuid4()}.wav"
	wavfile.write(output_path, rate=music["sampling_rate"], data=music["audio"])
	return output_path
	except Exception as e:
	return f"Error in text_to_music: {str(e)}"

	def cleanup_old_files():
	for file in os.listdir():
	if file.startswith("musicgen_out_") and file.endswith(".wav"):
	os.remove(file)

	def main():
	# st.set_page_config(page_title="Image to Music", layout="wide")

	st.title("Image to Music")
	st.write("""
	Generate music inspired by an image.

	This project enables the creation of music based on the inspiration drawn from an image, leveraging multiple AI technologies.

	## How It Works

	1. Image to Text Description
	- Use Salesforce BLIP to convert the image into a caption.
	2. Text to Refined Music Prompt
	- Use Microsoft Phi-3.5-mini- to generate a detailed music prompt based on the caption.
	3. Music Prompt to Music
	- Use Facebook MusicGen to generate music from the refined prompt.

	## Steps

	1. Image -> [ Salesforce BLIP ] -> Caption
	2. Caption -> [ Microsoft Phi-3.5-mini ] -> Refined Music Prompt
	3. Refined Music Prompt -> [ Facebook MusicGen ] -> Music

	Let's turn your visual inspirations into beautiful melodies!

	Please Note:
	The music generation process may take several minutes to complete.
	This is due to the complex AI models working behind the scenes to create unique music based on your image.
	Thank you for your patience! """)

	uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

	if uploaded_file is not None:
	image = PILImage.open(uploaded_file)
	st.image(image, caption="Uploaded Image", use_column_width=True)

	if st.button("Generate Music"):
	with st.spinner("Processing image..."):
	caption = image_to_text(image)
	st.text_area("Generated Caption", caption, height=100)

	with st.spinner("Refining music prompt..."):
	refined_prompt = refine_prompt(caption)
	st.text_area("Refined Music Prompt", refined_prompt, height=150)

	with st.spinner("Generating music..."):
	music_file = text_to_music(refined_prompt)

	st.audio(music_file)

	cleanup_old_files()

	if __name__ == "__main__":
	main()