import streamlit as st from transformers import AutoProcessor, BlipForConditionalGeneration, pipeline, AutoModelForCausalLM, AutoTokenizer from PIL import Image as PILImage import scipy.io.wavfile as wavfile import os import uuid # Set page config at the very beginning st.set_page_config(page_title="Image to Music", layout="wide") # Load models outside of functions @st.cache_resource def load_models(): model_id = "Salesforce/blip-image-captioning-large" processor = AutoProcessor.from_pretrained(model_id) blip_model = BlipForConditionalGeneration.from_pretrained(model_id) synthesiser = pipeline("text-to-audio", model="facebook/musicgen-small") phi_model = AutoModelForCausalLM.from_pretrained( "microsoft/Phi-3.5-mini-instruct", device_map="auto", torch_dtype="auto", trust_remote_code=True ) phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct") return processor, blip_model, synthesiser, phi_model, phi_tokenizer processor, blip_model, synthesiser, phi_model, phi_tokenizer = load_models() @st.cache_data def image_to_text(_image: PILImage.Image): try: # Prepare the image for the model inputs = processor(images=_image, return_tensors="pt") # Generate caption output = blip_model.generate(**inputs, max_new_tokens=100) # Decode the output caption = processor.decode(output[0], skip_special_tokens=True) return caption # # Create a music generation prompt based on the caption # music_prompt = f"Generate music inspired by this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions evoked by the scene." # return music_prompt except Exception as e: return f"Error in image_to_text: {str(e)}" @st.cache_data def refine_prompt(caption: str): try: messages = [ {"role": "system", "content": "You are a helpful AI assistant for generating music prompts."}, {"role": "user", "content": f"Generate a detailed music prompt based on this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions."} ] pipe = pipeline( "text-generation", model=phi_model, tokenizer=phi_tokenizer, ) generation_args = { "max_new_tokens": 500, "return_full_text": False, "temperature": 0.7, "do_sample": True, } output = pipe(messages, **generation_args) refined_prompt = output[0]['generated_text'] return refined_prompt except Exception as e: return f"Error in refine_prompt: {str(e)}" def text_to_music(response: str): try: music = synthesiser(response, forward_params={"do_sample": True}) output_path = f"musicgen_out_{uuid.uuid4()}.wav" wavfile.write(output_path, rate=music["sampling_rate"], data=music["audio"]) return output_path except Exception as e: return f"Error in text_to_music: {str(e)}" def cleanup_old_files(): for file in os.listdir(): if file.startswith("musicgen_out_") and file.endswith(".wav"): os.remove(file) def main(): # st.set_page_config(page_title="Image to Music", layout="wide") st.title("Image to Music") st.write(""" Generate music inspired by an image. This project enables the creation of music based on the inspiration drawn from an image, leveraging multiple AI technologies. ## How It Works 1. **Image to Text Description** - Use Salesforce BLIP to convert the image into a caption. 2. **Text to Refined Music Prompt** - Use Microsoft Phi-3.5-mini- to generate a detailed music prompt based on the caption. 3. **Music Prompt to Music** - Use Facebook MusicGen to generate music from the refined prompt. ## Steps 1. **Image -> [ Salesforce BLIP ] -> Caption** 2. **Caption -> [ Microsoft Phi-3.5-mini ] -> Refined Music Prompt** 3. **Refined Music Prompt -> [ Facebook MusicGen ] -> Music** Let's turn your visual inspirations into beautiful melodies! **Please Note:** The music generation process may take several minutes to complete. This is due to the complex AI models working behind the scenes to create unique music based on your image. Thank you for your patience! """) uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: image = PILImage.open(uploaded_file) st.image(image, caption="Uploaded Image", use_column_width=True) if st.button("Generate Music"): with st.spinner("Processing image..."): caption = image_to_text(image) st.text_area("Generated Caption", caption, height=100) with st.spinner("Refining music prompt..."): refined_prompt = refine_prompt(caption) st.text_area("Refined Music Prompt", refined_prompt, height=150) with st.spinner("Generating music..."): music_file = text_to_music(refined_prompt) st.audio(music_file) cleanup_old_files() if __name__ == "__main__": main()