image-to-music-v2

Running on Zero

File size: 3,967 Bytes

import gradio as gr
from gradio_client import Client

fusecap_client = Client("https://noamrot-fusecap-image-captioning.hf.space/")

def get_caption(image_in):
    
    fusecap_result = fusecap_client.predict(
	    image_in,	# str representing input in 'raw_image' Image component
	    api_name="/predict"
    )
    print(f"IMAGE CAPTION: {fusecap_result}")
    return fusecap_result

import re
import torch
from transformers import pipeline

pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto")

agent_maker_sys = f"""
You are an AI whose job it is to help users create their own chatbots, based on the image description the user provide. In particular, you need to respond succintly in a friendly tone, write a system prompt for an LLM, a catchy title for the chatbot, and a very short example user input. Make sure each part is included.
First, user will provide an image description, from which you'll take inspiration for your job. WITHOUT mentioning any image, use user provided description to imagine a llm with a personality reflecting informations given. 
For example, if a user says, "make a bot that gives advice on how to grow your startup", first do a friendly response, then add the title, system prompt, and example user input. Immediately STOP after the example input. It should be EXACTLY in this format:
Sure, I'd be happy to help you build a bot! I'm generating a title, system prompt, and an example input. How do they sound? Feel free to give me feedback!
Title: Startup Coach
System prompt: Your job as an LLM is to provide good startup advice. Do not provide extraneous comments on other topics. Be succinct but useful. 
Example input: Risks of setting up a non-profit board
Here's another example. If a user types, "Make a chatbot that roasts tech ceos", respond: 
Sure, I'd be happy to help you build a bot! I'm generating a title, system prompt, and an example input. How do they sound? Feel free to give me feedback!
Title: Tech Roaster
System prompt: As an LLM, your primary function is to deliver hilarious and biting critiques of technology CEOs. Keep it witty and entertaining, but also make sure your jokes aren't too mean-spirited or factually incorrect. 
Example input: Elon Musk
"""

instruction = f"""
<|system|>
{agent_maker_sys}</s>
<|user|>
"""

def infer(image_in):
    gr.Info("Getting image caption from Fuse Cap...")
    user_prompt = get_caption(image_in)
    prompt = f"{instruction.strip()}\n{user_prompt}</s>"    
    print(f"PROMPT: {prompt}")
    gr.Info("Building a system according to the image caption ...")
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    print(outputs)

    pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
    cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
    

    return cleaned_text

title = f"LLM Agent from a Picture",
description = f"Get a LLM system prompt from a picture so you can use it in <a href='https://huggingface.co/spaces/abidlabs/GPT-Baker'>GPT-Baker</a>."

css = """
#col-container{
    margin: 0 auto;
    max-width: 840px;
    text-align: left;
}
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML(f"""
        <h2 style="text-align: center;">LLM Agent from a Picture</h2>
        <p style="text-align: center;">{description}</p>
        """)
    with gr.Row():
        with gr.Column():
            image_in = gr.Image(
                label = "Image reference",
                type = "filepath"
            )
            submit_btn = gr.Button("Make LLM system from my pic !")
        with gr.Column():
            result = gr.Textbox(
                label ="Suggested System"
            )

    submit_btn.click(
        fn = infer,
        inputs = [
            image_in
        ],
        outputs =[
            result
        ]
    )

demo.queue().launch()