Spaces:

jadechoghari
/

ferret-demo

Running on Zero

File size: 8,686 Bytes

import gradio as gr
from inference import inference_and_run
import spaces
import os
import re
import shutil

model_name = 'Ferret-UI'
cur_dir = os.path.dirname(os.path.abspath(__file__))

@spaces.GPU()
def inference_with_gradio(chatbot, image, prompt, model_path, box=None, temperature=0.2, top_p=0.7, max_new_tokens=512):
    dir_path = os.path.dirname(image)
    # image_path = image
    # Define the directory where you want to save the image (current directory)
    filename = os.path.basename(image)
    dir_path = "./"

    # Create the new path for the file (in the current directory)
    image_path = os.path.join(dir_path, filename)
    shutil.copy(image, image_path)
    print("filename path: ", filename)
    if "gemma" in model_path.lower():
        conv_mode = "ferret_gemma_instruct"
    else:
        conv_mode = "ferret_llama_3"
    
    # inference_text = inference_and_run(
    #     image_path=image_path,
    #     prompt=prompt,
    #     conv_mode=conv_mode,
    #     model_path=model_path,
    #     box=box
    # )
    inference_text = inference_and_run(
        image_path=filename, # double check this
        image_dir=dir_path,
        prompt=prompt,
        model_path="jadechoghari/Ferret-UI-Gemma2b",
        conv_mode=conv_mode,
        temperature=temperature, 
        top_p=top_p,
        box=box,
        max_new_tokens=max_new_tokens,
        # stop=stop    # Assuming we want to process the image
        )
    
    # print("done, now appending", inference_text)
    # chatbot.append((prompt, inference_text))
    # return chatbot
    # Convert inference_text to string if it's not already
    if isinstance(inference_text, (list, tuple)):
        inference_text = str(inference_text[0])
        
    # Update chatbot history with new message pair
    new_history = chatbot.copy() if chatbot else []
    new_history.append((prompt, inference_text))
    return new_history

def submit_chat(chatbot, text_input):
    response = ''
    # chatbot.append((text_input, response))
    return chatbot, ''

def clear_chat():
    return [], None, "", "", 0.2, 0.7, 512

with open(f"{cur_dir}/logo.svg", "r", encoding="utf-8") as svg_file:
    svg_content = svg_file.read()
font_size = "2.5em"
svg_content = re.sub(r'(<svg[^>]*)(>)', rf'\1 height="{font_size}" style="vertical-align: middle; display: inline-block;"\2', svg_content)
html = f"""
<p align="center" style="font-size: {font_size}; line-height: 1;">
    <span style="display: inline-block; vertical-align: middle;">{svg_content}</span>
    <span style="display: inline-block; vertical-align: middle;">{model_name}</span>
</p>
<center><font size=3><b>{model_name}</b> Demo: Upload an image, provide a prompt, and get insights using advanced AI models. <a href='https://huggingface.co/jadechoghari/Ferret-UI-Gemma2b'>😊 Huggingface</a></font></center>
"""
with open(f"{cur_dir}/ferretui_icon.png", "rb") as image_file:
    image_data = image_file.read()
# html = f"""
# <p align="center">
#     <img src='data:image/png;base64,{image_data.encode("base64").decode("utf-8")}' alt='Ferret-UI' style='width: 100px; vertical-align: middle; border-radius: 15px; box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.1);'/>
#     <span style="font-size: 2em; font-weight: bold; margin-left: 10px; vertical-align: middle;">{model_name}</span>
# </p>
# <center><font size=3><b>{model_name}</b> Demo: Upload an image, provide a prompt, and get insights using advanced AI models. <a href='https://huggingface.co/jadechoghari/Ferret-UI-Gemma2b'>😊 Huggingface</a></font></center>
# """

html = f"""
<div style="text-align: center; padding: 20px;">
    <div style="display: inline-block; background-color: #f5f5f7; padding: 20px; border-radius: 20px; box-shadow: 0px 6px 20px rgba(0, 0, 0, 0.1);">
        <div style="display: flex; align-items: center;">
            <img src='https://github.com/apple/ml-ferret/blob/main/ferretui/figs/ferretui_icon.png?raw=true' alt='Ferret-UI' 
                style='width: 80px; height: 80px; border-radius: 20px; box-shadow: 0px 8px 16px rgba(0, 0, 0, 0.2);'/>
            <div style="margin-left: 15px;">
                <h1 style="font-size: 2.8em; font-family: -apple-system, BlinkMacSystemFont, sans-serif; color: #1D1D1F; 
                font-weight: bold; margin-bottom: 0;"> {model_name}</h1>
                <p style="font-size: 1.2em; color: #6e6e73; font-family: -apple-system, BlinkMacSystemFont, sans-serif; margin-top: 5px;">
                    📱 Grounded Mobile UI Understanding with Multimodal LLMs.<br>
                    A new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities.
                </p>
                <a href='https://huggingface.co/jadechoghari/Ferret-UI-Gemma2b' style='text-decoration: none;'>
                    <button style="background-color: #007aff; color: white; font-size: 1.2em; padding: 10px 20px; border-radius: 10px; border: none; margin-top: 10px; box-shadow: 0px 4px 12px rgba(0, 122, 255, 0.4); cursor: pointer;">
                        🤗 Try on Hugging Face
                    </button>
                </a>
            </div>
        </div>
    </div>
    <p style="font-size: 1.2em; color: #86868B; font-family: -apple-system, BlinkMacSystemFont, sans-serif; margin-top: 30px;">
        We release two Ferret-UI checkpoints, built on gemma-2b and Llama-3-8B models respectively, for public exploration. 🚀
    </p>
</div>
"""

latex_delimiters_set = [{
        "left": "\\(",
        "right": "\\)",
        "display": False 
    }, {
        "left": "\\begin{equation}",
        "right": "\\end{equation}",
        "display": True 
    }, {
        "left": "\\begin{align}",
        "right": "\\end{align}",
        "display": True
    }]

# Set up UI components
image_input = gr.Image(label="Upload Image", type="filepath", height=350)
text_input = gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt")
model_dropdown = gr.Dropdown(choices=[
    "jadechoghari/Ferret-UI-Gemma2b",
    "jadechoghari/Ferret-UI-Llama8b",
], label="Model Path", value="jadechoghari/Ferret-UI-Gemma2b")

bounding_box_input = gr.Textbox(placeholder="Optional bounding box (x1, y1, x2, y2)", label="Bounding Box (optional)")
# Adding Sliders for temperature, top_p, and max_new_tokens
temperature_input = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.2, label="Temperature")
top_p_input = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.7, label="Top P")
max_new_tokens_input = gr.Slider(minimum=1, maximum=1024, step=1, value=512, label="Max New Tokens")


chatbot = gr.Chatbot(label="Chat with Ferret-UI", height=400, show_copy_button=True, latex_delimiters=latex_delimiters_set)

with gr.Blocks(title=model_name, theme=gr.themes.Ocean()) as demo:
    gr.HTML(html)
    with gr.Row():
        with gr.Column(scale=3):
            image_input.render()
            text_input.render()
            model_dropdown.render()
            bounding_box_input.render()
            temperature_input.render()    # Render temperature input
            top_p_input.render()          # Render top_p input
            max_new_tokens_input.render()
            gr.Examples(
                examples=[
                    ["appstore_reminders.png", "Describe the image in details", "jadechoghari/Ferret-UI-Gemma2b", None],
                    ["appstore_reminders.png", "What's inside the selected region?", "jadechoghari/Ferret-UI-Gemma2b", "189, 906, 404, 970"],
                    ["appstore_reminders.png", "Where is the Game Tab?", "jadechoghari/Ferret-UI-Gemma2b", None],
                ],
                inputs=[image_input, text_input, model_dropdown, bounding_box_input]
            )
        with gr.Column(scale=7):
            chatbot.render()
            with gr.Row():
                send_btn = gr.Button("Send", variant="primary")
                clear_btn = gr.Button("Clear", variant="secondary")

    send_click_event = send_btn.click(
        inference_with_gradio, [chatbot, image_input, text_input, model_dropdown, bounding_box_input, temperature_input, top_p_input, max_new_tokens_input], chatbot
    ).then(submit_chat, [chatbot, text_input], [chatbot, text_input])
    submit_event = text_input.submit(
        inference_with_gradio, [chatbot, image_input, text_input, model_dropdown, bounding_box_input, temperature_input, top_p_input, max_new_tokens_input], chatbot
    ).then(submit_chat, [chatbot, text_input], [chatbot, text_input])
    
    clear_btn.click(clear_chat, outputs=[chatbot, image_input, text_input, bounding_box_input, temperature_input, top_p_input, max_new_tokens_input])

demo.launch()