Spaces:

llava-hf
/

llava-4bit

Running on T4

File size: 5,333 Bytes

8400add
 
 
 
 
 
ee95e21
8400add
 
 
 
e5327ee
 
 
 
 
 
 
 
11e466e
8400add
458ccb5
 
 
 
 
 
 
 
8400add
 
 
 
 
 
 
 
 
08bcb47
11e466e
08bcb47
8400add
11e466e
8400add
 
08bcb47
 
 
 
 
 
8400add
11e466e
 
86f426b
8400add
 
 
 
 
 
 
 
 
 
 
 
 
 
a3a174a
 
11e466e
08bcb47
8400add
08bcb47
8400add
08bcb47
8400add
08bcb47
 
 
8400add
 
 
 
 
 
08bcb47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8400add
 
 
35dad4a
8400add
35dad4a
08bcb47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8400add
 
 
 
 
 
 
08bcb47
 
 
8400add
08bcb47
 
8400add
 
 
 
 
 
 
 
08bcb47
 
 
8400add
08bcb47
 
8400add
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11e466e
 
3586d9f
11e466e
 
 
8400add
 
 
08bcb47

import os
import string

import gradio as gr
import PIL.Image
import torch
from transformers import BitsAndBytesConfig, pipeline
import re

DESCRIPTION = "# LLaVA 🌋"

model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)
pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})



def extract_response_pairs(text):
    turns = re.split(r'(USER:|ASSISTANT:)', text)[1:]
    turns = [turn.strip() for turn in turns if turn.strip()]
    print(turns[1::2])
    conv_list = []
    for i in range(0, len(turns[1::2]), 2):
        if i + 1 < len(turns[1::2]):
            conv_list.append((turns[1::2][i].lstrip(":"), turns[1::2][i + 1].lstrip(":")))
    return conv_list


def postprocess_output(output: str) -> str:
    if output and output[-1] not in string.punctuation:
        output += "."
    return output



def chat(image, text, temperature, length_penalty,
         repetition_penalty, max_length, min_length, top_p,
         history_chat):
  
  prompt = " ".join(history_chat) + f"USER: <image>\n{text}\nASSISTANT:"
  
  outputs = pipe(image, prompt=prompt, 
                  generate_kwargs={"temperature":temperature,
                  "length_penalty":length_penalty,
                  "repetition_penalty":repetition_penalty,
                  "max_length":max_length,
                  "min_length":min_length,
                  "top_p":top_p})
  

  history_chat.append(outputs[0]["generated_text"])


  chat_val =  extract_response_pairs(" ".join(history_chat))
  return chat_val, history_chat


css = """
  #mkd {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
  """
with gr.Blocks(css="style.css") as demo:
    gr.Markdown(DESCRIPTION)
    gr.Markdown("""## LLaVA, one of the greatest multimodal chat models is now available in Transformers with 4-bit quantization! ⚡️
    See the docs here: https://huggingface.co/docs/transformers/main/en/model_doc/llava.""")
    gr.Markdown("## Try it 4-bit quantized LLaVA this demo 🤗")

    chatbot = gr.Chatbot(label="Chat", show_label=False)
    gr.Markdown("Input image and text and start chatting 👇")
    with gr.Row():
      
      image = gr.Image(type="pil")
      text_input = gr.Text(label="Chat Input", show_label=False, max_lines=3, container=False)
    
      
      
    history_chat = gr.State(value=[])
    with gr.Row():
        clear_chat_button = gr.Button("Clear")
        chat_button = gr.Button("Submit", variant="primary")
    with gr.Accordion(label="Advanced settings", open=False):
        temperature = gr.Slider(
            label="Temperature",
            info="Used with nucleus sampling.",
            minimum=0.5,
            maximum=1.0,
            step=0.1,
            value=1.0,
        )
        length_penalty = gr.Slider(
            label="Length Penalty",
            info="Set to larger for longer sequence, used with beam search.",
            minimum=-1.0,
            maximum=2.0,
            step=0.2,
            value=1.0,
        )
        repetition_penalty = gr.Slider(
            label="Repetition Penalty",
            info="Larger value prevents repetition.",
            minimum=1.0,
            maximum=5.0,
            step=0.5,
            value=1.5,
        )
        max_length = gr.Slider(
            label="Max Length",
            minimum=1,
            maximum=500,
            step=1,
            value=200,
        )
        min_length = gr.Slider(
            label="Minimum Length",
            minimum=1,
            maximum=100,
            step=1,
            value=1,
        )
        top_p = gr.Slider(
            label="Top P",
            info="Used with nucleus sampling.",
            minimum=0.5,
            maximum=1.0,
            step=0.1,
            value=0.9,
        )
    chat_output = [
        chatbot,
        history_chat
    ]
    chat_button.click(fn=chat, inputs=[image, 
            text_input,
            temperature,
            length_penalty,
            repetition_penalty,
            max_length,
            min_length,
            top_p,
            history_chat],
        outputs=chat_output,
        api_name="Chat",
    )

    chat_inputs = [
        image,
        text_input,
        temperature,
        length_penalty,
        repetition_penalty,
        max_length,
        min_length,
        top_p,
        history_chat
    ]
    text_input.submit(
        fn=chat,
        inputs=chat_inputs,
        outputs=chat_output
    ).success(
        fn=lambda: "",
        outputs=chat_inputs,
        queue=False,
        api_name=False,
    )
    clear_chat_button.click(
        fn=lambda: ([], []),
        inputs=None,
        outputs=[
            chatbot,
            history_chat
        ],
        queue=False,
        api_name="clear",
    )
    image.change(
        fn=lambda: ([], []),
        inputs=None,
        outputs=[
            chatbot,
            history_chat
        ],
        queue=False)

    examples = [["./examples/baklava.png", "How to make this pastry?"],["./examples/bee.png","Describe this image."]]
    gr.Examples(examples=examples, inputs=[image, text_input, chat_inputs])


    

if __name__ == "__main__":
    demo.queue(max_size=10).launch()