SmolVLM-256M-Demo

Running on Zero

File size: 4,063 Bytes

e352103
36be50d
fe4fa5b
36be50d
e352103
 
 
 
36be50d
0a651e1
e352103
 
0b32b82
 
e352103
0a651e1
ad382c8
e352103
e4c787e
e352103
fe4fa5b
36be50d
 
 
 
fe4fa5b
36be50d
fe4fa5b
ab94263
 
36be50d
 
e352103
 
 
 
 
 
36be50d
e352103
 
0029ec4
 
 
36be50d
0029ec4
 
 
 
 
 
f3b9c11
0029ec4
fe4fa5b
 
 
 
 
 
 
0029ec4
e352103
 
fe4fa5b
 
36be50d
 
 
 
6d64276
 
36be50d
 
 
 
 
 
 
 
 
 
 
 
1f3cc30
 
 
 
 
 
 
 
36be50d
1f3cc30
 
36be50d
fe4fa5b
6609c1f
36be50d

import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
from transformers.image_utils import load_image
from threading import Thread
import re
import time
import torch
import spaces
#import subprocess
#subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)


processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct-250M")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct-250M", 
        torch_dtype=torch.bfloat16,
        #_attn_implementation="flash_attention_2"
        ).to("cuda")

@spaces.GPU
def model_inference(
    input_dict, history
): 
    text = input_dict["text"]
    print(input_dict["files"])
    if len(input_dict["files"]) > 1:
      images = [load_image(image) for image in input_dict["files"]]
    elif len(input_dict["files"]) == 1:
      images = [load_image(input_dict["files"][0])]   
    else:
      images = []
    

    if text == "" and not images:
        gr.Error("Please input a query and optionally image(s).")

    if text == "" and images:
        gr.Error("Please input a text query along the image(s).")

    


    resulting_messages = [
                {
                    "role": "user",
                    "content": [{"type": "image"} for _ in range(len(images))] + [
                        {"type": "text", "text": text}
                    ]
                }
            ]
    prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[images], return_tensors="pt")
    inputs = inputs.to('cuda')
    generation_args = {
        "input_ids": inputs.input_ids,
        "pixel_values": inputs.pixel_values,
        "attention_mask": inputs.attention_mask,
        "num_return_sequences": 1,
        "no_repeat_ngram_size": 2,
        "max_new_tokens": 500,
        "min_new_tokens": 10,       
    }

    # Generate
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_args = dict(inputs, streamer=streamer, max_new_tokens=500)
    generated_text = ""

    thread = Thread(target=model.generate, kwargs=generation_args)
    thread.start()

    yield "..."
    buffer = ""
    
      
    for new_text in streamer:
    
      buffer += new_text
      generated_text_without_prompt = buffer#[len(ext_buffer):]
      time.sleep(0.01)
      yield buffer


examples=[
              [{"text": "Describe this image.", "files": ["example_images/newyork.jpg"]}],
              [{"text": "Describe this image.", "files": ["example_images/dogs.jpg"]}],
              [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
              [{"text": "What art era do these artpieces belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
              [{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
              [{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
              [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
              [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
      ]
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM-250M: The Smollest VLM ever 💫", 
                description="Play with [HuggingFaceTB/SmolVLM-Instruct-250M](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct-250M) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
                examples=examples,
                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
                cache_examples=False
                )
      
      

demo.launch(debug=True)