moondream-05 / app.py
andito's picture
andito HF staff
adapt to moondream
ecd7421
raw
history blame
2.77 kB
import gradio as gr
from threading import Thread
from PIL import Image
import spaces
import moondream as md
#import subprocess
#subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
model = md.vl(model="moondream-0_5b-int8.mf")
def model_inference(input_dict, history):
# Extract image from message if present
if input_dict.get("files"):
image_path = input_dict["files"][0]
if isinstance(image_path, dict) and "path" in image_path:
image_path = image_path["path"]
image = Image.open(image_path)
encoded_image = model.encode_image(image)
# If there's a question, use query
text = input_dict.get("text", "")
if text not in ["", "Caption"]:
response = model.query(encoded_image, text)["answer"]
# Otherwise generate a caption
else:
response = model.caption(encoded_image)["caption"]
return response
else:
return "Please provide an image to analyze."
examples=[
[{"text": "What art era do this artpiece belong to?", "files": ["example_images/rococo.jpg"]}, []],
[{"text": "Caption", "files": ["example_images/rococo.jpg"]}, []],
[{"text": "I'm planning a visit to this temple, give me travel tips.", "files": ["example_images/examples_wat_arun.jpg"]}, []],
[{"text": "Caption", "files": ["example_images/examples_wat_arun.jpg"]}, []],
[{"text": "What is the due date and the invoice date?", "files": ["example_images/examples_invoice.png"]}, []],
[{"text": "Caption", "files": ["example_images/examples_invoice.png"]}, []],
[{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}, []],
[{"text": "Caption", "files": ["example_images/s2w_example.png"]}, []],
[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}, []],
[{"text": "Caption", "files": ["example_images/examples_weather_events.png"]}, []],
]
demo = gr.ChatInterface(fn=model_inference, title="Moondream 0.5B: The World's Smallest Vision-Language Model",
description="Play with [Moondream 0.5B](https://huggingface.co/vikhyatk/moondream2) in this demo. To get started, upload an image and text or try one of the examples.",
examples=examples,
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="single"), stop_btn="Stop Generation", multimodal=True,
additional_inputs=[], cache_examples=False)
demo.launch(debug=True)