zamal's picture
Create app.py
e3ac72c verified
raw
history blame
2.48 kB
import gradio as gr
import torch
from transformers import AutoModelForCausalLM
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from deepseek_vl.utils.io import load_pil_images
import spaces # Import spaces for ZeroGPU support
# Load the model and processor
model_path = "deepseek-ai/deepseek-vl-1.3b-chat"
vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
# Define the function for image description
@spaces.GPU # Ensures GPU allocation for this function
def describe_image(image):
# Define the conversation
conversation = [
{
"role": "User",
"content": "<image_placeholder>Describe this image in great detail.",
"images": [image]
},
{
"role": "Assistant",
"content": ""
}
]
# Load image and process inputs
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
conversations=conversation,
images=pil_images,
force_batchify=True
).to('cuda')
# Run the image encoder to get embeddings
vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(torch.bfloat16).cuda().eval()
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
# Generate response from the model
outputs = vl_gpt.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=tokenizer.eos_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=512,
do_sample=False,
use_cache=True
)
# Decode the generated tokens into text
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer
# Gradio interface
def gradio_app():
with gr.Blocks() as demo:
gr.Markdown("# Image Description with DeepSeek VL 1.3b\n### Upload an image to receive a detailed description.")
with gr.Row():
image_input = gr.Image(type="pil", label="Upload an Image")
output_text = gr.Textbox(label="Image Description", interactive=False)
submit_btn = gr.Button("Generate Description")
submit_btn.click(
fn=describe_image,
inputs=[image_input],
outputs=output_text
)
demo.launch()
# Launch the Gradio app
gradio_app()