zamal commited on
Commit
e3ac72c
·
verified ·
1 Parent(s): 0cb0426

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM
4
+ from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
5
+ from deepseek_vl.utils.io import load_pil_images
6
+ import spaces # Import spaces for ZeroGPU support
7
+
8
+ # Load the model and processor
9
+ model_path = "deepseek-ai/deepseek-vl-1.3b-chat"
10
+ vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
11
+ tokenizer = vl_chat_processor.tokenizer
12
+
13
+ # Define the function for image description
14
+ @spaces.GPU # Ensures GPU allocation for this function
15
+ def describe_image(image):
16
+ # Define the conversation
17
+ conversation = [
18
+ {
19
+ "role": "User",
20
+ "content": "<image_placeholder>Describe this image in great detail.",
21
+ "images": [image]
22
+ },
23
+ {
24
+ "role": "Assistant",
25
+ "content": ""
26
+ }
27
+ ]
28
+
29
+ # Load image and process inputs
30
+ pil_images = load_pil_images(conversation)
31
+ prepare_inputs = vl_chat_processor(
32
+ conversations=conversation,
33
+ images=pil_images,
34
+ force_batchify=True
35
+ ).to('cuda')
36
+
37
+ # Run the image encoder to get embeddings
38
+ vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(torch.bfloat16).cuda().eval()
39
+ inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
40
+
41
+ # Generate response from the model
42
+ outputs = vl_gpt.language_model.generate(
43
+ inputs_embeds=inputs_embeds,
44
+ attention_mask=prepare_inputs.attention_mask,
45
+ pad_token_id=tokenizer.eos_token_id,
46
+ bos_token_id=tokenizer.bos_token_id,
47
+ eos_token_id=tokenizer.eos_token_id,
48
+ max_new_tokens=512,
49
+ do_sample=False,
50
+ use_cache=True
51
+ )
52
+
53
+ # Decode the generated tokens into text
54
+ answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
55
+ return answer
56
+
57
+ # Gradio interface
58
+ def gradio_app():
59
+ with gr.Blocks() as demo:
60
+ gr.Markdown("# Image Description with DeepSeek VL 1.3b\n### Upload an image to receive a detailed description.")
61
+
62
+ with gr.Row():
63
+ image_input = gr.Image(type="pil", label="Upload an Image")
64
+
65
+ output_text = gr.Textbox(label="Image Description", interactive=False)
66
+
67
+ submit_btn = gr.Button("Generate Description")
68
+
69
+ submit_btn.click(
70
+ fn=describe_image,
71
+ inputs=[image_input],
72
+ outputs=output_text
73
+ )
74
+
75
+ demo.launch()
76
+
77
+ # Launch the Gradio app
78
+ gradio_app()