Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
|
|
|
3 |
from threading import Thread
|
4 |
import re
|
5 |
import time
|
6 |
-
from PIL import Image
|
7 |
import torch
|
8 |
import spaces
|
9 |
#import subprocess
|
@@ -18,15 +18,14 @@ model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct-2
|
|
18 |
|
19 |
@spaces.GPU
|
20 |
def model_inference(
|
21 |
-
input_dict, history
|
22 |
-
repetition_penalty, top_p
|
23 |
):
|
24 |
text = input_dict["text"]
|
25 |
print(input_dict["files"])
|
26 |
if len(input_dict["files"]) > 1:
|
27 |
-
images = [
|
28 |
elif len(input_dict["files"]) == 1:
|
29 |
-
images = [
|
30 |
else:
|
31 |
images = []
|
32 |
|
@@ -52,26 +51,19 @@ def model_inference(
|
|
52 |
inputs = processor(text=prompt, images=[images], return_tensors="pt")
|
53 |
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
54 |
generation_args = {
|
55 |
-
"
|
56 |
-
"
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
58 |
}
|
59 |
|
60 |
-
assert decoding_strategy in [
|
61 |
-
"Greedy",
|
62 |
-
"Top P Sampling",
|
63 |
-
]
|
64 |
-
if decoding_strategy == "Greedy":
|
65 |
-
generation_args["do_sample"] = False
|
66 |
-
elif decoding_strategy == "Top P Sampling":
|
67 |
-
generation_args["temperature"] = temperature
|
68 |
-
generation_args["do_sample"] = True
|
69 |
-
generation_args["top_p"] = top_p
|
70 |
-
|
71 |
-
generation_args.update(inputs)
|
72 |
# Generate
|
73 |
-
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=
|
74 |
-
generation_args = dict(inputs, streamer=streamer, max_new_tokens=
|
75 |
generated_text = ""
|
76 |
|
77 |
thread = Thread(target=model.generate, kwargs=generation_args)
|
@@ -99,48 +91,8 @@ examples=[
|
|
99 |
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM: Small yet Mighty 💫",
|
100 |
description="Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This checkpoint works best with single turn conversations, so clear the conversation after a single turn.",
|
101 |
examples=examples,
|
102 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
|
103 |
-
|
104 |
-
"Greedy"],
|
105 |
-
value="Greedy",
|
106 |
-
label="Decoding strategy",
|
107 |
-
#interactive=True,
|
108 |
-
info="Higher values is equivalent to sampling more low-probability tokens.",
|
109 |
-
|
110 |
-
), gr.Slider(
|
111 |
-
minimum=0.0,
|
112 |
-
maximum=5.0,
|
113 |
-
value=0.4,
|
114 |
-
step=0.1,
|
115 |
-
interactive=True,
|
116 |
-
label="Sampling temperature",
|
117 |
-
info="Higher values will produce more diverse outputs.",
|
118 |
-
),
|
119 |
-
gr.Slider(
|
120 |
-
minimum=8,
|
121 |
-
maximum=1024,
|
122 |
-
value=512,
|
123 |
-
step=1,
|
124 |
-
interactive=True,
|
125 |
-
label="Maximum number of new tokens to generate",
|
126 |
-
), gr.Slider(
|
127 |
-
minimum=0.01,
|
128 |
-
maximum=5.0,
|
129 |
-
value=1.2,
|
130 |
-
step=0.01,
|
131 |
-
interactive=True,
|
132 |
-
label="Repetition penalty",
|
133 |
-
info="1.0 is equivalent to no penalty",
|
134 |
-
),
|
135 |
-
gr.Slider(
|
136 |
-
minimum=0.01,
|
137 |
-
maximum=0.99,
|
138 |
-
value=0.8,
|
139 |
-
step=0.01,
|
140 |
-
interactive=True,
|
141 |
-
label="Top P",
|
142 |
-
info="Higher values is equivalent to sampling more low-probability tokens.",
|
143 |
-
)],cache_examples=False
|
144 |
)
|
145 |
|
146 |
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
|
3 |
+
from transformers.image_utils import load_image
|
4 |
from threading import Thread
|
5 |
import re
|
6 |
import time
|
|
|
7 |
import torch
|
8 |
import spaces
|
9 |
#import subprocess
|
|
|
18 |
|
19 |
@spaces.GPU
|
20 |
def model_inference(
|
21 |
+
input_dict, history
|
|
|
22 |
):
|
23 |
text = input_dict["text"]
|
24 |
print(input_dict["files"])
|
25 |
if len(input_dict["files"]) > 1:
|
26 |
+
images = [load_image(image) for image in input_dict["files"]]
|
27 |
elif len(input_dict["files"]) == 1:
|
28 |
+
images = [load_image(input_dict["files"][0])]
|
29 |
else:
|
30 |
images = []
|
31 |
|
|
|
51 |
inputs = processor(text=prompt, images=[images], return_tensors="pt")
|
52 |
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
53 |
generation_args = {
|
54 |
+
"input_ids": inputs.input_ids,
|
55 |
+
"pixel_values": inputs.pixel_values,
|
56 |
+
"attention_mask": inputs.attention_mask,
|
57 |
+
"num_return_sequences": 1,
|
58 |
+
"no_repeat_ngram_size": 2,
|
59 |
+
"temperature": 0.7,
|
60 |
+
"max_new_tokens": 500,
|
61 |
+
"min_new_tokens": 10,
|
62 |
}
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
# Generate
|
65 |
+
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
66 |
+
generation_args = dict(inputs, streamer=streamer, max_new_tokens=500)
|
67 |
generated_text = ""
|
68 |
|
69 |
thread = Thread(target=model.generate, kwargs=generation_args)
|
|
|
91 |
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM: Small yet Mighty 💫",
|
92 |
description="Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This checkpoint works best with single turn conversations, so clear the conversation after a single turn.",
|
93 |
examples=examples,
|
94 |
+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
|
95 |
+
],cache_examples=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
)
|
97 |
|
98 |
|