andito HF staff commited on
Commit
fe4fa5b
·
verified ·
1 Parent(s): 0b32b82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -64
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import gradio as gr
2
  from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
 
3
  from threading import Thread
4
  import re
5
  import time
6
- from PIL import Image
7
  import torch
8
  import spaces
9
  #import subprocess
@@ -18,15 +18,14 @@ model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct-2
18
 
19
  @spaces.GPU
20
  def model_inference(
21
- input_dict, history, decoding_strategy, temperature, max_new_tokens,
22
- repetition_penalty, top_p
23
  ):
24
  text = input_dict["text"]
25
  print(input_dict["files"])
26
  if len(input_dict["files"]) > 1:
27
- images = [Image.open(image).convert("RGB") for image in input_dict["files"]]
28
  elif len(input_dict["files"]) == 1:
29
- images = [Image.open(input_dict["files"][0]).convert("RGB")]
30
  else:
31
  images = []
32
 
@@ -52,26 +51,19 @@ def model_inference(
52
  inputs = processor(text=prompt, images=[images], return_tensors="pt")
53
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
54
  generation_args = {
55
- "max_new_tokens": max_new_tokens,
56
- "repetition_penalty": repetition_penalty,
57
-
 
 
 
 
 
58
  }
59
 
60
- assert decoding_strategy in [
61
- "Greedy",
62
- "Top P Sampling",
63
- ]
64
- if decoding_strategy == "Greedy":
65
- generation_args["do_sample"] = False
66
- elif decoding_strategy == "Top P Sampling":
67
- generation_args["temperature"] = temperature
68
- generation_args["do_sample"] = True
69
- generation_args["top_p"] = top_p
70
-
71
- generation_args.update(inputs)
72
  # Generate
73
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens= True)
74
- generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
75
  generated_text = ""
76
 
77
  thread = Thread(target=model.generate, kwargs=generation_args)
@@ -99,48 +91,8 @@ examples=[
99
  demo = gr.ChatInterface(fn=model_inference, title="SmolVLM: Small yet Mighty 💫",
100
  description="Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This checkpoint works best with single turn conversations, so clear the conversation after a single turn.",
101
  examples=examples,
102
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
103
- additional_inputs=[gr.Radio(["Top P Sampling",
104
- "Greedy"],
105
- value="Greedy",
106
- label="Decoding strategy",
107
- #interactive=True,
108
- info="Higher values is equivalent to sampling more low-probability tokens.",
109
-
110
- ), gr.Slider(
111
- minimum=0.0,
112
- maximum=5.0,
113
- value=0.4,
114
- step=0.1,
115
- interactive=True,
116
- label="Sampling temperature",
117
- info="Higher values will produce more diverse outputs.",
118
- ),
119
- gr.Slider(
120
- minimum=8,
121
- maximum=1024,
122
- value=512,
123
- step=1,
124
- interactive=True,
125
- label="Maximum number of new tokens to generate",
126
- ), gr.Slider(
127
- minimum=0.01,
128
- maximum=5.0,
129
- value=1.2,
130
- step=0.01,
131
- interactive=True,
132
- label="Repetition penalty",
133
- info="1.0 is equivalent to no penalty",
134
- ),
135
- gr.Slider(
136
- minimum=0.01,
137
- maximum=0.99,
138
- value=0.8,
139
- step=0.01,
140
- interactive=True,
141
- label="Top P",
142
- info="Higher values is equivalent to sampling more low-probability tokens.",
143
- )],cache_examples=False
144
  )
145
 
146
 
 
1
  import gradio as gr
2
  from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
3
+ from transformers.image_utils import load_image
4
  from threading import Thread
5
  import re
6
  import time
 
7
  import torch
8
  import spaces
9
  #import subprocess
 
18
 
19
  @spaces.GPU
20
  def model_inference(
21
+ input_dict, history
 
22
  ):
23
  text = input_dict["text"]
24
  print(input_dict["files"])
25
  if len(input_dict["files"]) > 1:
26
+ images = [load_image(image) for image in input_dict["files"]]
27
  elif len(input_dict["files"]) == 1:
28
+ images = [load_image(input_dict["files"][0])]
29
  else:
30
  images = []
31
 
 
51
  inputs = processor(text=prompt, images=[images], return_tensors="pt")
52
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
53
  generation_args = {
54
+ "input_ids": inputs.input_ids,
55
+ "pixel_values": inputs.pixel_values,
56
+ "attention_mask": inputs.attention_mask,
57
+ "num_return_sequences": 1,
58
+ "no_repeat_ngram_size": 2,
59
+ "temperature": 0.7,
60
+ "max_new_tokens": 500,
61
+ "min_new_tokens": 10,
62
  }
63
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # Generate
65
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
66
+ generation_args = dict(inputs, streamer=streamer, max_new_tokens=500)
67
  generated_text = ""
68
 
69
  thread = Thread(target=model.generate, kwargs=generation_args)
 
91
  demo = gr.ChatInterface(fn=model_inference, title="SmolVLM: Small yet Mighty 💫",
92
  description="Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This checkpoint works best with single turn conversations, so clear the conversation after a single turn.",
93
  examples=examples,
94
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
95
+ ],cache_examples=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  )
97
 
98