Niki Zhang commited on
Commit
044e873
·
verified ·
1 Parent(s): e3f0a5f

submit button

Browse files
Files changed (1) hide show
  1. app.py +28 -14
app.py CHANGED
@@ -171,7 +171,6 @@ def upload_callback(image_input, state, visual_chatgpt=None):
171
  return state, state, image_input, click_state, image_input, image_input, image_embedding, \
172
  original_size, input_size
173
 
174
-
175
  def inference_click(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
176
  length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
177
  evt: gr.SelectData):
@@ -186,6 +185,17 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
186
  input_points = prompt['input_point']
187
  input_labels = prompt['input_label']
188
 
 
 
 
 
 
 
 
 
 
 
 
189
  controls = {'length': length,
190
  'sentiment': sentiment,
191
  'factuality': factuality,
@@ -213,31 +223,31 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
213
  input_mask = np.array(out['mask'].convert('P'))
214
  image_input = mask_painter(np.array(image_input), input_mask)
215
  origin_image_input = image_input
216
- image_input = create_bubble_frame(image_input, text, (click_index[0], click_index[1]), input_mask,
217
  input_points=input_points, input_labels=input_labels)
218
- x, y = input_points[-1]
219
-
220
  if visual_chatgpt is not None:
221
- print('inference_click: add caption to chatGPT memory')
222
- new_crop_save_path = get_new_image_name('chat_image', func_name='crop')
223
- Image.open(out["crop_save_path"]).save(new_crop_save_path)
224
- point_prompt = f'You should primarly use tools on the selected regional image (description: {text}, path: {new_crop_save_path}), which is a part of the whole image (path: {visual_chatgpt.current_image}). If human mentioned some objects not in the selected region, you can use tools on the whole image.'
225
- visual_chatgpt.point_prompt = point_prompt
 
 
 
226
 
227
- yield state, state, click_state, image_input
228
  if not args.disable_gpt and model.text_refiner:
229
  refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
230
  enable_wiki=enable_wiki)
231
- # new_cap = 'Original: ' + text + '. Refined: ' + refined_caption['caption']
232
  new_cap = refined_caption['caption']
233
  if refined_caption['wiki']:
234
  state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
235
  state = state + [(None, f"caption: {new_cap}")]
236
- refined_image_input = create_bubble_frame(origin_image_input, new_cap, (click_index[0], click_index[1]),
237
  input_mask,
238
  input_points=input_points, input_labels=input_labels)
239
- yield state, state, click_state, refined_image_input
240
-
 
241
 
242
  def get_sketch_prompt(mask: Image.Image):
243
  """
@@ -543,6 +553,10 @@ def create_ui():
543
  )
544
  clear_button_text.click(clear_chat_memory, inputs=[visual_chatgpt])
545
 
 
 
 
 
546
  image_input.clear(
547
  lambda: (None, [], [], [[], [], []], "", "", ""),
548
  [],
 
171
  return state, state, image_input, click_state, image_input, image_input, image_embedding, \
172
  original_size, input_size
173
 
 
174
  def inference_click(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
175
  length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
176
  evt: gr.SelectData):
 
185
  input_points = prompt['input_point']
186
  input_labels = prompt['input_label']
187
 
188
+ click_state[0] = input_points
189
+ click_state[1] = input_labels
190
+
191
+ return state, click_state
192
+
193
+ def submit_caption(image_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt):
194
+ coordinate = json.dumps(list(zip(click_state[0], click_state[1])))
195
+ prompt = get_click_prompt(coordinate, click_state, 'Single')
196
+ input_points = prompt['input_point']
197
+ input_labels = prompt['input_label']
198
+
199
  controls = {'length': length,
200
  'sentiment': sentiment,
201
  'factuality': factuality,
 
223
  input_mask = np.array(out['mask'].convert('P'))
224
  image_input = mask_painter(np.array(image_input), input_mask)
225
  origin_image_input = image_input
226
+ image_input = create_bubble_frame(image_input, text, (input_points[-1][0], input_points[-1][1]), input_mask,
227
  input_points=input_points, input_labels=input_labels)
 
 
228
  if visual_chatgpt is not None:
229
+ new_image_path = get_new_image_name('chat_image', func_name='upload')
230
+ image_input.save(new_image_path)
231
+ visual_chatgpt.current_image = new_image_path
232
+ img_caption = model.captioner.inference(image_input, filter=False, args={'text_prompt':''})['caption']
233
+ Human_prompt = f'\nHuman: The description of the image with path {new_image_path} is: {img_caption}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
234
+ AI_prompt = "Received."
235
+ visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
236
+ visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
237
 
 
238
  if not args.disable_gpt and model.text_refiner:
239
  refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
240
  enable_wiki=enable_wiki)
 
241
  new_cap = refined_caption['caption']
242
  if refined_caption['wiki']:
243
  state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
244
  state = state + [(None, f"caption: {new_cap}")]
245
+ refined_image_input = create_bubble_frame(origin_image_input, new_cap, (input_points[-1][0], input_points[-1][1]),
246
  input_mask,
247
  input_points=input_points, input_labels=input_labels)
248
+ return state, state, click_state, refined_image_input
249
+
250
+ return state, state, click_state, image_input
251
 
252
  def get_sketch_prompt(mask: Image.Image):
253
  """
 
553
  )
554
  clear_button_text.click(clear_chat_memory, inputs=[visual_chatgpt])
555
 
556
+ submit_button_click.click(submit_caption,
557
+ [origin_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt],
558
+ [chatbot, state, click_state, image_input])
559
+
560
  image_input.clear(
561
  lambda: (None, [], [], [[], [], []], "", "", ""),
562
  [],