Niki Zhang commited on
Commit
d29ec53
·
verified ·
1 Parent(s): 135ce14

Update app.py

Browse files

Add painting introduction part
Add Focus Type

Files changed (1) hide show
  1. app.py +158 -50
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import json
3
  import gradio as gr
4
  import numpy as np
@@ -117,9 +118,12 @@ def init_openai_api_key(api_key=""):
117
  gpt_state=1
118
  return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=True)]+ [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
119
  else:
 
120
  return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
121
 
122
  def init_wo_openai_api_key():
 
 
123
  return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]
124
 
125
  def get_click_prompt(chat_input, click_state, click_mode):
@@ -169,14 +173,13 @@ def chat_input_callback(*args):
169
 
170
 
171
 
172
- def upload_callback(image_input, state, visual_chatgpt=None):
173
-
174
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
175
  image_input, mask = image_input['image'], image_input['mask']
176
 
177
  click_state = [[], [], []]
178
  image_input = image_resize(image_input, res=1024)
179
-
180
  model = build_caption_anything_with_models(
181
  args,
182
  api_key="",
@@ -189,7 +192,7 @@ def upload_callback(image_input, state, visual_chatgpt=None):
189
  image_embedding = model.image_embedding
190
  original_size = model.original_size
191
  input_size = model.input_size
192
-
193
  if visual_chatgpt is not None:
194
  print('upload_callback: add caption to chatGPT memory')
195
  new_image_path = get_new_image_name('chat_image', func_name='upload')
@@ -200,10 +203,17 @@ def upload_callback(image_input, state, visual_chatgpt=None):
200
  AI_prompt = "Received."
201
  visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
202
  visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
 
 
 
 
 
 
203
  state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
204
 
205
- return state, state, image_input, click_state, image_input, image_input, image_embedding, \
206
- original_size, input_size
 
207
 
208
 
209
 
@@ -237,7 +247,7 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
237
  )
238
 
239
  model.setup(image_embedding, original_size, input_size, is_image_set=True)
240
-
241
  enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
242
  out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
243
 
@@ -270,7 +280,7 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
270
 
271
  def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
272
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
273
- input_text, input_language, input_audio, input_mic, use_mic, agree):
274
  print("state",state)
275
 
276
  click_index = click_index_state
@@ -278,9 +288,14 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
278
  input_points = input_points_state
279
  input_labels = input_labels_state
280
  out = out_state
281
- print("click",click_index)
 
 
 
 
282
 
283
- origin_image_input = image_input
 
284
 
285
  controls = {
286
  'length': length,
@@ -289,6 +304,37 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
289
  'language': language
290
  }
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
293
  input_points=input_points, input_labels=input_labels)
294
 
@@ -297,20 +343,14 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
297
 
298
 
299
  if not args.disable_gpt and text_refiner:
300
- refined_caption = text_refiner.inference(query=generated_caption, controls=controls, context=out['context_captions'], enable_wiki=enable_wiki)
301
- print("generated caption",generated_caption)
302
- print("controls",controls)
303
- print("context_catpions",out['context_captions'])
304
-
305
- new_cap = refined_caption['caption']
306
- if refined_caption.get('wiki'):
307
- state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
308
- state = state + [(None, f"GPT_Caption: {new_cap}")]
309
- print("new_cap",new_cap)
310
- refined_image_input = create_bubble_frame(np.array(origin_image_input), new_cap, click_index, input_mask,
311
  input_points=input_points, input_labels=input_labels)
312
  try:
313
- waveform_visual, audio_output = tts.predict(new_cap, input_language, input_audio, input_mic, use_mic, agree)
314
  return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
315
  except Exception as e:
316
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
@@ -327,17 +367,56 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
327
  return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
328
 
329
 
 
 
 
 
 
 
 
 
 
330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
- def txt2speech(text):
333
- print("Initializing text-to-speech conversion...")
334
- # API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
335
- # headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
336
- # payloads = {'inputs': text}
337
- # response = requests.post(API_URL, headers=headers, json=payloads)
338
- # with open('audio_story.mp3', 'wb') as file:
339
- # file.write(response.content)
340
- print("Text-to-speech conversion completed.")
341
 
342
 
343
 
@@ -447,6 +526,26 @@ def cap_everything(image_input, visual_chatgpt, text_refiner,input_language, inp
447
  waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
448
  return paragraph,waveform_visual, audio_output
449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
451
  def get_style():
452
  current_version = version.parse(gr.__version__)
@@ -496,11 +595,13 @@ def create_ui():
496
  original_size = gr.State(None)
497
  input_size = gr.State(None)
498
  generated_caption = gr.State("")
 
499
  aux_state = gr.State([])
500
  click_index_state = gr.State((0, 0))
501
  input_mask_state = gr.State(np.zeros((1, 1)))
502
  input_points_state = gr.State([])
503
  input_labels_state = gr.State([])
 
504
 
505
 
506
  gr.Markdown(title)
@@ -510,14 +611,20 @@ def create_ui():
510
  with gr.Column(scale=1.0):
511
  with gr.Column(visible=False) as modules_not_need_gpt:
512
  with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
 
513
  image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
514
  example_image = gr.Image(type="pil", interactive=False, visible=False)
515
-
516
-
517
  with gr.Tab("Click") as click_tab:
518
- modules_not_need_gpt2=True
519
  image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
520
  example_image = gr.Image(type="pil", interactive=False, visible=False)
 
 
 
 
 
 
521
  with gr.Row(scale=1.0):
522
  with gr.Row(scale=0.4):
523
  point_prompt = gr.Radio(
@@ -572,6 +679,7 @@ def create_ui():
572
  value="No",
573
  label="Enable Wiki",
574
  interactive=True)
 
575
  # with gr.Column(visible=True) as modules_not_need_gpt3:
576
  gr.Examples(
577
  examples=examples,
@@ -708,29 +816,29 @@ def create_ui():
708
 
709
  image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
710
 
711
- image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt],
712
- [chatbot, state, origin_image, click_state, image_input_base, sketcher_input,
713
- image_embedding, original_size, input_size])
714
 
715
 
716
- image_input.upload(upload_callback, [image_input, state, visual_chatgpt],
717
- [chatbot, state, origin_image, click_state, image_input, sketcher_input,
718
- image_embedding, original_size, input_size])
719
- sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt],
720
- [chatbot, state, origin_image, click_state, image_input, sketcher_input,
721
- image_embedding, original_size, input_size])
 
 
 
 
722
  chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
723
  [chatbot, state, aux_state])
724
  chat_input.submit(lambda: "", None, chat_input)
725
  submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
726
  [chatbot, state, aux_state])
727
  submit_button_text.click(lambda: "", None, chat_input)
728
- example_image.change(upload_callback, [example_image, state, visual_chatgpt],
729
- [chatbot, state, origin_image, click_state, image_input, sketcher_input,
730
- image_embedding, original_size, input_size])
731
- example_image.change(upload_callback, [example_image, state, visual_chatgpt],
732
- [chatbot, state, origin_image, click_state, image_input_base, sketcher_input,
733
- image_embedding, original_size, input_size])
734
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
735
 
736
  def on_click_tab_selected():
@@ -776,7 +884,7 @@ def create_ui():
776
  inputs=[
777
  image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
778
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
779
- input_text, input_language, input_audio, input_mic, use_mic, agree
780
  ],
781
  outputs=[
782
  chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
 
1
  import os
2
+ import base64
3
  import json
4
  import gradio as gr
5
  import numpy as np
 
118
  gpt_state=1
119
  return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=True)]+ [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
120
  else:
121
+ gpt_state=0
122
  return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
123
 
124
  def init_wo_openai_api_key():
125
+ global gpt_state
126
+ gpt_state=0
127
  return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]
128
 
129
  def get_click_prompt(chat_input, click_state, click_mode):
 
173
 
174
 
175
 
176
+ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None):
 
177
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
178
  image_input, mask = image_input['image'], image_input['mask']
179
 
180
  click_state = [[], [], []]
181
  image_input = image_resize(image_input, res=1024)
182
+
183
  model = build_caption_anything_with_models(
184
  args,
185
  api_key="",
 
192
  image_embedding = model.image_embedding
193
  original_size = model.original_size
194
  input_size = model.input_size
195
+
196
  if visual_chatgpt is not None:
197
  print('upload_callback: add caption to chatGPT memory')
198
  new_image_path = get_new_image_name('chat_image', func_name='upload')
 
203
  AI_prompt = "Received."
204
  visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
205
  visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
206
+ parsed_data = get_image_gpt(openai_api_key, new_image_path,"Please provide the name, artist, year of creation, and material used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\",\"artist\": \"Name of the artist\", \"year\": \"Year of creation\", \"material\": \"Material used in the painting\" }.")
207
+ parsed_data = json.loads(parsed_data.replace("'", "\""))
208
+ name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
209
+ artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
210
+ paragraph = get_image_gpt(openai_api_key, new_image_path,"Imagine you are a intelligent image captioner. You should generate a descriptive, coherent and human-like paragraph based on the given image instead of imagination. There are some rules for your response: Show objects with their attributes (e.g. position, color, size, shape, texture).\nPrimarily describe common objects with large size.\nProvide context of the image.\nShow relative position between objects.\nLess than 6 sentences.\nDo not appear number.\nDo not describe any individual letter.\nDo not show the image resolution.\nIngore the white background.")
211
+
212
  state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
213
 
214
+ return state, state, image_input, click_state, image_input, image_input, image_input, image_embedding, \
215
+ original_size, input_size, artwork_info,artwork_info,paragraph
216
+
217
 
218
 
219
 
 
247
  )
248
 
249
  model.setup(image_embedding, original_size, input_size, is_image_set=True)
250
+
251
  enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
252
  out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
253
 
 
280
 
281
  def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
282
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
283
+ input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key):
284
  print("state",state)
285
 
286
  click_index = click_index_state
 
288
  input_points = input_points_state
289
  input_labels = input_labels_state
290
  out = out_state
291
+ focus_map = {
292
+ "Inside the Mark": 0,
293
+ "Around the Mark": 1,
294
+ "Outside the Mark": 2
295
+ }
296
 
297
+ mapped_value = focus_map.get(focus_type, -1)
298
+ print("mapped value",mapped_value)
299
 
300
  controls = {
301
  'length': length,
 
304
  'language': language
305
  }
306
 
307
+ prompt_list = [
308
+ 'Select sentences closely related to the raw caption: "{raw_caption}" from the wiki caption: "{Wiki_caption}" around {length} words of {sentiment} sentiment in {language}.',
309
+ 'Pick sentences from the wiki caption: "{Wiki_caption}" that may be related to objects mentioned in the "{raw_caption}" around {length} words of {sentiment} sentiment in {language}.',
310
+ 'Choose sentences from the wiki caption: "{Wiki_caption}" that describe unrelated topics to the raw caption: "{raw_caption}" around {length} words of {sentiment} sentiment in {language}.'
311
+ ]
312
+
313
+
314
+ if mapped_value != -1:
315
+ prompt= prompt_list[mapped_value].format(
316
+ raw_caption=generated_caption,
317
+ Wiki_caption=paragraph,
318
+ length=controls['length'],
319
+ sentiment=controls['sentiment'],
320
+ language=controls['language']
321
+ )
322
+ prompt+="You should generate a descriptive, coherent and human-like paragraph"
323
+
324
+ else:
325
+ print("error prompting")
326
+ prompt = "Invalid focus type."
327
+
328
+ if controls['factuality'] == "Imagination":
329
+ prompt += " The new sentence could extend the original description by using your imagination to create additional details, or think about what might have happened before or after the scene in the image, but should not conflict with the original sentence."
330
+
331
+ print("Prompt:", prompt)
332
+ print("click",click_index)
333
+
334
+ origin_image_input = image_input
335
+
336
+
337
+
338
  image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
339
  input_points=input_points, input_labels=input_labels)
340
 
 
343
 
344
 
345
  if not args.disable_gpt and text_refiner:
346
+ focus_info=get_image_gpt(openai_api_key,visual_chatgpt.current_image,prompt)
347
+ state = state + [(None, f"Wiki: {paragraph}")]
348
+ state = state + [(None, f"Focus_Caption: {focus_info}")]
349
+ print("new_cap",focus_info)
350
+ refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
 
 
 
 
 
 
351
  input_points=input_points, input_labels=input_labels)
352
  try:
353
+ waveform_visual, audio_output = tts.predict(focus_info, input_language, input_audio, input_mic, use_mic, agree)
354
  return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
355
  except Exception as e:
356
  state = state + [(None, f"Error during TTS prediction: {str(e)}")]
 
367
  return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
368
 
369
 
370
+ def encode_image(image_path):
371
+ with open(image_path, "rb") as image_file:
372
+ return base64.b64encode(image_file.read()).decode('utf-8')
373
+
374
+ def get_image_gpt(api_key, image_path,prompt,enable_wiki=None):
375
+ # Getting the base64 string
376
+ base64_image = encode_image(image_path)
377
+
378
+
379
 
380
+ headers = {
381
+ "Content-Type": "application/json",
382
+ "Authorization": f"Bearer {api_key}"
383
+ }
384
+
385
+ prompt_text = prompt
386
+
387
+ payload = {
388
+ "model": "gpt-4o",
389
+ "messages": [
390
+ {
391
+ "role": "user",
392
+ "content": [
393
+ {
394
+ "type": "text",
395
+ "text": prompt_text
396
+ },
397
+ {
398
+ "type": "image_url",
399
+ "image_url": {
400
+ "url": f"data:image/jpeg;base64,{base64_image}"
401
+ }
402
+ }
403
+ ]
404
+ }
405
+ ],
406
+ "max_tokens": 300
407
+ }
408
+
409
+ # Sending the request to the OpenAI API
410
+ response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
411
+ result = response.json()
412
+ print(result)
413
+ content = result['choices'][0]['message']['content']
414
+ # Assume the model returns a valid JSON string in 'content'
415
+ try:
416
+ return content
417
+ except json.JSONDecodeError:
418
+ return {"error": "Failed to parse model output"}
419
 
 
 
 
 
 
 
 
 
 
420
 
421
 
422
 
 
526
  waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
527
  return paragraph,waveform_visual, audio_output
528
 
529
+ def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
530
+
531
+ model = build_caption_anything_with_models(
532
+ args,
533
+ api_key="",
534
+ captioner=shared_captioner,
535
+ sam_model=shared_sam_model,
536
+ ocr_reader=shared_ocr_reader,
537
+ text_refiner=text_refiner,
538
+ session_id=iface.app_id
539
+ )
540
+ paragraph = model.inference_cap_everything(image_input, verbose=True)
541
+ # state = state + [(None, f"Caption Everything: {paragraph}")]
542
+ Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
543
+ AI_prompt = "Received."
544
+ visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
545
+ visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
546
+ return paragraph
547
+
548
+
549
 
550
  def get_style():
551
  current_version = version.parse(gr.__version__)
 
595
  original_size = gr.State(None)
596
  input_size = gr.State(None)
597
  generated_caption = gr.State("")
598
+ paragraph = gr.State("")
599
  aux_state = gr.State([])
600
  click_index_state = gr.State((0, 0))
601
  input_mask_state = gr.State(np.zeros((1, 1)))
602
  input_points_state = gr.State([])
603
  input_labels_state = gr.State([])
604
+
605
 
606
 
607
  gr.Markdown(title)
 
611
  with gr.Column(scale=1.0):
612
  with gr.Column(visible=False) as modules_not_need_gpt:
613
  with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
614
+ image_intro=gr.HTML()
615
  image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
616
  example_image = gr.Image(type="pil", interactive=False, visible=False)
617
+
 
618
  with gr.Tab("Click") as click_tab:
619
+ image_intro_click=gr.HTML()
620
  image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
621
  example_image = gr.Image(type="pil", interactive=False, visible=False)
622
+ with gr.Row(scale=1.0):
623
+ focus_type = gr.Radio(
624
+ choices=["Inside the Mark", "Around the Mark", "Outside the Mark"],
625
+ value="Inside the Mark",
626
+ label="Focus Type",
627
+ interactive=True)
628
  with gr.Row(scale=1.0):
629
  with gr.Row(scale=0.4):
630
  point_prompt = gr.Radio(
 
679
  value="No",
680
  label="Enable Wiki",
681
  interactive=True)
682
+
683
  # with gr.Column(visible=True) as modules_not_need_gpt3:
684
  gr.Examples(
685
  examples=examples,
 
816
 
817
  image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
818
 
819
+
 
 
820
 
821
 
822
+ image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key],
823
+ [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
824
+ image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
825
+
826
+ image_input.upload(upload_callback, [image_input, state, visual_chatgpt, openai_api_key],
827
+ [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
828
+ image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
829
+ sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
830
+ [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
831
+ image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
832
  chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
833
  [chatbot, state, aux_state])
834
  chat_input.submit(lambda: "", None, chat_input)
835
  submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
836
  [chatbot, state, aux_state])
837
  submit_button_text.click(lambda: "", None, chat_input)
838
+ example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
839
+ [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
840
+ image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
841
+
 
 
842
  example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
843
 
844
  def on_click_tab_selected():
 
884
  inputs=[
885
  image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
886
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
887
+ input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key
888
  ],
889
  outputs=[
890
  chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,