Niki Zhang commited on
Commit
b9eedbd
·
verified ·
1 Parent(s): d29ec53

Update app.py

Browse files

Update prompt
Using selected objects as image input

Files changed (1) hide show
  1. app.py +16 -13
app.py CHANGED
@@ -173,7 +173,7 @@ def chat_input_callback(*args):
173
 
174
 
175
 
176
- def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None):
177
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
178
  image_input, mask = image_input['image'], image_input['mask']
179
 
@@ -207,7 +207,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
207
  parsed_data = json.loads(parsed_data.replace("'", "\""))
208
  name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
209
  artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
210
- paragraph = get_image_gpt(openai_api_key, new_image_path,"Imagine you are a intelligent image captioner. You should generate a descriptive, coherent and human-like paragraph based on the given image instead of imagination. There are some rules for your response: Show objects with their attributes (e.g. position, color, size, shape, texture).\nPrimarily describe common objects with large size.\nProvide context of the image.\nShow relative position between objects.\nLess than 6 sentences.\nDo not appear number.\nDo not describe any individual letter.\nDo not show the image resolution.\nIngore the white background.")
211
 
212
  state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
213
 
@@ -272,15 +272,16 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
272
 
273
  generated_caption = text
274
  print(generated_caption)
 
275
 
276
- yield state, state, click_state, image_input, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state
277
 
278
 
279
 
280
 
281
  def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
282
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
283
- input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key):
284
  print("state",state)
285
 
286
  click_index = click_index_state
@@ -305,10 +306,10 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
305
  }
306
 
307
  prompt_list = [
308
- 'Select sentences closely related to the raw caption: "{raw_caption}" from the wiki caption: "{Wiki_caption}" around {length} words of {sentiment} sentiment in {language}.',
309
- 'Pick sentences from the wiki caption: "{Wiki_caption}" that may be related to objects mentioned in the "{raw_caption}" around {length} words of {sentiment} sentiment in {language}.',
310
- 'Choose sentences from the wiki caption: "{Wiki_caption}" that describe unrelated topics to the raw caption: "{raw_caption}" around {length} words of {sentiment} sentiment in {language}.'
311
- ]
312
 
313
 
314
  if mapped_value != -1:
@@ -319,14 +320,13 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
319
  sentiment=controls['sentiment'],
320
  language=controls['language']
321
  )
322
- prompt+="You should generate a descriptive, coherent and human-like paragraph"
323
 
324
  else:
325
  print("error prompting")
326
  prompt = "Invalid focus type."
327
 
328
  if controls['factuality'] == "Imagination":
329
- prompt += " The new sentence could extend the original description by using your imagination to create additional details, or think about what might have happened before or after the scene in the image, but should not conflict with the original sentence."
330
 
331
  print("Prompt:", prompt)
332
  print("click",click_index)
@@ -343,7 +343,9 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
343
 
344
 
345
  if not args.disable_gpt and text_refiner:
346
- focus_info=get_image_gpt(openai_api_key,visual_chatgpt.current_image,prompt)
 
 
347
  state = state + [(None, f"Wiki: {paragraph}")]
348
  state = state + [(None, f"Focus_Caption: {focus_info}")]
349
  print("new_cap",focus_info)
@@ -601,6 +603,7 @@ def create_ui():
601
  input_mask_state = gr.State(np.zeros((1, 1)))
602
  input_points_state = gr.State([])
603
  input_labels_state = gr.State([])
 
604
 
605
 
606
 
@@ -874,7 +877,7 @@ def create_ui():
874
  image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
875
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state
876
  ],
877
- outputs=[chatbot, state, click_state, image_input, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state],
878
  show_progress=False, queue=True
879
  )
880
 
@@ -884,7 +887,7 @@ def create_ui():
884
  inputs=[
885
  image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
886
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
887
- input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key
888
  ],
889
  outputs=[
890
  chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
 
173
 
174
 
175
 
176
+ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English"):
177
  if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
178
  image_input, mask = image_input['image'], image_input['mask']
179
 
 
207
  parsed_data = json.loads(parsed_data.replace("'", "\""))
208
  name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
209
  artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
210
+ paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
211
 
212
  state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
213
 
 
272
 
273
  generated_caption = text
274
  print(generated_caption)
275
+ print("new crop save",new_crop_save_path)
276
 
277
+ yield state, state, click_state, image_input, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path
278
 
279
 
280
 
281
 
282
  def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
283
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
284
+ input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path):
285
  print("state",state)
286
 
287
  click_index = click_index_state
 
306
  }
307
 
308
  prompt_list = [
309
+ 'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
310
+ 'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
311
+ 'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
312
+ ]
313
 
314
 
315
  if mapped_value != -1:
 
320
  sentiment=controls['sentiment'],
321
  language=controls['language']
322
  )
 
323
 
324
  else:
325
  print("error prompting")
326
  prompt = "Invalid focus type."
327
 
328
  if controls['factuality'] == "Imagination":
329
+ prompt += "Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art. Explain artistic features (composition, color, style, or use of light) and discuss the symbolism of the content and its influence on later artistic movements"
330
 
331
  print("Prompt:", prompt)
332
  print("click",click_index)
 
343
 
344
 
345
  if not args.disable_gpt and text_refiner:
346
+ print("new crop save",new_crop_save_path)
347
+ focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
348
+
349
  state = state + [(None, f"Wiki: {paragraph}")]
350
  state = state + [(None, f"Focus_Caption: {focus_info}")]
351
  print("new_cap",focus_info)
 
603
  input_mask_state = gr.State(np.zeros((1, 1)))
604
  input_points_state = gr.State([])
605
  input_labels_state = gr.State([])
606
+ new_crop_save_path = gr.State(None)
607
 
608
 
609
 
 
877
  image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
878
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state
879
  ],
880
+ outputs=[chatbot, state, click_state, image_input, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path],
881
  show_progress=False, queue=True
882
  )
883
 
 
887
  inputs=[
888
  image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
889
  out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
890
+ input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path
891
  ],
892
  outputs=[
893
  chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,