EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on May 30, 2024

Commit

b9eedbd

verified ·

1 Parent(s): d29ec53

Update app.py

Browse files

Update prompt
Using selected objects as image input

Files changed (1) hide show

app.py +16 -13

app.py CHANGED Viewed

@@ -173,7 +173,7 @@ def chat_input_callback(*args):
-def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None):
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input, mask = image_input['image'], image_input['mask']
@@ -207,7 +207,7 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
         parsed_data = json.loads(parsed_data.replace("'", "\""))
         name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
         artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
-        paragraph = get_image_gpt(openai_api_key, new_image_path,"Imagine you are a intelligent image captioner. You should generate a descriptive, coherent and human-like paragraph based on the given image instead of imagination. There are some rules for your response: Show objects with their attributes (e.g. position, color, size, shape, texture).\nPrimarily describe common objects with large size.\nProvide context of the image.\nShow relative position between objects.\nLess than 6 sentences.\nDo not appear number.\nDo not describe any individual letter.\nDo not show the image resolution.\nIngore the white background.")
     state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
@@ -272,15 +272,16 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     generated_caption = text
     print(generated_caption)
-    yield state, state, click_state, image_input, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state
 def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-                   input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key):
     print("state",state)
     click_index = click_index_state
@@ -305,10 +306,10 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
     }
     prompt_list = [
-        'Select sentences closely related to the raw caption: "{raw_caption}" from the wiki caption: "{Wiki_caption}" around {length} words of {sentiment} sentiment in {language}.',
-        'Pick sentences from the  wiki caption: "{Wiki_caption}" that may be related to objects mentioned in the "{raw_caption}" around {length} words of {sentiment} sentiment in {language}.',
-        'Choose sentences from the  wiki caption: "{Wiki_caption}" that describe unrelated topics to the raw caption: "{raw_caption}" around {length} words of {sentiment} sentiment in {language}.'
-    ]
     if mapped_value != -1:
@@ -319,14 +320,13 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
             sentiment=controls['sentiment'],
             language=controls['language']
         )
-        prompt+="You should generate a descriptive, coherent and human-like paragraph"
     else:
         print("error prompting")
         prompt = "Invalid focus type."
     if controls['factuality'] == "Imagination":
-        prompt += " The new sentence could extend the original description by using your imagination to create additional details, or think about what might have happened before or after the scene in the image, but should not conflict with the original sentence."
     print("Prompt:", prompt)
     print("click",click_index)
@@ -343,7 +343,9 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
         if not args.disable_gpt and text_refiner:
-            focus_info=get_image_gpt(openai_api_key,visual_chatgpt.current_image,prompt)
             state = state + [(None, f"Wiki: {paragraph}")]
             state = state + [(None, f"Focus_Caption: {focus_info}")]
             print("new_cap",focus_info)
@@ -601,6 +603,7 @@ def create_ui():
         input_mask_state = gr.State(np.zeros((1, 1)))
         input_points_state = gr.State([])
         input_labels_state = gr.State([])
@@ -874,7 +877,7 @@ def create_ui():
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
                 out_state, click_index_state, input_mask_state, input_points_state, input_labels_state
             ],
-            outputs=[chatbot, state, click_state, image_input, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state],
             show_progress=False, queue=True
         )
@@ -884,7 +887,7 @@ def create_ui():
             inputs=[
         image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-        input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key
     ],
             outputs=[
                 chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,

+def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English"):
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input, mask = image_input['image'], image_input['mask']
         parsed_data = json.loads(parsed_data.replace("'", "\""))
         name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
         artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
+        paragraph = get_image_gpt(openai_api_key, new_image_path,f"What's going on in this picture? in {language}")
     state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
     generated_caption = text
     print(generated_caption)
+    print("new crop save",new_crop_save_path)
+    yield state, state, click_state, image_input, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path
 def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+                   input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path):
     print("state",state)
     click_index = click_index_state
     }
     prompt_list = [
+    'Wiki_caption: {Wiki_caption}, you have to generate a caption according to the image and wiki caption. Around {length} words of {sentiment} sentiment in {language}.',
+    'Wiki_caption: {Wiki_caption}, you have to select sentences from wiki caption that describe the surrounding objects that may be associated with the picture object. Around {length} words of {sentiment} sentiment in {language}.',
+    'Wiki_caption: {Wiki_caption}. You have to choose sentences from the wiki caption that describe unrelated objects to the image. Around {length} words of {sentiment} sentiment in {language}.'
+]
     if mapped_value != -1:
             sentiment=controls['sentiment'],
             language=controls['language']
         )
     else:
         print("error prompting")
         prompt = "Invalid focus type."
     if controls['factuality'] == "Imagination":
+        prompt += "Assuming that I am someone who has viewed a lot of art and has a lot of experience viewing art.  Explain artistic features (composition, color, style, or use of light) and discuss the symbolism of the content and its influence on later artistic movements"
     print("Prompt:", prompt)
     print("click",click_index)
         if not args.disable_gpt and text_refiner:
+            print("new crop save",new_crop_save_path)
+            focus_info=get_image_gpt(openai_api_key,new_crop_save_path,prompt)
             state = state + [(None, f"Wiki: {paragraph}")]
             state = state + [(None, f"Focus_Caption: {focus_info}")]
             print("new_cap",focus_info)
         input_mask_state = gr.State(np.zeros((1, 1)))
         input_points_state = gr.State([])
         input_labels_state = gr.State([])
+        new_crop_save_path = gr.State(None)
                 image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt,
                 out_state, click_index_state, input_mask_state, input_points_state, input_labels_state
             ],
+            outputs=[chatbot, state, click_state, image_input, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path],
             show_progress=False, queue=True
         )
             inputs=[
         image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+        input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path
     ],
             outputs=[
                 chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,