EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on May 30, 2024

Commit

d29ec53

verified ·

1 Parent(s): 135ce14

Update app.py

Browse files

Add painting introduction part
Add Focus Type

Files changed (1) hide show

app.py +158 -50

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import json
 import gradio as gr
 import numpy as np
@@ -117,9 +118,12 @@ def init_openai_api_key(api_key=""):
         gpt_state=1
         return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=True)]+ [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
     else:
         return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
 def init_wo_openai_api_key():
         return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]
 def get_click_prompt(chat_input, click_state, click_mode):
@@ -169,14 +173,13 @@ def chat_input_callback(*args):
-def upload_callback(image_input, state, visual_chatgpt=None):
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input, mask = image_input['image'], image_input['mask']
     click_state = [[], [], []]
     image_input = image_resize(image_input, res=1024)
     model = build_caption_anything_with_models(
         args,
         api_key="",
@@ -189,7 +192,7 @@ def upload_callback(image_input, state, visual_chatgpt=None):
     image_embedding = model.image_embedding
     original_size = model.original_size
     input_size = model.input_size
     if visual_chatgpt is not None:
         print('upload_callback: add caption to chatGPT memory')
         new_image_path = get_new_image_name('chat_image', func_name='upload')
@@ -200,10 +203,17 @@ def upload_callback(image_input, state, visual_chatgpt=None):
         AI_prompt = "Received."
         visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
         visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
     state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
-    return state, state, image_input, click_state, image_input, image_input, image_embedding, \
-        original_size, input_size
@@ -237,7 +247,7 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     )
     model.setup(image_embedding, original_size, input_size, is_image_set=True)
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
@@ -270,7 +280,7 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
 def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-                   input_text, input_language, input_audio, input_mic, use_mic, agree):
     print("state",state)
     click_index = click_index_state
@@ -278,9 +288,14 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
     input_points = input_points_state
     input_labels = input_labels_state
     out = out_state
-    print("click",click_index)
-    origin_image_input = image_input
     controls = {
         'length': length,
@@ -289,6 +304,37 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
         'language': language
     }
     image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
                                       input_points=input_points, input_labels=input_labels)
@@ -297,20 +343,14 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
         if not args.disable_gpt and text_refiner:
-            refined_caption = text_refiner.inference(query=generated_caption, controls=controls, context=out['context_captions'], enable_wiki=enable_wiki)
-            print("generated caption",generated_caption)
-            print("controls",controls)
-            print("context_catpions",out['context_captions'])
-            new_cap = refined_caption['caption']
-            if refined_caption.get('wiki'):
-                state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
-            state = state + [(None, f"GPT_Caption: {new_cap}")]
-            print("new_cap",new_cap)
-            refined_image_input = create_bubble_frame(np.array(origin_image_input), new_cap, click_index, input_mask,
                                                       input_points=input_points, input_labels=input_labels)
             try:
-                waveform_visual, audio_output = tts.predict(new_cap, input_language, input_audio, input_mic, use_mic, agree)
                 return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
             except Exception as e:
                 state = state + [(None, f"Error during TTS prediction: {str(e)}")]
@@ -327,17 +367,56 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
                 return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
-def txt2speech(text):
-    print("Initializing text-to-speech conversion...")
-    # API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
-    # headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
-    # payloads = {'inputs': text}
-    # response = requests.post(API_URL, headers=headers, json=payloads)
-    # with open('audio_story.mp3', 'wb') as file:
-    #     file.write(response.content)
-    print("Text-to-speech conversion completed.")
@@ -447,6 +526,26 @@ def cap_everything(image_input, visual_chatgpt, text_refiner,input_language, inp
     waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
     return paragraph,waveform_visual, audio_output
 def get_style():
     current_version = version.parse(gr.__version__)
@@ -496,11 +595,13 @@ def create_ui():
         original_size = gr.State(None)
         input_size = gr.State(None)
         generated_caption = gr.State("")
         aux_state = gr.State([])
         click_index_state = gr.State((0, 0))
         input_mask_state = gr.State(np.zeros((1, 1)))
         input_points_state = gr.State([])
         input_labels_state = gr.State([])
         gr.Markdown(title)
@@ -510,14 +611,20 @@ def create_ui():
             with gr.Column(scale=1.0):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                     with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
                         image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         example_image = gr.Image(type="pil", interactive=False, visible=False)
                     with gr.Tab("Click") as click_tab:
-                        modules_not_need_gpt2=True
                         image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         example_image = gr.Image(type="pil", interactive=False, visible=False)
                         with gr.Row(scale=1.0):
                             with gr.Row(scale=0.4):
                                 point_prompt = gr.Radio(
@@ -572,6 +679,7 @@ def create_ui():
                             value="No",
                             label="Enable Wiki",
                             interactive=True)
                 # with gr.Column(visible=True) as modules_not_need_gpt3:
                 gr.Examples(
                     examples=examples,
@@ -708,29 +816,29 @@ def create_ui():
         image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
-        image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt],
-                           [chatbot, state, origin_image, click_state, image_input_base, sketcher_input,
-                            image_embedding, original_size, input_size])
-        image_input.upload(upload_callback, [image_input, state, visual_chatgpt],
-                           [chatbot, state, origin_image, click_state, image_input, sketcher_input,
-                            image_embedding, original_size, input_size])
-        sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt],
-                              [chatbot, state, origin_image, click_state, image_input, sketcher_input,
-                               image_embedding, original_size, input_size])
         chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
                           [chatbot, state, aux_state])
         chat_input.submit(lambda: "", None, chat_input)
         submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
                           [chatbot, state, aux_state])
         submit_button_text.click(lambda: "", None, chat_input)
-        example_image.change(upload_callback, [example_image, state, visual_chatgpt],
-                             [chatbot, state, origin_image, click_state, image_input, sketcher_input,
-                              image_embedding, original_size, input_size])
-        example_image.change(upload_callback, [example_image, state, visual_chatgpt],
-                             [chatbot, state, origin_image, click_state, image_input_base, sketcher_input,
-                              image_embedding, original_size, input_size])
         example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
         def on_click_tab_selected():
@@ -776,7 +884,7 @@ def create_ui():
             inputs=[
         image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
-        input_text, input_language, input_audio, input_mic, use_mic, agree
     ],
             outputs=[
                 chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,

 import os
+import base64
 import json
 import gradio as gr
 import numpy as np
         gpt_state=1
         return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=True)]+ [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
     else:
+        gpt_state=0
         return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
 def init_wo_openai_api_key():
+        global gpt_state
+        gpt_state=0
         return  [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]
 def get_click_prompt(chat_input, click_state, click_mode):
+def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None):
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input, mask = image_input['image'], image_input['mask']
     click_state = [[], [], []]
     image_input = image_resize(image_input, res=1024)
     model = build_caption_anything_with_models(
         args,
         api_key="",
     image_embedding = model.image_embedding
     original_size = model.original_size
     input_size = model.input_size
     if visual_chatgpt is not None:
         print('upload_callback: add caption to chatGPT memory')
         new_image_path = get_new_image_name('chat_image', func_name='upload')
         AI_prompt = "Received."
         visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
         visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
+        parsed_data = get_image_gpt(openai_api_key, new_image_path,"Please provide the name, artist, year of creation, and material used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\",\"artist\": \"Name of the artist\", \"year\": \"Year of creation\", \"material\": \"Material used in the painting\" }.")
+        parsed_data = json.loads(parsed_data.replace("'", "\""))
+        name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
+        artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
+        paragraph = get_image_gpt(openai_api_key, new_image_path,"Imagine you are a intelligent image captioner. You should generate a descriptive, coherent and human-like paragraph based on the given image instead of imagination. There are some rules for your response: Show objects with their attributes (e.g. position, color, size, shape, texture).\nPrimarily describe common objects with large size.\nProvide context of the image.\nShow relative position between objects.\nLess than 6 sentences.\nDo not appear number.\nDo not describe any individual letter.\nDo not show the image resolution.\nIngore the white background.")
     state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
+    return state, state, image_input, click_state, image_input, image_input, image_input, image_embedding, \
+        original_size, input_size, artwork_info,artwork_info,paragraph
     )
     model.setup(image_embedding, original_size, input_size, is_image_set=True)
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
 def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+                   input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key):
     print("state",state)
     click_index = click_index_state
     input_points = input_points_state
     input_labels = input_labels_state
     out = out_state
+    focus_map = {
+    "Inside the Mark": 0,
+    "Around the Mark": 1,
+    "Outside the Mark": 2
+}
+    mapped_value = focus_map.get(focus_type, -1)
+    print("mapped value",mapped_value)
     controls = {
         'length': length,
         'language': language
     }
+    prompt_list = [
+        'Select sentences closely related to the raw caption: "{raw_caption}" from the wiki caption: "{Wiki_caption}" around {length} words of {sentiment} sentiment in {language}.',
+        'Pick sentences from the  wiki caption: "{Wiki_caption}" that may be related to objects mentioned in the "{raw_caption}" around {length} words of {sentiment} sentiment in {language}.',
+        'Choose sentences from the  wiki caption: "{Wiki_caption}" that describe unrelated topics to the raw caption: "{raw_caption}" around {length} words of {sentiment} sentiment in {language}.'
+    ]
+    if mapped_value != -1:
+        prompt= prompt_list[mapped_value].format(
+            raw_caption=generated_caption,
+            Wiki_caption=paragraph,
+            length=controls['length'],
+            sentiment=controls['sentiment'],
+            language=controls['language']
+        )
+        prompt+="You should generate a descriptive, coherent and human-like paragraph"
+    else:
+        print("error prompting")
+        prompt = "Invalid focus type."
+    if controls['factuality'] == "Imagination":
+        prompt += " The new sentence could extend the original description by using your imagination to create additional details, or think about what might have happened before or after the scene in the image, but should not conflict with the original sentence."
+    print("Prompt:", prompt)
+    print("click",click_index)
+    origin_image_input = image_input
     image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
                                       input_points=input_points, input_labels=input_labels)
         if not args.disable_gpt and text_refiner:
+            focus_info=get_image_gpt(openai_api_key,visual_chatgpt.current_image,prompt)
+            state = state + [(None, f"Wiki: {paragraph}")]
+            state = state + [(None, f"Focus_Caption: {focus_info}")]
+            print("new_cap",focus_info)
+            refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
                                                       input_points=input_points, input_labels=input_labels)
             try:
+                waveform_visual, audio_output = tts.predict(focus_info, input_language, input_audio, input_mic, use_mic, agree)
                 return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
             except Exception as e:
                 state = state + [(None, f"Error during TTS prediction: {str(e)}")]
                 return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+def get_image_gpt(api_key, image_path,prompt,enable_wiki=None):
+    # Getting the base64 string
+    base64_image = encode_image(image_path)
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {api_key}"
+    }
+    prompt_text = prompt
+    payload = {
+        "model": "gpt-4o",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt_text
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{base64_image}"
+                        }
+                    }
+                ]
+            }
+        ],
+        "max_tokens": 300
+    }
+    # Sending the request to the OpenAI API
+    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+    result = response.json()
+    print(result)
+    content = result['choices'][0]['message']['content']
+    # Assume the model returns a valid JSON string in 'content'
+    try:
+        return content
+    except json.JSONDecodeError:
+        return {"error": "Failed to parse model output"}
     waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
     return paragraph,waveform_visual, audio_output
+def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
+    model = build_caption_anything_with_models(
+        args,
+        api_key="",
+        captioner=shared_captioner,
+        sam_model=shared_sam_model,
+        ocr_reader=shared_ocr_reader,
+        text_refiner=text_refiner,
+        session_id=iface.app_id
+    )
+    paragraph = model.inference_cap_everything(image_input, verbose=True)
+    # state = state + [(None, f"Caption Everything: {paragraph}")]
+    Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
+    AI_prompt = "Received."
+    visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
+    visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
+    return paragraph
 def get_style():
     current_version = version.parse(gr.__version__)
         original_size = gr.State(None)
         input_size = gr.State(None)
         generated_caption = gr.State("")
+        paragraph = gr.State("")
         aux_state = gr.State([])
         click_index_state = gr.State((0, 0))
         input_mask_state = gr.State(np.zeros((1, 1)))
         input_points_state = gr.State([])
         input_labels_state = gr.State([])
         gr.Markdown(title)
             with gr.Column(scale=1.0):
                 with gr.Column(visible=False) as modules_not_need_gpt:
                     with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
+                        image_intro=gr.HTML()
                         image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         example_image = gr.Image(type="pil", interactive=False, visible=False)
                     with gr.Tab("Click") as click_tab:
+                        image_intro_click=gr.HTML()
                         image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         example_image = gr.Image(type="pil", interactive=False, visible=False)
+                        with gr.Row(scale=1.0):
+                             focus_type = gr.Radio(
+                                    choices=["Inside the Mark", "Around the Mark", "Outside the Mark"],
+                                    value="Inside the Mark",
+                                    label="Focus Type",
+                                    interactive=True)
                         with gr.Row(scale=1.0):
                             with gr.Row(scale=0.4):
                                 point_prompt = gr.Radio(
                             value="No",
                             label="Enable Wiki",
                             interactive=True)
                 # with gr.Column(visible=True) as modules_not_need_gpt3:
                 gr.Examples(
                     examples=examples,
         image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
+        image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key],
+                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
+                            image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
+        image_input.upload(upload_callback, [image_input, state, visual_chatgpt, openai_api_key],
+                           [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
+                            image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
+        sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
+                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
+                               image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
         chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
                           [chatbot, state, aux_state])
         chat_input.submit(lambda: "", None, chat_input)
         submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
                           [chatbot, state, aux_state])
         submit_button_text.click(lambda: "", None, chat_input)
+        example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
+                             [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
+                              image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
         example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
         def on_click_tab_selected():
             inputs=[
         image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
         out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
+        input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key
     ],
             outputs=[
                 chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,