EyeSee_chi

Running

App Files Files Community

Niki Zhang commited on Jun 24, 2024

Commit

8708def

verified ·

1 Parent(s): c9593df

Update app.py

Browse files

Files changed (1) hide show

app.py +207 -150

app.py CHANGED Viewed

@@ -37,8 +37,9 @@ import requests
 import spaces
 # Print the current version of LangChain
 print(f"Current LangChain version: {__version__}")
 print("testing testing")
 # import tts
 ###############################################################################
@@ -94,21 +95,6 @@ from huggingface_hub import hf_hub_download
-# import logging
-# logging.basicConfig(level=logging.DEBUG)
-# logger = logging.getLogger(__name__)
-# def my_function(input_text):
-#     logger.info(f'Received input: {input_text}')
-#     return "Output: " + input_text
-import sys
-# 设置无缓冲输出
-sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', buffering=1)
-sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', buffering=1)
 # def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
 #     """
@@ -385,97 +371,97 @@ def infer(image_path):
 ############# this part is for text to image #############
 ###############################################################################
-# Use environment variables for flexibility
 MODEL_ID = os.getenv("MODEL_ID", "sd-community/sdxl-flash")
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # Allow generating multiple images at once
-# Determine device and load model outside of function for efficiency
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-pipe = StableDiffusionXLPipeline.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    use_safetensors=True,
-    add_watermarker=False,
-).to(device)
-pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-# Torch compile for potential speedup (experimental)
-if USE_TORCH_COMPILE:
-    pipe.compile()
-# CPU offloading for larger RAM capacity (experimental)
-if ENABLE_CPU_OFFLOAD:
-    pipe.enable_model_cpu_offload()
 MAX_SEED = np.iinfo(np.int32).max
-def save_image(img):
-    unique_name = str(uuid.uuid4()) + ".png"
-    img.save(unique_name)
-    return unique_name
-def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    return seed
 # @spaces.GPU(duration=30, queue=False)
-def generate(
-    prompt: str,
-    negative_prompt: str = "",
-    use_negative_prompt: bool = False,
-    seed: int = 1,
-    width: int = 200,
-    height: int = 200,
-    guidance_scale: float = 3,
-    num_inference_steps: int = 30,
-    randomize_seed: bool = False,
-    num_images: int = 4,  # Number of images to generate
-    use_resolution_binning: bool = True,
-    progress=gr.Progress(track_tqdm=True),
-):
-    seed = int(randomize_seed_fn(seed, randomize_seed))
-    generator = torch.Generator(device=device).manual_seed(seed)
-    # Improved options handling
-    options = {
-        "prompt": [prompt] * num_images,
-        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
-        "width": width,
-        "height": height,
-        "guidance_scale": guidance_scale,
-        "num_inference_steps": num_inference_steps,
-        "generator": generator,
-        "output_type": "pil",
-    }
-    # Use resolution binning for faster generation with less VRAM usage
-    # if use_resolution_binning:
-    #     options["use_resolution_binning"] = True
-    # Generate images potentially in batches
-    images = []
-    for i in range(0, num_images, BATCH_SIZE):
-        batch_options = options.copy()
-        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
-        if "negative_prompt" in batch_options:
-            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
-        images.extend(pipe(**batch_options).images)
-    image_paths = [save_image(img) for img in images]
-    return image_paths, seed
-examples = [
-    "a cat eating a piece of cheese",
-    "a ROBOT riding a BLUE horse on Mars, photorealistic, 4k",
-    "Ironman VS Hulk, ultrarealistic",
-    "Astronaut in a jungle, cold color palette, oil pastel, detailed, 8k",
-    "An alien holding a sign board containing the word 'Flash', futuristic, neonpunk",
-    "Kids going to school, Anime style"
-]
@@ -485,6 +471,8 @@ examples = [
 ###############################################################################
 css = """
 #warning {background-color: #FFCCCB}
 .tools_button {
@@ -492,6 +480,18 @@ css = """
     border: none !important;
     box-shadow: none !important;
 }
 #tool_box {max-width: 50px}
 """
@@ -547,18 +547,48 @@ args = parse_augment()
 args.segmenter = "huge"
 args.segmenter_checkpoint = "sam_vit_h_4b8939.pth"
 args.clip_filter = True
-if args.segmenter_checkpoint is None:
-    _, segmenter_checkpoint = prepare_segmenter(args.segmenter)
-else:
-    segmenter_checkpoint = args.segmenter_checkpoint
-shared_captioner = build_captioner(args.captioner, args.device, args)
-shared_sam_model = sam_model_registry[seg_model_map[args.segmenter]](checkpoint=segmenter_checkpoint).to(args.device)
-ocr_lang = ["ch_tra", "en"]
-shared_ocr_reader = easyocr.Reader(ocr_lang)
-tools_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.chat_tools_dict.split(',')}
-shared_chatbot_tools = build_chatbot_tools(tools_dict)
 # class ImageSketcher(gr.Image):
 #     """
@@ -595,15 +625,15 @@ def build_caption_anything_with_models(args, api_key="", captioner=None, sam_mod
 def validate_api_key(api_key):
     api_key = str(api_key).strip()
-    print(api_key, flush=True)
     try:
         test_llm = ChatOpenAI(model_name="gpt-4o", temperature=0, openai_api_key=api_key)
-        print("test_llm", flush=True)
         response = test_llm([HumanMessage(content='Hello')])
-        print(response, flush=True)
         return True
     except Exception as e:
-        print(f"API key validation failed: {e}", flush=True)
         return False
@@ -612,23 +642,23 @@ def init_openai_api_key(api_key=""):
     text_refiner = None
     visual_chatgpt = None
     if api_key and len(api_key) > 30:
-        print(api_key, flush=True)
         if validate_api_key(api_key):
             try:
                 # text_refiner = build_text_refiner(args.text_refiner, args.device, args, api_key)
                 # assert len(text_refiner.llm('hi')) > 0 # test
                 text_refiner = None
-                print("text refiner", flush=True)
                 visual_chatgpt = ConversationBot(shared_chatbot_tools, api_key=api_key)
             except Exception as e:
-                print(f"Error initializing TextRefiner or ConversationBot: {e}", flush=True)
                 text_refiner = None
                 visual_chatgpt = None
         else:
-            print("Invalid API key.", flush=True)
     else:
-        print("API key is too short.", flush=True)
-    print(text_refiner, flush=True)
     openai_available = text_refiner is not None
     if visual_chatgpt:
@@ -704,7 +734,8 @@ async def chat_input_callback(*args):
-def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English"):
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input = image_input['background']
@@ -748,13 +779,29 @@ def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None
         name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"]
         # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
-    state = [
-    (
-        None,
-        f"🤖 Hi, I am EyeSee. Let's explore this painting {name} together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
-    )
-]
     return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
         original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist]
@@ -795,9 +842,9 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
-    # state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
-    state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
     update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
     text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
@@ -823,11 +870,20 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
     yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
 async def submit_caption(state,length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
                    autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
-    print("state",state)
     click_index = click_index_state
@@ -1110,8 +1166,7 @@ def clear_chat_memory(visual_chatgpt, keep_global=False):
         visual_chatgpt.point_prompt = ""
         if keep_global:
             # visual_chatgpt.agent.memory.buffer = visual_chatgpt.global_prompt
-            visual_chatgpt.agent.memory.save_context({"input": visual_chatgpt.global_prompt}, {"output": None})
-            print("test")
         else:
             visual_chatgpt.current_image = None
             visual_chatgpt.global_prompt = ""
@@ -1345,10 +1400,10 @@ def print_like_dislike(x: gr.LikeData,like_res,dislike_res,state):
 def toggle_icons_and_update_prompt(point_prompt):
     new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
-    new_add_icon = "assets/icons/plus-square-blue.png" if point_prompt == "Positive" else "assets/icons/plus-square.png"
-    new_minus_icon = "assets/icons/minus-square.png" if point_prompt == "Positive" else "assets/icons/minus-square-blue.png"
-    print(point_prompt)
-    print(new_prompt)
     return new_prompt, gr.update(icon=new_add_icon), gr.update(icon=new_minus_icon)
@@ -1358,6 +1413,7 @@ minus_icon_path="assets/icons/minus-square.png"
 print("this is a print test")
 def create_ui():
     title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
     """
     description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
@@ -1453,38 +1509,38 @@ def create_ui():
                     with gr.Tab("Base(GPT Power)") as base_tab:
                         image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         with gr.Row():
-                            name_label_base = gr.Button(value="Name: ")
-                            artist_label_base = gr.Button(value="Artist: ")
-                            year_label_base = gr.Button(value="Year: ")
-                            material_label_base = gr.Button(value="Style: ")
                     with gr.Tab("Base2") as base_tab2:
                         image_input_base_2 = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         with gr.Row():
-                            name_label_base2 = gr.Button(value="Name: ")
-                            artist_label_base2 = gr.Button(value="Artist: ")
-                            year_label_base2 = gr.Button(value="Year: ")
-                            material_label_base2 = gr.Button(value="Style: ")
                     with gr.Tab("Click") as click_tab:
                         with gr.Row():
-                            with gr.Column(scale=10,min_width=450):
                                 image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                                 example_image = gr.Image(type="pil", interactive=False, visible=False)
                                 with gr.Row():
-                                    name_label = gr.Button(value="Name: ")
-                                    artist_label = gr.Button(value="Artist: ")
-                                    year_label = gr.Button(value="Year: ")
-                                    material_label = gr.Button(value="Style: ")
                             # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
                             # the tool column
-                            with gr.Column(scale=1,elem_id="tool_box",min_width=100):
                                 add_button = gr.Button(value="", interactive=True,elem_classes="tools_button",icon=add_icon_path)
                                 minus_button = gr.Button(value="", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
                                 clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
-                                clear_button_image = gr.Button(value="Change Image", interactive=True,elem_classes="tools_button")
                                 focus_d = gr.Button(value="D",interactive=True,elem_classes="function_button")
                                 focus_da = gr.Button(value="DA",interactive=True,elem_classes="function_button")
                                 focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button")
@@ -2017,7 +2073,7 @@ def create_ui():
-        image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key],
                            [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                             image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
@@ -2049,11 +2105,12 @@ def create_ui():
         #                        image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
         chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
                           [chatbot, state, aux_state,output_audio])
-        chat_input.submit(lambda: "", None, chat_input)
         # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
         #                   [chatbot, state, aux_state,output_audio])
         # submit_button_text.click(lambda: "", None, chat_input)
-        example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                               image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
@@ -2220,7 +2277,7 @@ def create_ui():
 if __name__ == '__main__':
-    # logger.info("Starting Gradio app")
     iface = create_ui()
     iface.queue(api_open=False, max_size=10)
     # iface.queue(concurrency_count=5, api_open=False, max_size=10)

 import spaces
 # Print the current version of LangChain
 print(f"Current LangChain version: {__version__}")
 print("testing testing")
 # import tts
 ###############################################################################
 # def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
 #     """
 ############# this part is for text to image #############
 ###############################################################################
+# # Use environment variables for flexibility
 MODEL_ID = os.getenv("MODEL_ID", "sd-community/sdxl-flash")
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # Allow generating multiple images at once
+# # Determine device and load model outside of function for efficiency
+# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# pipe = StableDiffusionXLPipeline.from_pretrained(
+#     MODEL_ID,
+#     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+#     use_safetensors=True,
+#     add_watermarker=False,
+# ).to(device)
+# pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+# # Torch compile for potential speedup (experimental)
+# if USE_TORCH_COMPILE:
+#     pipe.compile()
+# # CPU offloading for larger RAM capacity (experimental)
+# if ENABLE_CPU_OFFLOAD:
+#     pipe.enable_model_cpu_offload()
 MAX_SEED = np.iinfo(np.int32).max
+# def save_image(img):
+#     unique_name = str(uuid.uuid4()) + ".png"
+#     img.save(unique_name)
+#     return unique_name
+# def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+#     if randomize_seed:
+#         seed = random.randint(0, MAX_SEED)
+#     return seed
 # @spaces.GPU(duration=30, queue=False)
+# def generate(
+#     prompt: str,
+#     negative_prompt: str = "",
+#     use_negative_prompt: bool = False,
+#     seed: int = 1,
+#     width: int = 200,
+#     height: int = 200,
+#     guidance_scale: float = 3,
+#     num_inference_steps: int = 30,
+#     randomize_seed: bool = False,
+#     num_images: int = 4,  # Number of images to generate
+#     use_resolution_binning: bool = True,
+#     progress=gr.Progress(track_tqdm=True),
+# ):
+#     seed = int(randomize_seed_fn(seed, randomize_seed))
+#     generator = torch.Generator(device=device).manual_seed(seed)
+#     # Improved options handling
+#     options = {
+#         "prompt": [prompt] * num_images,
+#         "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
+#         "width": width,
+#         "height": height,
+#         "guidance_scale": guidance_scale,
+#         "num_inference_steps": num_inference_steps,
+#         "generator": generator,
+#         "output_type": "pil",
+#     }
+#     # Use resolution binning for faster generation with less VRAM usage
+#     # if use_resolution_binning:
+#     #     options["use_resolution_binning"] = True
+#     # Generate images potentially in batches
+#     images = []
+#     for i in range(0, num_images, BATCH_SIZE):
+#         batch_options = options.copy()
+#         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
+#         if "negative_prompt" in batch_options:
+#             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
+#         images.extend(pipe(**batch_options).images)
+#     image_paths = [save_image(img) for img in images]
+#     return image_paths, seed
+# examples = [
+#     "a cat eating a piece of cheese",
+#     "a ROBOT riding a BLUE horse on Mars, photorealistic, 4k",
+#     "Ironman VS Hulk, ultrarealistic",
+#     "Astronaut in a jungle, cold color palette, oil pastel, detailed, 8k",
+#     "An alien holding a sign board containing the word 'Flash', futuristic, neonpunk",
+#     "Kids going to school, Anime style"
+# ]
 ###############################################################################
+print("4")
 css = """
 #warning {background-color: #FFCCCB}
 .tools_button {
     border: none !important;
     box-shadow: none !important;
 }
+.info_btn {
+    background: white;
+    border: none !important;
+    box-shadow: none !important;
+}
+.function_button {
+    border: none !important;
+    box-shadow: none !important;
+}
 #tool_box {max-width: 50px}
 """
 args.segmenter = "huge"
 args.segmenter_checkpoint = "sam_vit_h_4b8939.pth"
 args.clip_filter = True
+try:
+    print("Before preparing segmenter")
+    if args.segmenter_checkpoint is None:
+        _, segmenter_checkpoint = prepare_segmenter(args.segmenter)
+    else:
+        segmenter_checkpoint = args.segmenter_checkpoint
+    print("After preparing segmenter")
+except Exception as e:
+    print(f"Error in preparing segmenter: {e}")
+try:
+    print("Before building captioner")
+    shared_captioner = build_captioner(args.captioner, args.device, args)
+    print("After building captioner")
+except Exception as e:
+    print(f"Error in building captioner: {e}")
+try:
+    print("Before loading SAM model")
+    shared_sam_model = sam_model_registry[seg_model_map[args.segmenter]](checkpoint=segmenter_checkpoint).to(args.device)
+    print("After loading SAM model")
+except Exception as e:
+    print(f"Error in loading SAM model: {e}")
+try:
+    print("Before initializing OCR reader")
+    ocr_lang = ["ch_tra", "en"]
+    shared_ocr_reader = easyocr.Reader(ocr_lang,model_storage_directory=".EasyOCR/model")
+    print("After initializing OCR reader")
+except Exception as e:
+    print(f"Error in initializing OCR reader: {e}")
+try:
+    print("Before building chatbot tools")
+    tools_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.chat_tools_dict.split(',')}
+    shared_chatbot_tools = build_chatbot_tools(tools_dict)
+    print("After building chatbot tools")
+except Exception as e:
+    print(f"Error in building chatbot tools: {e}")
+print(5)
 # class ImageSketcher(gr.Image):
 #     """
 def validate_api_key(api_key):
     api_key = str(api_key).strip()
+    print(api_key)
     try:
         test_llm = ChatOpenAI(model_name="gpt-4o", temperature=0, openai_api_key=api_key)
+        print("test_llm")
         response = test_llm([HumanMessage(content='Hello')])
+        print(response)
         return True
     except Exception as e:
+        print(f"API key validation failed: {e}")
         return False
     text_refiner = None
     visual_chatgpt = None
     if api_key and len(api_key) > 30:
+        print(api_key)
         if validate_api_key(api_key):
             try:
                 # text_refiner = build_text_refiner(args.text_refiner, args.device, args, api_key)
                 # assert len(text_refiner.llm('hi')) > 0 # test
                 text_refiner = None
+                print("text refiner")
                 visual_chatgpt = ConversationBot(shared_chatbot_tools, api_key=api_key)
             except Exception as e:
+                print(f"Error initializing TextRefiner or ConversationBot: {e}")
                 text_refiner = None
                 visual_chatgpt = None
         else:
+            print("Invalid API key.")
     else:
+        print("API key is too short.")
+    print(text_refiner)
     openai_available = text_refiner is not None
     if visual_chatgpt:
+def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None,language="English",narritive=None):
+    print("narritive", narritive)
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input = image_input['background']
         name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["style"]
         # artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
+    if narritive==None or narritive=="Third":
+        state = [
+        (
+            None,
+            f"🤖 Hi, I am EyeSee. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant information."
+        )
+        ]
+    elif narritive=="Artist":
+        state = [
+        (
+            None,
+            f"🧑‍🎨 Hello, I am the {artist}. Welcome to explore my painting, '{name}'. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
+        )
+        ]
+    elif narritive=="Item":
+        state = [
+        (
+            None,
+            f"🎨 Hello, I am the Item. Let's explore this painting '{name}' together. You can click on the area you're interested in and choose from four types of information: Description, Analysis, Interpretation, and Judgment. Based on your selection, I will provide you with the relevant insights and thoughts behind my creation."
+        )
+        ]
     return [state, state, image_input, click_state, image_input, image_input, image_input, image_input, image_embedding, \
         original_size, input_size] + [f"Name: {name}", f"Artist: {artist}", f"Year: {year}", f"Style: {material}"]*4 + [paragraph,artist]
     enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
     out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
+    state = state + [("You've selected image point at {}, ".format(prompt["input_point"]), None)]
+    # state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
     update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
     text = out['generated_captions']['raw_caption']
     input_mask = np.array(out['mask'].convert('P'))
     yield state, state, click_state, image_input_nobackground, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground
+query_focus = {
+    "D": "Provide a description of the item.",
+    "DA": "Provide a description and analysis of the item.",
+    "DAI": "Provide a description, analysis, and interpretation of the item.",
+    "DDA": "Evaluate the item."
+}
 async def submit_caption(state,length, sentiment, factuality, language,
                    out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
                    autoplay,paragraph,focus_type,openai_api_key,new_crop_save_path):
+    state = state + [(query_focus[focus_type], None)]
     click_index = click_index_state
         visual_chatgpt.point_prompt = ""
         if keep_global:
             # visual_chatgpt.agent.memory.buffer = visual_chatgpt.global_prompt
+            visual_chatgpt.agent.memory.save_context({"input": visual_chatgpt.global_prompt}, {"output": ""})
         else:
             visual_chatgpt.current_image = None
             visual_chatgpt.global_prompt = ""
 def toggle_icons_and_update_prompt(point_prompt):
     new_prompt = "Negative" if point_prompt == "Positive" else "Positive"
+    new_add_icon = "assets/icons/plus-square-blue.png" if new_prompt == "Positive" else "assets/icons/plus-square.png"
+    new_minus_icon = "assets/icons/minus-square.png" if new_prompt == "Positive" else "assets/icons/minus-square-blue.png"
+    print(point_prompt,flush=True)
+    print(new_prompt,flush=True)
     return new_prompt, gr.update(icon=new_add_icon), gr.update(icon=new_minus_icon)
 print("this is a print test")
 def create_ui():
+    print(6)
     title = """<p><h1 align="center">EyeSee Anything in Art</h1></p>
     """
     description = """<p>Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """
                     with gr.Tab("Base(GPT Power)") as base_tab:
                         image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         with gr.Row():
+                            name_label_base = gr.Button(value="Name: ",elem_classes="info_btn")
+                            artist_label_base = gr.Button(value="Artist: ",elem_classes="info_btn")
+                            year_label_base = gr.Button(value="Year: ",elem_classes="info_btn")
+                            material_label_base = gr.Button(value="Style: ",elem_classes="info_btn")
                     with gr.Tab("Base2") as base_tab2:
                         image_input_base_2 = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                         with gr.Row():
+                            name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn")
+                            artist_label_base2 = gr.Button(value="Artist: ",elem_classes="info_btn")
+                            year_label_base2 = gr.Button(value="Year: ",elem_classes="info_btn")
+                            material_label_base2 = gr.Button(value="Style: ",elem_classes="info_btn")
                     with gr.Tab("Click") as click_tab:
                         with gr.Row():
+                            with gr.Column(scale=10,min_width=600):
                                 image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
                                 example_image = gr.Image(type="pil", interactive=False, visible=False)
                                 with gr.Row():
+                                    name_label = gr.Button(value="Name: ",elem_classes="info_btn")
+                                    artist_label = gr.Button(value="Artist: ",elem_classes="info_btn")
+                                    year_label = gr.Button(value="Year: ",elem_classes="info_btn")
+                                    material_label = gr.Button(value="Style: ",elem_classes="info_btn")
                             # example_image_click = gr.Image(type="pil", interactive=False, visible=False)
                             # the tool column
+                            with gr.Column(scale=1,elem_id="tool_box",min_width=80):
                                 add_button = gr.Button(value="", interactive=True,elem_classes="tools_button",icon=add_icon_path)
                                 minus_button = gr.Button(value="", interactive=True,elem_classes="tools_button",icon=minus_icon_path)
                                 clear_button_click = gr.Button(value="Reset", interactive=True,elem_classes="tools_button")
+                                clear_button_image = gr.Button(value="Change", interactive=True,elem_classes="tools_button")
                                 focus_d = gr.Button(value="D",interactive=True,elem_classes="function_button")
                                 focus_da = gr.Button(value="DA",interactive=True,elem_classes="function_button")
                                 focus_dai = gr.Button(value="DAI",interactive=True,elem_classes="function_button")
+        image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive],
                            [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                             image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
         #                        image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph,artist])
         chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
                           [chatbot, state, aux_state,output_audio])
+        # chat_input.submit(lambda: "", None, chat_input)
+        chat_input.submit(lambda: {"text": ""}, None, chat_input)
         # submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play],
         #                   [chatbot, state, aux_state,output_audio])
         # submit_button_text.click(lambda: "", None, chat_input)
+        example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key,language,naritive],
                              [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2,
                               image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \
                             name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \
 if __name__ == '__main__':
+    print("main")
     iface = create_ui()
     iface.queue(api_open=False, max_size=10)
     # iface.queue(concurrency_count=5, api_open=False, max_size=10)