Spaces:
Running
Running
Niki Zhang
commited on
Update app.py
Browse filesAdd painting introduction part
Add Focus Type
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
import json
|
3 |
import gradio as gr
|
4 |
import numpy as np
|
@@ -117,9 +118,12 @@ def init_openai_api_key(api_key=""):
|
|
117 |
gpt_state=1
|
118 |
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=True)]+ [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
|
119 |
else:
|
|
|
120 |
return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
|
121 |
|
122 |
def init_wo_openai_api_key():
|
|
|
|
|
123 |
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]
|
124 |
|
125 |
def get_click_prompt(chat_input, click_state, click_mode):
|
@@ -169,14 +173,13 @@ def chat_input_callback(*args):
|
|
169 |
|
170 |
|
171 |
|
172 |
-
def upload_callback(image_input, state, visual_chatgpt=None):
|
173 |
-
|
174 |
if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
|
175 |
image_input, mask = image_input['image'], image_input['mask']
|
176 |
|
177 |
click_state = [[], [], []]
|
178 |
image_input = image_resize(image_input, res=1024)
|
179 |
-
|
180 |
model = build_caption_anything_with_models(
|
181 |
args,
|
182 |
api_key="",
|
@@ -189,7 +192,7 @@ def upload_callback(image_input, state, visual_chatgpt=None):
|
|
189 |
image_embedding = model.image_embedding
|
190 |
original_size = model.original_size
|
191 |
input_size = model.input_size
|
192 |
-
|
193 |
if visual_chatgpt is not None:
|
194 |
print('upload_callback: add caption to chatGPT memory')
|
195 |
new_image_path = get_new_image_name('chat_image', func_name='upload')
|
@@ -200,10 +203,17 @@ def upload_callback(image_input, state, visual_chatgpt=None):
|
|
200 |
AI_prompt = "Received."
|
201 |
visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
202 |
visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
|
204 |
|
205 |
-
return state, state, image_input, click_state, image_input, image_input, image_embedding, \
|
206 |
-
original_size, input_size
|
|
|
207 |
|
208 |
|
209 |
|
@@ -237,7 +247,7 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
|
|
237 |
)
|
238 |
|
239 |
model.setup(image_embedding, original_size, input_size, is_image_set=True)
|
240 |
-
|
241 |
enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
|
242 |
out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
|
243 |
|
@@ -270,7 +280,7 @@ def inference_click(image_input, point_prompt, click_mode, enable_wiki, language
|
|
270 |
|
271 |
def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
|
272 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
273 |
-
input_text, input_language, input_audio, input_mic, use_mic, agree):
|
274 |
print("state",state)
|
275 |
|
276 |
click_index = click_index_state
|
@@ -278,9 +288,14 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
|
|
278 |
input_points = input_points_state
|
279 |
input_labels = input_labels_state
|
280 |
out = out_state
|
281 |
-
|
|
|
|
|
|
|
|
|
282 |
|
283 |
-
|
|
|
284 |
|
285 |
controls = {
|
286 |
'length': length,
|
@@ -289,6 +304,37 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
|
|
289 |
'language': language
|
290 |
}
|
291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
|
293 |
input_points=input_points, input_labels=input_labels)
|
294 |
|
@@ -297,20 +343,14 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
|
|
297 |
|
298 |
|
299 |
if not args.disable_gpt and text_refiner:
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
print("
|
304 |
-
|
305 |
-
new_cap = refined_caption['caption']
|
306 |
-
if refined_caption.get('wiki'):
|
307 |
-
state = state + [(None, "Wiki: {}".format(refined_caption['wiki']))]
|
308 |
-
state = state + [(None, f"GPT_Caption: {new_cap}")]
|
309 |
-
print("new_cap",new_cap)
|
310 |
-
refined_image_input = create_bubble_frame(np.array(origin_image_input), new_cap, click_index, input_mask,
|
311 |
input_points=input_points, input_labels=input_labels)
|
312 |
try:
|
313 |
-
waveform_visual, audio_output = tts.predict(
|
314 |
return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
315 |
except Exception as e:
|
316 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
@@ -327,17 +367,56 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
|
|
327 |
return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
328 |
|
329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
|
332 |
-
def txt2speech(text):
|
333 |
-
print("Initializing text-to-speech conversion...")
|
334 |
-
# API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
|
335 |
-
# headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
|
336 |
-
# payloads = {'inputs': text}
|
337 |
-
# response = requests.post(API_URL, headers=headers, json=payloads)
|
338 |
-
# with open('audio_story.mp3', 'wb') as file:
|
339 |
-
# file.write(response.content)
|
340 |
-
print("Text-to-speech conversion completed.")
|
341 |
|
342 |
|
343 |
|
@@ -447,6 +526,26 @@ def cap_everything(image_input, visual_chatgpt, text_refiner,input_language, inp
|
|
447 |
waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
|
448 |
return paragraph,waveform_visual, audio_output
|
449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
|
451 |
def get_style():
|
452 |
current_version = version.parse(gr.__version__)
|
@@ -496,11 +595,13 @@ def create_ui():
|
|
496 |
original_size = gr.State(None)
|
497 |
input_size = gr.State(None)
|
498 |
generated_caption = gr.State("")
|
|
|
499 |
aux_state = gr.State([])
|
500 |
click_index_state = gr.State((0, 0))
|
501 |
input_mask_state = gr.State(np.zeros((1, 1)))
|
502 |
input_points_state = gr.State([])
|
503 |
input_labels_state = gr.State([])
|
|
|
504 |
|
505 |
|
506 |
gr.Markdown(title)
|
@@ -510,14 +611,20 @@ def create_ui():
|
|
510 |
with gr.Column(scale=1.0):
|
511 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
512 |
with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
|
|
|
513 |
image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
|
514 |
example_image = gr.Image(type="pil", interactive=False, visible=False)
|
515 |
-
|
516 |
-
|
517 |
with gr.Tab("Click") as click_tab:
|
518 |
-
|
519 |
image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
|
520 |
example_image = gr.Image(type="pil", interactive=False, visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
521 |
with gr.Row(scale=1.0):
|
522 |
with gr.Row(scale=0.4):
|
523 |
point_prompt = gr.Radio(
|
@@ -572,6 +679,7 @@ def create_ui():
|
|
572 |
value="No",
|
573 |
label="Enable Wiki",
|
574 |
interactive=True)
|
|
|
575 |
# with gr.Column(visible=True) as modules_not_need_gpt3:
|
576 |
gr.Examples(
|
577 |
examples=examples,
|
@@ -708,29 +816,29 @@ def create_ui():
|
|
708 |
|
709 |
image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
|
710 |
|
711 |
-
|
712 |
-
[chatbot, state, origin_image, click_state, image_input_base, sketcher_input,
|
713 |
-
image_embedding, original_size, input_size])
|
714 |
|
715 |
|
716 |
-
|
717 |
-
[chatbot, state, origin_image, click_state, image_input, sketcher_input,
|
718 |
-
image_embedding, original_size, input_size])
|
719 |
-
|
720 |
-
|
721 |
-
|
|
|
|
|
|
|
|
|
722 |
chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
|
723 |
[chatbot, state, aux_state])
|
724 |
chat_input.submit(lambda: "", None, chat_input)
|
725 |
submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
|
726 |
[chatbot, state, aux_state])
|
727 |
submit_button_text.click(lambda: "", None, chat_input)
|
728 |
-
example_image.change(upload_callback, [example_image, state, visual_chatgpt],
|
729 |
-
[chatbot, state, origin_image, click_state, image_input, sketcher_input,
|
730 |
-
image_embedding, original_size, input_size])
|
731 |
-
|
732 |
-
[chatbot, state, origin_image, click_state, image_input_base, sketcher_input,
|
733 |
-
image_embedding, original_size, input_size])
|
734 |
example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
|
735 |
|
736 |
def on_click_tab_selected():
|
@@ -776,7 +884,7 @@ def create_ui():
|
|
776 |
inputs=[
|
777 |
image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
|
778 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
779 |
-
input_text, input_language, input_audio, input_mic, use_mic, agree
|
780 |
],
|
781 |
outputs=[
|
782 |
chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
|
|
|
1 |
import os
|
2 |
+
import base64
|
3 |
import json
|
4 |
import gradio as gr
|
5 |
import numpy as np
|
|
|
118 |
gpt_state=1
|
119 |
return [gr.update(visible=True)]+[gr.update(visible=False)]+[gr.update(visible=True)]*3+[gr.update(visible=False)]+ [gr.update(visible=True)]+ [gr.update(visible=False)]*2 + [text_refiner, visual_chatgpt, None]
|
120 |
else:
|
121 |
+
gpt_state=0
|
122 |
return [gr.update(visible=False)]*7 + [gr.update(visible=True)]*2 + [text_refiner, visual_chatgpt, 'Your OpenAI API Key is not available']
|
123 |
|
124 |
def init_wo_openai_api_key():
|
125 |
+
global gpt_state
|
126 |
+
gpt_state=0
|
127 |
return [gr.update(visible=False)]*4 + [gr.update(visible=True)]+ [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 + [None, None, None]
|
128 |
|
129 |
def get_click_prompt(chat_input, click_state, click_mode):
|
|
|
173 |
|
174 |
|
175 |
|
176 |
+
def upload_callback(image_input, state, visual_chatgpt=None, openai_api_key=None):
|
|
|
177 |
if isinstance(image_input, dict): # if upload from sketcher_input, input contains image and mask
|
178 |
image_input, mask = image_input['image'], image_input['mask']
|
179 |
|
180 |
click_state = [[], [], []]
|
181 |
image_input = image_resize(image_input, res=1024)
|
182 |
+
|
183 |
model = build_caption_anything_with_models(
|
184 |
args,
|
185 |
api_key="",
|
|
|
192 |
image_embedding = model.image_embedding
|
193 |
original_size = model.original_size
|
194 |
input_size = model.input_size
|
195 |
+
|
196 |
if visual_chatgpt is not None:
|
197 |
print('upload_callback: add caption to chatGPT memory')
|
198 |
new_image_path = get_new_image_name('chat_image', func_name='upload')
|
|
|
203 |
AI_prompt = "Received."
|
204 |
visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
205 |
visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
206 |
+
parsed_data = get_image_gpt(openai_api_key, new_image_path,"Please provide the name, artist, year of creation, and material used for this painting. Return the information in dictionary format without any newline characters. If any information is unavailable, return \"None\" for that field. Format as follows: { \"name\": \"Name of the painting\",\"artist\": \"Name of the artist\", \"year\": \"Year of creation\", \"material\": \"Material used in the painting\" }.")
|
207 |
+
parsed_data = json.loads(parsed_data.replace("'", "\""))
|
208 |
+
name, artist, year, material= parsed_data["name"],parsed_data["artist"],parsed_data["year"], parsed_data["material"]
|
209 |
+
artwork_info = f"<div>Painting: {name}<br>Artist name: {artist}<br>Year: {year}<br>Material: {material}</div>"
|
210 |
+
paragraph = get_image_gpt(openai_api_key, new_image_path,"Imagine you are a intelligent image captioner. You should generate a descriptive, coherent and human-like paragraph based on the given image instead of imagination. There are some rules for your response: Show objects with their attributes (e.g. position, color, size, shape, texture).\nPrimarily describe common objects with large size.\nProvide context of the image.\nShow relative position between objects.\nLess than 6 sentences.\nDo not appear number.\nDo not describe any individual letter.\nDo not show the image resolution.\nIngore the white background.")
|
211 |
+
|
212 |
state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
|
213 |
|
214 |
+
return state, state, image_input, click_state, image_input, image_input, image_input, image_embedding, \
|
215 |
+
original_size, input_size, artwork_info,artwork_info,paragraph
|
216 |
+
|
217 |
|
218 |
|
219 |
|
|
|
247 |
)
|
248 |
|
249 |
model.setup(image_embedding, original_size, input_size, is_image_set=True)
|
250 |
+
|
251 |
enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
|
252 |
out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki, verbose=True, args={'clip_filter': False})[0]
|
253 |
|
|
|
280 |
|
281 |
def submit_caption(image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
|
282 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
283 |
+
input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key):
|
284 |
print("state",state)
|
285 |
|
286 |
click_index = click_index_state
|
|
|
288 |
input_points = input_points_state
|
289 |
input_labels = input_labels_state
|
290 |
out = out_state
|
291 |
+
focus_map = {
|
292 |
+
"Inside the Mark": 0,
|
293 |
+
"Around the Mark": 1,
|
294 |
+
"Outside the Mark": 2
|
295 |
+
}
|
296 |
|
297 |
+
mapped_value = focus_map.get(focus_type, -1)
|
298 |
+
print("mapped value",mapped_value)
|
299 |
|
300 |
controls = {
|
301 |
'length': length,
|
|
|
304 |
'language': language
|
305 |
}
|
306 |
|
307 |
+
prompt_list = [
|
308 |
+
'Select sentences closely related to the raw caption: "{raw_caption}" from the wiki caption: "{Wiki_caption}" around {length} words of {sentiment} sentiment in {language}.',
|
309 |
+
'Pick sentences from the wiki caption: "{Wiki_caption}" that may be related to objects mentioned in the "{raw_caption}" around {length} words of {sentiment} sentiment in {language}.',
|
310 |
+
'Choose sentences from the wiki caption: "{Wiki_caption}" that describe unrelated topics to the raw caption: "{raw_caption}" around {length} words of {sentiment} sentiment in {language}.'
|
311 |
+
]
|
312 |
+
|
313 |
+
|
314 |
+
if mapped_value != -1:
|
315 |
+
prompt= prompt_list[mapped_value].format(
|
316 |
+
raw_caption=generated_caption,
|
317 |
+
Wiki_caption=paragraph,
|
318 |
+
length=controls['length'],
|
319 |
+
sentiment=controls['sentiment'],
|
320 |
+
language=controls['language']
|
321 |
+
)
|
322 |
+
prompt+="You should generate a descriptive, coherent and human-like paragraph"
|
323 |
+
|
324 |
+
else:
|
325 |
+
print("error prompting")
|
326 |
+
prompt = "Invalid focus type."
|
327 |
+
|
328 |
+
if controls['factuality'] == "Imagination":
|
329 |
+
prompt += " The new sentence could extend the original description by using your imagination to create additional details, or think about what might have happened before or after the scene in the image, but should not conflict with the original sentence."
|
330 |
+
|
331 |
+
print("Prompt:", prompt)
|
332 |
+
print("click",click_index)
|
333 |
+
|
334 |
+
origin_image_input = image_input
|
335 |
+
|
336 |
+
|
337 |
+
|
338 |
image_input = create_bubble_frame(np.array(image_input), generated_caption, click_index, input_mask,
|
339 |
input_points=input_points, input_labels=input_labels)
|
340 |
|
|
|
343 |
|
344 |
|
345 |
if not args.disable_gpt and text_refiner:
|
346 |
+
focus_info=get_image_gpt(openai_api_key,visual_chatgpt.current_image,prompt)
|
347 |
+
state = state + [(None, f"Wiki: {paragraph}")]
|
348 |
+
state = state + [(None, f"Focus_Caption: {focus_info}")]
|
349 |
+
print("new_cap",focus_info)
|
350 |
+
refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
input_points=input_points, input_labels=input_labels)
|
352 |
try:
|
353 |
+
waveform_visual, audio_output = tts.predict(focus_info, input_language, input_audio, input_mic, use_mic, agree)
|
354 |
return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
355 |
except Exception as e:
|
356 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
|
|
367 |
return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
368 |
|
369 |
|
370 |
+
def encode_image(image_path):
|
371 |
+
with open(image_path, "rb") as image_file:
|
372 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
373 |
+
|
374 |
+
def get_image_gpt(api_key, image_path,prompt,enable_wiki=None):
|
375 |
+
# Getting the base64 string
|
376 |
+
base64_image = encode_image(image_path)
|
377 |
+
|
378 |
+
|
379 |
|
380 |
+
headers = {
|
381 |
+
"Content-Type": "application/json",
|
382 |
+
"Authorization": f"Bearer {api_key}"
|
383 |
+
}
|
384 |
+
|
385 |
+
prompt_text = prompt
|
386 |
+
|
387 |
+
payload = {
|
388 |
+
"model": "gpt-4o",
|
389 |
+
"messages": [
|
390 |
+
{
|
391 |
+
"role": "user",
|
392 |
+
"content": [
|
393 |
+
{
|
394 |
+
"type": "text",
|
395 |
+
"text": prompt_text
|
396 |
+
},
|
397 |
+
{
|
398 |
+
"type": "image_url",
|
399 |
+
"image_url": {
|
400 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
401 |
+
}
|
402 |
+
}
|
403 |
+
]
|
404 |
+
}
|
405 |
+
],
|
406 |
+
"max_tokens": 300
|
407 |
+
}
|
408 |
+
|
409 |
+
# Sending the request to the OpenAI API
|
410 |
+
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
411 |
+
result = response.json()
|
412 |
+
print(result)
|
413 |
+
content = result['choices'][0]['message']['content']
|
414 |
+
# Assume the model returns a valid JSON string in 'content'
|
415 |
+
try:
|
416 |
+
return content
|
417 |
+
except json.JSONDecodeError:
|
418 |
+
return {"error": "Failed to parse model output"}
|
419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
420 |
|
421 |
|
422 |
|
|
|
526 |
waveform_visual, audio_output=tts.predict(paragraph, input_language, input_audio, input_mic, use_mic, agree)
|
527 |
return paragraph,waveform_visual, audio_output
|
528 |
|
529 |
+
def cap_everything_withoutsound(image_input, visual_chatgpt, text_refiner,paragraph):
|
530 |
+
|
531 |
+
model = build_caption_anything_with_models(
|
532 |
+
args,
|
533 |
+
api_key="",
|
534 |
+
captioner=shared_captioner,
|
535 |
+
sam_model=shared_sam_model,
|
536 |
+
ocr_reader=shared_ocr_reader,
|
537 |
+
text_refiner=text_refiner,
|
538 |
+
session_id=iface.app_id
|
539 |
+
)
|
540 |
+
paragraph = model.inference_cap_everything(image_input, verbose=True)
|
541 |
+
# state = state + [(None, f"Caption Everything: {paragraph}")]
|
542 |
+
Human_prompt = f'\nThe description of the image with path {visual_chatgpt.current_image} is:\n{paragraph}\nThis information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
543 |
+
AI_prompt = "Received."
|
544 |
+
visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
|
545 |
+
visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
|
546 |
+
return paragraph
|
547 |
+
|
548 |
+
|
549 |
|
550 |
def get_style():
|
551 |
current_version = version.parse(gr.__version__)
|
|
|
595 |
original_size = gr.State(None)
|
596 |
input_size = gr.State(None)
|
597 |
generated_caption = gr.State("")
|
598 |
+
paragraph = gr.State("")
|
599 |
aux_state = gr.State([])
|
600 |
click_index_state = gr.State((0, 0))
|
601 |
input_mask_state = gr.State(np.zeros((1, 1)))
|
602 |
input_points_state = gr.State([])
|
603 |
input_labels_state = gr.State([])
|
604 |
+
|
605 |
|
606 |
|
607 |
gr.Markdown(title)
|
|
|
611 |
with gr.Column(scale=1.0):
|
612 |
with gr.Column(visible=False) as modules_not_need_gpt:
|
613 |
with gr.Tab("Base(GPT Power)",visible=False) as base_tab:
|
614 |
+
image_intro=gr.HTML()
|
615 |
image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload")
|
616 |
example_image = gr.Image(type="pil", interactive=False, visible=False)
|
617 |
+
|
|
|
618 |
with gr.Tab("Click") as click_tab:
|
619 |
+
image_intro_click=gr.HTML()
|
620 |
image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
|
621 |
example_image = gr.Image(type="pil", interactive=False, visible=False)
|
622 |
+
with gr.Row(scale=1.0):
|
623 |
+
focus_type = gr.Radio(
|
624 |
+
choices=["Inside the Mark", "Around the Mark", "Outside the Mark"],
|
625 |
+
value="Inside the Mark",
|
626 |
+
label="Focus Type",
|
627 |
+
interactive=True)
|
628 |
with gr.Row(scale=1.0):
|
629 |
with gr.Row(scale=0.4):
|
630 |
point_prompt = gr.Radio(
|
|
|
679 |
value="No",
|
680 |
label="Enable Wiki",
|
681 |
interactive=True)
|
682 |
+
|
683 |
# with gr.Column(visible=True) as modules_not_need_gpt3:
|
684 |
gr.Examples(
|
685 |
examples=examples,
|
|
|
816 |
|
817 |
image_input.clear(clear_chat_memory, inputs=[visual_chatgpt])
|
818 |
|
819 |
+
|
|
|
|
|
820 |
|
821 |
|
822 |
+
image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key],
|
823 |
+
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
|
824 |
+
image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
|
825 |
+
|
826 |
+
image_input.upload(upload_callback, [image_input, state, visual_chatgpt, openai_api_key],
|
827 |
+
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
|
828 |
+
image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
|
829 |
+
sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key],
|
830 |
+
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
|
831 |
+
image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
|
832 |
chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
|
833 |
[chatbot, state, aux_state])
|
834 |
chat_input.submit(lambda: "", None, chat_input)
|
835 |
submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state],
|
836 |
[chatbot, state, aux_state])
|
837 |
submit_button_text.click(lambda: "", None, chat_input)
|
838 |
+
example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key],
|
839 |
+
[chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,
|
840 |
+
image_embedding, original_size, input_size,image_intro,image_intro_click,paragraph])
|
841 |
+
|
|
|
|
|
842 |
example_image.change(clear_chat_memory, inputs=[visual_chatgpt])
|
843 |
|
844 |
def on_click_tab_selected():
|
|
|
884 |
inputs=[
|
885 |
image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language,
|
886 |
out_state, click_index_state, input_mask_state, input_points_state, input_labels_state,
|
887 |
+
input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key
|
888 |
],
|
889 |
outputs=[
|
890 |
chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,
|