|
"""Visual Iterative Prompting Demo.""" |
|
|
|
import gradio as gr |
|
import numpy as np |
|
from vip_runner import vip_runner |
|
from vlms import GPT4V |
|
|
|
|
|
radius_per_pixel = 0.05 |
|
|
|
|
|
def run_vip( |
|
im, |
|
query, |
|
n_samples_init, |
|
n_samples_opt, |
|
n_iters, |
|
n_recurssion, |
|
openai_api_key, |
|
progress=gr.Progress(track_tqdm=True), |
|
): |
|
|
|
if not openai_api_key: |
|
return [], 'Must provide OpenAI API Key' |
|
if im is None: |
|
return [], 'Must specify image' |
|
if not query: |
|
return [], 'Must specify description' |
|
|
|
img_size = np.min(im.shape[:2]) |
|
print(int(img_size * radius_per_pixel)) |
|
|
|
style = { |
|
'num_samples': 12, |
|
'circle_alpha': 0.6, |
|
'alpha': 0.8, |
|
'arrow_alpha': 0.0, |
|
'radius': int(img_size * radius_per_pixel), |
|
'thickness': 2, |
|
'fontsize': int(img_size * radius_per_pixel), |
|
'rgb_scale': 255, |
|
'focal_offset': 1, |
|
} |
|
|
|
action_spec = { |
|
'loc': [0, 0, 0], |
|
'scale': [0.0, 100, 100], |
|
'min_scale': [0.0, 30, 30], |
|
'min': [0, -300.0, -300], |
|
'max': [0, 300, 300], |
|
'action_to_coord': 250, |
|
'robot': 'meta', |
|
} |
|
|
|
vlm = GPT4V(openai_api_key=openai_api_key) |
|
ims, center, _ = vip_runner( |
|
vlm, |
|
im, |
|
query, |
|
style, |
|
action_spec, |
|
n_samples_init=n_samples_init, |
|
n_samples_opt=n_samples_opt, |
|
n_iters=n_iters, |
|
recursion_level=n_recurssion, |
|
) |
|
return ims, f'Final selected coordinate: {np.round(center, decimals=0)}' |
|
|
|
|
|
examples = [ |
|
{ |
|
'im_path': 'ims/aloha.png', |
|
'desc': 'a point between the fork and the cup', |
|
}, |
|
{ |
|
'im_path': 'ims/robot.png', |
|
'desc': 'the toy in the middle of the table', |
|
}, |
|
{ |
|
'im_path': 'ims/parking.jpg', |
|
'desc': 'a place to park if I am handicapped', |
|
}, |
|
{ |
|
'im_path': 'ims/tools.png', |
|
'desc': 'what should I use pull a nail' |
|
}, |
|
] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(""" |
|
# Visual Iterative Prompting Demo |
|
The demo below showcases the Visual Iterative Prompting (VIP) algorithm. |
|
Given an image and a description of an object or region, |
|
VIP leverages a Vision-Language Model (VLM) to iteratively search for the point in the image that best corresponds to the description. |
|
This is done through visual prompting, where instead of reasoning with text, the VLM reasons over images annotated with sampled points, |
|
in order to pick the best points. |
|
In each iteration, we take the points previously selected by the VLM, resample new points around the their mean, and repeat the process. |
|
|
|
To get started, you can use the provided example image and query pairs, or |
|
upload your own images. |
|
This demo uses GPT-4V, so it requires an OpenAI API key. |
|
|
|
To use the provided example images, you can right click on the image -> copy image, then click the clipboard icon in the Input Image box. |
|
|
|
Hyperparameters to set: |
|
* N Samples for Initialization - how many initial points are sampled for the first VIP iteration. |
|
* N Samples for Optimiazation - how many points are sampled for subsequent iterations. |
|
* N Iterations - how many optimization iterations to perform. |
|
* N Ensemble Recursions - how many ensembles for recursive VIP. |
|
|
|
Note that each iteration takes about ~10s, and each additional ensemble adds a multiple number of N Iterations. |
|
|
|
After VIP finishes, the image gallery below will visualize VIP results throughout all the iterations. |
|
There are two images for each iteration - the first one shows all the sampled points, and the second one shows which one VIP picked. |
|
The Info textbox will show the final selected pixel coordinate that VIP converged to. |
|
""".strip()) |
|
|
|
gr.Markdown( |
|
'## Example Images and Queries\n Drag images into the image box below' |
|
) |
|
with gr.Row(equal_height=True): |
|
for example in examples: |
|
gr.Image(value=example['im_path'], label=example['desc']) |
|
|
|
gr.Markdown('## New Query') |
|
with gr.Row(): |
|
with gr.Column(): |
|
inp_im = gr.Image(label='Input Image', type='numpy', show_label=True) |
|
inp_query = gr.Textbox(label='Description', lines=1) |
|
|
|
with gr.Column(): |
|
inp_openai_api_key = gr.Textbox( |
|
label='OpenAI API Key (not saved)', lines=1 |
|
) |
|
with gr.Group(): |
|
inp_n_samples_init = gr.Slider( |
|
label='N Samples for Initialization', |
|
minimum=10, |
|
maximum=40, |
|
value=25, |
|
step=1, |
|
) |
|
inp_n_samples_opt = gr.Slider( |
|
label='N Samples for Optimization', |
|
minimum=3, |
|
maximum=20, |
|
value=10, |
|
step=1, |
|
) |
|
inp_n_iters = gr.Slider( |
|
label='N Iterations', minimum=1, maximum=5, value=3, step=1 |
|
) |
|
inp_n_recurssions = gr.Slider( |
|
label='N Ensemble Recursions', minimum=0, maximum=3, value=0, step=1 |
|
) |
|
btn_run = gr.Button('Run') |
|
|
|
with gr.Group(): |
|
out_ims = gr.Gallery( |
|
label='Images with Sampled and Chosen Points', |
|
columns=4, |
|
rows=1, |
|
interactive=False, |
|
) |
|
out_info = gr.Textbox(label='Info', lines=1) |
|
|
|
btn_run.click( |
|
run_vip, |
|
inputs=[ |
|
inp_im, |
|
inp_query, |
|
inp_n_samples_init, |
|
inp_n_samples_opt, |
|
inp_n_iters, |
|
inp_n_recurssions, |
|
inp_openai_api_key, |
|
], |
|
outputs=[out_ims, out_info], |
|
) |
|
|
|
demo.launch() |
|
|