Spaces:
Runtime error
Runtime error
# Copyright (c) Tencent Inc. All rights reserved. | |
import time | |
import os | |
os.environ['PYTORCH_JIT'] = "0" | |
os.system('mim install mmcv==2.0.1') | |
os.system('pip install numpy==1.26.4') | |
# import spaces | |
import sys | |
import argparse | |
import os.path as osp | |
from io import BytesIO | |
from functools import partial | |
# import spaces | |
# from mmengine.runner import Runner | |
from mmengine.dataset import Compose | |
from mmengine.runner.amp import autocast | |
from mmengine.config import Config, DictAction, ConfigDict | |
from mmdet.datasets import CocoDataset | |
from mmyolo.registry import RUNNERS | |
import cv2 | |
# import onnx | |
import torch | |
# import onnxsim | |
import numpy as np | |
import gradio as gr | |
from PIL import Image | |
import supervision as sv | |
from torchvision.ops import nms | |
from transformers import (AutoTokenizer, CLIPTextModelWithProjection) | |
from transformers import (AutoProcessor, CLIPImageProcessor, CLIPVisionModelWithProjection) | |
BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=2) | |
MASK_ANNOTATOR = sv.MaskAnnotator() | |
class LabelAnnotator(sv.LabelAnnotator): | |
def resolve_text_background_xyxy( | |
center_coordinates, | |
text_wh, | |
position, | |
): | |
center_x, center_y = center_coordinates | |
text_w, text_h = text_wh | |
return center_x, center_y, center_x + text_w, center_y + text_h | |
LABEL_ANNOTATOR = LabelAnnotator(text_padding=4, | |
text_scale=0.5, | |
text_thickness=1) | |
# @spaces.GPU | |
def generate_image_embeddings(prompt_image, | |
vision_encoder, | |
vision_processor, | |
projector, | |
device='cuda:0'): | |
prompt_image = prompt_image.convert('RGB') | |
inputs = vision_processor(images=[prompt_image], | |
return_tensors="pt", | |
padding=False) | |
inputs = inputs.to(device) | |
image_outputs = vision_encoder(**inputs) | |
img_feats = image_outputs.image_embeds.view(1, -1) | |
img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True) | |
if projector is not None: | |
img_feats = projector(img_feats) + img_feats | |
return img_feats | |
# @spaces.GPU | |
def run_image(runner, | |
vision_encoder, | |
vision_processor, | |
padding_token, | |
image, | |
text, | |
prompt_image, | |
add_padding, | |
max_num_boxes, | |
score_thr, | |
nms_thr, | |
image_path='./work_dirs/demo.png'): | |
image = image.convert('RGB') | |
if prompt_image is not None: | |
texts = [['object'], [' ']] | |
projector = None | |
if hasattr(runner.model, 'image_prompt_encoder'): | |
projector = runner.model.image_prompt_encoder.projector | |
print(projector) | |
prompt_embeddings = generate_image_embeddings( | |
prompt_image, | |
vision_encoder=vision_encoder, | |
vision_processor=vision_processor, | |
projector=projector) | |
if add_padding == 'padding': | |
prompt_embeddings = torch.cat([prompt_embeddings, padding_token], | |
dim=0) | |
prompt_embeddings = prompt_embeddings / prompt_embeddings.norm( | |
p=2, dim=-1, keepdim=True) | |
runner.model.num_test_classes = prompt_embeddings.shape[0] | |
runner.model.setembeddings(prompt_embeddings[None]) | |
else: | |
runner.model.setembeddings(None) | |
texts = [[t.strip()] for t in text.split(',')] | |
data_info = dict(img_id=0, img=np.array(image), texts=texts) | |
data_info = runner.pipeline(data_info) | |
data_batch = dict(inputs=data_info['inputs'].unsqueeze(0), | |
data_samples=[data_info['data_samples']]) | |
with autocast(enabled=False), torch.no_grad(): | |
if (prompt_image is not None) and ('texts' in data_batch['data_samples'][ | |
0]): | |
del data_batch['data_samples'][0]['texts'] | |
output = runner.model.test_step(data_batch)[0] | |
pred_instances = output.pred_instances | |
keep = nms(pred_instances.bboxes, | |
pred_instances.scores, | |
iou_threshold=nms_thr) | |
pred_instances = pred_instances[keep] | |
pred_instances = pred_instances[pred_instances.scores.float() > score_thr] | |
if len(pred_instances.scores) > max_num_boxes: | |
indices = pred_instances.scores.float().topk(max_num_boxes)[1] | |
pred_instances = pred_instances[indices] | |
pred_instances = pred_instances.cpu().numpy() | |
if 'masks' in pred_instances: | |
masks = pred_instances['masks'] | |
else: | |
masks = None | |
detections = sv.Detections(xyxy=pred_instances['bboxes'], | |
class_id=pred_instances['labels'], | |
confidence=pred_instances['scores'], | |
mask=masks) | |
labels = [ | |
f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in | |
zip(detections.class_id, detections.confidence) | |
] | |
image = np.array(image) | |
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # Convert RGB to BGR | |
image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections) | |
image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels) | |
if masks is not None: | |
image = MASK_ANNOTATOR.annotate(image, detections) | |
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB | |
image = Image.fromarray(image) | |
return image | |
MARKDOWN = """ | |
# YOLO-World-Image 👀: YOLO-World with Image Prompts | |
Now everyone can use YOLO-World and some image prompts to detect corresponding objects in images, no longer relying on text prompts! | |
This demo is actively under construction and the YOLO-World-Image is an initial (**beta**) version! | |
<div style='display:flex; gap: 0.25rem; align-items: center'> | |
<a href="https://yoloworld.cc"><img src="https://img.shields.io/badge/Project-Page-green"></a> | |
<a href="https://arxiv.org/abs/2401.17270"><img src="https://img.shields.io/badge/arXiv-Paper-red"></a> | |
<a href="https://github.com/AILab-CVC/YOLO-World"><img src="https://img.shields.io/badge/GitHub-Code-blue"></a> | |
<a href="https://huggingface.co/spaces/stevengrove/YOLO-World"><img src="https://img.shields.io/badge/🤗HugginngFace-Spaces-orange"></a> | |
</div> | |
### Simple guide: | |
* upload the image prompts | |
* adjust the hyperparameters: | |
- score threshold | |
- nms threshold | |
- max number of detections | |
- whether using padding: adding padding might improve the confidence scores of the predictions. It will be removed in the coming updates. | |
* run with 'submit'! | |
""" | |
def demo(runner, vision_encoder, vision_processor, padding_embed): | |
with gr.Blocks(title="YOLO-World") as demo: | |
with gr.Row(): | |
gr.Markdown(MARKDOWN) | |
with gr.Row(): | |
image = gr.Image(type='pil', label='input image') | |
output_image = gr.Image(type='pil', label='output image') | |
with gr.Row(): | |
with gr.Column(scale=0.3): | |
with gr.Row(): | |
prompt_image = gr.Image(type='pil', | |
label='Image Prompts', | |
height=300) | |
with gr.Row(): | |
add_padding = gr.Radio(["padding", "none"], | |
label="Padding Prompt", | |
info="whether add padding prompt") | |
with gr.Column(scale=0.3): | |
with gr.Row(): | |
input_text = gr.Textbox( | |
lines=7, | |
label='Text Prompts:\nEnter the classes to be detected, ' | |
'separated by comma', | |
value=', '.join(CocoDataset.METAINFO['classes']), | |
elem_id='textbox') | |
with gr.Column(scale=0.4): | |
max_num_boxes = gr.Slider(minimum=1, | |
maximum=300, | |
value=100, | |
step=1, | |
interactive=True, | |
label='Maximum Number Boxes') | |
score_thr = gr.Slider(minimum=0, | |
maximum=1, | |
value=0.05, | |
step=0.001, | |
interactive=True, | |
label='Score Threshold') | |
nms_thr = gr.Slider(minimum=0, | |
maximum=1, | |
value=0.7, | |
step=0.001, | |
interactive=True, | |
label='NMS Threshold') | |
with gr.Row(): | |
submit = gr.Button('Submit') | |
clear = gr.Button('Clear') | |
exp_image_dir = "./gradio_examples/image_prompts/images/" | |
exp_prompt_dir = "./gradio_examples/image_prompts/prompts/" | |
example = gr.Examples( | |
examples=[ | |
[ | |
exp_image_dir + "0.jpeg", exp_prompt_dir + "0.png", "", | |
"none", 0.3, 0.5, 100 | |
], | |
[ | |
exp_image_dir + "1.png", exp_prompt_dir + "1.png", "", | |
"padding", 0.2, 0.1, 100 | |
], | |
[ | |
exp_image_dir + "3.png", exp_prompt_dir + "3.png", "", | |
"padding", 0.3, 0.5, 100 | |
], | |
], | |
inputs=[ | |
image, prompt_image, input_text, add_padding, score_thr, | |
nms_thr, max_num_boxes | |
], | |
) | |
submit.click( | |
partial(run_image, runner, vision_encoder, vision_processor, | |
padding_embed), [ | |
image, | |
input_text, | |
prompt_image, | |
add_padding, | |
max_num_boxes, | |
score_thr, | |
nms_thr, | |
], [output_image]) | |
clear.click(lambda: [None, None, '', None], None, | |
[image, prompt_image, input_text, output_image]) | |
demo.launch() # port 80 does not work for me | |
if __name__ == '__main__': | |
# args = parse_args() | |
config = "configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_image_prompt_demo.py" | |
checkpoint = "weights/yolo_world_v2_l_image_prompt_adapter-719a7afb.pth" | |
# load config | |
cfg = Config.fromfile(config) | |
cfg.compile = False | |
if cfg.get('work_dir', None) is None: | |
cfg.work_dir = osp.join('./work_dirs', | |
osp.splitext(osp.basename(config))[0]) | |
cfg.load_from = checkpoint | |
# if 'runner_type' not in cfg: | |
# runner = Runner.from_cfg(cfg) | |
# else: | |
# | |
runner = RUNNERS.build(cfg) | |
# runner.test() | |
runner.call_hook('before_run') | |
runner.load_or_resume() | |
pipeline = cfg.test_dataloader.dataset.pipeline | |
pipeline[0].type = 'mmdet.LoadImageFromNDArray' | |
runner.pipeline = Compose(pipeline) | |
runner.model.eval() | |
# init vision encoder | |
clip_model = "openai/clip-vit-base-patch32" | |
vision_model = CLIPVisionModelWithProjection.from_pretrained(clip_model) | |
processor = CLIPImageProcessor.from_pretrained(clip_model) | |
device = 'cuda' | |
vision_model.to(device) | |
texts = [' '] | |
tokenizer = AutoTokenizer.from_pretrained(clip_model) | |
text_model = CLIPTextModelWithProjection.from_pretrained(clip_model) | |
# device = 'cuda:0' | |
text_model.to(device) | |
texts = tokenizer(text=texts, return_tensors='pt', padding=True) | |
texts = texts.to(device) | |
text_outputs = text_model(**texts) | |
txt_feats = text_outputs.text_embeds | |
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True) | |
txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]) | |
txt_feats = txt_feats[0].unsqueeze(0) | |
demo(runner, vision_model, processor, txt_feats) | |