# Copyright (c) Tencent Inc. All rights reserved. import time import os os.environ['PYTORCH_JIT'] = "0" os.system('mim install mmcv==2.0.1') os.system('pip install numpy==1.26.4') # import spaces import sys import argparse import os.path as osp from io import BytesIO from functools import partial # import spaces # from mmengine.runner import Runner from mmengine.dataset import Compose from mmengine.runner.amp import autocast from mmengine.config import Config, DictAction, ConfigDict from mmdet.datasets import CocoDataset from mmyolo.registry import RUNNERS import cv2 # import onnx import torch # import onnxsim import numpy as np import gradio as gr from PIL import Image import supervision as sv from torchvision.ops import nms from transformers import (AutoTokenizer, CLIPTextModelWithProjection) from transformers import (AutoProcessor, CLIPImageProcessor, CLIPVisionModelWithProjection) BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=2) MASK_ANNOTATOR = sv.MaskAnnotator() class LabelAnnotator(sv.LabelAnnotator): @staticmethod def resolve_text_background_xyxy( center_coordinates, text_wh, position, ): center_x, center_y = center_coordinates text_w, text_h = text_wh return center_x, center_y, center_x + text_w, center_y + text_h LABEL_ANNOTATOR = LabelAnnotator(text_padding=4, text_scale=0.5, text_thickness=1) # @spaces.GPU def generate_image_embeddings(prompt_image, vision_encoder, vision_processor, projector, device='cuda:0'): prompt_image = prompt_image.convert('RGB') inputs = vision_processor(images=[prompt_image], return_tensors="pt", padding=False) inputs = inputs.to(device) image_outputs = vision_encoder(**inputs) img_feats = image_outputs.image_embeds.view(1, -1) img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True) if projector is not None: img_feats = projector(img_feats) + img_feats return img_feats # @spaces.GPU def run_image(runner, vision_encoder, vision_processor, padding_token, image, text, prompt_image, add_padding, max_num_boxes, score_thr, nms_thr, image_path='./work_dirs/demo.png'): image = image.convert('RGB') if prompt_image is not None: texts = [['object'], [' ']] projector = None if hasattr(runner.model, 'image_prompt_encoder'): projector = runner.model.image_prompt_encoder.projector print(projector) prompt_embeddings = generate_image_embeddings( prompt_image, vision_encoder=vision_encoder, vision_processor=vision_processor, projector=projector) if add_padding == 'padding': prompt_embeddings = torch.cat([prompt_embeddings, padding_token], dim=0) prompt_embeddings = prompt_embeddings / prompt_embeddings.norm( p=2, dim=-1, keepdim=True) runner.model.num_test_classes = prompt_embeddings.shape[0] runner.model.setembeddings(prompt_embeddings[None]) else: runner.model.setembeddings(None) texts = [[t.strip()] for t in text.split(',')] data_info = dict(img_id=0, img=np.array(image), texts=texts) data_info = runner.pipeline(data_info) data_batch = dict(inputs=data_info['inputs'].unsqueeze(0), data_samples=[data_info['data_samples']]) with autocast(enabled=False), torch.no_grad(): if (prompt_image is not None) and ('texts' in data_batch['data_samples'][ 0]): del data_batch['data_samples'][0]['texts'] output = runner.model.test_step(data_batch)[0] pred_instances = output.pred_instances keep = nms(pred_instances.bboxes, pred_instances.scores, iou_threshold=nms_thr) pred_instances = pred_instances[keep] pred_instances = pred_instances[pred_instances.scores.float() > score_thr] if len(pred_instances.scores) > max_num_boxes: indices = pred_instances.scores.float().topk(max_num_boxes)[1] pred_instances = pred_instances[indices] pred_instances = pred_instances.cpu().numpy() if 'masks' in pred_instances: masks = pred_instances['masks'] else: masks = None detections = sv.Detections(xyxy=pred_instances['bboxes'], class_id=pred_instances['labels'], confidence=pred_instances['scores'], mask=masks) labels = [ f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in zip(detections.class_id, detections.confidence) ] image = np.array(image) image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # Convert RGB to BGR image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections) image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels) if masks is not None: image = MASK_ANNOTATOR.annotate(image, detections) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB image = Image.fromarray(image) return image MARKDOWN = """ # YOLO-World-Image 👀: YOLO-World with Image Prompts Now everyone can use YOLO-World and some image prompts to detect corresponding objects in images, no longer relying on text prompts! This demo is actively under construction and the YOLO-World-Image is an initial (**beta**) version!
### Simple guide: * upload the image prompts * adjust the hyperparameters: - score threshold - nms threshold - max number of detections - whether using padding: adding padding might improve the confidence scores of the predictions. It will be removed in the coming updates. * run with 'submit'! """ def demo(runner, vision_encoder, vision_processor, padding_embed): with gr.Blocks(title="YOLO-World") as demo: with gr.Row(): gr.Markdown(MARKDOWN) with gr.Row(): image = gr.Image(type='pil', label='input image') output_image = gr.Image(type='pil', label='output image') with gr.Row(): with gr.Column(scale=0.3): with gr.Row(): prompt_image = gr.Image(type='pil', label='Image Prompts', height=300) with gr.Row(): add_padding = gr.Radio(["padding", "none"], label="Padding Prompt", info="whether add padding prompt") with gr.Column(scale=0.3): with gr.Row(): input_text = gr.Textbox( lines=7, label='Text Prompts:\nEnter the classes to be detected, ' 'separated by comma', value=', '.join(CocoDataset.METAINFO['classes']), elem_id='textbox') with gr.Column(scale=0.4): max_num_boxes = gr.Slider(minimum=1, maximum=300, value=100, step=1, interactive=True, label='Maximum Number Boxes') score_thr = gr.Slider(minimum=0, maximum=1, value=0.05, step=0.001, interactive=True, label='Score Threshold') nms_thr = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.001, interactive=True, label='NMS Threshold') with gr.Row(): submit = gr.Button('Submit') clear = gr.Button('Clear') exp_image_dir = "./gradio_examples/image_prompts/images/" exp_prompt_dir = "./gradio_examples/image_prompts/prompts/" example = gr.Examples( examples=[ [ exp_image_dir + "0.jpeg", exp_prompt_dir + "0.png", "", "none", 0.3, 0.5, 100 ], [ exp_image_dir + "1.png", exp_prompt_dir + "1.png", "", "padding", 0.2, 0.1, 100 ], [ exp_image_dir + "3.png", exp_prompt_dir + "3.png", "", "padding", 0.3, 0.5, 100 ], ], inputs=[ image, prompt_image, input_text, add_padding, score_thr, nms_thr, max_num_boxes ], ) submit.click( partial(run_image, runner, vision_encoder, vision_processor, padding_embed), [ image, input_text, prompt_image, add_padding, max_num_boxes, score_thr, nms_thr, ], [output_image]) clear.click(lambda: [None, None, '', None], None, [image, prompt_image, input_text, output_image]) demo.launch() # port 80 does not work for me if __name__ == '__main__': # args = parse_args() config = "configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_image_prompt_demo.py" checkpoint = "weights/yolo_world_v2_l_image_prompt_adapter-719a7afb.pth" # load config cfg = Config.fromfile(config) cfg.compile = False if cfg.get('work_dir', None) is None: cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(config))[0]) cfg.load_from = checkpoint # if 'runner_type' not in cfg: # runner = Runner.from_cfg(cfg) # else: # runner = RUNNERS.build(cfg) # runner.test() runner.call_hook('before_run') runner.load_or_resume() pipeline = cfg.test_dataloader.dataset.pipeline pipeline[0].type = 'mmdet.LoadImageFromNDArray' runner.pipeline = Compose(pipeline) runner.model.eval() # init vision encoder clip_model = "openai/clip-vit-base-patch32" vision_model = CLIPVisionModelWithProjection.from_pretrained(clip_model) processor = CLIPImageProcessor.from_pretrained(clip_model) device = 'cuda' vision_model.to(device) texts = [' '] tokenizer = AutoTokenizer.from_pretrained(clip_model) text_model = CLIPTextModelWithProjection.from_pretrained(clip_model) # device = 'cuda:0' text_model.to(device) texts = tokenizer(text=texts, return_tensors='pt', padding=True) texts = texts.to(device) text_outputs = text_model(**texts) txt_feats = text_outputs.text_embeds txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True) txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]) txt_feats = txt_feats[0].unsqueeze(0) demo(runner, vision_model, processor, txt_feats)