|
""" |
|
Gradio app for pollen-vision |
|
|
|
This script creates a Gradio app for pollen-vision. The app allows users to perform object detection and object segmentation using the OWL-ViT and MobileSAM models. |
|
""" |
|
|
|
from datasets import load_dataset |
|
import gradio as gr |
|
|
|
import numpy as np |
|
import numpy.typing as npt |
|
from typing import Any, Dict, List |
|
|
|
from pollen_vision.vision_models.object_detection import OwlVitWrapper |
|
from pollen_vision.vision_models.object_segmentation import MobileSamWrapper |
|
from pollen_vision.utils import Annotator, get_bboxes |
|
|
|
|
|
owl_vit = OwlVitWrapper() |
|
mobile_sam = MobileSamWrapper() |
|
annotator = Annotator() |
|
|
|
|
|
def object_detection( |
|
img: npt.NDArray[np.uint8], text_queries: List[str], score_threshold: float |
|
) -> List[Dict[str, Any]]: |
|
predictions: List[Dict[str, Any]] = owl_vit.infer( |
|
im=img, candidate_labels=text_queries, detection_threshold=score_threshold |
|
) |
|
return predictions |
|
|
|
|
|
def object_segmentation( |
|
img: npt.NDArray[np.uint8], object_detection_predictions: List[Dict[str, Any]] |
|
) -> List[npt.NDArray[np.uint8]]: |
|
bboxes = get_bboxes(predictions=object_detection_predictions) |
|
masks: List[npt.NDArray[np.uint8]] = mobile_sam.infer(im=img, bboxes=bboxes) |
|
return masks |
|
|
|
|
|
def query( |
|
task: str, |
|
img: npt.NDArray[np.uint8], |
|
text_queries: List[str], |
|
score_threshold: float, |
|
) -> npt.NDArray[np.uint8]: |
|
object_detection_predictions = object_detection( |
|
img=img, text_queries=text_queries, score_threshold=score_threshold |
|
) |
|
|
|
if task == "Object detection + segmentation (OWL-ViT + MobileSAM)": |
|
masks = object_segmentation( |
|
img=img, object_detection_predictions=object_detection_predictions |
|
) |
|
img = annotator.annotate( |
|
im=img, detection_predictions=object_detection_predictions, masks=masks |
|
) |
|
return img |
|
|
|
img = annotator.annotate(im=img, detection_predictions=object_detection_predictions) |
|
return img |
|
|
|
|
|
description = """ |
|
Welcome to the demo of pollen-vision, a simple and unified Python library to zero-shot computer vision models curated |
|
for robotics use cases. **Pollen-vision** is designed for ease of installation and use, composed of independent modules |
|
that can be combined to create a 3D object detection pipeline, getting the position of the objects in 3D space (x, y, z). |
|
|
|
\n\nIn this demo, you have the option to choose between two tasks: object detection and object detection + segmentation. |
|
The models available are: |
|
|
|
- **OWL-VIT** (Open World Localization - Vision Transformer, By Google Research): this model performs text-conditionned |
|
zero-shot 2D object localization in RGB images. |
|
- **Mobile SAM**: A lightweight version of the Segment Anything Model (SAM) by Meta AI. SAM is a zero shot image |
|
segmentation model. It can be prompted with bounding boxes or points. (https://github.com/ChaoningZhang/MobileSAM) |
|
|
|
\n\nYou can input images in this demo in three ways: either by trying out the provided examples, by uploading an image |
|
of your choice, or by capturing an image from your computer's webcam. |
|
Additionally, you should provide text queries representing a list of objects to detect. Separate each object with a comma. |
|
The last input parameter is the detection threshold (ranging from 0 to 1), which defaults to 0.1. |
|
|
|
\n\nCheck out our blog post introducing pollen-vision or its <a href="https://github.com/pollen-robotics/pollen-vision"> |
|
Github repository</a> for more info! |
|
""" |
|
|
|
demo_inputs = [ |
|
gr.Dropdown( |
|
[ |
|
"Object detection (OWL-ViT)", |
|
"Object detection + segmentation (OWL-ViT + MobileSAM)", |
|
], |
|
label="Choose a task", |
|
value="Object detection (OWL-ViT)", |
|
), |
|
gr.Image(), |
|
"text", |
|
gr.Slider(0, 1, value=0.1), |
|
] |
|
|
|
rdt_dataset = load_dataset("pollen-robotics/reachy-doing-things", split="train") |
|
|
|
img_kitchen_detection = rdt_dataset[11]["image"] |
|
img_kitchen_segmentation = rdt_dataset[12]["image"] |
|
|
|
demo_examples = [ |
|
[ |
|
"Object detection (OWL-ViT)", |
|
img_kitchen_detection, |
|
["kettle", "black mug", "sink", "blue mug", "sponge", "bag of chips"], |
|
0.15, |
|
], |
|
[ |
|
"Object detection + segmentation (OWL-ViT + MobileSAM)", |
|
img_kitchen_segmentation, |
|
["blue mug", "paper cup", "kettle", "sponge"], |
|
0.12, |
|
], |
|
] |
|
|
|
demo = gr.Interface( |
|
fn=query, |
|
inputs=demo_inputs, |
|
outputs="image", |
|
title="Use zero-shot computer vision models with pollen-vision", |
|
description=description, |
|
examples=demo_examples, |
|
) |
|
demo.launch() |
|
|