Spaces:

RaviNaik
/

ERA-SESSION19

Sleeping

App Files Files Community

ravi.naik commited on Oct 6, 2023

Commit

10f4748

1 Parent(s): 3b41a3f

Added FastSAM code and supporting UI

Browse files

Files changed (11) hide show

FastSAM-x.pt +3 -0
app.py +168 -47
experiments/clip.ipynb +0 -0
experiments/sam.ipynb +121 -0
fastsam/__init__.py +9 -0
fastsam/decoder.py +131 -0
fastsam/model.py +106 -0
fastsam/predict.py +56 -0
fastsam/prompt.py +455 -0
fastsam/utils.py +86 -0
requirements.txt +20 -3

FastSAM-x.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0be4e7ddbe4c15333d15a859c676d053c486d0a746a3be6a7a9790d52a9b6d7
+size 144943063

app.py CHANGED Viewed

@@ -2,8 +2,15 @@ import gradio as gr
 from PIL import Image
 from transformers import CLIPProcessor, CLIPModel
-sample_images = [f"./sample_images/{i}.jpg" for i in range(5)]
 prediction_image = None
 clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
 clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
@@ -48,59 +55,173 @@ def predict(text):
     return {output: gr.update(value=results)}
-with gr.Blocks() as app:
-    gr.Markdown("## ERA Session19 - Zero Shot Classification with CLIP")
-    gr.Markdown(
-        "Please an image or select one of the sample images. Type some classification labels separated by comma. For ex: dog, cat"
     )
-    with gr.Row():
-        with gr.Column():
-            with gr.Box():
-                with gr.Group():
-                    upload_gallery = gr.Gallery(
-                        value=None,
-                        label="Uploaded images",
-                        show_label=False,
-                        elem_id="gallery_upload",
-                        columns=5,
-                        rows=2,
-                        height="auto",
-                        object_fit="contain",
                     )
-                    upload_button = gr.UploadButton(
-                        "Click to Upload images",
-                        file_types=["image"],
-                        file_count="multiple",
                     )
-                    upload_button.upload(upload_file, upload_button, upload_gallery)
-                with gr.Group():
-                    sample_gallery = gr.Gallery(
-                        value=sample_images,
-                        label="Sample images",
-                        show_label=False,
-                        elem_id="gallery_sample",
-                        columns=3,
-                        rows=2,
-                        height="auto",
-                        object_fit="contain",
                     )
-                upload_gallery.select(set_prediction_image, inputs=[upload_gallery])
-                sample_gallery.select(set_prediction_image, inputs=[sample_gallery])
-            with gr.Box():
-                input_text = gr.TextArea(
-                    label="Classification Text",
-                    placeholder="Please enter comma separated text",
-                    interactive=True,
                 )
-            submit_btn = gr.Button(value="Submit")
-        with gr.Column():
-            with gr.Box():
-                output = gr.Label(value=None, label="Classification Results")
-            submit_btn.click(predict, inputs=[input_text], outputs=[output])
-app.launch()

 from PIL import Image
 from transformers import CLIPProcessor, CLIPModel
+from fastsam import FastSAM, FastSAMPrompt
+project_path = "/home/ravi.naik/learning/era/s19"
+sam_model = FastSAM(f"{project_path}/FastSAM-x.pt")
+DEVICE = "cpu"
+sample_images = [f"{project_path}/sample_images/{i}.jpg" for i in range(5)]
 prediction_image = None
+sam_prediction_image = None
 clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
 clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
     return {output: gr.update(value=results)}
+def show_hide_sam_text(status):
+    if status == "Text Based":
+        return {sam_input_text: gr.update(visible=True)}
+    return {sam_input_text: gr.update(visible=False)}
+def set_prediction_image_sam(evt: gr.SelectData, gallery):
+    global sam_prediction_image
+    if isinstance(gallery[evt.index], dict):
+        sam_prediction_image = gallery[evt.index]["name"]
+    else:
+        sam_prediction_image = gallery[evt.index][0]["name"]
+def sam_predict(radio, text):
+    output_path = f"{project_path}/output/sam_results.jpg"
+    everything_results = sam_model(
+        sam_prediction_image,
+        device=DEVICE,
+        retina_masks=True,
+        imgsz=1024,
+        conf=0.4,
+        iou=0.9,
+    )
+    prompt_process = FastSAMPrompt(
+        sam_prediction_image, everything_results, device=DEVICE
+    )
+    ann = prompt_process.everything_prompt()
+    if radio == "Text Based":
+        ann = prompt_process.text_prompt(text=text)
+    prompt_process.plot(
+        annotations=ann,
+        output_path=output_path,
     )
+    return {sam_output: gr.update(value=output_path)}
+with gr.Blocks() as app:
+    gr.Markdown("## ERA Session19 - FastSAM & CLIP Inference with Gradio")
+    with gr.Tab("FastSAM"):
+        gr.Markdown("### ERA Session19 - Image Segmentation with FastSAM")
+        gr.Markdown(
+            """Please an image or select one of the sample images.
+            Select either segment everything or text based segmentation.
+            Enter the text if you opt for segment based on text and hit Submit.
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                with gr.Box():
+                    with gr.Group():
+                        upload_gallery = gr.Gallery(
+                            value=None,
+                            label="Uploaded images",
+                            show_label=False,
+                            elem_id="gallery_upload",
+                            columns=5,
+                            rows=2,
+                            height="auto",
+                            object_fit="contain",
+                        )
+                        upload_button = gr.UploadButton(
+                            "Click to Upload images",
+                            file_types=["image"],
+                            file_count="multiple",
+                        )
+                        upload_button.upload(upload_file, upload_button, upload_gallery)
+                    with gr.Group():
+                        sample_gallery = gr.Gallery(
+                            value=sample_images,
+                            label="Sample images",
+                            show_label=False,
+                            elem_id="gallery_sample",
+                            columns=3,
+                            rows=2,
+                            height="auto",
+                            object_fit="contain",
+                        )
+                    upload_gallery.select(
+                        set_prediction_image_sam, inputs=[upload_gallery]
                     )
+                    sample_gallery.select(
+                        set_prediction_image_sam, inputs=[sample_gallery]
                     )
+                with gr.Box():
+                    radio = gr.Radio(
+                        choices=["Segment Everything", "Text Based"],
+                        value="Segment Everything",
+                        type="value",
+                        label="Select a Segmentation approach",
+                        interactive=True,
                     )
+                    sam_input_text = gr.TextArea(
+                        label="Segementation Input",
+                        placeholder="Please enter some text",
+                        interactive=True,
+                        visible=False,
+                    )
+                radio.change(
+                    show_hide_sam_text, inputs=[radio], outputs=[sam_input_text]
+                )
+                sam_submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                with gr.Box():
+                    sam_output = gr.Image(value=None, label="Segmentation Results")
+                sam_submit_btn.click(
+                    sam_predict, inputs=[radio, sam_input_text], outputs=[sam_output]
                 )
+    with gr.Tab("CLIP"):
+        gr.Markdown("### ERA Session19 - Zero Shot Classification with CLIP")
+        gr.Markdown(
+            "Please an image or select one of the sample images. Type some classification labels separated by comma. For ex: dog, cat"
+        )
+        with gr.Row():
+            with gr.Column():
+                with gr.Box():
+                    with gr.Group():
+                        upload_gallery = gr.Gallery(
+                            value=None,
+                            label="Uploaded images",
+                            show_label=False,
+                            elem_id="gallery_upload",
+                            columns=5,
+                            rows=2,
+                            height="auto",
+                            object_fit="contain",
+                        )
+                        upload_button = gr.UploadButton(
+                            "Click to Upload images",
+                            file_types=["image"],
+                            file_count="multiple",
+                        )
+                        upload_button.upload(upload_file, upload_button, upload_gallery)
+                    with gr.Group():
+                        sample_gallery = gr.Gallery(
+                            value=sample_images,
+                            label="Sample images",
+                            show_label=False,
+                            elem_id="gallery_sample",
+                            columns=3,
+                            rows=2,
+                            height="auto",
+                            object_fit="contain",
+                        )
+                    upload_gallery.select(set_prediction_image, inputs=[upload_gallery])
+                    sample_gallery.select(set_prediction_image, inputs=[sample_gallery])
+                with gr.Box():
+                    input_text = gr.TextArea(
+                        label="Classification Text",
+                        placeholder="Please enter comma separated text",
+                        interactive=True,
+                    )
+                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                with gr.Box():
+                    output = gr.Label(value=None, label="Classification Results")
+                submit_btn.click(predict, inputs=[input_text], outputs=[output])
+app.launch(debug=True, show_error=True)

experiments/clip.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/sam.ipynb ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fastsam import FastSAM, FastSAMPrompt\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = FastSAM(\"FastSAM-x.pt\")\n",
+    "IMAGE_PATH = \"./sample_images/3.jpg\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DEVICE = \"cpu\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "image 1/1 /home/ravi.naik/learning/era/s19/sample_images/3.jpg: 704x1024 5 objects, 5524.6ms\n",
+      "Speed: 77.9ms preprocess, 5524.6ms inference, 75.1ms postprocess per image at shape (1, 3, 1024, 1024)\n"
+     ]
+    }
+   ],
+   "source": [
+    "everything_results = model(\n",
+    "    IMAGE_PATH,\n",
+    "    device=DEVICE,\n",
+    "    retina_masks=True,\n",
+    "    imgsz=1024,\n",
+    "    conf=0.4,\n",
+    "    iou=0.9,\n",
+    ")\n",
+    "prompt_process = FastSAMPrompt(IMAGE_PATH, everything_results, device=DEVICE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|███████████████████████████████████████| 338M/338M [00:32<00:00, 10.9MiB/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# everything prompt\n",
+    "ann = prompt_process.everything_prompt()\n",
+    "\n",
+    "# bbox default shape [0,0,0,0] -> [x1,y1,x2,y2]\n",
+    "# ann = prompt_process.box_prompt(bbox=[[200, 200, 300, 300]])\n",
+    "\n",
+    "# text prompt\n",
+    "ann = prompt_process.text_prompt(text=\"a photo of a dog\")\n",
+    "\n",
+    "# point prompt\n",
+    "# points default [[0,0]] [[x1,y1],[x2,y2]]\n",
+    "# point_label default [0] [1,0] 0:background, 1:foreground\n",
+    "# ann = prompt_process.point_prompt(points=[[620, 360]], pointlabel=[1])\n",
+    "\n",
+    "prompt_process.plot(\n",
+    "    annotations=ann,\n",
+    "    output_path=\"./output/dog.jpg\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

fastsam/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Ultralytics YOLO 🚀, AGPL-3.0 license
+from .model import FastSAM
+from .predict import FastSAMPredictor
+from .prompt import FastSAMPrompt
+# from .val import FastSAMValidator
+from .decoder import FastSAMDecoder
+__all__ = 'FastSAMPredictor', 'FastSAM', 'FastSAMPrompt', 'FastSAMDecoder'

fastsam/decoder.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from .model import FastSAM
+import numpy as np
+from PIL import Image
+import clip
+from typing import Optional, List, Tuple, Union
+class FastSAMDecoder:
+    def __init__(
+        self,
+        model: FastSAM,
+        device: str='cpu',
+        conf: float=0.4,
+        iou: float=0.9,
+        imgsz: int=1024,
+        retina_masks: bool=True,
+        ):
+        self.model = model
+        self.device = device
+        self.retina_masks = retina_masks
+        self.imgsz = imgsz
+        self.conf = conf
+        self.iou = iou
+        self.image = None
+        self.image_embedding = None
+    def run_encoder(self, image):
+        if isinstance(image,str):
+            image =  np.array(Image.open(image))
+        self.image = image
+        image_embedding = self.model(
+            self.image,
+            device=self.device,
+            retina_masks=self.retina_masks,
+            imgsz=self.imgsz,
+            conf=self.conf,
+            iou=self.iou
+            )
+        return image_embedding[0].numpy()
+    def run_decoder(
+            self,
+            image_embedding,
+            point_prompt: Optional[np.ndarray]=None,
+            point_label: Optional[np.ndarray]=None,
+            box_prompt: Optional[np.ndarray]=None,
+            text_prompt: Optional[str]=None,
+            )->np.ndarray:
+        self.image_embedding = image_embedding
+        if point_prompt is not None:
+            ann = self.point_prompt(points=point_prompt, pointlabel=point_label)
+            return ann
+        elif box_prompt is not None:
+            ann = self.box_prompt(bbox=box_prompt)
+            return ann
+        elif text_prompt is not None:
+            ann = self.text_prompt(text=text_prompt)
+            return ann
+        else:
+            return None
+    def box_prompt(self, bbox):
+        assert (bbox[2] != 0 and bbox[3] != 0)
+        masks = self.image_embedding.masks.data
+        target_height = self.image.shape[0]
+        target_width = self.image.shape[1]
+        h = masks.shape[1]
+        w = masks.shape[2]
+        if h != target_height or w != target_width:
+            bbox = [
+                int(bbox[0] * w / target_width),
+                int(bbox[1] * h / target_height),
+                int(bbox[2] * w / target_width),
+                int(bbox[3] * h / target_height), ]
+        bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
+        bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
+        bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
+        bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
+        # IoUs = torch.zeros(len(masks), dtype=torch.float32)
+        bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
+        masks_area = np.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], axis=(1, 2))
+        orig_masks_area = np.sum(masks, axis=(1, 2))
+        union = bbox_area + orig_masks_area - masks_area
+        IoUs = masks_area / union
+        max_iou_index = np.argmax(IoUs)
+        return np.array([masks[max_iou_index].cpu().numpy()])
+    def point_prompt(self, points, pointlabel):  # numpy
+        masks = self._format_results(self.image_embedding[0], 0)
+        target_height = self.image.shape[0]
+        target_width = self.image.shape[1]
+        h = masks[0]['segmentation'].shape[0]
+        w = masks[0]['segmentation'].shape[1]
+        if h != target_height or w != target_width:
+            points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
+        onemask = np.zeros((h, w))
+        masks = sorted(masks, key=lambda x: x['area'], reverse=True)
+        for i, annotation in enumerate(masks):
+            if type(annotation) == dict:
+                mask = annotation['segmentation']
+            else:
+                mask = annotation
+            for i, point in enumerate(points):
+                if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
+                    onemask[mask] = 1
+                if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
+                    onemask[mask] = 0
+        onemask = onemask >= 1
+        return np.array([onemask])
+    def _format_results(self, result, filter=0):
+        annotations = []
+        n = len(result.masks.data)
+        for i in range(n):
+            annotation = {}
+            mask = result.masks.data[i] == 1.0
+            if np.sum(mask) < filter:
+                continue
+            annotation['id'] = i
+            annotation['segmentation'] = mask
+            annotation['bbox'] = result.boxes.data[i]
+            annotation['score'] = result.boxes.conf[i]
+            annotation['area'] = annotation['segmentation'].sum()
+            annotations.append(annotation)
+        return annotations

fastsam/model.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+FastSAM model interface.
+Usage - Predict:
+    from ultralytics import FastSAM
+    model = FastSAM('last.pt')
+    results = model.predict('ultralytics/assets/bus.jpg')
+"""
+from ultralytics.yolo.cfg import get_cfg
+from ultralytics.yolo.engine.exporter import Exporter
+from ultralytics.yolo.engine.model import YOLO
+from ultralytics.yolo.utils import DEFAULT_CFG, LOGGER, ROOT, is_git_dir
+from ultralytics.yolo.utils.checks import check_imgsz
+from ultralytics.yolo.utils.torch_utils import model_info, smart_inference_mode
+from .predict import FastSAMPredictor
+class FastSAM(YOLO):
+    @smart_inference_mode()
+    def predict(self, source=None, stream=False, **kwargs):
+        """
+        Perform prediction using the YOLO model.
+        Args:
+            source (str | int | PIL | np.ndarray): The source of the image to make predictions on.
+                          Accepts all source types accepted by the YOLO model.
+            stream (bool): Whether to stream the predictions or not. Defaults to False.
+            **kwargs : Additional keyword arguments passed to the predictor.
+                       Check the 'configuration' section in the documentation for all available options.
+        Returns:
+            (List[ultralytics.yolo.engine.results.Results]): The prediction results.
+        """
+        if source is None:
+            source = ROOT / 'assets' if is_git_dir() else 'https://ultralytics.com/images/bus.jpg'
+            LOGGER.warning(f"WARNING ⚠️ 'source' is missing. Using 'source={source}'.")
+        overrides = self.overrides.copy()
+        overrides['conf'] = 0.25
+        overrides.update(kwargs)  # prefer kwargs
+        overrides['mode'] = kwargs.get('mode', 'predict')
+        assert overrides['mode'] in ['track', 'predict']
+        overrides['save'] = kwargs.get('save', False)  # do not save by default if called in Python
+        self.predictor = FastSAMPredictor(overrides=overrides)
+        self.predictor.setup_model(model=self.model, verbose=False)
+        try:
+            return self.predictor(source, stream=stream)
+        except Exception as e:
+            return None
+    def train(self, **kwargs):
+        """Function trains models but raises an error as FastSAM models do not support training."""
+        raise NotImplementedError("Currently, the training codes are on the way.")
+    def val(self, **kwargs):
+        """Run validation given dataset."""
+        overrides = dict(task='segment', mode='val')
+        overrides.update(kwargs)  # prefer kwargs
+        args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides)
+        args.imgsz = check_imgsz(args.imgsz, max_dim=1)
+        validator = FastSAM(args=args)
+        validator(model=self.model)
+        self.metrics = validator.metrics
+        return validator.metrics
+    @smart_inference_mode()
+    def export(self, **kwargs):
+        """
+        Export model.
+        Args:
+            **kwargs : Any other args accepted by the predictors. To see all args check 'configuration' section in docs
+        """
+        overrides = dict(task='detect')
+        overrides.update(kwargs)
+        overrides['mode'] = 'export'
+        args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides)
+        args.task = self.task
+        if args.imgsz == DEFAULT_CFG.imgsz:
+            args.imgsz = self.model.args['imgsz']  # use trained imgsz unless custom value is passed
+        if args.batch == DEFAULT_CFG.batch:
+            args.batch = 1  # default to 1 if not modified
+        return Exporter(overrides=args)(model=self.model)
+    def info(self, detailed=False, verbose=True):
+        """
+        Logs model info.
+        Args:
+            detailed (bool): Show detailed information about model.
+            verbose (bool): Controls verbosity.
+        """
+        return model_info(self.model, detailed=detailed, verbose=verbose, imgsz=640)
+    def __call__(self, source=None, stream=False, **kwargs):
+        """Calls the 'predict' function with given arguments to perform object detection."""
+        return self.predict(source, stream, **kwargs)
+    def __getattr__(self, attr):
+        """Raises error if object has no requested attribute."""
+        name = self.__class__.__name__
+        raise AttributeError(f"'{name}' object has no attribute '{attr}'. See valid attributes below.\n{self.__doc__}")

fastsam/predict.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from ultralytics.yolo.engine.results import Results
+from ultralytics.yolo.utils import DEFAULT_CFG, ops
+from ultralytics.yolo.v8.detect.predict import DetectionPredictor
+from .utils import bbox_iou
+class FastSAMPredictor(DetectionPredictor):
+    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
+        super().__init__(cfg, overrides, _callbacks)
+        self.args.task = 'segment'
+    def postprocess(self, preds, img, orig_imgs):
+        """TODO: filter by classes."""
+        p = ops.non_max_suppression(preds[0],
+                                    self.args.conf,
+                                    self.args.iou,
+                                    agnostic=self.args.agnostic_nms,
+                                    max_det=self.args.max_det,
+                                    nc=len(self.model.names),
+                                    classes=self.args.classes)
+        results = []
+        if len(p) == 0 or len(p[0]) == 0:
+            print("No object detected.")
+            return results
+        full_box = torch.zeros_like(p[0][0])
+        full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
+        full_box = full_box.view(1, -1)
+        critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
+        if critical_iou_index.numel() != 0:
+            full_box[0][4] = p[0][critical_iou_index][:,4]
+            full_box[0][6:] = p[0][critical_iou_index][:,6:]
+            p[0][critical_iou_index] = full_box
+        proto = preds[1][-1] if len(preds[1]) == 3 else preds[1]  # second output is len 3 if pt, but only 1 if exported
+        for i, pred in enumerate(p):
+            orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
+            path = self.batch[0]
+            img_path = path[i] if isinstance(path, list) else path
+            if not len(pred):  # save empty boxes
+                results.append(Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6]))
+                continue
+            if self.args.retina_masks:
+                if not isinstance(orig_imgs, torch.Tensor):
+                    pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
+                masks = ops.process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2])  # HWC
+            else:
+                masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True)  # HWC
+                if not isinstance(orig_imgs, torch.Tensor):
+                    pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
+            results.append(
+                Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], masks=masks))
+        return results

fastsam/prompt.py ADDED Viewed

	@@ -0,0 +1,455 @@

+import os
+import sys
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from .utils import image_to_np_ndarray
+from PIL import Image
+try:
+    import clip  # for linear_assignment
+except (ImportError, AssertionError, AttributeError):
+    from ultralytics.yolo.utils.checks import check_requirements
+    check_requirements('git+https://github.com/openai/CLIP.git')  # required before installing lap from source
+    import clip
+class FastSAMPrompt:
+    def __init__(self, image, results, device='cuda'):
+        if isinstance(image, str) or isinstance(image, Image.Image):
+            image = image_to_np_ndarray(image)
+        self.device = device
+        self.results = results
+        self.img = image
+    def _segment_image(self, image, bbox):
+        if isinstance(image, Image.Image):
+            image_array = np.array(image)
+        else:
+            image_array = image
+        segmented_image_array = np.zeros_like(image_array)
+        x1, y1, x2, y2 = bbox
+        segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
+        segmented_image = Image.fromarray(segmented_image_array)
+        black_image = Image.new('RGB', image.size, (255, 255, 255))
+        # transparency_mask = np.zeros_like((), dtype=np.uint8)
+        transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
+        transparency_mask[y1:y2, x1:x2] = 255
+        transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
+        black_image.paste(segmented_image, mask=transparency_mask_image)
+        return black_image
+    def _format_results(self, result, filter=0):
+        annotations = []
+        n = len(result.masks.data)
+        for i in range(n):
+            annotation = {}
+            mask = result.masks.data[i] == 1.0
+            if torch.sum(mask) < filter:
+                continue
+            annotation['id'] = i
+            annotation['segmentation'] = mask.cpu().numpy()
+            annotation['bbox'] = result.boxes.data[i]
+            annotation['score'] = result.boxes.conf[i]
+            annotation['area'] = annotation['segmentation'].sum()
+            annotations.append(annotation)
+        return annotations
+    def filter_masks(annotations):  # filte the overlap mask
+        annotations.sort(key=lambda x: x['area'], reverse=True)
+        to_remove = set()
+        for i in range(0, len(annotations)):
+            a = annotations[i]
+            for j in range(i + 1, len(annotations)):
+                b = annotations[j]
+                if i != j and j not in to_remove:
+                    # check if
+                    if b['area'] < a['area']:
+                        if (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
+                            to_remove.add(j)
+        return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
+    def _get_bbox_from_mask(self, mask):
+        mask = mask.astype(np.uint8)
+        contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        x1, y1, w, h = cv2.boundingRect(contours[0])
+        x2, y2 = x1 + w, y1 + h
+        if len(contours) > 1:
+            for b in contours:
+                x_t, y_t, w_t, h_t = cv2.boundingRect(b)
+                # Merge multiple bounding boxes into one.
+                x1 = min(x1, x_t)
+                y1 = min(y1, y_t)
+                x2 = max(x2, x_t + w_t)
+                y2 = max(y2, y_t + h_t)
+            h = y2 - y1
+            w = x2 - x1
+        return [x1, y1, x2, y2]
+    def plot_to_result(self,
+             annotations,
+             bboxes=None,
+             points=None,
+             point_label=None,
+             mask_random_color=True,
+             better_quality=True,
+             retina=False,
+             withContours=True) -> np.ndarray:
+        if isinstance(annotations[0], dict):
+            annotations = [annotation['segmentation'] for annotation in annotations]
+        image = self.img
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        original_h = image.shape[0]
+        original_w = image.shape[1]
+        if sys.platform == "darwin":
+            plt.switch_backend("TkAgg")
+        plt.figure(figsize=(original_w / 100, original_h / 100))
+        # Add subplot with no margin.
+        plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
+        plt.margins(0, 0)
+        plt.gca().xaxis.set_major_locator(plt.NullLocator())
+        plt.gca().yaxis.set_major_locator(plt.NullLocator())
+        plt.imshow(image)
+        if better_quality:
+            if isinstance(annotations[0], torch.Tensor):
+                annotations = np.array(annotations.cpu())
+            for i, mask in enumerate(annotations):
+                mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
+                annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
+        if self.device == 'cpu':
+            annotations = np.array(annotations)
+            self.fast_show_mask(
+                annotations,
+                plt.gca(),
+                random_color=mask_random_color,
+                bboxes=bboxes,
+                points=points,
+                pointlabel=point_label,
+                retinamask=retina,
+                target_height=original_h,
+                target_width=original_w,
+            )
+        else:
+            if isinstance(annotations[0], np.ndarray):
+                annotations = torch.from_numpy(annotations)
+            self.fast_show_mask_gpu(
+                annotations,
+                plt.gca(),
+                random_color=mask_random_color,
+                bboxes=bboxes,
+                points=points,
+                pointlabel=point_label,
+                retinamask=retina,
+                target_height=original_h,
+                target_width=original_w,
+            )
+        if isinstance(annotations, torch.Tensor):
+            annotations = annotations.cpu().numpy()
+        if withContours:
+            contour_all = []
+            temp = np.zeros((original_h, original_w, 1))
+            for i, mask in enumerate(annotations):
+                if type(mask) == dict:
+                    mask = mask['segmentation']
+                annotation = mask.astype(np.uint8)
+                if not retina:
+                    annotation = cv2.resize(
+                        annotation,
+                        (original_w, original_h),
+                        interpolation=cv2.INTER_NEAREST,
+                    )
+                contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+                for contour in contours:
+                    contour_all.append(contour)
+            cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
+            color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
+            contour_mask = temp / 255 * color.reshape(1, 1, -1)
+            plt.imshow(contour_mask)
+        plt.axis('off')
+        fig = plt.gcf()
+        plt.draw()
+        try:
+            buf = fig.canvas.tostring_rgb()
+        except AttributeError:
+            fig.canvas.draw()
+            buf = fig.canvas.tostring_rgb()
+        cols, rows = fig.canvas.get_width_height()
+        img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3)
+        result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
+        plt.close()
+        return result
+    # Remark for refactoring: IMO a function should do one thing only, storing the image and plotting should be seperated and do not necessarily need to be class functions but standalone utility functions that the user can chain in his scripts to have more fine-grained control.
+    def plot(self,
+             annotations,
+             output_path,
+             bboxes=None,
+             points=None,
+             point_label=None,
+             mask_random_color=True,
+             better_quality=True,
+             retina=False,
+             withContours=True):
+        if len(annotations) == 0:
+            return None
+        result = self.plot_to_result(
+            annotations,
+            bboxes,
+            points,
+            point_label,
+            mask_random_color,
+            better_quality,
+            retina,
+            withContours,
+        )
+        path = os.path.dirname(os.path.abspath(output_path))
+        if not os.path.exists(path):
+            os.makedirs(path)
+        result = result[:, :, ::-1]
+        cv2.imwrite(output_path, result)
+    #   CPU post process
+    def fast_show_mask(
+        self,
+        annotation,
+        ax,
+        random_color=False,
+        bboxes=None,
+        points=None,
+        pointlabel=None,
+        retinamask=True,
+        target_height=960,
+        target_width=960,
+    ):
+        msak_sum = annotation.shape[0]
+        height = annotation.shape[1]
+        weight = annotation.shape[2]
+        #Sort annotations based on area.
+        areas = np.sum(annotation, axis=(1, 2))
+        sorted_indices = np.argsort(areas)
+        annotation = annotation[sorted_indices]
+        index = (annotation != 0).argmax(axis=0)
+        if random_color:
+            color = np.random.random((msak_sum, 1, 1, 3))
+        else:
+            color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255])
+        transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
+        visual = np.concatenate([color, transparency], axis=-1)
+        mask_image = np.expand_dims(annotation, -1) * visual
+        show = np.zeros((height, weight, 4))
+        h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij')
+        indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
+        # Use vectorized indexing to update the values of 'show'.
+        show[h_indices, w_indices, :] = mask_image[indices]
+        if bboxes is not None:
+            for bbox in bboxes:
+                x1, y1, x2, y2 = bbox
+                ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
+        # draw point
+        if points is not None:
+            plt.scatter(
+                [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
+                [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
+                s=20,
+                c='y',
+            )
+            plt.scatter(
+                [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
+                [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
+                s=20,
+                c='m',
+            )
+        if not retinamask:
+            show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
+        ax.imshow(show)
+    def fast_show_mask_gpu(
+        self,
+        annotation,
+        ax,
+        random_color=False,
+        bboxes=None,
+        points=None,
+        pointlabel=None,
+        retinamask=True,
+        target_height=960,
+        target_width=960,
+    ):
+        msak_sum = annotation.shape[0]
+        height = annotation.shape[1]
+        weight = annotation.shape[2]
+        areas = torch.sum(annotation, dim=(1, 2))
+        sorted_indices = torch.argsort(areas, descending=False)
+        annotation = annotation[sorted_indices]
+        # Find the index of the first non-zero value at each position.
+        index = (annotation != 0).to(torch.long).argmax(dim=0)
+        if random_color:
+            color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
+        else:
+            color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([
+                30 / 255, 144 / 255, 255 / 255]).to(annotation.device)
+        transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
+        visual = torch.cat([color, transparency], dim=-1)
+        mask_image = torch.unsqueeze(annotation, -1) * visual
+        # Select data according to the index. The index indicates which batch's data to choose at each position, converting the mask_image into a single batch form.
+        show = torch.zeros((height, weight, 4)).to(annotation.device)
+        try:
+            h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij')
+        except:
+            h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight))
+        indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
+        # Use vectorized indexing to update the values of 'show'.
+        show[h_indices, w_indices, :] = mask_image[indices]
+        show_cpu = show.cpu().numpy()
+        if bboxes is not None:
+            for bbox in bboxes:
+                x1, y1, x2, y2 = bbox
+                ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
+        # draw point
+        if points is not None:
+            plt.scatter(
+                [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
+                [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
+                s=20,
+                c='y',
+            )
+            plt.scatter(
+                [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
+                [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
+                s=20,
+                c='m',
+            )
+        if not retinamask:
+            show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
+        ax.imshow(show_cpu)
+    # clip
+    @torch.no_grad()
+    def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
+        preprocessed_images = [preprocess(image).to(device) for image in elements]
+        tokenized_text = clip.tokenize([search_text]).to(device)
+        stacked_images = torch.stack(preprocessed_images)
+        image_features = model.encode_image(stacked_images)
+        text_features = model.encode_text(tokenized_text)
+        image_features /= image_features.norm(dim=-1, keepdim=True)
+        text_features /= text_features.norm(dim=-1, keepdim=True)
+        probs = 100.0 * image_features @ text_features.T
+        return probs[:, 0].softmax(dim=0)
+    def _crop_image(self, format_results):
+        image = Image.fromarray(cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB))
+        ori_w, ori_h = image.size
+        annotations = format_results
+        mask_h, mask_w = annotations[0]['segmentation'].shape
+        if ori_w != mask_w or ori_h != mask_h:
+            image = image.resize((mask_w, mask_h))
+        cropped_boxes = []
+        cropped_images = []
+        not_crop = []
+        filter_id = []
+        # annotations, _ = filter_masks(annotations)
+        # filter_id = list(_)
+        for _, mask in enumerate(annotations):
+            if np.sum(mask['segmentation']) <= 100:
+                filter_id.append(_)
+                continue
+            bbox = self._get_bbox_from_mask(mask['segmentation'])  # mask 的 bbox
+            cropped_boxes.append(self._segment_image(image, bbox))
+            # cropped_boxes.append(segment_image(image,mask["segmentation"]))
+            cropped_images.append(bbox)  # Save the bounding box of the cropped image.
+        return cropped_boxes, cropped_images, not_crop, filter_id, annotations
+    def box_prompt(self, bbox=None, bboxes=None):
+        if self.results == None:
+            return []
+        assert bbox or bboxes
+        if bboxes is None:
+            bboxes = [bbox]
+        max_iou_index = []
+        for bbox in bboxes:
+            assert (bbox[2] != 0 and bbox[3] != 0)
+            masks = self.results[0].masks.data
+            target_height = self.img.shape[0]
+            target_width = self.img.shape[1]
+            h = masks.shape[1]
+            w = masks.shape[2]
+            if h != target_height or w != target_width:
+                bbox = [
+                    int(bbox[0] * w / target_width),
+                    int(bbox[1] * h / target_height),
+                    int(bbox[2] * w / target_width),
+                    int(bbox[3] * h / target_height), ]
+            bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
+            bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
+            bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
+            bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
+            # IoUs = torch.zeros(len(masks), dtype=torch.float32)
+            bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
+            masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
+            orig_masks_area = torch.sum(masks, dim=(1, 2))
+            union = bbox_area + orig_masks_area - masks_area
+            IoUs = masks_area / union
+            max_iou_index.append(int(torch.argmax(IoUs)))
+        max_iou_index = list(set(max_iou_index))
+        return np.array(masks[max_iou_index].cpu().numpy())
+    def point_prompt(self, points, pointlabel):  # numpy
+        if self.results == None:
+            return []
+        masks = self._format_results(self.results[0], 0)
+        target_height = self.img.shape[0]
+        target_width = self.img.shape[1]
+        h = masks[0]['segmentation'].shape[0]
+        w = masks[0]['segmentation'].shape[1]
+        if h != target_height or w != target_width:
+            points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
+        onemask = np.zeros((h, w))
+        masks = sorted(masks, key=lambda x: x['area'], reverse=True)
+        for i, annotation in enumerate(masks):
+            if type(annotation) == dict:
+                mask = annotation['segmentation']
+            else:
+                mask = annotation
+            for i, point in enumerate(points):
+                if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
+                    onemask[mask] = 1
+                if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
+                    onemask[mask] = 0
+        onemask = onemask >= 1
+        return np.array([onemask])
+    def text_prompt(self, text):
+        if self.results == None:
+            return []
+        format_results = self._format_results(self.results[0], 0)
+        cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
+        clip_model, preprocess = clip.load('ViT-B/32', device=self.device)
+        scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
+        max_idx = scores.argsort()
+        max_idx = max_idx[-1]
+        max_idx += sum(np.array(filter_id) <= int(max_idx))
+        return np.array([annotations[max_idx]['segmentation']])
+    def everything_prompt(self):
+        if self.results == None:
+            return []
+        return self.results[0].masks.data

fastsam/utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import numpy as np
+import torch
+from PIL import Image
+def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
+    '''Adjust bounding boxes to stick to image border if they are within a certain threshold.
+    Args:
+    boxes: (n, 4)
+    image_shape: (height, width)
+    threshold: pixel threshold
+    Returns:
+    adjusted_boxes: adjusted bounding boxes
+    '''
+    # Image dimensions
+    h, w = image_shape
+    # Adjust boxes
+    boxes[:, 0] = torch.where(boxes[:, 0] < threshold, torch.tensor(
+        0, dtype=torch.float, device=boxes.device), boxes[:, 0])  # x1
+    boxes[:, 1] = torch.where(boxes[:, 1] < threshold, torch.tensor(
+        0, dtype=torch.float, device=boxes.device), boxes[:, 1])  # y1
+    boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, torch.tensor(
+        w, dtype=torch.float, device=boxes.device), boxes[:, 2])  # x2
+    boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, torch.tensor(
+        h, dtype=torch.float, device=boxes.device), boxes[:, 3])  # y2
+    return boxes
+def convert_box_xywh_to_xyxy(box):
+    x1 = box[0]
+    y1 = box[1]
+    x2 = box[0] + box[2]
+    y2 = box[1] + box[3]
+    return [x1, y1, x2, y2]
+def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
+    '''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
+    Args:
+    box1: (4, )
+    boxes: (n, 4)
+    Returns:
+    high_iou_indices: Indices of boxes with IoU > thres
+    '''
+    boxes = adjust_bboxes_to_image_border(boxes, image_shape)
+    # obtain coordinates for intersections
+    x1 = torch.max(box1[0], boxes[:, 0])
+    y1 = torch.max(box1[1], boxes[:, 1])
+    x2 = torch.min(box1[2], boxes[:, 2])
+    y2 = torch.min(box1[3], boxes[:, 3])
+    # compute the area of intersection
+    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
+    # compute the area of both individual boxes
+    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+    # compute the area of union
+    union = box1_area + box2_area - intersection
+    # compute the IoU
+    iou = intersection / union  # Should be shape (n, )
+    if raw_output:
+        if iou.numel() == 0:
+            return 0
+        return iou
+    # get indices of boxes with IoU > thres
+    high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
+    return high_iou_indices
+def image_to_np_ndarray(image):
+    if type(image) is str:
+        return np.array(Image.open(image))
+    elif issubclass(type(image), Image.Image):
+        return np.array(image)
+    elif type(image) is np.ndarray:
+        return image
+    return None

requirements.txt CHANGED Viewed

@@ -1,4 +1,21 @@
 transformers
-torch
-torchvision
-gradio

 transformers
+# Base-----------------------------------
+matplotlib>=3.2.2
+opencv-python>=4.6.0
+Pillow>=7.1.2
+PyYAML>=5.3.1
+requests>=2.23.0
+scipy>=1.4.1
+torch>=1.7.0
+torchvision>=0.8.1
+tqdm>=4.64.0
+pandas>=1.1.4
+seaborn>=0.11.0
+gradio
+# Ultralytics-----------------------------------
+ultralytics == 8.0.120
+clip @ git+https://github.com/openai/CLIP.git