File size: 10,929 Bytes
8683813
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# Copyright (c) Facebook, Inc. and its affiliates.
import itertools
import logging
import numpy as np
from collections import OrderedDict
from collections.abc import Mapping
from typing import Dict, List, Optional, Tuple, Union
import torch
from omegaconf import DictConfig, OmegaConf
from torch import Tensor, nn

from annotator.oneformer.detectron2.layers import ShapeSpec
from annotator.oneformer.detectron2.structures import BitMasks, Boxes, ImageList, Instances
from annotator.oneformer.detectron2.utils.events import get_event_storage

from .backbone import Backbone

logger = logging.getLogger(__name__)


def _to_container(cfg):
    """
    mmdet will assert the type of dict/list.
    So convert omegaconf objects to dict/list.
    """
    if isinstance(cfg, DictConfig):
        cfg = OmegaConf.to_container(cfg, resolve=True)
    from mmcv.utils import ConfigDict

    return ConfigDict(cfg)


class MMDetBackbone(Backbone):
    """
    Wrapper of mmdetection backbones to use in detectron2.

    mmdet backbones produce list/tuple of tensors, while detectron2 backbones
    produce a dict of tensors. This class wraps the given backbone to produce
    output in detectron2's convention, so it can be used in place of detectron2
    backbones.
    """

    def __init__(
        self,
        backbone: Union[nn.Module, Mapping],
        neck: Union[nn.Module, Mapping, None] = None,
        *,
        output_shapes: List[ShapeSpec],
        output_names: Optional[List[str]] = None,
    ):
        """
        Args:
            backbone: either a backbone module or a mmdet config dict that defines a
                backbone. The backbone takes a 4D image tensor and returns a
                sequence of tensors.
            neck: either a backbone module or a mmdet config dict that defines a
                neck. The neck takes outputs of backbone and returns a
                sequence of tensors. If None, no neck is used.
            output_shapes: shape for every output of the backbone (or neck, if given).
                stride and channels are often needed.
            output_names: names for every output of the backbone (or neck, if given).
                By default, will use "out0", "out1", ...
        """
        super().__init__()
        if isinstance(backbone, Mapping):
            from mmdet.models import build_backbone

            backbone = build_backbone(_to_container(backbone))
        self.backbone = backbone

        if isinstance(neck, Mapping):
            from mmdet.models import build_neck

            neck = build_neck(_to_container(neck))
        self.neck = neck

        # "Neck" weights, if any, are part of neck itself. This is the interface
        # of mmdet so we follow it. Reference:
        # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
        logger.info("Initializing mmdet backbone weights...")
        self.backbone.init_weights()
        # train() in mmdet modules is non-trivial, and has to be explicitly
        # called. Reference:
        # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py
        self.backbone.train()
        if self.neck is not None:
            logger.info("Initializing mmdet neck weights ...")
            if isinstance(self.neck, nn.Sequential):
                for m in self.neck:
                    m.init_weights()
            else:
                self.neck.init_weights()
            self.neck.train()

        self._output_shapes = output_shapes
        if not output_names:
            output_names = [f"out{i}" for i in range(len(output_shapes))]
        self._output_names = output_names

    def forward(self, x) -> Dict[str, Tensor]:
        outs = self.backbone(x)
        if self.neck is not None:
            outs = self.neck(outs)
        assert isinstance(
            outs, (list, tuple)
        ), "mmdet backbone should return a list/tuple of tensors!"
        if len(outs) != len(self._output_shapes):
            raise ValueError(
                "Length of output_shapes does not match outputs from the mmdet backbone: "
                f"{len(outs)} != {len(self._output_shapes)}"
            )
        return {k: v for k, v in zip(self._output_names, outs)}

    def output_shape(self) -> Dict[str, ShapeSpec]:
        return {k: v for k, v in zip(self._output_names, self._output_shapes)}


class MMDetDetector(nn.Module):
    """
    Wrapper of a mmdetection detector model, for detection and instance segmentation.
    Input/output formats of this class follow detectron2's convention, so a
    mmdetection model can be trained and evaluated in detectron2.
    """

    def __init__(
        self,
        detector: Union[nn.Module, Mapping],
        *,
        # Default is 32 regardless of model:
        # https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets
        size_divisibility=32,
        pixel_mean: Tuple[float],
        pixel_std: Tuple[float],
    ):
        """
        Args:
            detector: a mmdet detector, or a mmdet config dict that defines a detector.
            size_divisibility: pad input images to multiple of this number
            pixel_mean: per-channel mean to normalize input image
            pixel_std: per-channel stddev to normalize input image
        """
        super().__init__()
        if isinstance(detector, Mapping):
            from mmdet.models import build_detector

            detector = build_detector(_to_container(detector))
        self.detector = detector
        self.detector.init_weights()
        self.size_divisibility = size_divisibility

        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
        assert (
            self.pixel_mean.shape == self.pixel_std.shape
        ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"

    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
        images = ImageList.from_tensors(images, size_divisibility=self.size_divisibility).tensor
        metas = []
        rescale = {"height" in x for x in batched_inputs}
        if len(rescale) != 1:
            raise ValueError("Some inputs have original height/width, but some don't!")
        rescale = list(rescale)[0]
        output_shapes = []
        for input in batched_inputs:
            meta = {}
            c, h, w = input["image"].shape
            meta["img_shape"] = meta["ori_shape"] = (h, w, c)
            if rescale:
                scale_factor = np.array(
                    [w / input["width"], h / input["height"]] * 2, dtype="float32"
                )
                ori_shape = (input["height"], input["width"])
                output_shapes.append(ori_shape)
                meta["ori_shape"] = ori_shape + (c,)
            else:
                scale_factor = 1.0
                output_shapes.append((h, w))
            meta["scale_factor"] = scale_factor
            meta["flip"] = False
            padh, padw = images.shape[-2:]
            meta["pad_shape"] = (padh, padw, c)
            metas.append(meta)

        if self.training:
            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
            if gt_instances[0].has("gt_masks"):
                from mmdet.core import PolygonMasks as mm_PolygonMasks, BitmapMasks as mm_BitMasks

                def convert_mask(m, shape):
                    # mmdet mask format
                    if isinstance(m, BitMasks):
                        return mm_BitMasks(m.tensor.cpu().numpy(), shape[0], shape[1])
                    else:
                        return mm_PolygonMasks(m.polygons, shape[0], shape[1])

                gt_masks = [convert_mask(x.gt_masks, x.image_size) for x in gt_instances]
                losses_and_metrics = self.detector.forward_train(
                    images,
                    metas,
                    [x.gt_boxes.tensor for x in gt_instances],
                    [x.gt_classes for x in gt_instances],
                    gt_masks=gt_masks,
                )
            else:
                losses_and_metrics = self.detector.forward_train(
                    images,
                    metas,
                    [x.gt_boxes.tensor for x in gt_instances],
                    [x.gt_classes for x in gt_instances],
                )
            return _parse_losses(losses_and_metrics)
        else:
            results = self.detector.simple_test(images, metas, rescale=rescale)
            results = [
                {"instances": _convert_mmdet_result(r, shape)}
                for r, shape in zip(results, output_shapes)
            ]
            return results

    @property
    def device(self):
        return self.pixel_mean.device


# Reference: show_result() in
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
def _convert_mmdet_result(result, shape: Tuple[int, int]) -> Instances:
    if isinstance(result, tuple):
        bbox_result, segm_result = result
        if isinstance(segm_result, tuple):
            segm_result = segm_result[0]
    else:
        bbox_result, segm_result = result, None

    bboxes = torch.from_numpy(np.vstack(bbox_result))  # Nx5
    bboxes, scores = bboxes[:, :4], bboxes[:, -1]
    labels = [
        torch.full((bbox.shape[0],), i, dtype=torch.int32) for i, bbox in enumerate(bbox_result)
    ]
    labels = torch.cat(labels)
    inst = Instances(shape)
    inst.pred_boxes = Boxes(bboxes)
    inst.scores = scores
    inst.pred_classes = labels

    if segm_result is not None and len(labels) > 0:
        segm_result = list(itertools.chain(*segm_result))
        segm_result = [torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in segm_result]
        segm_result = torch.stack(segm_result, dim=0)
        inst.pred_masks = segm_result
    return inst


# reference: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
def _parse_losses(losses: Dict[str, Tensor]) -> Dict[str, Tensor]:
    log_vars = OrderedDict()
    for loss_name, loss_value in losses.items():
        if isinstance(loss_value, torch.Tensor):
            log_vars[loss_name] = loss_value.mean()
        elif isinstance(loss_value, list):
            log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
        else:
            raise TypeError(f"{loss_name} is not a tensor or list of tensors")

        if "loss" not in loss_name:
            # put metrics to storage; don't return them
            storage = get_event_storage()
            value = log_vars.pop(loss_name).cpu().item()
            storage.put_scalar(loss_name, value)
    return log_vars