File size: 10,929 Bytes
8683813 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
# Copyright (c) Facebook, Inc. and its affiliates.
import itertools
import logging
import numpy as np
from collections import OrderedDict
from collections.abc import Mapping
from typing import Dict, List, Optional, Tuple, Union
import torch
from omegaconf import DictConfig, OmegaConf
from torch import Tensor, nn
from annotator.oneformer.detectron2.layers import ShapeSpec
from annotator.oneformer.detectron2.structures import BitMasks, Boxes, ImageList, Instances
from annotator.oneformer.detectron2.utils.events import get_event_storage
from .backbone import Backbone
logger = logging.getLogger(__name__)
def _to_container(cfg):
"""
mmdet will assert the type of dict/list.
So convert omegaconf objects to dict/list.
"""
if isinstance(cfg, DictConfig):
cfg = OmegaConf.to_container(cfg, resolve=True)
from mmcv.utils import ConfigDict
return ConfigDict(cfg)
class MMDetBackbone(Backbone):
"""
Wrapper of mmdetection backbones to use in detectron2.
mmdet backbones produce list/tuple of tensors, while detectron2 backbones
produce a dict of tensors. This class wraps the given backbone to produce
output in detectron2's convention, so it can be used in place of detectron2
backbones.
"""
def __init__(
self,
backbone: Union[nn.Module, Mapping],
neck: Union[nn.Module, Mapping, None] = None,
*,
output_shapes: List[ShapeSpec],
output_names: Optional[List[str]] = None,
):
"""
Args:
backbone: either a backbone module or a mmdet config dict that defines a
backbone. The backbone takes a 4D image tensor and returns a
sequence of tensors.
neck: either a backbone module or a mmdet config dict that defines a
neck. The neck takes outputs of backbone and returns a
sequence of tensors. If None, no neck is used.
output_shapes: shape for every output of the backbone (or neck, if given).
stride and channels are often needed.
output_names: names for every output of the backbone (or neck, if given).
By default, will use "out0", "out1", ...
"""
super().__init__()
if isinstance(backbone, Mapping):
from mmdet.models import build_backbone
backbone = build_backbone(_to_container(backbone))
self.backbone = backbone
if isinstance(neck, Mapping):
from mmdet.models import build_neck
neck = build_neck(_to_container(neck))
self.neck = neck
# "Neck" weights, if any, are part of neck itself. This is the interface
# of mmdet so we follow it. Reference:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
logger.info("Initializing mmdet backbone weights...")
self.backbone.init_weights()
# train() in mmdet modules is non-trivial, and has to be explicitly
# called. Reference:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py
self.backbone.train()
if self.neck is not None:
logger.info("Initializing mmdet neck weights ...")
if isinstance(self.neck, nn.Sequential):
for m in self.neck:
m.init_weights()
else:
self.neck.init_weights()
self.neck.train()
self._output_shapes = output_shapes
if not output_names:
output_names = [f"out{i}" for i in range(len(output_shapes))]
self._output_names = output_names
def forward(self, x) -> Dict[str, Tensor]:
outs = self.backbone(x)
if self.neck is not None:
outs = self.neck(outs)
assert isinstance(
outs, (list, tuple)
), "mmdet backbone should return a list/tuple of tensors!"
if len(outs) != len(self._output_shapes):
raise ValueError(
"Length of output_shapes does not match outputs from the mmdet backbone: "
f"{len(outs)} != {len(self._output_shapes)}"
)
return {k: v for k, v in zip(self._output_names, outs)}
def output_shape(self) -> Dict[str, ShapeSpec]:
return {k: v for k, v in zip(self._output_names, self._output_shapes)}
class MMDetDetector(nn.Module):
"""
Wrapper of a mmdetection detector model, for detection and instance segmentation.
Input/output formats of this class follow detectron2's convention, so a
mmdetection model can be trained and evaluated in detectron2.
"""
def __init__(
self,
detector: Union[nn.Module, Mapping],
*,
# Default is 32 regardless of model:
# https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets
size_divisibility=32,
pixel_mean: Tuple[float],
pixel_std: Tuple[float],
):
"""
Args:
detector: a mmdet detector, or a mmdet config dict that defines a detector.
size_divisibility: pad input images to multiple of this number
pixel_mean: per-channel mean to normalize input image
pixel_std: per-channel stddev to normalize input image
"""
super().__init__()
if isinstance(detector, Mapping):
from mmdet.models import build_detector
detector = build_detector(_to_container(detector))
self.detector = detector
self.detector.init_weights()
self.size_divisibility = size_divisibility
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
assert (
self.pixel_mean.shape == self.pixel_std.shape
), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
images = [x["image"].to(self.device) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, size_divisibility=self.size_divisibility).tensor
metas = []
rescale = {"height" in x for x in batched_inputs}
if len(rescale) != 1:
raise ValueError("Some inputs have original height/width, but some don't!")
rescale = list(rescale)[0]
output_shapes = []
for input in batched_inputs:
meta = {}
c, h, w = input["image"].shape
meta["img_shape"] = meta["ori_shape"] = (h, w, c)
if rescale:
scale_factor = np.array(
[w / input["width"], h / input["height"]] * 2, dtype="float32"
)
ori_shape = (input["height"], input["width"])
output_shapes.append(ori_shape)
meta["ori_shape"] = ori_shape + (c,)
else:
scale_factor = 1.0
output_shapes.append((h, w))
meta["scale_factor"] = scale_factor
meta["flip"] = False
padh, padw = images.shape[-2:]
meta["pad_shape"] = (padh, padw, c)
metas.append(meta)
if self.training:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
if gt_instances[0].has("gt_masks"):
from mmdet.core import PolygonMasks as mm_PolygonMasks, BitmapMasks as mm_BitMasks
def convert_mask(m, shape):
# mmdet mask format
if isinstance(m, BitMasks):
return mm_BitMasks(m.tensor.cpu().numpy(), shape[0], shape[1])
else:
return mm_PolygonMasks(m.polygons, shape[0], shape[1])
gt_masks = [convert_mask(x.gt_masks, x.image_size) for x in gt_instances]
losses_and_metrics = self.detector.forward_train(
images,
metas,
[x.gt_boxes.tensor for x in gt_instances],
[x.gt_classes for x in gt_instances],
gt_masks=gt_masks,
)
else:
losses_and_metrics = self.detector.forward_train(
images,
metas,
[x.gt_boxes.tensor for x in gt_instances],
[x.gt_classes for x in gt_instances],
)
return _parse_losses(losses_and_metrics)
else:
results = self.detector.simple_test(images, metas, rescale=rescale)
results = [
{"instances": _convert_mmdet_result(r, shape)}
for r, shape in zip(results, output_shapes)
]
return results
@property
def device(self):
return self.pixel_mean.device
# Reference: show_result() in
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
def _convert_mmdet_result(result, shape: Tuple[int, int]) -> Instances:
if isinstance(result, tuple):
bbox_result, segm_result = result
if isinstance(segm_result, tuple):
segm_result = segm_result[0]
else:
bbox_result, segm_result = result, None
bboxes = torch.from_numpy(np.vstack(bbox_result)) # Nx5
bboxes, scores = bboxes[:, :4], bboxes[:, -1]
labels = [
torch.full((bbox.shape[0],), i, dtype=torch.int32) for i, bbox in enumerate(bbox_result)
]
labels = torch.cat(labels)
inst = Instances(shape)
inst.pred_boxes = Boxes(bboxes)
inst.scores = scores
inst.pred_classes = labels
if segm_result is not None and len(labels) > 0:
segm_result = list(itertools.chain(*segm_result))
segm_result = [torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in segm_result]
segm_result = torch.stack(segm_result, dim=0)
inst.pred_masks = segm_result
return inst
# reference: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
def _parse_losses(losses: Dict[str, Tensor]) -> Dict[str, Tensor]:
log_vars = OrderedDict()
for loss_name, loss_value in losses.items():
if isinstance(loss_value, torch.Tensor):
log_vars[loss_name] = loss_value.mean()
elif isinstance(loss_value, list):
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
else:
raise TypeError(f"{loss_name} is not a tensor or list of tensors")
if "loss" not in loss_name:
# put metrics to storage; don't return them
storage = get_event_storage()
value = log_vars.pop(loss_name).cpu().item()
storage.put_scalar(loss_name, value)
return log_vars
|