diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6d4eb2c8a1abeaba24a7fa2180cf5dd6db196906 --- /dev/null +++ b/LICENSE @@ -0,0 +1,25 @@ +FCOS for non-commercial purposes + +Copyright (c) 2019 the authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/configs/ctw/r50_baseline.yaml b/configs/ctw/r50_baseline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..29c27d35c825154636745bbe5a85f581ffe0dfd8 --- /dev/null +++ b/configs/ctw/r50_baseline.yaml @@ -0,0 +1,70 @@ +OUTPUT_DIR: "./output/ctw" +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50" + BACKBONE: + CONV_BODY: "R-50-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + ASPECT_RATIOS: (0.25, 0.5, 1.0, 2.0, 4.0) + ROI_HEADS: + USE_FPN: True + SCORE_THRESH: 0.85 + NMS: 0.3 + ROI_BOX_HEAD: + DEFORMABLE_POOLING: False + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 2 + CLASS_WEIGHT: 1.0 + ## Boundary + BOUNDARY_ON: True + ROI_BOUNDARY_HEAD: + DEFORMABLE_POOLING: True + FEATURE_EXTRACTOR: "BoundaryRCNNFPNFeatureExtractor" + POOLER_RESOLUTION: 14 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + PREDICTOR: "BoundaryRCNNC4Predictor" + RESOLUTION: 48 + SHARE_BOX_FEATURE_EXTRACTOR: False + BO_WEIGHT: 0.1 + Loss_balance: 1.1 + +PROCESS: + PNMS: True + NMS_THRESH: 0.25 +DATASETS: + TRAIN: ("CTW1500_train",) + TEST: ("CTW1500_test",) + Test_Visual: True +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.0025 + BIAS_LR_FACTOR: 2 + WEIGHT_DECAY: 0.0001 + STEPS: (30000, 40000) + MAX_ITER: 45000 + IMS_PER_BATCH: 1 + CHECKPOINT_PERIOD: 1000 +INPUT: + + MIN_SIZE_TRAIN: (400,600,720,1000,1200) + MAX_SIZE_TRAIN: 2000 + MIN_SIZE_TEST: 720 + MAX_SIZE_TEST: 1280 + CROP_PROB_TRAIN: 1.0 + ROTATE_PROB_TRAIN: 0.0 + ROTATE_DEGREE: (0,30,60,90,210,150,180,210,240,270,300,330,360) + +TEST: + IMS_PER_BATCH: 1 + + diff --git a/configs/ic/r50_baseline.yaml b/configs/ic/r50_baseline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..056a3fc7a39ffc40e13da3a0d10af9166cc200da --- /dev/null +++ b/configs/ic/r50_baseline.yaml @@ -0,0 +1,75 @@ +OUTPUT_DIR: "./output/ic15" +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: catalog://ImageNetPretrained/MSRA/R-50 + BACKBONE: + CONV_BODY: "R-50-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + ASPECT_RATIOS: (0.25, 0.5, 1.0, 2.0, 4.0) + ROI_HEADS: + USE_FPN: True + SCORE_THRESH: 0.52 # ic15 + NMS: 0.89 + ROI_BOX_HEAD: + DEFORMABLE_POOLING: False + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 2 + CLASS_WEIGHT: 1.0 + ## Boundary + BOUNDARY_ON: True + ROI_BOUNDARY_HEAD: + DEFORMABLE_POOLING: False + FEATURE_EXTRACTOR: "BoundaryRCNNFPNFeatureExtractor" + POOLER_RESOLUTION: 14 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + PREDICTOR: "BoundaryRCNNC4Predictor" + RESOLUTION: 48 + SHARE_BOX_FEATURE_EXTRACTOR: False + BO_WEIGHT: 0.1 + Loss_balance: 1.0 + +PROCESS: + PNMS: True + NMS_THRESH: 0.25 +DATASETS: + TRAIN: ("ic15_train",) + TEST: ("ic15_test",) + Test_Visual: True +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.00025 + BIAS_LR_FACTOR: 2 + WEIGHT_DECAY: 0.0001 +# STEPS: (120000, 160000) + STEPS: (5000, 10000) # fine-tune +# MAX_ITER: 180000 + MAX_ITER: 190500 # fine-tune + IMS_PER_BATCH: 1 + CHECKPOINT_PERIOD: 5000 +INPUT: + + MIN_SIZE_TRAIN: (400,600,720,1000,1200) + MAX_SIZE_TRAIN: 2000 + MIN_SIZE_TEST: 1200 + MAX_SIZE_TEST: 2000 + + CROP_PROB_TRAIN: 1.0 + ROTATE_PROB_TRAIN: 0.3 # fine-tune +# ROTATE_PROB_TRAIN: 1.0 +# ROTATE_DEGREE: (0,30,60,90,210,150,180,210,240,270,300,330,360) + ROTATE_DEGREE: (10,) # fine-tune + +TEST: + IMS_PER_BATCH: 1 + + diff --git a/demo/1.jpg b/demo/1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d0be654dbc9fbb13e9e6a4b9453caa3562daba47 Binary files /dev/null and b/demo/1.jpg differ diff --git a/demo/2.jpg b/demo/2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..507950bc42f98aacbce40b33a0118becec43d93f Binary files /dev/null and b/demo/2.jpg differ diff --git a/demo/example1.jpg b/demo/example1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fb8063a68c4a7422810a9a83d323a78cd0faf67c Binary files /dev/null and b/demo/example1.jpg differ diff --git a/demo/example_results.jpg b/demo/example_results.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b8b7c8e2d1a68952152703111be1d9f478619257 Binary files /dev/null and b/demo/example_results.jpg differ diff --git a/maskrcnn_benchmark/__init__.py b/maskrcnn_benchmark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5c7f19c6c00a4ac3f2f2bc66f892e44bcbd72612 --- /dev/null +++ b/maskrcnn_benchmark/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. diff --git a/maskrcnn_benchmark/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..239b08be4b4cfdf3d70754418e36b0ebe8b19083 Binary files /dev/null and b/maskrcnn_benchmark/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/config/__init__.py b/maskrcnn_benchmark/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..22a15023b1b06dad1f8c36924cdbb96bf1f5dc8d --- /dev/null +++ b/maskrcnn_benchmark/config/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .defaults import _C as cfg diff --git a/maskrcnn_benchmark/config/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/config/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4dcd8c5e9719cf492218a3621e7b022ebbcab6f Binary files /dev/null and b/maskrcnn_benchmark/config/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/config/__pycache__/defaults.cpython-37.pyc b/maskrcnn_benchmark/config/__pycache__/defaults.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51b32a43281423b8a8127c3a035e20516eb30225 Binary files /dev/null and b/maskrcnn_benchmark/config/__pycache__/defaults.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/config/__pycache__/paths_catalog.cpython-37.pyc b/maskrcnn_benchmark/config/__pycache__/paths_catalog.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18400dd1d95f9ca56e88aea30264e2f28cb2199c Binary files /dev/null and b/maskrcnn_benchmark/config/__pycache__/paths_catalog.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..aa35ac474b5d42a99361d1ac5ba2d8e164ae0a2c --- /dev/null +++ b/maskrcnn_benchmark/config/defaults.py @@ -0,0 +1,471 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import os + +from yacs.config import CfgNode as CN + + +# ----------------------------------------------------------------------------- +# Convention about Training / Test specific parameters +# ----------------------------------------------------------------------------- +# Whenever an argument can be either used for training or for testing, the +# corresponding name will be post-fixed by a _TRAIN for a training parameter, +# or _TEST for a test-specific parameter. +# For example, the number of images during training will be +# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be +# IMAGES_PER_BATCH_TEST + +# ----------------------------------------------------------------------------- +# Config definition +# ----------------------------------------------------------------------------- + +_C = CN() + +_C.MODEL = CN() +_C.MODEL.RPN_ONLY = False +_C.MODEL.MASK_ON = False +_C.MODEL.FCOS_ON = False +_C.MODEL.KE_ON = False +_C.MODEL.BOUNDARY_ON = False +_C.MODEL.MSR_ON = False +_C.MODEL.RETINANET_ON = False +_C.MODEL.KEYPOINT_ON = False +_C.MODEL.DEVICE = "cuda" +_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN" +_C.MODEL.CLS_AGNOSTIC_BBOX_REG = False + +# If the WEIGHT starts with a catalog://, like :R-50, the code will look for +# the path in paths_catalog. Else, it will use it as the specified absolute +# path +_C.MODEL.WEIGHT = "" + + +# ----------------------------------------------------------------------------- +# INPUT +# ----------------------------------------------------------------------------- +_C.INPUT = CN() +# Size of the smallest side of the image during training +_C.INPUT.MIN_SIZE_TRAIN = (800,) # (800,) +# The range of the smallest side for multi-scale training +_C.INPUT.MIN_SIZE_RANGE_TRAIN = (-1, -1) # -1 means disabled and it will use MIN_SIZE_TRAIN +# Maximum size of the side of the image during training +_C.INPUT.MAX_SIZE_TRAIN = 1333 +# Size of the smallest side of the image during testing +_C.INPUT.MIN_SIZE_TEST = 1000 +# Maximum size of the side of the image during testing +_C.INPUT.MAX_SIZE_TEST = 1333 +# Values to be used for image normalization +_C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717] +# Values to be used for image normalization +_C.INPUT.PIXEL_STD = [1., 1., 1.] +# Convert image to BGR format (for Caffe2 models), in range 0-255 +_C.INPUT.TO_BGR255 = True +_C.INPUT.CROP_PROB_TRAIN = 1.0 +_C.INPUT.ROTATE_PROB_TRAIN = 0.3 +_C.INPUT.ROTATE_DEGREE = (0,15,-15,45,-45,90,-90) +# _C.INPUT.ROTATE_DEGREE = 15 + + + + +# ----------------------------------------------------------------------------- +# Dataset +# ----------------------------------------------------------------------------- +_C.DATASETS = CN() +# List of the dataset names for training, as present in paths_catalog.py +_C.DATASETS.TRAIN = () +# List of the dataset names for testing, as present in paths_catalog.py +_C.DATASETS.TEST = () +_C.DATASETS.Test_Visual = False +# ----------------------------------------------------------------------------- +# DataLoader +# ----------------------------------------------------------------------------- +_C.DATALOADER = CN() +# Number of data loading threads +_C.DATALOADER.NUM_WORKERS = 4 +# If > 0, this enforces that each collated batch should have a size divisible +# by SIZE_DIVISIBILITY +_C.DATALOADER.SIZE_DIVISIBILITY = 0 +# If True, each batch should contain only images for which the aspect ratio +# is compatible. This groups portrait images together, and landscape images +# are not batched with portrait images. +_C.DATALOADER.ASPECT_RATIO_GROUPING = True + + +# ---------------------------------------------------------------------------- # +# Backbone options +# ---------------------------------------------------------------------------- # +_C.MODEL.BACKBONE = CN() + +# The backbone conv body to use +# The string must match a function that is imported in modeling.model_builder +# (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN +# backbone) +_C.MODEL.BACKBONE.CONV_BODY = "R-50-C4" + +# Add StopGrad at a specified stage so the bottom layers are frozen +_C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2 +# GN for backbone + +##123123123 +_C.MODEL.BACKBONE.USE_GN = False + + +# ---------------------------------------------------------------------------- # +# FPN options +# ---------------------------------------------------------------------------- # +_C.MODEL.FPN = CN() + +# 123123123 +_C.MODEL.FPN.USE_GN = False +_C.MODEL.FPN.USE_RELU = False + +#############123123123 +_C.MODEL.FPN.USE_DEFORMABLE = False + + +# ---------------------------------------------------------------------------- # +# Group Norm options +# ---------------------------------------------------------------------------- # +_C.MODEL.GROUP_NORM = CN() +# Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS) +_C.MODEL.GROUP_NORM.DIM_PER_GP = -1 +# Number of groups in GroupNorm (-1 if using DIM_PER_GP) +_C.MODEL.GROUP_NORM.NUM_GROUPS = 32 +# GroupNorm's small constant in the denominator +_C.MODEL.GROUP_NORM.EPSILON = 1e-5 + + +# ---------------------------------------------------------------------------- # +# RPN options +# ---------------------------------------------------------------------------- # +_C.MODEL.RPN = CN() +_C.MODEL.RPN.USE_FPN = False +# Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input +_C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512) +# Stride of the feature map that RPN is attached. +# For FPN, number of strides should match number of scales +_C.MODEL.RPN.ANCHOR_STRIDE = (16,) +# RPN anchor aspect ratios +_C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0) +# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels +# Set to -1 or a large value, e.g. 100000, to disable pruning anchors +_C.MODEL.RPN.STRADDLE_THRESH = 0 +# Minimum overlap required between an anchor and ground-truth box for the +# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD +# ==> positive RPN example) +_C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7 +# Maximum overlap allowed between an anchor and ground-truth box for the +# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD +# ==> negative RPN example) +_C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3 +# Total number of RPN examples per image +_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256 +# Target fraction of foreground (positive) examples per RPN minibatch +_C.MODEL.RPN.POSITIVE_FRACTION = 0.5 +# Number of top scoring RPN proposals to keep before applying NMS +# When FPN is used, this is *per FPN level* (not total) +_C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000 + +_C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000 +# Number of top scoring RPN proposals to keep after applying NMS +_C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000 +_C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000 +# NMS threshold used on RPN proposals +_C.MODEL.RPN.NMS_THRESH = 0.7 +# Proposal height and width both need to be greater than RPN_MIN_SIZE +# (a the scale used during training or inference) +_C.MODEL.RPN.MIN_SIZE = 0 +# Number of top scoring RPN proposals to keep after combining proposals from +# all FPN levels +_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000 +_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000 +# Custom rpn head, empty to use default conv or separable conv +_C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead_1" + + +# ---------------------------------------------------------------------------- # +# ROI HEADS options +# ---------------------------------------------------------------------------- # +_C.MODEL.ROI_HEADS = CN() +_C.MODEL.ROI_HEADS.USE_FPN = False +_C.MODEL.ROI_HEADS.USE_FPN = False +# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD) +_C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5 +# Overlap threshold for an RoI to be considered background +# (class = 0 if overlap in [0, BG_IOU_THRESHOLD)) +_C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5 +# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets +# These are empirically chosen to approximately lead to unit variance targets +_C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.) +# RoI minibatch size *per image* (number of regions of interest [ROIs]) +# Total number of RoIs per training minibatch = +# TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH +# E.g., a common configuration is: 512 * 2 * 8 = 8192 +_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 +# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0) +_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25 + +# Only used on test mode + +# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to +# balance obtaining high recall with not having too many low precision +# detections that will slow down inference post processing steps (like NMS) +_C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05 +# Overlap threshold used for non-maximum suppression (suppress boxes with +# IoU >= this threshold) +_C.MODEL.ROI_HEADS.NMS = 0.5 +# Maximum number of detections to return per image (100 is based on the limit established for the COCO dataset) +_C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100 + + +_C.MODEL.ROI_BOX_HEAD = CN() +_C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor" +_C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor" +_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14 +_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0 +_C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,) +_C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81 +# Hidden layer dimension when using an MLP for the RoI box head +_C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024 +# GN +#####123123123 +_C.MODEL.ROI_BOX_HEAD.USE_GN = False +# Dilation +_C.MODEL.ROI_BOX_HEAD.DILATION = 1 +_C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256 + +#### 123123 +_C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4 +_C.MODEL.ROI_BOX_HEAD.CLASS_WEIGHT = 0.1 +_C.MODEL.ROI_BOX_HEAD.DEFORMABLE_POOLING = False + +_C.MODEL.ROI_MASK_HEAD = CN() +# Whether or not resize and translate masks to the input image. +_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False +_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5 +_C.MODEL.ROI_MASK_HEAD.DILATION = 1 +_C.MODEL.ROI_MASK_HEAD.USE_GN = False + +# Boundary edge +_C.MODEL.ROI_BOUNDARY_HEAD = CN() +_C.MODEL.ROI_BOUNDARY_HEAD.DEFORMABLE_POOLING = False + +_C.MODEL.ROI_BOUNDARY_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor" +_C.MODEL.ROI_BOUNDARY_HEAD.POOLER_RESOLUTION = 14 +_C.MODEL.ROI_BOUNDARY_HEAD.POOLER_SCALES = (1.0 / 16,) +_C.MODEL.ROI_BOUNDARY_HEAD.POOLER_SAMPLING_RATIO = 0 +_C.MODEL.ROI_BOUNDARY_HEAD.CONV_LAYERS = (256, 256, 256, 256) + +_C.MODEL.ROI_BOUNDARY_HEAD.PREDICTOR = "KERCNNC4Predictor" +_C.MODEL.ROI_BOUNDARY_HEAD.RESOLUTION = 14 +_C.MODEL.ROI_BOUNDARY_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True +_C.MODEL.ROI_BOUNDARY_HEAD.BO_WEIGHT = 1.0 +_C.MODEL.ROI_BOUNDARY_HEAD.Loss_balance = 1.2 + +# ---------------------------------------------------------------------------- # +# ResNe[X]t options (ResNets = {ResNet, ResNeXt} +# Note that parts of a resnet may be used for both the backbone and the head +# These options apply to both +# ---------------------------------------------------------------------------- # +_C.MODEL.RESNETS = CN() + +# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt +_C.MODEL.RESNETS.NUM_GROUPS = 1 + +# Baseline width of each group +_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64 + +# Place the stride 2 conv on the 1x1 filter +# Use True only for the original MSRA ResNet; use False for C2 and Torch models +_C.MODEL.RESNETS.STRIDE_IN_1X1 = True + +# Residual transformation function +_C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm" +_C.MODEL.RESNETS.DEF_FUNC = "DeformableConvWithFixedBatchNorm" +# ResNet's stem function (conv1 and pool1) +_C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm" +_C.MODEL.RESNETS.DEF_START_MODULE = "NA" + +#########123123123 +_C.MODEL.RESNETS.DEFORM_POOLING = False + +# Apply dilation in stage "res5" +_C.MODEL.RESNETS.RES5_DILATION = 1 + +_C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4 +_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256 +_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64 + +# ---------------------------------------------------------------------------- # +# FCOS Options +# ---------------------------------------------------------------------------- # +_C.MODEL.FCOS = CN() +_C.MODEL.FCOS.NUM_CLASSES = 81 # the number of classes including background +_C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128] +_C.MODEL.FCOS.PRIOR_PROB = 0.01 +_C.MODEL.FCOS.INFERENCE_TH = 0.05 +_C.MODEL.FCOS.NMS_TH = 0.4 +_C.MODEL.FCOS.PRE_NMS_TOP_N = 1000 + +# Focal loss parameter: alpha +_C.MODEL.FCOS.LOSS_ALPHA = 0.25 +# Focal loss parameter: gamma +_C.MODEL.FCOS.LOSS_GAMMA = 2.0 +_C.MODEL.FCOS.SIZES_OF_INTEREST = [64, 128, 256, 512] + +# the number of convolutions used in the cls and bbox tower +_C.MODEL.FCOS.NUM_CONVS = 4 + +# ---------------------------------------------------------------------------- # +# RetinaNet Options (Follow the Detectron version) +# ---------------------------------------------------------------------------- # +_C.MODEL.RETINANET = CN() + +# This is the number of foreground classes and background. +_C.MODEL.RETINANET.NUM_CLASSES = 81 + +# Anchor aspect ratios to use +_C.MODEL.RETINANET.ANCHOR_SIZES = (32, 64, 128, 256, 512) +_C.MODEL.RETINANET.ASPECT_RATIOS = (0.5, 1.0, 2.0) +_C.MODEL.RETINANET.ANCHOR_STRIDES = (8, 16, 32, 64, 128) +_C.MODEL.RETINANET.STRADDLE_THRESH = 0 + +# Anchor scales per octave +_C.MODEL.RETINANET.OCTAVE = 2.0 +_C.MODEL.RETINANET.SCALES_PER_OCTAVE = 3 + +# Use C5 or P5 to generate P6 +_C.MODEL.RETINANET.USE_C5 = True + +# Convolutions to use in the cls and bbox tower +# NOTE: this doesn't include the last conv for logits +_C.MODEL.RETINANET.NUM_CONVS = 4 + +# Weight for bbox_regression loss +_C.MODEL.RETINANET.BBOX_REG_WEIGHT = 4.0 + +# Smooth L1 loss beta for bbox regression +_C.MODEL.RETINANET.BBOX_REG_BETA = 0.11 + +# During inference, #locs to select based on cls score before NMS is performed +# per FPN level +_C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000 + +# IoU overlap ratio for labeling an anchor as positive +# Anchors with >= iou overlap are labeled positive +_C.MODEL.RETINANET.FG_IOU_THRESHOLD = 0.5 + +# IoU overlap ratio for labeling an anchor as negative +# Anchors with < iou overlap are labeled negative +_C.MODEL.RETINANET.BG_IOU_THRESHOLD = 0.4 + +# Focal loss parameter: alpha +_C.MODEL.RETINANET.LOSS_ALPHA = 0.25 + +# Focal loss parameter: gamma +_C.MODEL.RETINANET.LOSS_GAMMA = 2.0 + +# Prior prob for the positives at the beginning of training. This is used to set +# the bias init for the logits layer +_C.MODEL.RETINANET.PRIOR_PROB = 0.01 + +# Inference cls score threshold, anchors with score > INFERENCE_TH are +# considered for inference +_C.MODEL.RETINANET.INFERENCE_TH = 0.05 + +# NMS threshold used in RetinaNet +_C.MODEL.RETINANET.NMS_TH = 0.4 + + +# ---------------------------------------------------------------------------- # +# FBNet options +# ---------------------------------------------------------------------------- # +_C.MODEL.FBNET = CN() +_C.MODEL.FBNET.ARCH = "default" +# custom arch +_C.MODEL.FBNET.ARCH_DEF = "" +_C.MODEL.FBNET.BN_TYPE = "bn" +_C.MODEL.FBNET.SCALE_FACTOR = 1.0 +# the output channels will be divisible by WIDTH_DIVISOR +_C.MODEL.FBNET.WIDTH_DIVISOR = 1 +_C.MODEL.FBNET.DW_CONV_SKIP_BN = True +_C.MODEL.FBNET.DW_CONV_SKIP_RELU = True + +# > 0 scale, == 0 skip, < 0 same dimension +_C.MODEL.FBNET.DET_HEAD_LAST_SCALE = 1.0 +_C.MODEL.FBNET.DET_HEAD_BLOCKS = [] +# overwrite the stride for the head, 0 to use original value +_C.MODEL.FBNET.DET_HEAD_STRIDE = 0 + +# > 0 scale, == 0 skip, < 0 same dimension +_C.MODEL.FBNET.KPTS_HEAD_LAST_SCALE = 0.0 +_C.MODEL.FBNET.KPTS_HEAD_BLOCKS = [] +# overwrite the stride for the head, 0 to use original value +_C.MODEL.FBNET.KPTS_HEAD_STRIDE = 0 + +# > 0 scale, == 0 skip, < 0 same dimension +_C.MODEL.FBNET.MASK_HEAD_LAST_SCALE = 0.0 +_C.MODEL.FBNET.MASK_HEAD_BLOCKS = [] +# overwrite the stride for the head, 0 to use original value +_C.MODEL.FBNET.MASK_HEAD_STRIDE = 0 + +# 0 to use all blocks defined in arch_def +_C.MODEL.FBNET.RPN_HEAD_BLOCKS = 0 +_C.MODEL.FBNET.RPN_BN_TYPE = "" + + +# ---------------------------------------------------------------------------- # +# Solver +# ---------------------------------------------------------------------------- # +_C.SOLVER = CN() +_C.SOLVER.MAX_ITER = 40000 + +_C.SOLVER.BASE_LR = 0.001 +_C.SOLVER.BIAS_LR_FACTOR = 2 + +_C.SOLVER.MOMENTUM = 0.9 + +_C.SOLVER.WEIGHT_DECAY = 0.0005 +_C.SOLVER.WEIGHT_DECAY_BIAS = 0 + +_C.SOLVER.GAMMA = 0.1 +_C.SOLVER.STEPS = (30000,) + +_C.SOLVER.WARMUP_FACTOR = 1.0 / 3 +_C.SOLVER.WARMUP_ITERS = 500 +_C.SOLVER.WARMUP_METHOD = "linear" + +_C.SOLVER.CHECKPOINT_PERIOD = 2500 + +# Number of images per batch +# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will +# see 2 images per batch +_C.SOLVER.IMS_PER_BATCH = 4 + +# ---------------------------------------------------------------------------- # +# Specific test options +# ---------------------------------------------------------------------------- # +_C.TEST = CN() +_C.TEST.EXPECTED_RESULTS = [] +_C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4 +# Number of images per batch +# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will +# see 2 images per batch +_C.TEST.IMS_PER_BATCH = 16 +# Number of detections per image +_C.TEST.DETECTIONS_PER_IMG = 100 + + +# ---------------------------------------------------------------------------- # +# Misc options +# ---------------------------------------------------------------------------- # +_C.OUTPUT_DIR = "./1" +_C.IS_LOAD_OPTIMIZER = True +_C.IS_LOAD_SCHEDULER = True +_C.PROCESS = CN() + +#####123123123 +_C.PROCESS.PNMS = False +_C.PROCESS.NMS_THRESH = 0.4 + +_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py") diff --git a/maskrcnn_benchmark/config/paths_catalog.py b/maskrcnn_benchmark/config/paths_catalog.py new file mode 100644 index 0000000000000000000000000000000000000000..0ad2540e07c484eff858363003f3015a5f99713d --- /dev/null +++ b/maskrcnn_benchmark/config/paths_catalog.py @@ -0,0 +1,120 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +"""Centralized catalog of paths.""" + +import os + +class DatasetCatalog(object): + DATA_DIR = "/home/zhangbq/ws/ct/dataset/" + DATASETS = { + "ic15_train": ( + "ic15/ic15_train_images", + "ic15/annotations/ic15_train.json" + ), + "ic15_test": ( + "ic15/ic15_test_images", + "ic15/annotations/ic15_test.json" + ), + "CTW1500_train": ( + "ctw/ctw_train_images", + "ctw/annotations/ctw_train.json" + ), + "CTW1500_test": ( + "ctw/ctw_test_images", + "ctw/annotations/ctw_test.json" + ) + +} + + @staticmethod + def get(name): + data_dir = DatasetCatalog.DATA_DIR + attrs = DatasetCatalog.DATASETS[name] + if "coco" in name: + args = dict( + root=os.path.join(data_dir, attrs["img_dir"]), + ann_file=os.path.join(data_dir, attrs["ann_file"]), + ) + return dict( + factory="COCODataset", + args=args, + ) + elif "voc" in name: + args = dict( + data_dir=os.path.join(data_dir, attrs["data_dir"]), + split=attrs["split"], + ) + return dict( + factory="PascalVOCDataset", + args=args, + ) + elif True: + args = dict( + root=os.path.join(data_dir, attrs[0]), + ann_file=os.path.join(data_dir, attrs[1]), + ) + return dict( + factory="WordDataset", + args=args, + ) + raise RuntimeError("Dataset not available: {}".format(name)) + + +class ModelCatalog(object): + S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron" + C2_IMAGENET_MODELS = { + "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl", + "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl", + "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl", + "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl", + "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl", + } + + C2_DETECTRON_SUFFIX = "output/train/{}coco_2014_train%3A{}coco_2014_valminusminival/generalized_rcnn/model_final.pkl" + C2_DETECTRON_MODELS = { + "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW", + "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I", + "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7", + "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ", + "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB", + "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC", + "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT", + "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI", + "37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x": "09_35_36.8pzTQKYK", + # keypoints + "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "08_42_54.kdzV35ao" + } + + @staticmethod + def get(name): + if name.startswith("Caffe2Detectron/COCO"): + return ModelCatalog.get_c2_detectron_12_2017_baselines(name) + if name.startswith("ImageNetPretrained"): + return ModelCatalog.get_c2_imagenet_pretrained(name) + raise RuntimeError("model not present in the catalog {}".format(name)) + + @staticmethod + def get_c2_imagenet_pretrained(name): + prefix = ModelCatalog.S3_C2_DETECTRON_URL + name = name[len("ImageNetPretrained/"):] + name = ModelCatalog.C2_IMAGENET_MODELS[name] + url = "/".join([prefix, name]) + return url + + @staticmethod + def get_c2_detectron_12_2017_baselines(name): + # Detectron C2 models are stored following the structure + # prefix//2012_2017_baselines/.yaml./suffix + # we use as identifiers in the catalog Caffe2Detectron/COCO// + prefix = ModelCatalog.S3_C2_DETECTRON_URL + dataset_tag = "keypoints_" if "keypoint" in name else "" + suffix = ModelCatalog.C2_DETECTRON_SUFFIX.format(dataset_tag, dataset_tag) + # remove identification prefix + name = name[len("Caffe2Detectron/COCO/"):] + # split in and + model_id, model_name = name.split("/") + # parsing to make it match the url address from the Caffe2 models + model_name = "{}.yaml".format(model_name) + signature = ModelCatalog.C2_DETECTRON_MODELS[name] + unique_name = ".".join([model_name, signature]) + url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix]) + return url diff --git a/maskrcnn_benchmark/csrc/ROIAlign.h b/maskrcnn_benchmark/csrc/ROIAlign.h new file mode 100644 index 0000000000000000000000000000000000000000..3907deab2a750a9f83f0f3ef38fee279c1445c61 --- /dev/null +++ b/maskrcnn_benchmark/csrc/ROIAlign.h @@ -0,0 +1,46 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + +// Interface for Python +at::Tensor ROIAlign_forward(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + if (input.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); +} + +at::Tensor ROIAlign_backward(const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio) { + if (grad.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + diff --git a/maskrcnn_benchmark/csrc/ROIPool.h b/maskrcnn_benchmark/csrc/ROIPool.h new file mode 100644 index 0000000000000000000000000000000000000000..200fd7390b4629747f0ea9e16c0823ac5f099ac1 --- /dev/null +++ b/maskrcnn_benchmark/csrc/ROIPool.h @@ -0,0 +1,48 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + + +std::tuple ROIPool_forward(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width) { + if (input.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +at::Tensor ROIPool_backward(const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& rois, + const at::Tensor& argmax, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width) { + if (grad.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + + + diff --git a/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h new file mode 100644 index 0000000000000000000000000000000000000000..308861e44774dffd89b3f5ebff7cc6c5491fe3a5 --- /dev/null +++ b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h @@ -0,0 +1,41 @@ +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + +// Interface for Python +at::Tensor SigmoidFocalLoss_forward( + const at::Tensor& logits, + const at::Tensor& targets, + const int num_classes, + const float gamma, + const float alpha) { + if (logits.type().is_cuda()) { +#ifdef WITH_CUDA + return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +at::Tensor SigmoidFocalLoss_backward( + const at::Tensor& logits, + const at::Tensor& targets, + const at::Tensor& d_losses, + const int num_classes, + const float gamma, + const float alpha) { + if (logits.type().is_cuda()) { +#ifdef WITH_CUDA + return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} diff --git a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d35aedf27ea581b9241d44b87dcca2e901b5064e --- /dev/null +++ b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp @@ -0,0 +1,257 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "cpu/vision.h" + +// implementation taken from Caffe2 +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void ROIAlignForward_cpu_kernel( + const int nthreads, + const T* bottom_data, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, + //int roi_cols, + T* top_data) { + //AT_ASSERT(roi_cols == 4 || roi_cols == 5); + int roi_cols = 5; + + int n_rois = nthreads / channels / pooled_width / pooled_height; + // (n, c, ph, pw) is an element in the pooled output + // can be parallelized using omp + // #pragma omp parallel for num_threads(32) + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + // roi could have 4 or 5 columns + const T* offset_bottom_rois = bottom_rois + n * roi_cols; + int roi_batch_ind = 0; + if (roi_cols == 5) { + roi_batch_ind = offset_bottom_rois[0]; + offset_bottom_rois++; + } + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[0] * spatial_scale; + T roi_start_h = offset_bottom_rois[1] * spatial_scale; + T roi_end_w = offset_bottom_rois[2] * spatial_scale; + T roi_end_h = offset_bottom_rois[3] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); + T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + // we want to precalculate indeces and weights shared by all chanels, + // this is the key point of optimiation + std::vector> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + + pc.w4 * offset_bottom_data[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + top_data[index] = output_val; + } // for pw + } // for ph + } // for c + } // for n +} + +at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor"); + AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor"); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + + if (output.numel() == 0) { + return output; + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { + ROIAlignForward_cpu_kernel( + output_size, + input.data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + rois.data(), + output.data()); + }); + return output; +} diff --git a/maskrcnn_benchmark/csrc/cpu/dcn_v2_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/dcn_v2_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fc8a2a13c4dab47fbbc5dfc83dfc269a9ff68ecf --- /dev/null +++ b/maskrcnn_benchmark/csrc/cpu/dcn_v2_cpu.cpp @@ -0,0 +1,74 @@ +#include + +#include +#include + + +at::Tensor +dcn_v2_cpu_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + AT_ERROR("Not implement on cpu"); +} + +std::vector +dcn_v2_cpu_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + AT_ERROR("Not implement on cpu"); +} + +std::tuple +dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ERROR("Not implement on cpu"); +} + +std::tuple +dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ERROR("Not implement on cpu"); +} diff --git a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1153dea04f032c67c41bd0d2a285376a72c5a595 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp @@ -0,0 +1,75 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "cpu/vision.h" + + +template +at::Tensor nms_cpu_kernel(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); + AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); + AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); + + if (dets.numel() == 0) { + return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); + } + + auto x1_t = dets.select(1, 0).contiguous(); + auto y1_t = dets.select(1, 1).contiguous(); + auto x2_t = dets.select(1, 2).contiguous(); + auto y2_t = dets.select(1, 3).contiguous(); + + at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); + + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + + auto ndets = dets.size(0); + at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); + + auto suppressed = suppressed_t.data(); + auto order = order_t.data(); + auto x1 = x1_t.data(); + auto y1 = y1_t.data(); + auto x2 = x2_t.data(); + auto y2 = y2_t.data(); + auto areas = areas_t.data(); + + for (int64_t _i = 0; _i < ndets; _i++) { + auto i = order[_i]; + if (suppressed[i] == 1) + continue; + auto ix1 = x1[i]; + auto iy1 = y1[i]; + auto ix2 = x2[i]; + auto iy2 = y2[i]; + auto iarea = areas[i]; + + for (int64_t _j = _i + 1; _j < ndets; _j++) { + auto j = order[_j]; + if (suppressed[j] == 1) + continue; + auto xx1 = std::max(ix1, x1[j]); + auto yy1 = std::max(iy1, y1[j]); + auto xx2 = std::min(ix2, x2[j]); + auto yy2 = std::min(iy2, y2[j]); + + auto w = std::max(static_cast(0), xx2 - xx1 + 1); + auto h = std::max(static_cast(0), yy2 - yy1 + 1); + auto inter = w * h; + auto ovr = inter / (iarea + areas[j] - inter); + if (ovr >= threshold) + suppressed[j] = 1; + } + } + return at::nonzero(suppressed_t == 0).squeeze(1); +} + +at::Tensor nms_cpu(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + at::Tensor result; + AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { + result = nms_cpu_kernel(dets, scores, threshold); + }); + return result; +} diff --git a/maskrcnn_benchmark/csrc/cpu/vision.h b/maskrcnn_benchmark/csrc/cpu/vision.h new file mode 100644 index 0000000000000000000000000000000000000000..19539caf9c5aa8b8025f786c3e54e23de300cf5e --- /dev/null +++ b/maskrcnn_benchmark/csrc/cpu/vision.h @@ -0,0 +1,73 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include + + +at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio); + + +at::Tensor nms_cpu(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold); +at::Tensor +dcn_v2_cpu_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group); + +std::vector +dcn_v2_cpu_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group); + + +std::tuple +dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); + +std::tuple +dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); diff --git a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..1142fb37597141122ee63161d0abd7beac510a74 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu @@ -0,0 +1,346 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include +#include + +#include +#include +#include + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + + +template +__device__ T bilinear_interpolate(const T* bottom_data, + const int height, const int width, + T y, T x, + const int index /* index for debug only*/) { + + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + //empty + return 0; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + int y_low = (int) y; + int x_low = (int) x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T) y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T) x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + // do bilinear interpolation + T v1 = bottom_data[y_low * width + x_low]; + T v2 = bottom_data[y_low * width + x_high]; + T v3 = bottom_data[y_high * width + x_low]; + T v4 = bottom_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + return val; +} + +template +__global__ void RoIAlignForward(const int nthreads, const T* bottom_data, + const T spatial_scale, const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, T* top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[1] * spatial_scale; + T roi_start_h = offset_bottom_rois[2] * spatial_scale; + T roi_end_w = offset_bottom_rois[3] * spatial_scale; + T roi_end_h = offset_bottom_rois[4] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, (T)1.); + T roi_height = max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 + { + const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix ++) + { + const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); + + T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index); + output_val += val; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + + +template +__device__ void bilinear_interpolate_gradient( + const int height, const int width, + T y, T x, + T & w1, T & w2, T & w3, T & w4, + int & x_low, int & x_high, int & y_low, int & y_high, + const int index /* index for debug only*/) { + + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + //empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + y_low = (int) y; + x_low = (int) x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T) y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T) x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = bottom_data[y_low * width + x_low]; + // T v2 = bottom_data[y_low * width + x_high]; + // T v3 = bottom_data[y_high * width + x_low]; + // T v4 = bottom_data[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +template +__global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff, + const int num_rois, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const int sampling_ratio, + T* bottom_diff, + const T* bottom_rois) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[1] * spatial_scale; + T roi_start_h = offset_bottom_rois[2] * spatial_scale; + T roi_end_w = offset_bottom_rois[3] * spatial_scale; + T roi_end_h = offset_bottom_rois[4] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, (T)1.); + T roi_height = max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; + + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_top_diff = top_diff + top_offset; + const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 + { + const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix ++) + { + const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, + w1, w2, w3, w4, + x_low, x_high, y_low, y_high, + index); + + T g1 = top_diff_this_bin * w1 / count; + T g2 = top_diff_this_bin * w2 / count; + T g3 = top_diff_this_bin * w3 / count; + T g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) + { + atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1)); + atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2)); + atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3)); + atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4)); + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RoIAlignBackward + + +at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); + dim3 block(512); + + if (output.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return output; + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { + RoIAlignForward<<>>( + output_size, + input.contiguous().data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + rois.contiguous().data(), + output.data()); + }); + THCudaCheck(cudaGetLastError()); + return output; +} + +// TODO remove the dependency on input and use instead its sizes -> save memory +at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio) { + AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + + auto num_rois = rois.size(0); + auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); + dim3 block(512); + + // handle possibly empty gradients + if (grad.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return grad_input; + } + + AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] { + RoIAlignBackwardFeature<<>>( + grad.numel(), + grad.contiguous().data(), + num_rois, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + grad_input.data(), + rois.contiguous().data()); + }); + THCudaCheck(cudaGetLastError()); + return grad_input; +} diff --git a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..8f072ffc2bd6de310f0d92c8c513dd9cfcc80dbc --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu @@ -0,0 +1,202 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include +#include + +#include +#include +#include + + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + + +template +__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const T* bottom_rois, T* top_data, int* argmax_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); + int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); + int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); + int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + T bin_size_h = static_cast(roi_height) + / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) + / static_cast(pooled_width); + + int hstart = static_cast(floor(static_cast(ph) + * bin_size_h)); + int wstart = static_cast(floor(static_cast(pw) + * bin_size_w)); + int hend = static_cast(ceil(static_cast(ph + 1) + * bin_size_h)); + int wend = static_cast(ceil(static_cast(pw + 1) + * bin_size_w)); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart + roi_start_h, 0), height); + hend = min(max(hend + roi_start_h, 0), height); + wstart = min(max(wstart + roi_start_w, 0), width); + wend = min(max(wend + roi_start_w, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Define an empty pooling region to be zero + T maxval = is_empty ? 0 : -FLT_MAX; + // If nothing is pooled, argmax = -1 causes nothing to be backprop'd + int maxidx = -1; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h * width + w; + if (offset_bottom_data[bottom_index] > maxval) { + maxval = offset_bottom_data[bottom_index]; + maxidx = bottom_index; + } + } + } + top_data[index] = maxval; + argmax_data[index] = maxidx; + } +} + +template +__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff, + const int* argmax_data, const int num_rois, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, T* bottom_diff, + const T* bottom_rois) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + int bottom_offset = (roi_batch_ind * channels + c) * height * width; + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_top_diff = top_diff + top_offset; + T* offset_bottom_diff = bottom_diff + bottom_offset; + const int* offset_argmax_data = argmax_data + top_offset; + + int argmax = offset_argmax_data[ph * pooled_width + pw]; + if (argmax != -1) { + atomicAdd( + offset_bottom_diff + argmax, + static_cast(offset_top_diff[ph * pooled_width + pw])); + + } + } +} + +std::tuple ROIPool_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width) { + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt)); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); + dim3 block(512); + + if (output.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(output, argmax); + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] { + RoIPoolFForward<<>>( + output_size, + input.contiguous().data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + rois.contiguous().data(), + output.data(), + argmax.data()); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(output, argmax); +} + +// TODO remove the dependency on input and use instead its sizes -> save memory +at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& rois, + const at::Tensor& argmax, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width) { + AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + // TODO add more checks + + auto num_rois = rois.size(0); + auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); + dim3 block(512); + + // handle possibly empty gradients + if (grad.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return grad_input; + } + + AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] { + RoIPoolFBackward<<>>( + grad.numel(), + grad.contiguous().data(), + argmax.data(), + num_rois, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + grad_input.data(), + rois.contiguous().data()); + }); + THCudaCheck(cudaGetLastError()); + return grad_input; +} diff --git a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..7d40767bbb690eb8e55397bca83af636c7e0531c --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu @@ -0,0 +1,188 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu +// Cheng-Yang Fu +// cyfu@cs.unc.edu +#include +#include + +#include +#include +#include + +#include + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + + +template +__global__ void SigmoidFocalLossForward(const int nthreads, + const T* logits, + const int* targets, + const int num_classes, + const float gamma, + const float alpha, + const int num, + T* losses) { + CUDA_1D_KERNEL_LOOP(i, nthreads) { + + int n = i / num_classes; + int d = i % num_classes; // current class[0~79]; + int t = targets[n]; // target class [1~80]; + + // Decide it is positive or negative case. + T c1 = (t == (d+1)); + T c2 = (t>=0 & t != (d+1)); + + T zn = (1.0 - alpha); + T zp = (alpha); + + // p = 1. / 1. + expf(-x); p = sigmoid(x) + T p = 1. / (1. + expf(-logits[i])); + + // (1-p)**gamma * log(p) where + T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN)); + + // p**gamma * log(1-p) + T term2 = powf(p, gamma) * + (-1. * logits[i] * (logits[i] >= 0) - + logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); + + losses[i] = 0.0; + losses[i] += -c1 * term1 * zp; + losses[i] += -c2 * term2 * zn; + + } // CUDA_1D_KERNEL_LOOP +} // SigmoidFocalLossForward + + +template +__global__ void SigmoidFocalLossBackward(const int nthreads, + const T* logits, + const int* targets, + const T* d_losses, + const int num_classes, + const float gamma, + const float alpha, + const int num, + T* d_logits) { + CUDA_1D_KERNEL_LOOP(i, nthreads) { + + int n = i / num_classes; + int d = i % num_classes; // current class[0~79]; + int t = targets[n]; // target class [1~80], 0 is background; + + // Decide it is positive or negative case. + T c1 = (t == (d+1)); + T c2 = (t>=0 & t != (d+1)); + + T zn = (1.0 - alpha); + T zp = (alpha); + // p = 1. / 1. + expf(-x); p = sigmoid(x) + T p = 1. / (1. + expf(-logits[i])); + + // (1-p)**g * (1 - p - g*p*log(p) + T term1 = powf((1. - p), gamma) * + (1. - p - (p * gamma * logf(max(p, FLT_MIN)))); + + // (p**g) * (g*(1-p)*log(1-p) - p) + T term2 = powf(p, gamma) * + ((-1. * logits[i] * (logits[i] >= 0) - + logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * + (1. - p) * gamma - p); + d_logits[i] = 0.0; + d_logits[i] += -c1 * term1 * zp; + d_logits[i] += -c2 * term2 * zn; + d_logits[i] = d_logits[i] * d_losses[i]; + + } // CUDA_1D_KERNEL_LOOP +} // SigmoidFocalLossBackward + + +at::Tensor SigmoidFocalLoss_forward_cuda( + const at::Tensor& logits, + const at::Tensor& targets, + const int num_classes, + const float gamma, + const float alpha) { + AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); + AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); + AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); + + const int num_samples = logits.size(0); + + auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); + auto losses_size = num_samples * logits.size(1); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L)); + dim3 block(512); + + if (losses.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return losses; + } + + AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] { + SigmoidFocalLossForward<<>>( + losses_size, + logits.contiguous().data(), + targets.contiguous().data(), + num_classes, + gamma, + alpha, + num_samples, + losses.data()); + }); + THCudaCheck(cudaGetLastError()); + return losses; +} + + +at::Tensor SigmoidFocalLoss_backward_cuda( + const at::Tensor& logits, + const at::Tensor& targets, + const at::Tensor& d_losses, + const int num_classes, + const float gamma, + const float alpha) { + AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); + AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); + AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor"); + + AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); + + const int num_samples = logits.size(0); + AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes"); + + auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); + auto d_logits_size = num_samples * logits.size(1); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L)); + dim3 block(512); + + if (d_logits.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return d_logits; + } + + AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] { + SigmoidFocalLossBackward<<>>( + d_logits_size, + logits.contiguous().data(), + targets.contiguous().data(), + d_losses.contiguous().data(), + num_classes, + gamma, + alpha, + num_samples, + d_logits.data()); + }); + + THCudaCheck(cudaGetLastError()); + return d_logits; +} + diff --git a/maskrcnn_benchmark/csrc/cuda/dcn_v2_cuda.cu b/maskrcnn_benchmark/csrc/cuda/dcn_v2_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..767ed8fb12b6218223b8331342de91b0f13ea3d4 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/dcn_v2_cuda.cu @@ -0,0 +1,335 @@ +#include +#include "cuda/dcn_v2_im2col_cuda.h" + +#include +#include + +#include +#include +#include + +extern THCState *state; + +// author: Charles Shang +// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu + +// [batch gemm] +// https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu + +__global__ void createBatchGemmBuffer(const float **input_b, float **output_b, + float **columns_b, const float **ones_b, + const float **weight_b, const float **bias_b, + float *input, float *output, + float *columns, float *ones, + float *weight, float *bias, + const int input_stride, const int output_stride, + const int columns_stride, const int ones_stride, + const int num_batches) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_batches) + { + input_b[idx] = input + idx * input_stride; + output_b[idx] = output + idx * output_stride; + columns_b[idx] = columns + idx * columns_stride; + ones_b[idx] = ones + idx * ones_stride; + // share weights and bias within a Mini-Batch + weight_b[idx] = weight; + bias_b[idx] = bias; + } +} + +at::Tensor +dcn_v2_cuda_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + using scalar_t = float; + // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); + // printf("Channels: %d %d\n", channels, channels_kernel); + // printf("Channels: %d %d\n", channels_out, channels_kernel); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({batch, height_out, width_out}, input.options()); + auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + // prepare for batch-wise computing, which is significantly faster than instance-wise computing + // when batch size is large. + // launch batch threads + int matrices_size = batch * sizeof(float *); + auto input_b = static_cast(THCudaMalloc(state, matrices_size)); + auto output_b = static_cast(THCudaMalloc(state, matrices_size)); + auto columns_b = static_cast(THCudaMalloc(state, matrices_size)); + auto ones_b = static_cast(THCudaMalloc(state, matrices_size)); + auto weight_b = static_cast(THCudaMalloc(state, matrices_size)); + auto bias_b = static_cast(THCudaMalloc(state, matrices_size)); + + const int block = 128; + const int grid = (batch + block - 1) / block; + + createBatchGemmBuffer<<>>( + input_b, output_b, + columns_b, ones_b, + weight_b, bias_b, + input.data(), + output.data(), + columns.data(), + ones.data(), + weight.data(), + bias.data(), + channels * width * height, + channels_out * width_out * height_out, + channels * kernel_h * kernel_w * height_out * width_out, + height_out * width_out, + batch); + + long m_ = channels_out; + long n_ = height_out * width_out; + long k_ = 1; + THCudaBlas_SgemmBatched(state, + 't', + 'n', + n_, + m_, + k_, + 1.0f, + ones_b, k_, + bias_b, k_, + 0.0f, + output_b, n_, + batch); + + modulated_deformable_im2col_cuda(THCState_getCurrentStream(state), + input.data(), + offset.data(), + mask.data(), + batch, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + deformable_group, + columns.data()); + + long m = channels_out; + long n = height_out * width_out; + long k = channels * kernel_h * kernel_w; + THCudaBlas_SgemmBatched(state, + 'n', + 'n', + n, + m, + k, + 1.0f, + (const float **)columns_b, n, + weight_b, k, + 1.0f, + output_b, n, + batch); + + THCudaFree(state, input_b); + THCudaFree(state, output_b); + THCudaFree(state, columns_b); + THCudaFree(state, ones_b); + THCudaFree(state, weight_b); + THCudaFree(state, bias_b); + return output; +} + +__global__ void createBatchGemmBufferBackward( + float **grad_output_b, + float **columns_b, + float **ones_b, + float **weight_b, + float **grad_weight_b, + float **grad_bias_b, + float *grad_output, + float *columns, + float *ones, + float *weight, + float *grad_weight, + float *grad_bias, + const int grad_output_stride, + const int columns_stride, + const int ones_stride, + const int num_batches) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_batches) + { + grad_output_b[idx] = grad_output + idx * grad_output_stride; + columns_b[idx] = columns + idx * columns_stride; + ones_b[idx] = ones + idx * ones_stride; + + // share weights and bias within a Mini-Batch + weight_b[idx] = weight; + grad_weight_b[idx] = grad_weight; + grad_bias_b[idx] = grad_bias; + } +} + +std::vector dcn_v2_cuda_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + + THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); + THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); + + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({height_out, width_out}, input.options()); + auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + auto grad_input = at::zeros_like(input); + auto grad_weight = at::zeros_like(weight); + auto grad_bias = at::zeros_like(bias); + auto grad_offset = at::zeros_like(offset); + auto grad_mask = at::zeros_like(mask); + + using scalar_t = float; + + for (int b = 0; b < batch; b++) + { + auto input_n = input.select(0, b); + auto offset_n = offset.select(0, b); + auto mask_n = mask.select(0, b); + auto grad_output_n = grad_output.select(0, b); + auto grad_input_n = grad_input.select(0, b); + auto grad_offset_n = grad_offset.select(0, b); + auto grad_mask_n = grad_mask.select(0, b); + + long m = channels * kernel_h * kernel_w; + long n = height_out * width_out; + long k = channels_out; + + THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f, + grad_output_n.data(), n, + weight.data(), m, 0.0f, + columns.data(), n); + + // gradient w.r.t. input coordinate data + modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state), + columns.data(), + input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_offset_n.data(), + grad_mask_n.data()); + // gradient w.r.t. input data + modulated_deformable_col2im_cuda(THCState_getCurrentStream(state), + columns.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_input_n.data()); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and group + modulated_deformable_im2col_cuda(THCState_getCurrentStream(state), + input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + columns.data()); + + long m_ = channels_out; + long n_ = channels * kernel_h * kernel_w; + long k_ = height_out * width_out; + + THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f, + columns.data(), k_, + grad_output_n.data(), k_, 1.0f, + grad_weight.data(), n_); + + // gradient w.r.t. bias + // long m_ = channels_out; + // long k__ = height_out * width_out; + THCudaBlas_Sgemv(state, + 't', + k_, m_, 1.0f, + grad_output_n.data(), k_, + ones.data(), 1, 1.0f, + grad_bias.data(), 1); + } + + return { + grad_input, grad_offset, grad_mask, grad_weight, grad_bias + }; +} \ No newline at end of file diff --git a/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.cu b/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..4183793ba0b9230d5c78af72b3050c070dad5268 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.cu @@ -0,0 +1,402 @@ +#include "dcn_v2_im2col_cuda.h" +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + + +__device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width, + const int height, const int width, float h, float w) +{ + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h - h_low; + float lw = w - w_low; + float hh = 1 - lh, hw = 1 - lw; + + float v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + float v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + float v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + float v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +__device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w, + const int h, const int w, const int height, const int width) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +__device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w, + const int height, const int width, const float *im_data, + const int data_width, const int bp_dir) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +__global__ void modulated_deformable_im2col_gpu_kernel(const int n, + const float *data_im, const float *data_offset, const float *data_mask, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + float *data_col) +{ + // launch channels * batch_size * height_col * width_col cores + CUDA_KERNEL_LOOP(index, n) + { + // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow) + // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis + + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + // const int b_col = (index / width_col / height_col) % batch_size; + const int b_col = (index / width_col / height_col / num_channels) % batch_size; + // const int c_im = (index / width_col / height_col) / batch_size; + const int c_im = (index / width_col / height_col) % num_channels; + // const int c_col = c_im * kernel_h * kernel_w; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + // float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col; + //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; + const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; + const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + + const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) + { + for (int j = 0; j < kernel_w; ++j) + { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + float val = static_cast(0); + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const float map_h = i * dilation_h + offset_h; + //const float map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + // data_col_ptr += batch_size * height_col * width_col; + data_col_ptr += height_col * width_col; + } + } + } +} + +__global__ void modulated_deformable_col2im_gpu_kernel(const int n, + const float *data_col, const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + float *grad_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + const float cur_inv_h_data = h_in + i * dilation_h + offset_h; + const float cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const float cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, + const float *data_col, const float *data_im, + const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, + float *grad_offset, float *grad_mask) +{ + CUDA_KERNEL_LOOP(index, n) + { + float val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; + const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + float inv_h = h_in + i * dilation_h + offset_h; + float inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + else + { + mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); + } + const float weight = dmcn_get_coordinate_weight( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; + } +} + +void modulated_deformable_im2col_cuda(cudaStream_t stream, + const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* data_col) { + // num_axes should be smaller than block size + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * batch_size * height_col * width_col; + modulated_deformable_im2col_gpu_kernel + <<>>( + num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, + batch_size, channels, deformable_group, height_col, width_col, data_col); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +void modulated_deformable_col2im_cuda(cudaStream_t stream, + const float* data_col, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* grad_im){ + + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; + modulated_deformable_col2im_gpu_kernel + <<>>( + num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, deformable_group, height_col, width_col, grad_im); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, + const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float* grad_offset, float* grad_mask) { + const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; + const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; + modulated_deformable_col2im_coord_gpu_kernel + <<>>( + num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, + grad_offset, grad_mask); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); + } +} \ No newline at end of file diff --git a/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.h b/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..c85683198e0f6f908c294aef45314d79d9de8451 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.h @@ -0,0 +1,101 @@ + +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.h + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1811.11168 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu + */ + +/***************** Adapted by Charles Shang *********************/ + +#ifndef DCN_V2_IM2COL_CUDA +#define DCN_V2_IM2COL_CUDA + +#ifdef __cplusplus +extern "C" +{ +#endif + + void modulated_deformable_im2col_cuda(cudaStream_t stream, + const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *data_col); + + void modulated_deformable_col2im_cuda(cudaStream_t stream, + const float *data_col, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *grad_im); + + void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, + const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float *grad_offset, float *grad_mask); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/maskrcnn_benchmark/csrc/cuda/dcn_v2_psroi_pooling_cuda.cu b/maskrcnn_benchmark/csrc/cuda/dcn_v2_psroi_pooling_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..07b438e1957b8190e842e6873cd7feee805535e5 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/dcn_v2_psroi_pooling_cuda.cu @@ -0,0 +1,419 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +/***************** Adapted by Charles Shang *********************/ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__device__ T bilinear_interp( + const T *data, + const T x, + const T y, + const int width, + const int height) +{ + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + T dist_x = static_cast(x - x1); + T dist_y = static_cast(y - y1); + T value11 = data[y1 * width + x1]; + T value12 = data[y2 * width + x1]; + T value21 = data[y1 * width + x2]; + T value22 = data[y2 * width + x2]; + T value = (1 - dist_x) * (1 - dist_y) * value11 + + (1 - dist_x) * dist_y * value12 + + dist_x * (1 - dist_y) * value21 + + dist_x * dist_y * value22; + return value; +} + +template +__global__ void DeformablePSROIPoolForwardKernel( + const int count, + const T *bottom_data, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const T *bottom_rois, const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + T *top_data, + T *top_count) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + T sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; + } +} + +template +__global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, + const T *top_diff, + const T *top_count, + const int num_rois, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + T *bottom_data_diff, T *bottom_trans_diff, + const T *bottom_data, + const T *bottom_rois, + const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) + { + continue; + } + T diff_val = top_diff[index] / top_count[index]; + const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); + + if (no_trans) + { + continue; + } + T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; + diff_y *= roi_height; + + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); + } + } + } +} + +std::tuple +dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + + auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + if (out.numel() == 0) + { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); + } + + dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512); + + AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] { + DeformablePSROIPoolForwardKernel<<>>( + out_size, + input.contiguous().data(), + spatial_scale, + channels, + height, width, + pooled_height, + pooled_width, + bbox.contiguous().data(), + trans.contiguous().data(), + no_trans, + trans_std, + sample_per_part, + output_dim, + group_size, + part_size, + num_classes, + channels_each_class, + out.data(), + top_count.data()); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); +} + +std::tuple +dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); + AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); + auto trans_grad = at::zeros_like(trans); + + if (input_grad.numel() == 0) + { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); + } + + dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] { + DeformablePSROIPoolBackwardAccKernel<<>>( + out_size, + out_grad.contiguous().data(), + top_count.contiguous().data(), + num_bbox, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + output_dim, + input_grad.contiguous().data(), + trans_grad.contiguous().data(), + input.contiguous().data(), + bbox.contiguous().data(), + trans.contiguous().data(), + no_trans, + trans_std, + sample_per_part, + group_size, + part_size, + num_classes, + channels_each_class); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); +} \ No newline at end of file diff --git a/maskrcnn_benchmark/csrc/cuda/nms.cu b/maskrcnn_benchmark/csrc/cuda/nms.cu new file mode 100644 index 0000000000000000000000000000000000000000..833d8523a5809d99a1078a144a384c864a9d8df9 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/nms.cu @@ -0,0 +1,131 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include +#include + +#include +#include + +#include +#include + +int const threadsPerBlock = sizeof(unsigned long long) * 8; + +__device__ inline float devIoU(float const * const a, float const * const b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return interS / (Sa + Sb - interS); +} + +__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, + const float *dev_boxes, unsigned long long *dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 5; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +// boxes is a N x 5 tensor +at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { + using scalar_t = float; + AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); + auto scores = boxes.select(1, 4); + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + auto boxes_sorted = boxes.index_select(0, order_t); + + int boxes_num = boxes.size(0); + + const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); + + scalar_t* boxes_dev = boxes_sorted.data(); + + THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState + + unsigned long long* mask_dev = NULL; + //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, + // boxes_num * col_blocks * sizeof(unsigned long long))); + + mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); + + dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), + THCCeilDiv(boxes_num, threadsPerBlock)); + dim3 threads(threadsPerBlock); + nms_kernel<<>>(boxes_num, + nms_overlap_thresh, + boxes_dev, + mask_dev); + + std::vector mask_host(boxes_num * col_blocks); + THCudaCheck(cudaMemcpy(&mask_host[0], + mask_dev, + sizeof(unsigned long long) * boxes_num * col_blocks, + cudaMemcpyDeviceToHost)); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); + + at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); + int64_t* keep_out = keep.data(); + + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / threadsPerBlock; + int inblock = i % threadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + keep_out[num_to_keep++] = i; + unsigned long long *p = &mask_host[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + + THCudaFree(state, mask_dev); + // TODO improve this part + return std::get<0>(order_t.index({ + keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( + order_t.device(), keep.scalar_type()) + }).sort(0, false)); +} diff --git a/maskrcnn_benchmark/csrc/cuda/vision.h b/maskrcnn_benchmark/csrc/cuda/vision.h new file mode 100644 index 0000000000000000000000000000000000000000..ff02d612304120f86dfc0940a745250594adb267 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/vision.h @@ -0,0 +1,121 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include + + +at::Tensor SigmoidFocalLoss_forward_cuda( + const at::Tensor& logits, + const at::Tensor& targets, + const int num_classes, + const float gamma, + const float alpha); + +at::Tensor SigmoidFocalLoss_backward_cuda( + const at::Tensor& logits, + const at::Tensor& targets, + const at::Tensor& d_losses, + const int num_classes, + const float gamma, + const float alpha); + +at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio); + +at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio); + + +std::tuple ROIPool_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width); + +at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& rois, + const at::Tensor& argmax, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width); + +at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); + + +at::Tensor compute_flow_cuda(const at::Tensor& boxes, + const int height, + const int width); + +at::Tensor +dcn_v2_cuda_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group); + +std::vector +dcn_v2_cuda_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group); + + +std::tuple +dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); + +std::tuple +dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); diff --git a/maskrcnn_benchmark/csrc/dcn_v2.h b/maskrcnn_benchmark/csrc/dcn_v2.h new file mode 100644 index 0000000000000000000000000000000000000000..9c718a4969e26b7fb04358db10b71a0fa953c20c --- /dev/null +++ b/maskrcnn_benchmark/csrc/dcn_v2.h @@ -0,0 +1,145 @@ +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + +at::Tensor +dcn_v2_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_cuda_forward(input, weight, bias, offset, mask, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +std::vector +dcn_v2_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_cuda_backward(input, + weight, + bias, + offset, + mask, + grad_output, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +std::tuple +dcn_v2_psroi_pooling_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_psroi_pooling_cuda_forward(input, + bbox, + trans, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +std::tuple +dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_psroi_pooling_cuda_backward(out_grad, + input, + bbox, + trans, + top_count, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} diff --git a/maskrcnn_benchmark/csrc/nms.h b/maskrcnn_benchmark/csrc/nms.h new file mode 100644 index 0000000000000000000000000000000000000000..312fed4a7cb7c1bc6c2345b5e5d678cc6c1a7141 --- /dev/null +++ b/maskrcnn_benchmark/csrc/nms.h @@ -0,0 +1,28 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + + +at::Tensor nms(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + + if (dets.type().is_cuda()) { +#ifdef WITH_CUDA + // TODO raise error if not compiled with CUDA + if (dets.numel() == 0) + return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); + auto b = at::cat({dets, scores.unsqueeze(1)}, 1); + return nms_cuda(b, threshold); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + + at::Tensor result = nms_cpu(dets, scores, threshold); + return result; +} diff --git a/maskrcnn_benchmark/csrc/vision.cpp b/maskrcnn_benchmark/csrc/vision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5d5fbfb63e035dd1efd01ca3fa226c88cc1f2409 --- /dev/null +++ b/maskrcnn_benchmark/csrc/vision.cpp @@ -0,0 +1,21 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "nms.h" +#include "ROIAlign.h" +#include "ROIPool.h" +#include "SigmoidFocalLoss.h" +#include "dcn_v2.h" + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("nms", &nms, "non-maximum suppression"); + m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); + m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); + m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); + m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); + m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); + m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); + m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward"); + m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward"); + m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward"); + m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward"); +} diff --git a/maskrcnn_benchmark/data/README.md b/maskrcnn_benchmark/data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8ae85e0567cbe71ef1f1df4137cbf549240065d2 --- /dev/null +++ b/maskrcnn_benchmark/data/README.md @@ -0,0 +1,90 @@ +# Setting Up Datasets +This file describes how to perform training on other datasets. + +Only Pascal VOC dataset can be loaded from its original format and be outputted to Pascal style results currently. + +We expect the annotations from other datasets be converted to COCO json format, and +the output will be in COCO-style. (i.e. AP, AP50, AP75, APs, APm, APl for bbox and segm) + +## Creating Symlinks for PASCAL VOC + +We assume that your symlinked `datasets/voc/VOC` directory has the following structure: + +``` +VOC +|_ JPEGImages +| |_ .jpg +| |_ ... +| |_ .jpg +|_ Annotations +| |_ pascal_train.json (optional) +| |_ pascal_val.json (optional) +| |_ pascal_test.json (optional) +| |_ .xml +| |_ ... +| |_ .xml +|_ VOCdevkit +``` + +Create symlinks for `voc/VOC`: + +``` +cd ~/github/maskrcnn-benchmark +mkdir -p datasets/voc/VOC +ln -s /path/to/VOC /datasets/voc/VOC +``` +Example configuration files for PASCAL VOC could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/pascal_voc/). + +### PASCAL VOC Annotations in COCO Format +To output COCO-style evaluation result, PASCAL VOC annotations in COCO json format is required and could be downloaded from [here](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip) +via http://cocodataset.org/#external. + +## Creating Symlinks for Cityscapes: + +We assume that your symlinked `datasets/cityscapes` directory has the following structure: + +``` +cityscapes +|_ images +| |_ .jpg +| |_ ... +| |_ .jpg +|_ annotations +| |_ instanceonly_gtFile_train.json +| |_ ... +|_ raw + |_ gtFine + |_ ... + |_ README.md +``` + +Create symlinks for `cityscapes`: + +``` +cd ~/github/maskrcnn-benchmark +mkdir -p datasets/cityscapes +ln -s /path/to/cityscapes datasets/data/cityscapes +``` + +### Steps to convert Cityscapes Annotations to COCO Format +1. Download gtFine_trainvaltest.zip from https://www.cityscapes-dataset.com/downloads/ (login required) +2. Extract it to /path/to/gtFine_trainvaltest +``` +cityscapes +|_ gtFine_trainvaltest.zip +|_ gtFine_trainvaltest + |_ gtFine +``` +3. Run the below commands to convert the annotations + +``` +cd ~/github +git clone https://github.com/mcordts/cityscapesScripts.git +cd cityscapesScripts +cp ~/github/maskrcnn-benchmark/tools/cityscapes/instances2dict_with_polygons.py cityscapesscripts/evaluation +python setup.py install +cd ~/github/maskrcnn-benchmark +python tools/cityscapes/convert_cityscapes_to_coco.py --datadir /path/to/cityscapes --outdir /path/to/cityscapes/annotations +``` + +Example configuration files for Cityscapes could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/cityscapes/). diff --git a/maskrcnn_benchmark/data/__init__.py b/maskrcnn_benchmark/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2ba1e52473f97615cc41f82aef279fff4d194527 --- /dev/null +++ b/maskrcnn_benchmark/data/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .build import make_data_loader diff --git a/maskrcnn_benchmark/data/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3364ff853c4e75938313a943503d2d7dadeead1 Binary files /dev/null and b/maskrcnn_benchmark/data/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/__pycache__/build.cpython-37.pyc b/maskrcnn_benchmark/data/__pycache__/build.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b84186aeb391fafd4fc23962880ff0174c230fa1 Binary files /dev/null and b/maskrcnn_benchmark/data/__pycache__/build.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/__pycache__/collate_batch.cpython-37.pyc b/maskrcnn_benchmark/data/__pycache__/collate_batch.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..150b3a691c5813b5c1cd13cba77da8a0d47f8ebc Binary files /dev/null and b/maskrcnn_benchmark/data/__pycache__/collate_batch.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/build.py b/maskrcnn_benchmark/data/build.py new file mode 100644 index 0000000000000000000000000000000000000000..24fbc5c1f4897b40cb13c204767315e549c18d28 --- /dev/null +++ b/maskrcnn_benchmark/data/build.py @@ -0,0 +1,176 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import bisect +import copy +import logging + +import torch.utils.data +from maskrcnn_benchmark.utils.comm import get_world_size +from maskrcnn_benchmark.utils.imports import import_file + +from . import datasets as D +from . import samplers + +from .collate_batch import BatchCollator +from .transforms import build_transforms + + +def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True): + """ + Arguments: + dataset_list (list[str]): Contains the names of the datasets, i.e., + coco_2014_trian, coco_2014_val, etc + transforms (callable): transforms to apply to each (image, target) sample + dataset_catalog (DatasetCatalog): contains the information on how to + construct a dataset. + is_train (bool): whether to setup the dataset for training or testing + """ + if not isinstance(dataset_list, (list, tuple)): + raise RuntimeError( + "dataset_list should be a list of strings, got {}".format(dataset_list) + ) + datasets = [] + for dataset_name in dataset_list: + data = dataset_catalog.get(dataset_name) + factory = getattr(D, data["factory"]) + args = data["args"] + # for COCODataset, we want to remove images without annotations + # during training + if data["factory"] in ["COCODataset", + "WordDataset"]: + args["remove_images_without_annotations"] = is_train + if data["factory"] == "PascalVOCDataset": + args["use_difficult"] = not is_train + args["transforms"] = transforms + # make dataset from factory + dataset = factory(**args) + datasets.append(dataset) + + # for testing, return a list of datasets + if not is_train: + return datasets + + # for training, concatenate all datasets into a single one + dataset = datasets[0] + if len(datasets) > 1: + dataset = D.ConcatDataset(datasets) + + return [dataset] + + +def make_data_sampler(dataset, shuffle, distributed): + if distributed: + return samplers.DistributedSampler(dataset, shuffle=shuffle) + if shuffle: + sampler = torch.utils.data.sampler.RandomSampler(dataset) + else: + sampler = torch.utils.data.sampler.SequentialSampler(dataset) + return sampler + + +def _quantize(x, bins): + bins = copy.copy(bins) + bins = sorted(bins) + quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) + return quantized + + +def _compute_aspect_ratios(dataset): + aspect_ratios = [] + for i in range(len(dataset)): + img_info = dataset.get_img_info(i) + aspect_ratio = float(img_info["height"]) / float(img_info["width"]) + aspect_ratios.append(aspect_ratio) + return aspect_ratios + + +def make_batch_data_sampler( + dataset, sampler, aspect_grouping, images_per_batch, num_iters=None, start_iter=0 +): + if aspect_grouping: + if not isinstance(aspect_grouping, (list, tuple)): + aspect_grouping = [aspect_grouping] + aspect_ratios = _compute_aspect_ratios(dataset) + group_ids = _quantize(aspect_ratios, aspect_grouping) + batch_sampler = samplers.GroupedBatchSampler( + sampler, group_ids, images_per_batch, drop_uneven=False + ) + else: + batch_sampler = torch.utils.data.sampler.BatchSampler( + sampler, images_per_batch, drop_last=False + ) + if num_iters is not None: + batch_sampler = samplers.IterationBasedBatchSampler( + batch_sampler, num_iters, start_iter + ) + return batch_sampler + + +def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): + num_gpus = get_world_size() + if is_train: + images_per_batch = cfg.SOLVER.IMS_PER_BATCH + assert ( + images_per_batch % num_gpus == 0 + ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " + "of GPUs ({}) used.".format(images_per_batch, num_gpus) + images_per_gpu = images_per_batch // num_gpus + shuffle = True + num_iters = cfg.SOLVER.MAX_ITER + else: + images_per_batch = cfg.TEST.IMS_PER_BATCH + assert ( + images_per_batch % num_gpus == 0 + ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " + "of GPUs ({}) used.".format(images_per_batch, num_gpus) + images_per_gpu = images_per_batch // num_gpus + shuffle = False if not is_distributed else True + num_iters = None + start_iter = 0 + + if images_per_gpu > 1: + logger = logging.getLogger(__name__) + logger.warning( + "When using more than one image per GPU you may encounter " + "an out-of-memory (OOM) error if your GPU does not have " + "sufficient memory. If this happens, you can reduce " + "SOLVER.IMS_PER_BATCH (for training) or " + "TEST.IMS_PER_BATCH (for inference). For training, you must " + "also adjust the learning rate and schedule length according " + "to the linear scaling rule. See for example: " + "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" + ) + + # group images which have similar aspect ratio. In this case, we only + # group in two cases: those with width / height > 1, and the other way around, + # but the code supports more general grouping strategy + aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] + + paths_catalog = import_file( + "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True + ) + DatasetCatalog = paths_catalog.DatasetCatalog + dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST + + transforms = build_transforms(cfg, is_train) + datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train) + + data_loaders = [] + for dataset in datasets: + sampler = make_data_sampler(dataset, shuffle, is_distributed) + batch_sampler = make_batch_data_sampler( + dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter + ) + collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) + num_workers = cfg.DATALOADER.NUM_WORKERS + data_loader = torch.utils.data.DataLoader( + dataset, + num_workers=num_workers, + batch_sampler=batch_sampler, + collate_fn=collator, + ) + data_loaders.append(data_loader) + if is_train: + # during training, a single (possibly concatenated) data_loader is returned + assert len(data_loaders) == 1 + return data_loaders[0] + return data_loaders diff --git a/maskrcnn_benchmark/data/collate_batch.py b/maskrcnn_benchmark/data/collate_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..a7f03416741cfb4c04de613f7d2c8f2050258d73 --- /dev/null +++ b/maskrcnn_benchmark/data/collate_batch.py @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from maskrcnn_benchmark.structures.image_list import to_image_list + + +class BatchCollator(object): + """ + From a list of samples from the dataset, + returns the batched images and targets. + This should be passed to the DataLoader + """ + + def __init__(self, size_divisible=0): + self.size_divisible = size_divisible + + def __call__(self, batch): + transposed_batch = list(zip(*batch)) + images = to_image_list(transposed_batch[0], self.size_divisible) + targets = transposed_batch[1] + img_ids = transposed_batch[2] + return images, targets, img_ids diff --git a/maskrcnn_benchmark/data/datasets/__init__.py b/maskrcnn_benchmark/data/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e2ab8384e78842d06b639ac631511368b93bf01a --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .coco import COCODataset +from .voc import PascalVOCDataset +from .concat_dataset import ConcatDataset +from .word_dataset import WordDataset + +__all__ = ["COCODataset", "ConcatDataset", "PascalVOCDataset", + "WordDataset"] diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3c932e33972edd8cd5b473b425a126458dcca33 Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/char_dataset.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/char_dataset.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9dae9b30ea4a38b29ae1c3a23511426b69f10078 Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/char_dataset.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/coco.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/coco.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7842be5c123e881e9dd3c6663be0e0d4e4ea6dab Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/coco.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/concat_dataset.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/concat_dataset.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ff6ac314c989677abab6f6dc52748dd6900000d Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/concat_dataset.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/voc.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/voc.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2a07fed0e252f636ada7fcdaf9cdda93e0216d2 Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/voc.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/word_dataset.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/word_dataset.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff7695f750c19697b1333daaf47df21f3addd769 Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/word_dataset.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/coco.py b/maskrcnn_benchmark/data/datasets/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d0e42b437db2fab29d4fab59a813c932c9355516 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/coco.py @@ -0,0 +1,101 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torchvision + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask +from maskrcnn_benchmark.structures.keypoint import PersonKeypoints + + +min_keypoints_per_image = 10 + + +def _count_visible_keypoints(anno): + return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno) + + +def _has_only_empty_bbox(anno): + return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno) + + +def has_valid_annotation(anno): + # if it's empty, there is no annotation + if len(anno) == 0: + return False + # if all boxes have close to zero area, there is no annotation + if _has_only_empty_bbox(anno): + return False + # keypoints task have a slight different critera for considering + # if an annotation is valid + if "keypoints" not in anno[0]: + return True + # for keypoint detection tasks, only consider valid images those + # containing at least min_keypoints_per_image + if _count_visible_keypoints(anno) >= min_keypoints_per_image: + return True + return False + + +class COCODataset(torchvision.datasets.coco.CocoDetection): + def __init__( + self, ann_file, root, remove_images_without_annotations, transforms=None + ): + super(COCODataset, self).__init__(root, ann_file) + # sort indices for reproducible results + self.ids = sorted(self.ids) + + # filter images without detection annotations + if remove_images_without_annotations: + ids = [] + for img_id in self.ids: + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None) + anno = self.coco.loadAnns(ann_ids) + if has_valid_annotation(anno): + ids.append(img_id) + self.ids = ids + + self.json_category_id_to_contiguous_id = { + v: i + 1 for i, v in enumerate(self.coco.getCatIds()) + } + self.contiguous_category_id_to_json_id = { + v: k for k, v in self.json_category_id_to_contiguous_id.items() + } + self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} + self.transforms = transforms + + def __getitem__(self, idx): + img, anno = super(COCODataset, self).__getitem__(idx) + + # filter crowd annotations + # TODO might be better to add an extra field + anno = [obj for obj in anno if obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes + target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") + + classes = [obj["category_id"] for obj in anno] + classes = [self.json_category_id_to_contiguous_id[c] for c in classes] + classes = torch.tensor(classes) + target.add_field("labels", classes) + + masks = [obj["segmentation"] for obj in anno] + masks = SegmentationMask(masks, img.size, mode='poly') + target.add_field("masks", masks) + + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = PersonKeypoints(keypoints, img.size) + target.add_field("keypoints", keypoints) + + target = target.clip_to_image(remove_empty=True) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target, idx + + def get_img_info(self, index): + img_id = self.id_to_img_map[index] + img_data = self.coco.imgs[img_id] + return img_data diff --git a/maskrcnn_benchmark/data/datasets/concat_dataset.py b/maskrcnn_benchmark/data/datasets/concat_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e5e087c42036f27132ca2c6e1d5252af5fee4a97 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/concat_dataset.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import bisect + +from torch.utils.data.dataset import ConcatDataset as _ConcatDataset + + +class ConcatDataset(_ConcatDataset): + """ + Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra + method for querying the sizes of the image + """ + + def get_idxs(self, idx): + dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + return dataset_idx, sample_idx + + def get_img_info(self, idx): + dataset_idx, sample_idx = self.get_idxs(idx) + return self.datasets[dataset_idx].get_img_info(sample_idx) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d5687182c7268d7b2fcc5fcbf6a35dc27341281d --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/__init__.py @@ -0,0 +1,25 @@ +from maskrcnn_benchmark.data import datasets + + +from .word import word_evaluation + + +def evaluate(dataset, predictions, output_folder, **kwargs): + """evaluate dataset using different methods based on dataset type. + Args: + dataset: Dataset object + predictions(list[BoxList]): each item in the list represents the + prediction results for one image. + output_folder: output folder, to save evaluation files or results. + **kwargs: other args. + Returns: + evaluation result + """ + args = dict( + dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs + ) + if isinstance(dataset, datasets.WordDataset): + return word_evaluation(**args) + else: + dataset_name = dataset.__class__.__name__ + raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c72a8f32ac3a03d8df8a710bc0b3f7debfe1ca8f Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/word/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..efcf8ce034944e58a34592ed22e82adaa266808b --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/__init__.py @@ -0,0 +1,21 @@ +from .word_eval import do_coco_evaluation +# from util import io_ + +def word_evaluation( + dataset, + predictions, + output_folder, + box_only, + iou_types, + expected_results, + expected_results_sigma_tol, +): + return do_coco_evaluation( + dataset=dataset, + predictions=predictions, + box_only=box_only, + output_folder=output_folder, + iou_types=iou_types, + expected_results=expected_results, + expected_results_sigma_tol=expected_results_sigma_tol, + ) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c84d02958ed66b0ab303f0cdd523e8c0b289ea06 Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/alfashape.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/alfashape.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1d3a519c736f089b17eda3404babf46f1cc061a Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/alfashape.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/area_of_polygon.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/area_of_polygon.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a25b55023c40456c0f70a8d8bc9f2da077122cfb Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/area_of_polygon.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/io_.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/io_.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b05afd5505226acb78fbb81e3048366fbfde324 Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/io_.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/word_eval.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/word_eval.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa07d9c7c8c9998cbf7fc6d7d82a84447cc0a487 Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/word_eval.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/alfashape.py b/maskrcnn_benchmark/data/datasets/evaluation/word/alfashape.py new file mode 100644 index 0000000000000000000000000000000000000000..9043c54b2cc8a27a37702649c8acff865f741790 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/alfashape.py @@ -0,0 +1,89 @@ +import numpy as np +from scipy.spatial import Delaunay +from .area_of_polygon import area_of_polygon_crd +import networkx as nx + +def sqrt_sum(a, b): + x = (a[0]-b[0]) + y = (a[1]-b[1]) + return np.sqrt(x*x+y*y) + +def shapeToSomePolygons(shape): + G = nx.Graph() + allnodes = set() + for line in shape: + G.add_nodes_from(line) + G.add_edge(line[0], line[1]) + allnodes.add(line[0]) + allnodes.add(line[1]) + + result = [] + + while allnodes: + node = allnodes.pop() + new_node = next(iter(G[node]), None) + if not new_node: continue + + G.remove_edge(node, new_node) + temp = nx.shortest_path(G, node, new_node) + for j,t in enumerate(temp): + if t in allnodes: + allnodes.remove(t) + result.append(temp) + return result + +def getAlfaShapes(pts,alfas=1): + tri_ind = [(0,1),(1,2),(2,0)] + tri = Delaunay(pts) + lenghts={} + for s in tri.simplices: + for ind in tri_ind: + a = pts[s[ind[0]]] + b = pts[s[ind[1]]] + # print('a---', a) + # print('b---', b) + line = (a, b) + # line = ((a[0], a[1]), (b[0], b[1])) + lenghts[line] = sqrt_sum(a, b) + + ls = sorted(lenghts.values()) + + mean_length = np.mean(ls) + mean_length_index = ls.index(next(filter(lambda x: x>=mean_length, ls))) + magic_numbers = [ls[i] for i in range(mean_length_index, len(ls))] + magic_numbers[0] = 0 + sum_magic = np.sum(magic_numbers) + for i in range(2, len(magic_numbers)): + magic_numbers[i] += magic_numbers[i-1] + magic_numbers = [m /sum_magic for m in magic_numbers] + + rez = [] + for alfa in alfas: + i = magic_numbers.index(next(filter(lambda z: z > alfa, magic_numbers), magic_numbers[-1])) + av_length = ls[mean_length_index+i] + + lines = {} + + for s in tri.simplices: + used = True + for ind in tri_ind: + if lenghts[(pts[s[ind[0]]], pts[s[ind[1]]])] > av_length: + used = False + break + if used == False: continue + + for ind in tri_ind: + i,j= s[ind[0]],s[ind[1]] + line = (pts[min(i,j)], pts[max(i,j)]) + lines[line] = line in lines + + good_lines = [] + for v in lines: + if not lines[v]: + good_lines.append(v) + + result = shapeToSomePolygons(good_lines) + result.sort(key=area_of_polygon_crd, reverse=True) + rez.append(result) + return rez + diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/area_of_polygon.py b/maskrcnn_benchmark/data/datasets/evaluation/word/area_of_polygon.py new file mode 100644 index 0000000000000000000000000000000000000000..73694a0f91b56b9bff08bfea02e89c8d106624ae --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/area_of_polygon.py @@ -0,0 +1,38 @@ +import numpy as np + +def area_of_polygon_xy(x, y): + """Calculates the area of an arbitrary polygon given its verticies""" + area = 0.0 + for i in range(-1, len(x)-1): + area += x[i] * (y[i+1] - y[i-1]) + return abs(area) / 2.0 + +def area_of_polygon_crd(cordinates): + """Calculates the area of an arbitrary polygon given its verticies""" + x = [v[0] for v in cordinates] + y = [v[1] for v in cordinates] + return area_of_polygon_xy(x,y) + +def area_of_polygon(**kwargs): + if 'x' in kwargs and 'y' in kwargs: + x = kwargs['x'] + y = kwargs['y'] + return area_of_polygon_xy(x, y) + + if 'coordinates' in kwargs: + cordinates = kwargs['coordinates'] + return area_of_polygon_crd(cordinates) + + print("Wrong parameters") + return None + +def length_of_way(cordinates): + """Length of the way""" + if len(cordinates)<2: + return 0 + leng = 0 + for i in range(1,len(cordinates)): + crd = cordinates + dist = distance(crd[i-1],crd[i-1]) + leng = leng + dist + return leng \ No newline at end of file diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/io_.py b/maskrcnn_benchmark/data/datasets/evaluation/word/io_.py new file mode 100644 index 0000000000000000000000000000000000000000..0976223422731574789f5ed7fc30c167a2db03fc --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/io_.py @@ -0,0 +1,216 @@ +#coding=utf-8 +''' +Created on 2016年9月27日 + +@author: dengdan + +Tool functions for file system operation and I/O. +In the style of linux shell commands +''' +import os +import pickle as pkl +# import commands +import logging + +# import util + +def mkdir(path): + """ + If the target directory does not exists, it and its parent directories will created. + """ + path = get_absolute_path(path) + if not exists(path): + os.makedirs(path) + return path + +def make_parent_dir(path): + """make the parent directories for a file.""" + parent_dir = get_dir(path) + mkdir(parent_dir) + + +def pwd(): + return os.getcwd() + +def dump(path, obj): + path = get_absolute_path(path) + parent_path = get_dir(path) + mkdir(parent_path) + with open(path, 'w') as f: + logging.info('dumping file:' + path); + pkl.dump(obj, f) + +def load(path): + path = get_absolute_path(path) + with open(path, 'r') as f: + data = pkl.load(f) + return data + +def join_path(a, *p): + return os.path.join(a, *p) + +def is_dir(path): + path = get_absolute_path(path) + return os.path.isdir(path) + + +def is_path(path): + path = get_absolute_path(path) + return os.path.ispath(path) + +def get_dir(path): + ''' + return the directory it belongs to. + if path is a directory itself, itself will be return + ''' + path = get_absolute_path(path) + if is_dir(path): + return path; + return os.path.split(path)[0] + +def get_filename(path): + return os.path.split(path)[1] + +def get_absolute_path(p): + if p.startswith('~'): + p = os.path.expanduser(p) + return os.path.abspath(p) + +def cd(p): + p = get_absolute_path(p) + os.chdir(p) + +# def ls(path = '.', suffix = None): +# """ +# list files in a directory. +# return file names in a list +# """ +# path = get_absolute_path(path) +# files = os.listdir(path) +# +# if suffix is None: +# return files +# +# filtered = [] +# for f in files: +# if util.str.ends_with(f, suffix, ignore_case = True): +# filtered.append(f) +# +# return filtered + +def find_files(pattern): + import glob + return glob.glob(pattern) + +def read_lines(p): + """return the text in a file in lines as a list """ + p = get_absolute_path(p) + f = open(p,'r') + return f.readlines() + +def write_lines(p, lines): + p = get_absolute_path(p) + make_parent_dir(p) + with open(p, 'w') as f: + for line in lines: + f.write(line) + + +# def cat(p): +# """return the text in a file as a whole""" +# cmd = 'cat ' + p +# return commands.getoutput(cmd) + +def exists(path): + path = get_absolute_path(path) + return os.path.exists(path) + +def load_mat(path): + import scipy.io as sio + path = get_absolute_path(path) + return sio.loadmat(path) + +def dump_mat(path, dict_obj, append = True): + import scipy.io as sio + path = get_absolute_path(path) + make_parent_dir(path) + sio.savemat(file_name = path, mdict = dict_obj, appendmat = append) + +def dir_mat(path): + ''' + list the variables in mat file. + return a list: [(name, shape, dtype), ...] + ''' + import scipy.io as sio + path = get_absolute_path(path) + return sio.whosmat(path) + +SIZE_UNIT_K = 1024 +SIZE_UNIT_M = SIZE_UNIT_K ** 2 +SIZE_UNIT_G = SIZE_UNIT_K ** 3 +def get_file_size(path, unit = SIZE_UNIT_K): + size = os.path.getsize(get_absolute_path(path)) + return size * 1.0 / unit + + +def create_h5(path): + import h5py + path = get_absolute_path(path) + make_parent_dir(path) + return h5py.File(path, 'w'); + +def open_h5(path, mode = 'r'): + import h5py + path = get_absolute_path(path) + return h5py.File(path, mode); + +def read_h5(h5, key): + return h5[key][:] +def read_h5_attrs(h5, key, attrs): + return h5[key].attrs[attrs] + +def copy(src, dest): + import shutil + shutil.copy(get_absolute_path(src), get_absolute_path(dest)) + +cp = copy + +def remove(p): + import os + os.remove(get_absolute_path(p)) +rm = remove + +# def search(pattern, path, file_only = True): +# """ +# Search files whose name matches the give pattern. The search scope +# is the directory and sub-directories of 'path'. +# """ +# path = get_absolute_path(path) +# pattern_here = util.io.join_path(path, pattern) +# targets = [] +# +# # find matchings in current directory +# candidates = find_files(pattern_here) +# for can in candidates: +# if util.io.is_dir(can) and file_only: +# continue +# else: +# targets.append(can) +# +# # find matching in sub-dirs +# files = ls(path) +# for f in files: +# fpath = util.io.join_path(path, f) +# if is_dir(fpath): +# targets_in_sub_dir = search(pattern, fpath, file_only) +# targets.extend(targets_in_sub_dir) +# return targets + +def dump_json(path, data): + import json + path = get_absolute_path(path) + make_parent_dir(path) + + with open(path, 'w') as f: + json.dump(data, f) + return path \ No newline at end of file diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..89b681378328a3f30ec9de2d9743de066d5c7632 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__init__.py @@ -0,0 +1,62 @@ +# import log +# import dtype +# # import plt +# import np +# import img +# _img = img +# import dec +# import rand +# import mod +# import proc +# import test +# import neighbour as nb +# #import mask +# import str_ as str +# import io as sys_io +# import io_ as io +# import feature +# import thread_ as thread +# import caffe_ as caffe +# # import tf +# import cmd +# import ml +# import sys +# import url +# from .misc import * +# from .logger import * +# # log.init_logger('~/temp/log/log_' + get_date_str() + '.log') +# +# def exit(code = 0): +# sys.exit(0) +# +# is_main = mod.is_main +# init_logger = log.init_logger +# +# def sit(img, path = None, name = ""): +# if path is None: +# _count = get_count(); +# path = '~/temp/no-use/images/%s_%d_%s.jpg'%(log.get_date_str(), _count, name) +# +# if type(img) == list: +# plt.show_images(images = img, path = path, show = False, axis_off = True, save = True) +# else: +# plt.imwrite(path, img) +# +# return path +# _count = 0; +# +# def get_count(): +# global _count; +# _count += 1; +# return _count +# +# def cit(img, path = None, rgb = True, name = ""): +# _count = get_count(); +# if path is None: +# img = np.np.asarray(img, dtype = np.np.uint8) +# path = '~/temp/no-use/%s_%d_%s.jpg'%(log.get_date_str(), _count, name) +# _img.imwrite(path, img, rgb = rgb) +# return path +# +# def argv(index): +# return sys.argv[index] diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e881f4c7d31e4210c752fe810650653d798f370 Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/io_.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/io_.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84ab18e8447d7e9f4d154922c07cc4cdc0dd7531 Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/io_.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py new file mode 100644 index 0000000000000000000000000000000000000000..cc34c8368e81b687d949eaca7bdcc46dede5e561 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py @@ -0,0 +1,70 @@ +# encoding=utf-8 + +import util +def get_data(net, name): + import caffe + if isinstance(net, caffe._caffe.Solver): + net = net.net + return net.blobs[name].data[...] + +def get_params(net, name = None): + import caffe + if isinstance(net, caffe._caffe.Solver): + net = net.net + params = net.params[name] + p = [] + for param in params: + p.append(param.data[...]) + return p + +def draw_log(log_path, output_names, show = False, save_path = None, from_to = None, smooth = False): + pattern = "Train net output: word_bbox_loc_loss = " + log_path = util.io.get_absolute_path(log_path) + f = open(log_path,'r') + iterations = [] + outputs = {} + plt = util.plt.plt + for line in f.readlines(): + if util.str.contains(line, 'Iteration') and util.str.contains(line, 'loss = '): + print line + s = line.split('Iteration')[-1] + iter_num = util.str.find_all(s, '\d+')[0] + iter_num = int(iter_num) + iterations.append(iter_num) + + if util.str.contains(line, "Train net output #"): + s = util.str.split(line, 'Train net output #\d+\:')[-1] + s = s.split('(')[0] + output = util.str.find_all(s, '\d*\.*\d+e*\-*\d*\.*\d*')[-1] + output = eval(output) + output = float(output) + for name in output_names: + ptr = ' '+ name + ' =' + if util.str.contains(line, ptr): + if name not in outputs: + outputs[name] = [] + print line + print '\t', iter_num, name, output + outputs[name].append(output) + if len(outputs)==0: + print 'No output named:', output_names + return + for name in outputs: + output = outputs[name] + if smooth: + output = util.np.smooth(output) + start = 0 + end = len(output) + + if from_to is not None: + start = from_to[0] + end = from_to[1] + line_style = util.plt.get_random_line_style() + plt.plot(iterations[start: end], output[start: end], line_style, label = name) + + plt.legend() + + if save_path is not None: + util.plt.save_image(save_path) + if show: + util.plt.show() diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py~ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py~ new file mode 100644 index 0000000000000000000000000000000000000000..fced85251199da6de7b51392a65b3c75794995d7 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py~ @@ -0,0 +1,72 @@ +# encoding=utf-8 + +import util +def get_data(net, name): + import caffe + if isinstance(net, caffe._caffe.Solver): + net = net.net + return net.blobs[name].data[...] + +def get_params(net, name = None): + import caffe + if isinstance(net, caffe._caffe.Solver): + net = net.net + params = net.params[name] + p = [] + for param in params: + p.append(param.data[...]) + return p + +def draw_log(log_path, output_names, show = False, save_path = None, from_to = None, smooth = False): + pattern = "Train net output: word_bbox_loc_loss = " + log_path = util.io.get_absolute_path(log_path) + f = open(log_path,'r') + iterations = [] + outputs = {} + plt = util.plt.plt + for line in f.readlines(): + if util.str.contains(line, 'Iteration') and util.str.contains(line, 'loss = '): + print line + s = line.split('Iteration')[-1] + iter_num = util.str.find_all(s, '\d+')[0] + iter_num = int(iter_num) + iterations.append(iter_num) + + if util.str.contains(line, "Train net output #"): + s = util.str.split(line, 'Train net output #\d+\:')[-1] + s = s.split('(')[0] + output = util.str.find_all(s, '\d*\.*\d+e*\-*\d*\.*\d*')[-1] + output = eval(output) + output = float(output) + for name in output_names: + ptr = ' '+ name + ' =' + if util.str.contains(line, ptr): + if name not in outputs: + outputs[name] = [] + print line + print '\t', iter_num, name, output + outputs[name].append(output) + if len(outputs)==0: + print 'No output named:', output_names + return + for name in outputs: + output = outputs[name] + if smooth: + output = util.np.smooth(output) + start = 0 + end = len(output) + + import pdb + pdb.set_trace() + if from_to is not None: + start = from_to[0] + end = from_to[1] + line_style = util.plt.get_random_line_style() + plt.plot(iterations[start: end], output[start: end], line_style, label = name) + + plt.legend() + + if save_path is not None: + util.plt.save_image(save_path) + if show: + util.plt.show() diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/cmd.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/cmd.py new file mode 100644 index 0000000000000000000000000000000000000000..0003c2805772bd9f68c705c8f759e4a76e5b2ca8 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/cmd.py @@ -0,0 +1,6 @@ +#encoding = utf-8 + +def cmd(cmd): + import commands + return commands.getoutput(cmd) + diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/dec.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/dec.py new file mode 100644 index 0000000000000000000000000000000000000000..dd80e90be1c610d2c46bc8b8b02fd6070d94ee6d --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/dec.py @@ -0,0 +1,78 @@ +#encoding=utf-8 +import logging +import time +def print_calling(fn): + def wrapper(*args1, ** args2): + s = "calling function %s"%(fn.__name__) + logging.info(s) + start = time.time() + ret = fn(*args1, **args2) + end = time.time() +# s = "%s. time used = %f seconds"%(s, (end - start)) + s = "function [%s] has been called, taking %f seconds"%(fn.__name__, (end - start)) + logging.debug(s) + return ret + return wrapper + + +def print_test(fn): + def wrapper(*args1, ** args2): + s = "running test: %s..."%(fn.__name__) + logging.info(s) + ret = fn(*args1, **args2) + s = "running test: %s...succeed"%(fn.__name__) + logging.debug(s) + return ret + return wrapper + +def print_calling_in_short(fn): + def wrapper(*args1, ** args2): + start = time.time() + ret = fn(*args1, **args2) + end = time.time() + s = "function [%s] has been called, taking %f seconds"%(fn.__name__, (end - start)) + logging.debug(s) + return ret + return wrapper + +import collections +counter = collections.defaultdict(int) +count_times =collections.defaultdict(int) +def print_calling_in_short_for_tf(fn): + import tensorflow as tf + import util + def wrapper(*args1, ** args2): + start = time.time() + thread_name = util.thread.get_current_thread_name() + ret = fn(*args1, **args2) + end = time.time() + counter[fn.__name__] = counter[fn.__name__] + (end - start) + count_times[fn.__name__] += 1 + all_time = sum([counter[name] for name in counter]) * 1.0 + for name in counter: +# tf.logging.info('\t %s: %f, %f seconds'%(name, counter[name] / all_time, counter[name])) + tf.logging.info('\t %s: %d callings, %fsper calling'%(name, count_times[name], counter[name] * 1.0 / count_times[name])) + s = "Thread [%s]:function [%s] has been called, taking %f seconds"%(thread_name, fn.__name__, (end - start)) + tf.logging.info(s) + return ret + return wrapper + +def timeit(fn): + import util + def wrapper(*args1, ** args2): + start = time.time() + thread_name = util.thread.get_current_thread_name() + ret = fn(*args1, **args2) + end = time.time() + counter[fn.__name__] = counter[fn.__name__] + (end - start) + count_times[fn.__name__] += 1 + all_time = sum([counter[name] for name in counter]) * 1.0 + for name in counter: + logging.info('\t %s: %f, %f seconds'%(name, counter[name] / all_time, counter[name])) + logging.info('\t %s: %d callings, %f seconds per calling'%(name, count_times[name], counter[name] * 1.0 / count_times[name])) + s = "Thread [%s]:function [%s] has been called, taking %f seconds"%(thread_name, fn.__name__, (end - start)) +# logging.info(s) + return ret + return wrapper + + diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/dtype.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/dtype.py new file mode 100644 index 0000000000000000000000000000000000000000..baedb192be4bbddd52bc0105a344a0484c890fe1 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/dtype.py @@ -0,0 +1,39 @@ +#coding=utf-8 +''' +Created on 2016年9月27日 +@author: dengdan +''' +import numpy as np + +float32 = 'float32' +floatX = float32 +int32 = 'int32' +uint8 = 'uint8' +string = 'str' + +def cast(obj, dtype): + if isinstance(obj, list): + return np.asarray(obj, dtype = floatX) + return np.cast[dtype](obj) + +def int(obj): + return cast(obj, 'int') + +def double(obj): + return cast(obj, 'double') + +def is_number(obj): + try: + obj + 1 + except: + return False + return True + +def is_str(s): + return type(s) == str + +def is_list(s): + return type(s) == list + +def is_tuple(s): + return type(s) == tuple diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/event.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/event.py new file mode 100644 index 0000000000000000000000000000000000000000..5612e818f66fa1fa633b8995b97701700c560b62 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/event.py @@ -0,0 +1,12 @@ +import cv2 +import logging +def wait_key(target = None): + key = cv2.waitKey()& 0xFF + if target == None: + return key + if type(target) == str: + target = ord(target) + while key != target: + key = cv2.waitKey()& 0xFF + + logging.debug('Key Pression caught:%s'%(target)) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/feature.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/feature.py new file mode 100644 index 0000000000000000000000000000000000000000..6dd24e0a24459b16e6032bf33d013a1654fc9f41 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/feature.py @@ -0,0 +1,14 @@ +# encoding utf-8 +def hog(img, bins =9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), transform_sqrt=False, feature_vector=True): + """ + Extract hog feature from image. + See detail at https://github.com/scikit-image/scikit-image/blob/master/skimage/feature/_hog.py + """ + from skimage.feature import hog + return hog(img, + orientations = bins, + pixels_per_cell = pixels_per_cell, + cells_per_block = cells_per_block, + visualise = False, + transform_sqrt=False, + feature_vector=True) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/img.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/img.py new file mode 100644 index 0000000000000000000000000000000000000000..59db386776210cad4abe6dd85b0e0f8821f06a3e --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/img.py @@ -0,0 +1,521 @@ +#coding=utf-8 +''' +@author: dengdan +''' +import cv2 +import numpy as np +import logging +import math +import event +import util + +IMREAD_GRAY = 0 +IMREAD_COLOR = 1 +IMREAD_UNCHANGED = -1 + + + +COLOR_WHITE =(255, 255, 255) +COLOR_BLACK = (0, 0, 0) +COLOR_GREEN = (0, 255, 0) + +COLOR_RGB_RED = (255, 0, 0) +COLOR_BGR_RED = (0, 0, 255) + +COLOR_RGB_BLUE = (0, 0, 255) +COLOR_BGR_BLUE = (255, 0, 0) + +COLOR_RGB_YELLOW = (255, 255, 0) +COLOR_BGR_YELLOW = (0, 255, 255) + + +COLOR_RGB_GRAY = (47, 79, 79) + +COLOR_RGB_PINK = (255, 192, 203) +def imread(path, rgb = False, mode = cv2.IMREAD_COLOR): + path = util.io.get_absolute_path(path) + img = cv2.imread(path, mode) + if img is None: + raise IOError('File not found:%s'%(path)) + + if rgb: + img = bgr2rgb(img) + return img + +def imshow(winname, img, block = True, position = None, maximized = False, rgb = False): + if isinstance(img, str): + img = imread(path = img) + + cv2.namedWindow(winname, cv2.WINDOW_NORMAL) + if rgb: + img = rgb2bgr(img) + cv2.imshow(winname, img) + if position is not None: +# cv2.moveWindow(winname, position[0], position[1]) + move_win(winname, position) + + if maximized: + maximize_win(winname) + + + if block: +# cv2.waitKey(0) + event.wait_key(" ") + cv2.destroyAllWindows() + + +def imwrite(path, img, rgb = False): + if rgb: + img = rgb2bgr(img) + path = util.io.get_absolute_path(path) + util.io.make_parent_dir(path) + cv2.imwrite(path, img) + +def move_win(winname, position = (0, 0)): + """ + move pyplot window + """ + cv2.moveWindow(winname, position[0], position[1]) + +def maximize_win(winname): + cv2.setWindowProperty(winname, cv2.WND_PROP_FULLSCREEN, True); + +def eq_color(target, color): + for i, c in enumerate(color): + if target[i] != color[i]: + return False + return True + +def is_white(color): + for c in color: + if c < 255: + return False + return True + +def black(shape): + if len(np.shape(shape)) >= 2: + shape = get_shape(shape) + shape = [int(v) for v in shape] + return np.zeros(shape, np.uint8) + +def white(shape, value = 255): + if len(np.shape(shape)) >= 2: + shape = get_shape(shape) + return np.ones(shape, np.uint8) * np.uint8(value) + +def bgr2rgb(img): + return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + +def rgb2bgr(img): + return cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + + +def rgb2gray(img): + return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) + +def bgr2gray(img): + return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + +def ds_size(image_size, kernel_size, stride): + """calculate the size of downsampling result""" + image_x, image_y = image_size + + + kernel_x, kernel_y = kernel_size + stride_x, stride_y = stride + + def f(iw, kw, sw): + return int(np.floor((iw - kw) / sw) + 1) + + output_size = (f(image_x, kernel_x, stride_x), f(image_y, kernel_y, stride_y)) + return output_size + + + +def get_roi(img, p1, p2): + """ + extract region of interest from an image. + p1, p2: two tuples standing for two opposite corners of the rectangle bounding the roi. + Their order is arbitrary. + """ + x1, y1 = p1 + x2, y2 = p2 + + x_min = min([x1, x2]) + y_min = min([y1, y2]) + x_max = max([x1, x2]) + 1 + y_max = max([y1, y2]) + 1 + + return img[y_min: y_max, x_min: x_max] + +def rectangle(img, left_up, right_bottom, color, border_width = 1): + left_up = (int(left_up[0]), int(left_up[1])) + right_bottom = (int(right_bottom[0]), int(right_bottom[1])) + cv2.rectangle(img, left_up, right_bottom, color, border_width) + + +def circle(img, center, r, color, border_width = 1): + center = (int(center[0]), int(center[1])) + cv2.circle(img, center, r, color, border_width) + +def render_points(img, points, color): + for p in points: + x, y = p + img[y][x] = color + + +def draw_contours(img, contours, idx = -1, color = 1, border_width = 1): +# img = img.copy() + cv2.drawContours(img, contours, idx, color, border_width) + return img + +def get_contour_rect_box(contour): + x,y,w,h = cv2.boundingRect(contour) + return x, y, w, h + +def get_contour_region_in_rect(img, contour): + x, y, w, h = get_contour_rect_box(contour) + lu, rb = (x, y), (x + w, y + h) + return get_roi(img, lu, rb) + +def get_contour_min_area_box(contour): + rect = cv2.minAreaRect(contour) + box = cv2.cv.BoxPoints(rect) + box = np.int0(box) + return box + +def get_contour_region_in_min_area_rect(img, cnt): + # find the min area rect of contour + rect = cv2.minAreaRect(cnt) + angle = rect[-1] + box = cv2.cv.BoxPoints(rect) + box_cnt = points_to_contour(box) + + # find the rectangle containing box_cnt, and set it as ROI + outer_rect = get_contour_rect_box(box_cnt) + x, y, w, h = outer_rect + img = get_roi(img, (x, y), (x + w, y + h)) + box = [(ox - x, oy - y) for (ox, oy) in box] + + # rotate ROI and corner points + rows, cols = get_shape(img) + M = cv2.getRotationMatrix2D((cols/2,rows/2), angle, scale = 1) + dst = cv2.warpAffine(img,M,(cols,rows)) + bar_xy = np.hstack((box, np.ones((4, 1)))) + new_corners = np.dot(M, np.transpose(bar_xy)) + new_corners = util.dtype.int(np.transpose(new_corners)) +# cnt = points_to_contour(new_corners) + + xs = new_corners[:, 0] + ys = new_corners[:, 1] + lu = (min(xs), min(ys)) + rb = (max(xs), max(ys)) + return get_roi(dst, lu, rb) + + +def contour_to_points(contour): + return np.asarray([c[0] for c in contour]) + + +def points_to_contour(points): + contours = [[list(p)]for p in points] + return np.asarray(contours, dtype = np.int32) + +def points_to_contours(points): + return np.asarray([points_to_contour(points)]) + +def get_contour_region_iou(I, cnt1, cnt2): + """ + calculate the iou of two contours + """ + mask1 = util.img.black(I) + draw_contours(mask1, [cnt1], color = 1, border_width = -1) + + mask2 = util.img.black(I) + draw_contours(mask2, [cnt2], color = 1, border_width = -1) + + union_mask = ((mask1 + mask2) >=1) * 1 + intersect_mask = (mask1 * mask2 >= 1) * 1 + + return np.sum(intersect_mask) * 1.0 / np.sum(union_mask) + + +def fill_bbox(img, box, color = 1): + """ + filling a bounding box with color. + box: a list of 4 points, in clockwise order, as the four vertice of a bounding box + """ + util.test.assert_equal(np.shape(box), (4, 2)) + cnt = to_contours(box) + draw_contours(img, cnt, color = color, border_width = -1) + +def get_rect_points(left_up, right_bottom): + """ + given the left up and right bottom points of a rectangle, return its four points + """ + right_bottom, left_up = np.asarray(right_bottom), np.asarray(left_up) + w, h = right_bottom - left_up + x, y = left_up + points = [(x, y), (x + w, y), (x + w, y + h), (x, y + h)] + return points + +def rect_perimeter(left_up, right_bottom): + """ + calculate the perimeter of the rectangle described by its left-up and right-bottom point. + """ + return sum(np.asarray(right_bottom) - np.asarray(left_up)) * 2 + +def rect_area(left_up, right_bottom): + wh = np.asarray(right_bottom) - np.asarray(left_up) + 1 + return np.prod(wh) + +def apply_mask(img, mask): + """ + the img will be masked in place. + """ + c = np.shape(img)[-1] + for i in range(c): + img[:, :, i] = img[:, :, i] * mask + return img + +def get_shape(img): + """ + return the height and width of an image + """ + return np.shape(img)[0:2] + +def get_wh(img): + return np.shape(img)[0:2][::-1] + +def get_value(img, x, y = None): + if y == None: + y = x[1] + x = x[0] + + return img[y][x] + +def set_value(img, xy, val): + x, y = xy + img[y][x] = val + + +def filter2D(img, kernel): + dst = cv2.filter2D(img, -1, kernel) + return dst + +def average_blur(img, shape = (5, 5)): + return cv2.blur(img, shape) + +def gaussian_blur(img, shape = (5, 5), sigma = 0): + # sigma --> sigmaX, sigmaY + blur = cv2.GaussianBlur(img,shape, sigma) + return blur + +def bilateral_blur(img, d = 9, sigmaColor = 75, sigmaSpace = 75): + dst = cv2.bilateralFilter(img, d, sigmaColor, sigmaSpace) + return dst + +BLUR_AVERAGE = 'average' +BLUR_GAUSSIAN = 'gaussian' +BLUR_BILATERAL = 'bilateral' + + +_blur_dict = { + BLUR_AVERAGE: average_blur, + BLUR_GAUSSIAN: gaussian_blur, + BLUR_BILATERAL: bilateral_blur +} + +def blur(img, blur_type): + fn = _blur_dict[blur_type] + return fn(img) + +def put_text(img, text, pos, scale = 1, color = COLOR_WHITE, thickness = 1): + pos = np.int32(pos) + font = cv2.FONT_HERSHEY_SIMPLEX + cv2.putText(img = img, text = text, org = tuple(pos), fontFace = font, fontScale = scale, color = color, thickness = thickness) + +def resize(img, f = None, fx = None, fy = None, size = None, interpolation = cv2.INTER_LINEAR): + """ + size: (w, h) + """ + h, w = get_shape(img) + if fx != None and fy != None: + return cv2.resize(img, None, fx = fx, fy = fy, interpolation = interpolation) + + if size != None: + size = util.dtype.int(size) +# size = (size[1], size[0]) + size = tuple(size) + return cv2.resize(img, size, interpolation = interpolation) + + return cv2.resize(img, None, fx = f, fy = f, interpolation = interpolation) + +def translate(img, delta_x, delta_y, size = None): + M = np.float32([[1,0, delta_x],[0,1, delta_y]]) + if size == None: + size = get_wh(img) + + dst = cv2.warpAffine(img,M, size) + return dst + + +def rotate_about_center(src, angle, scale=1.): + """https://www.oschina.net/translate/opencv-rotation""" + w = src.shape[1] + h = src.shape[0] + rangle = np.deg2rad(angle) # angle in radians + # now calculate new image width and height + nw = (abs(np.sin(rangle)*h) + abs(np.cos(rangle)*w))*scale + nh = (abs(np.cos(rangle)*h) + abs(np.sin(rangle)*w))*scale + # ask OpenCV for the rotation matrix + rot_mat = cv2.getRotationMatrix2D((nw*0.5, nh*0.5), angle, scale) + # calculate the move from the old center to the new center combined + # with the rotation + rot_move = np.dot(rot_mat, np.array([(nw-w)*0.5, (nh-h)*0.5,0])) + # the move only affects the translation, so update the translation + # part of the transform + rot_mat[0,2] += rot_move[0] + rot_mat[1,2] += rot_move[1] + return cv2.warpAffine(src, rot_mat, (int(math.ceil(nw)), int(math.ceil(nh))), flags=cv2.INTER_LANCZOS4), rot_mat + + +def get_rect_iou(rects1, rects2): + """ + calculate the iou between rects1 and rects2 + each rect consists of four points:[min_x, min_y, max_x, max_y] + return: a iou matrix, len(rects1) * len(rects2) + """ + rects1, rects2 = np.asarray(rects1), np.asarray(rects2) + + def _to_matrix(p, ps): + p = np.ones((len(ps), 1)) * p + ps = np.reshape(ps, (len(ps), 1)) + temp =np.hstack([p, ps]) + return temp + + def _get_max(p, ps): + return np.max(_to_matrix(p, ps), axis = 1) + + def _get_min(p, ps): + return np.min(_to_matrix(p, ps), axis = 1) + + + def _get_area(rect): + w, h = rect[:, 2] - rect[:, 0] + 1.0 , rect[:, 3] - rect[:, 1] + 1.0 + return w * h + + def _get_inter(rect1, rects2): + x1 = _get_max(rect1[0], rects2[:, 0]) + y1 = _get_max(rect1[1], rects2[:, 1]) + + x2 = _get_min(rect1[2], rects2[:, 2]) + y2 = _get_min(rect1[3], rects2[:, 3]) + + w,h = x2-x1 +1, y2 - y1 + 1 + areas = w * h + areas[np.where(w < 0)] = 0 + areas[np.where(h < 0)] = 0 + return areas + + area2 = _get_area(rects2) + area1 = _get_area(rects1) + iou = np.zeros((len(rects1), len(rects2))) + for ri in range(len(rects1)): + inter = _get_inter(rects1[ri, :], rects2) + union = area1[ri] + area2 - inter + iou[ri, :] = np.transpose( inter / union) + return iou + +def find_contours(mask): + mask = np.asarray(mask, dtype = np.uint8) + mask = mask.copy() + contours, _ = cv2.findContours(mask, mode = cv2.RETR_CCOMP, + method = cv2.CHAIN_APPROX_SIMPLE) + return contours + +def find_two_level_contours(mask): + mask = mask.copy() + contours, tree = cv2.findContours(mask, mode = cv2.RETR_CCOMP, + method = cv2.CHAIN_APPROX_SIMPLE) + return contours, tree + + +def is_in_contour(point, cnt): + """tell whether a point is in contour or not. + In-contour here includes both the 'in contour' and 'on contour' cases. + point:(x, y) + cnt: a cv2 contour + """ + # doc of pointPolygonTest: http://docs.opencv.org/2.4/modules/imgproc/doc/structural_analysis_and_shape_descriptors.html?highlight=pointpolygontest#cv.PointPolygonTest + # the last argument means only tell if in or not, without calculating the shortest distance + in_cnt = cv2.pointPolygonTest(cnt, point, False) + return in_cnt >= 0; + +def convex_hull(contour): + hull = cv2.convexHull(contour, returnPoints=1) + return hull + +def random_color_3(): + c = util.rand.randint(low = 0, high = 255, shape = (3, )) +# c = np.uint8(c) + return c + +def get_contour_area(cnt): + return cv2.contourArea(cnt) + +def is_valid_jpg(jpg_file): + with open(jpg_file, 'rb') as f: + f.seek(-2, 2) + return f.read() == '\xff\xd9' + + + +def rotate_point_by_90(x, y, k, w = 1.0, h = 1.0): + """ + Rotate a point xy on an image by k * 90 + degrees. + Params: + x, y: a point, (x, y). If not normalized within 0 and 1, the + width and height of the image should be specified clearly. + w, h: the width and height of image + k: k * 90 degrees will be rotated + """ + k = k % 4 + + if k == 0: + return x, y + elif k == 1: + return y, w - x + elif k == 2: + return w - x, h - y + elif k == 3: + return h - y, x + + +def min_area_rect(xs, ys): + """ + Args: + xs: numpy ndarray with shape=(N,4). N is the number of oriented bboxes. 4 contains [x1, x2, x3, x4] + ys: numpy ndarray with shape=(N,4), [y1, y2, y3, y4] + Note that [(x1, y1), (x2, y2), (x3, y3), (x4, y4)] can represent an oriented bbox. + Return: + the oriented rects sorrounding the box, in the format:[cx, cy, w, h, theta]. + """ + xs = np.asarray(xs, dtype = np.float32) + ys = np.asarray(ys, dtype = np.float32) + + num_rects = xs.shape[0] + box = np.empty((num_rects, 5))#cx, cy, w, h, theta + for idx in xrange(num_rects): + points = zip(xs[idx, :], ys[idx, :]) + cnt = points_to_contour(points) + rect = cv2.minAreaRect(cnt) + cx, cy = rect[0] + w, h = rect[1] + theta = rect[2] + box[idx, :] = [cx, cy, w, h, theta] + + box = np.asarray(box, dtype = xs.dtype) + return box diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/io_.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/io_.py new file mode 100644 index 0000000000000000000000000000000000000000..0976223422731574789f5ed7fc30c167a2db03fc --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/io_.py @@ -0,0 +1,216 @@ +#coding=utf-8 +''' +Created on 2016年9月27日 + +@author: dengdan + +Tool functions for file system operation and I/O. +In the style of linux shell commands +''' +import os +import pickle as pkl +# import commands +import logging + +# import util + +def mkdir(path): + """ + If the target directory does not exists, it and its parent directories will created. + """ + path = get_absolute_path(path) + if not exists(path): + os.makedirs(path) + return path + +def make_parent_dir(path): + """make the parent directories for a file.""" + parent_dir = get_dir(path) + mkdir(parent_dir) + + +def pwd(): + return os.getcwd() + +def dump(path, obj): + path = get_absolute_path(path) + parent_path = get_dir(path) + mkdir(parent_path) + with open(path, 'w') as f: + logging.info('dumping file:' + path); + pkl.dump(obj, f) + +def load(path): + path = get_absolute_path(path) + with open(path, 'r') as f: + data = pkl.load(f) + return data + +def join_path(a, *p): + return os.path.join(a, *p) + +def is_dir(path): + path = get_absolute_path(path) + return os.path.isdir(path) + + +def is_path(path): + path = get_absolute_path(path) + return os.path.ispath(path) + +def get_dir(path): + ''' + return the directory it belongs to. + if path is a directory itself, itself will be return + ''' + path = get_absolute_path(path) + if is_dir(path): + return path; + return os.path.split(path)[0] + +def get_filename(path): + return os.path.split(path)[1] + +def get_absolute_path(p): + if p.startswith('~'): + p = os.path.expanduser(p) + return os.path.abspath(p) + +def cd(p): + p = get_absolute_path(p) + os.chdir(p) + +# def ls(path = '.', suffix = None): +# """ +# list files in a directory. +# return file names in a list +# """ +# path = get_absolute_path(path) +# files = os.listdir(path) +# +# if suffix is None: +# return files +# +# filtered = [] +# for f in files: +# if util.str.ends_with(f, suffix, ignore_case = True): +# filtered.append(f) +# +# return filtered + +def find_files(pattern): + import glob + return glob.glob(pattern) + +def read_lines(p): + """return the text in a file in lines as a list """ + p = get_absolute_path(p) + f = open(p,'r') + return f.readlines() + +def write_lines(p, lines): + p = get_absolute_path(p) + make_parent_dir(p) + with open(p, 'w') as f: + for line in lines: + f.write(line) + + +# def cat(p): +# """return the text in a file as a whole""" +# cmd = 'cat ' + p +# return commands.getoutput(cmd) + +def exists(path): + path = get_absolute_path(path) + return os.path.exists(path) + +def load_mat(path): + import scipy.io as sio + path = get_absolute_path(path) + return sio.loadmat(path) + +def dump_mat(path, dict_obj, append = True): + import scipy.io as sio + path = get_absolute_path(path) + make_parent_dir(path) + sio.savemat(file_name = path, mdict = dict_obj, appendmat = append) + +def dir_mat(path): + ''' + list the variables in mat file. + return a list: [(name, shape, dtype), ...] + ''' + import scipy.io as sio + path = get_absolute_path(path) + return sio.whosmat(path) + +SIZE_UNIT_K = 1024 +SIZE_UNIT_M = SIZE_UNIT_K ** 2 +SIZE_UNIT_G = SIZE_UNIT_K ** 3 +def get_file_size(path, unit = SIZE_UNIT_K): + size = os.path.getsize(get_absolute_path(path)) + return size * 1.0 / unit + + +def create_h5(path): + import h5py + path = get_absolute_path(path) + make_parent_dir(path) + return h5py.File(path, 'w'); + +def open_h5(path, mode = 'r'): + import h5py + path = get_absolute_path(path) + return h5py.File(path, mode); + +def read_h5(h5, key): + return h5[key][:] +def read_h5_attrs(h5, key, attrs): + return h5[key].attrs[attrs] + +def copy(src, dest): + import shutil + shutil.copy(get_absolute_path(src), get_absolute_path(dest)) + +cp = copy + +def remove(p): + import os + os.remove(get_absolute_path(p)) +rm = remove + +# def search(pattern, path, file_only = True): +# """ +# Search files whose name matches the give pattern. The search scope +# is the directory and sub-directories of 'path'. +# """ +# path = get_absolute_path(path) +# pattern_here = util.io.join_path(path, pattern) +# targets = [] +# +# # find matchings in current directory +# candidates = find_files(pattern_here) +# for can in candidates: +# if util.io.is_dir(can) and file_only: +# continue +# else: +# targets.append(can) +# +# # find matching in sub-dirs +# files = ls(path) +# for f in files: +# fpath = util.io.join_path(path, f) +# if is_dir(fpath): +# targets_in_sub_dir = search(pattern, fpath, file_only) +# targets.extend(targets_in_sub_dir) +# return targets + +def dump_json(path, data): + import json + path = get_absolute_path(path) + make_parent_dir(path) + + with open(path, 'w') as f: + json.dump(data, f) + return path \ No newline at end of file diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/log.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/log.py new file mode 100644 index 0000000000000000000000000000000000000000..c1fdfaac6d3564c5b59ad7ca51f02da00f355438 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/log.py @@ -0,0 +1,47 @@ +#coding=utf-8 +''' +Created on 2016年10月12日 + +@author: dengdan +''' +import datetime +import logging +import util +import sys + +def get_date_str(): + now = datetime.datetime.now() + return now.strftime('%Y-%m-%d %H:%M:%S') + +def init_logger(log_file = None, log_path = None, log_level = logging.DEBUG, mode = 'w', stdout = True): + """ + log_path: 日志文件的文件夹路径 + mode: 'a', append; 'w', 覆盖原文件写入. + """ + fmt = '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s: %(message)s' + if log_path is None: + log_path = '~/temp/log/' + if log_file is None: + log_file = 'log_' + get_date_str() + '.log' + log_file = util.io.join_path(log_path, log_file) + # 此处不能使用logging输出 + print('log file path:' + log_file); + util.io.make_parent_dir(log_file) + logging.basicConfig(level = log_level, + format= fmt, + filename= util.io.get_absolute_path(log_file), + filemode=mode) + + if stdout: + console = logging.StreamHandler(stream = sys.stdout) + console.setLevel(log_level) + formatter = logging.Formatter(fmt) + console.setFormatter(formatter) + logging.getLogger('').addHandler(console) + +# console = logging.StreamHandler(stream = sys.stderr) +# console.setLevel(log_level) +# formatter = logging.Formatter(fmt) +# console.setFormatter(formatter) +# logging.getLogger('').addHandler(console) + diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/logger.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..0dab12dc305b88e880d1babde3ba3c7825132802 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/logger.py @@ -0,0 +1,133 @@ +# A simple torch style logger +# (C) Wei YANG 2017 +from __future__ import absolute_import +# import matplotlib.pyplot as plt +import matplotlib +matplotlib.use('pdf') +import matplotlib.pyplot as plt +import os +import sys +import numpy as np + +__all__ = ['Logger', 'LoggerMonitor', 'savefig'] + +def savefig(fname, dpi=None): + dpi = 150 if dpi == None else dpi + plt.savefig(fname, dpi=dpi) + +def plot_overlap(logger, names=None): + names = logger.names if names == None else names + numbers = logger.numbers + for _, name in enumerate(names): + x = np.arange(len(numbers[name])) + plt.plot(x, np.asarray(numbers[name])) + return [logger.title + '(' + name + ')' for name in names] + +class Logger(object): + '''Save training process to log file with simple plot function.''' + def __init__(self, fpath, title=None, resume=False): + self.file = None + self.resume = resume + self.title = '' if title == None else title + if fpath is not None: + if resume: + self.file = open(fpath, 'r') + name = self.file.readline() + self.names = name.rstrip().split('\t') + self.numbers = {} + for _, name in enumerate(self.names): + self.numbers[name] = [] + + for numbers in self.file: + numbers = numbers.rstrip().split('\t') + for i in range(0, len(numbers)): + self.numbers[self.names[i]].append(numbers[i]) + self.file.close() + self.file = open(fpath, 'a') + else: + self.file = open(fpath, 'w') + + def set_names(self, names): + if self.resume: + pass + # initialize numbers as empty list + self.numbers = {} + self.names = names + for _, name in enumerate(self.names): + self.file.write(name) + self.file.write('\t') + self.numbers[name] = [] + self.file.write('\n') + self.file.flush() + + + def append(self, numbers): + assert len(self.names) == len(numbers), 'Numbers do not match names' + for index, num in enumerate(numbers): + self.file.write("{0:.6f}".format(num)) + self.file.write('\t') + self.numbers[self.names[index]].append(num) + self.file.write('\n') + self.file.flush() + + def plot(self, names=None): + print 'plot' + ''' + names = self.names if names == None else names + numbers = self.numbers + for _, name in enumerate(names): + x = np.arange(len(numbers[name])) + plt.plot(x, np.asarray(numbers[name])) + plt.legend([self.title + '(' + name + ')' for name in names]) + plt.grid(True) + ''' + + def close(self): + if self.file is not None: + self.file.close() + +class LoggerMonitor(object): + '''Load and visualize multiple logs.''' + def __init__ (self, paths): + '''paths is a distionary with {name:filepath} pair''' + self.loggers = [] + for title, path in paths.items(): + logger = Logger(path, title=title, resume=True) + self.loggers.append(logger) + + def plot(self, names=None): + plt.figure() + plt.subplot(121) + legend_text = [] + for logger in self.loggers: + legend_text += plot_overlap(logger, names) + plt.legend(legend_text, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) + plt.grid(True) + +if __name__ == '__main__': + # # Example + # logger = Logger('test.txt') + # logger.set_names(['Train loss', 'Valid loss','Test loss']) + + # length = 100 + # t = np.arange(length) + # train_loss = np.exp(-t / 10.0) + np.random.rand(length) * 0.1 + # valid_loss = np.exp(-t / 10.0) + np.random.rand(length) * 0.1 + # test_loss = np.exp(-t / 10.0) + np.random.rand(length) * 0.1 + + # for i in range(0, length): + # logger.append([train_loss[i], valid_loss[i], test_loss[i]]) + # logger.plot() + + # Example: logger monitor + paths = { + 'resadvnet20':'/home/wyang/code/pytorch-classification/checkpoint/cifar10/resadvnet20/log.txt', + 'resadvnet32':'/home/wyang/code/pytorch-classification/checkpoint/cifar10/resadvnet32/log.txt', + 'resadvnet44':'/home/wyang/code/pytorch-classification/checkpoint/cifar10/resadvnet44/log.txt', + } + + field = ['Valid Acc.'] + + monitor = LoggerMonitor(paths) + monitor.plot(names=field) + savefig('test.eps') \ No newline at end of file diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/mask.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/mask.py new file mode 100644 index 0000000000000000000000000000000000000000..d660607b1a798c38ed0495ec4acb3b14de735d35 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/mask.py @@ -0,0 +1,82 @@ +import cv2 +import numpy as np + +import util +from util import nb as neighbour + + +def find_white_components(mask, min_area = 0): + mask = (mask == 0) * 1 + return find_black_components(mask, min_area); + +def find_black_components(mask, min_area = 0): + """ + find components of zeros. + mask is a 0-1 matrix, ndarray. + """ + neighbour_type = neighbour.N4 + visited = mask.copy() + c_mask = util.img.black(mask) + + root_idx = [1] + def get_new_root(): + root_idx[0] += 1 + return root_idx[0] + + def is_visited(xy): + x, y = xy + return visited[y][x] + + def set_visited(xy): + x, y = xy + visited[y][x] = 255 + + def set_root(xy, root): + x, y = xy + c_mask[y][x] = root + + def get_root(xy): + x, y = xy + return c_mask[y][x] + + rows, cols = np.shape(mask) + q = [] + for y in xrange(rows): + for x in xrange(cols): + xy = (x, y) + if is_visited(xy): + continue + + q.append(xy) + new_root = get_new_root() + while len(q) > 0: + cp = q.pop() + set_root(cp, new_root) + set_visited(cp) + nbs = neighbour.get_neighbours(cp[0], cp[1], cols, rows, neighbour_type) + for nb in nbs: + if not is_visited(nb) and nb not in q: +# q.append(nb) + q.insert(0, nb) + + components = {} + for y in xrange(rows): + for x in xrange(cols): + root = get_root((x, y)) + if root == 0: + continue + + if root not in components: + components[root] = [] + + components[root].append((x,y)) + + ret = [] + + for root in components: + if len(components[root]) >= min_area: + ret.append(components[root]) + + return ret + + diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/misc.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..324309c3f9b7a3f5e3430fd53575779c394f283f --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/misc.py @@ -0,0 +1,74 @@ +'''Some helper functions for PyTorch, including: + - get_mean_and_std: calculate the mean and std value of dataset. + - msr_init: net parameter initialization. + - progress_bar: progress bar mimic xlua.progress. +''' +import errno +import os +import sys +import time +import math + +import torch.nn as nn +import torch.nn.init as init +from torch.autograd import Variable + +__all__ = ['get_mean_and_std', 'init_params', 'mkdir_p', 'AverageMeter'] + + +def get_mean_and_std(dataset): + '''Compute the mean and std value of dataset.''' + dataloader = trainloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2) + + mean = torch.zeros(3) + std = torch.zeros(3) + print('==> Computing mean and std..') + for inputs, targets in dataloader: + for i in range(3): + mean[i] += inputs[:,i,:,:].mean() + std[i] += inputs[:,i,:,:].std() + mean.div_(len(dataset)) + std.div_(len(dataset)) + return mean, std + +def init_params(net): + '''Init layer parameters.''' + for m in net.modules(): + if isinstance(m, nn.Conv2d): + init.kaiming_normal(m.weight, mode='fan_out') + if m.bias: + init.constant(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + init.constant(m.weight, 1) + init.constant(m.bias, 0) + elif isinstance(m, nn.Linear): + init.normal(m.weight, std=1e-3) + if m.bias: + init.constant(m.bias, 0) + +def mkdir_p(path): + '''make dir if not exist''' + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count \ No newline at end of file diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/ml.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/ml.py new file mode 100644 index 0000000000000000000000000000000000000000..12bee2ba8caefb32d4337071c6b45889693b8f62 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/ml.py @@ -0,0 +1,25 @@ +import logging +import cv2 +import numpy as np +import util.dec +import util.np + +@util.dec.print_calling +def kmeans(samples, k, criteria = None, attempts = 3, flags = cv2.KMEANS_RANDOM_CENTERS): + if criteria == None: + criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0) + samples = np.asarray(samples, dtype = np.float32) + _,labels,centers = cv2.kmeans(samples, k, criteria, attempts, flags) + labels = util.np.flatten(labels) + clusters = [None]*k + for idx, label in enumerate(labels): + if clusters[label] is None: + clusters[label] = [] + clusters[label].append(idx) + + for idx, cluster in enumerate(clusters): + if cluster == None: + logging.warn('Empty cluster appeared.') + clusters[idx] = [] + + return labels, clusters, centers diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/mod.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/mod.py new file mode 100644 index 0000000000000000000000000000000000000000..033d664457cf14d8e116f28e01192f8de23c1a82 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/mod.py @@ -0,0 +1,57 @@ +#coding=utf-8 + +import logging + +def add_to_path(path): + ''' + add path to sys.path. + ''' + import sys; + sys.path.insert(0, path); + +def add_ancester_dir_to_path(fp, p): + ''' + add ancester directory to sys.path. + fp: usually __file__ + p : the relative path to be added. + ''' + import util + parent_path = util.io.get_dir(fp) + path = util.io.join_path(parent_path, p) + add_to_path(path) + +def is_main(mod_name): + return mod_name == '__main__' + +def import_by_name(mod_name): + __import__(mod_name) + return get_mod_by_name(mod_name) + +def try_import_by_name(mod_name, error_path): + try: + import_by_name(mod_name) + except ImportError: + logging.info('adding %s to sys.path'%(error_path)) + add_to_path(error_path) + import_by_name(mod_name) + + return get_mod_by_name(mod_name) + +def get_mod_by_name(mod_name): + import sys + return sys.modules[mod_name] + +def load_mod_from_path(path, keep_name = True): + """" + Params: + path + keep_name: if True, the filename will be used as module name. + """ + import util + import imp + path = util.io.get_absolute_path(path) + file_name = util.io.get_filename(path) + module_name = file_name.split('.')[0] + if not keep_name: + module_name = '%s_%d'%(module_name, util.get_count()) + return imp.load_source(module_name, path) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/neighbour.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/neighbour.py new file mode 100644 index 0000000000000000000000000000000000000000..1f1826b88d55ccee198e77ad6874ff7976f1d0d5 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/neighbour.py @@ -0,0 +1,86 @@ +#encoding=utf-8 + +import numpy as np + +N1 = 'n1' +N2 = 'n2' +N4 = 'n4' +N8 = 'n8' + +def _in_image(c, w, h): + cx, cy = c + return cx >=0 and cx < w and cy >= 0 and cy < h + +def n1(x, y, w, h): + """down and right""" + neighbours = [] + candidates = [(x, y + 1), (x + 1, y)]; + + for c in candidates: + if _in_image(c, w, h): + neighbours.append(c) + + return neighbours + + +def n2(x, y, w, h): + neighbours = [] + candidates = [(x, y + 1), (x + 1, y), (x + 1, y + 1), (x - 1, y + 1)]; + for c in candidates: + if _in_image(c, w, h): + neighbours.append(c) + + return neighbours; + +def n4(x, y, w, h): + neighbours = [] + candidates = [(x, y - 1),(x, y + 1), (x + 1, y), (x - 1, y)]; + for c in candidates: + if _in_image(c, w, h): + neighbours.append(c) + return neighbours + + +def n8(x, y, w, h): + neighbours = [] + candidates = [(x + 1, y - 1),(x, y - 1),(x - 1, y - 1), (x - 1, y),(x, y + 1), (x + 1, y), (x + 1, y + 1), (x - 1, y + 1)]; + for c in candidates: + if _in_image(c, w, h): + neighbours.append(c) + + return neighbours; + + +def n1_count(w, h): + return 2 * w * h - w - h + +def n2_count(w, h): + return 4 * w * h - 3 * w - 3 * h + 2 + + +_dict1 = {N1:n1, N2:n2, N4:n4, N8:n8}; +_dict2 = {N1:n1_count, N2:n2_count}; + +def get_neighbours(x, y, w, h, neighbour_type): + if neighbour_type in _dict1: + fn = _dict1[neighbour_type] + return fn(x, y, w, h) + raise NotImplementedError("unknown neighbour type '%s'" % (neighbour_type)) + +def count_neighbours(w, h, neighbour_type): + if neighbour_type in _dict2: + fn = _dict2[neighbour_type] + return fn(w, h) + raise NotImplementedError("unknown neighbour type '%s'" % (neighbour_type)) + + +if __name__ == "__main__": + w, h = 10, 10 + np.testing.assert_equal(len(n4(0, 0, w, h)), 2) + np.testing.assert_equal(len(n8(0, 0, w, h)), 3) + + np.testing.assert_equal(len(n4(0, 2, w, h)), 3) + np.testing.assert_equal(len(n8(0, 2, w, h)), 5) + + np.testing.assert_equal(len(n4(3, 3, w, h)), 4) + np.testing.assert_equal(len(n8(3, 3, w, h)), 8) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py new file mode 100644 index 0000000000000000000000000000000000000000..0faf6d0107e9aab0981f0eaf8d218eb706cb81f9 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py @@ -0,0 +1,171 @@ +import numpy as np +import copy + +TINY = np.exp(-100) +concat = np.concatenate +def is_2D(m): + ''' + judge if a matrix is 2-D or not + ''' + return len(np.shape(m)) == 2 + +def norm1(v): + return np.sum(np.abs(v)) + +def norm2(v): + return np.sqrt(np.sum(v ** 2)) + +def norm2_squared(v): + return np.sum(v ** 2) + + +def cos_dist(v1, v2): + length1 = norm2(v1) + length2 = norm2(v2) + return np.dot(v1, v2) / (length1 * length2) + +def eu_dist(v1, v2): + v = v1 - v2 + return norm2(v) + +def chi_squared_dist(f1, f2): + dist = 0 + for ff1, ff2 in zip(f1, f2): + if ff1 + ff2 == 0:# color feature values are supposed to be non-negative. If this case happened, it means both ne and de are 0s + continue; + dist += (ff1 - ff2) ** 2 * 1.0/ (ff1 + ff2) + return np.sqrt(dist) + +def flatten(arr, ndim = 1): + """ + flatten an multi-dimensional array to a certain degree. + ndim: the number of dimensions after flatten + """ + arr = np.asarray(arr) + dims = len(arr.shape) + shape = [np.prod(arr.shape[0: dims + 1 - ndim])] + shape.extend(arr.shape[dims + 1 - ndim: dims]) + return np.reshape(arr, shape) + +def arcsin(sins, xs = None): + """ + cal arcsin. + xs: if this parameter is provided, the returned arcsins will be within [0, 2*pi) + otherwise the default [-pi/2, pi/2] + """ + arcs = np.arcsin(sins); + if xs != None: + xs = np.asarray(xs) + sins = np.asarray(sins) + # if x > 0, then the corresponding mask value is -1. The resulting angle unchanged: v = 0 - (-v) = v. else, v = pi - v + add_pi = xs < 0 + pi_mask = add_pi * np.pi + # 0 --> 1, 1 --> -1 + arc_mask = 2 * add_pi - 1 + arcs = pi_mask - arcs * arc_mask + + # if x >= 0 and sin < 0, v = 2*pi + v + add_2_pi = (xs >= 0) * (sins < 0) + pi_mask = add_2_pi * 2 * np.pi + arcs = pi_mask + arcs + return arcs + +def sin(ys = None, lengths = None, xs = None, angles = None): + """ + calculate sin with multiple kinds of parameters + """ + if not angles is None: + return np.sin(angles) + + if ys is None: + raise ValueError('ys must be provided when "angles" is None ') + + if lengths is None: + if xs is None: + raise ValueError('xs must be provided when "lengths" is None ') + lengths = np.sqrt(xs ** 2 + ys ** 2) + + if not np.iterable(lengths): + sins = ys / lengths if lengths > 0 else 0 + else: + lengths = np.asarray(lengths) + shape = lengths.shape + ys = flatten(ys) + lengths = flatten(lengths) + sins = [y / length if length > 0 else 0 for (y, length) in zip(ys, lengths)] + sins = np.reshape(sins, shape) + return sins + +def sum_all(m): + """ + sum up all the elements in a multi-dimension array + """ + return np.sum(m) + + +def clone(obj, deep = False): + if not deep: + return copy.copy(obj) + return copy.deepcopy(obj) + +def empty_list(length, etype): + empty_list = [None] * length + for i in xrange(length): + if etype == list: + empty_list[i] = [] + else: + raise NotImplementedError + + return empty_list + +def shuffle(arr): + import random + random.shuffle(arr) + +def is_empty(a): + ''' + tell whether an array is empty. + If a is multidimensional, it is empty when it contains no entry in the last dimension. + ''' + if a is None: + return True + + shape = np.shape(a) + if np.prod(shape) == 0: + return True + + return False + +def angle_with_x(x, y): + """ + return the arctan x/y, in range [-pi, pi] + """ + return np.arctan2(y, x) + +def has_infty(x): + test = x == np.infty + return np.sum(test) > 0 + +def has_nan(x): + x = np.asarray(x) + test = x != x + return np.sum(test) > 0 + +def has_nan_or_infty(x): + if has_nan(x): + return True + + if has_infty(x): + return True + + +def iterable(x): + return np.iterable(x) + +def smooth(arr): + result = [0] * len(arr) + s = 0 + for idx, n in enumerate(arr): + s += n + result[idx] = s * 1.0 / (idx + 1) + return result diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py~ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py~ new file mode 100644 index 0000000000000000000000000000000000000000..614d195ea3e6b001e6df3b9ffc4075efad8a4a63 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py~ @@ -0,0 +1,171 @@ +import numpy as np +import copy + +TINY = np.exp(-100) +concat = np.concatenate +def is_2D(m): + ''' + judge if a matrix is 2-D or not + ''' + return len(np.shape(m)) == 2 + +def norm1(v): + return np.sum(np.abs(v)) + +def norm2(v): + return np.sqrt(np.sum(v ** 2)) + +def norm2_squared(v): + return np.sum(v ** 2) + + +def cos_dist(v1, v2): + length1 = norm2(v1) + length2 = norm2(v2) + return np.dot(v1, v2) / (length1 * length2) + +def eu_dist(v1, v2): + v = v1 - v2 + return norm2(v) + +def chi_squared_dist(f1, f2): + dist = 0 + for ff1, ff2 in zip(f1, f2): + if ff1 + ff2 == 0:# color feature values are supposed to be non-negative. If this case happened, it means both ne and de are 0s + continue; + dist += (ff1 - ff2) ** 2 * 1.0/ (ff1 + ff2) + return np.sqrt(dist) + +def flatten(arr, ndim = 1): + """ + flatten an multi-dimensional array to a certain degree. + ndim: the number of dimensions after flatten + """ + arr = np.asarray(arr) + dims = len(arr.shape) + shape = [np.prod(arr.shape[0: dims + 1 - ndim])] + shape.extend(arr.shape[dims + 1 - ndim: dims]) + return np.reshape(arr, shape) + +def arcsin(sins, xs = None): + """ + cal arcsin. + xs: if this parameter is provided, the returned arcsins will be within [0, 2*pi) + otherwise the default [-pi/2, pi/2] + """ + arcs = np.arcsin(sins); + if xs != None: + xs = np.asarray(xs) + sins = np.asarray(sins) + # if x > 0, then the corresponding mask value is -1. The resulting angle unchanged: v = 0 - (-v) = v. else, v = pi - v + add_pi = xs < 0 + pi_mask = add_pi * np.pi + # 0 --> 1, 1 --> -1 + arc_mask = 2 * add_pi - 1 + arcs = pi_mask - arcs * arc_mask + + # if x >= 0 and sin < 0, v = 2*pi + v + add_2_pi = (xs >= 0) * (sins < 0) + pi_mask = add_2_pi * 2 * np.pi + arcs = pi_mask + arcs + return arcs + +def sin(ys = None, lengths = None, xs = None, angles = None): + """ + calculate sin with multiple kinds of parameters + """ + if not angles is None: + return np.sin(angles) + + if ys is None: + raise ValueError('ys must be provided when "angles" is None ') + + if lengths is None: + if xs is None: + raise ValueError('xs must be provided when "lengths" is None ') + lengths = np.sqrt(xs ** 2 + ys ** 2) + + if not np.iterable(lengths): + sins = ys / lengths if lengths > 0 else 0 + else: + lengths = np.asarray(lengths) + shape = lengths.shape + ys = flatten(ys) + lengths = flatten(lengths) + sins = [y / length if length > 0 else 0 for (y, length) in zip(ys, lengths)] + sins = np.reshape(sins, shape) + return sins + +def sum_all(m): + """ + sum up all the elements in a multi-dimension array + """ + return np.sum(m) + + +def clone(obj, deep = False): + if not deep: + return copy.copy(obj) + return copy.deepcopy(obj) + +def empty_list(length, etype): + empty_list = [None] * length + for i in xrange(length): + if etype == list: + empty_list[i] = [] + else: + raise NotImplementedError + + return empty_list + +def shuffle(arr): + import random + random.shuffle(arr) + +def is_empty(a): + ''' + tell whether an array is empty. + If a is multidimensional, it is empty when it contains no entry in the last dimension. + ''' + if a is None: + return True + + shape = np.shape(a) + if np.prod(shape) == 0: + return True + + return False + +def angle_with_x(x, y): + """ + return the arctan x/y, in range [-pi, pi] + """ + return np.arctan2(y, x) + +def has_infty(x): + test = x == np.infty + return np.sum(test) > 0 + +def has_nan(x): + x = np.asarray(x) + test = x != x + return np.sum(test) > 0 + +def has_nan_or_infty(x): + if has_nan(x): + return True + + if has_infty(x): + return True + + +def iterable(x): + return np.iterable(x) + +def smooth(arr): + result = [0] * len(arr) + s = 0 + for idx, n in arr: + s += n + result[idx] = s * 1.0 / (idx + 1) + return result diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py new file mode 100644 index 0000000000000000000000000000000000000000..6eb61833ff09c6db1308fedb1082eeaea194334b --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py @@ -0,0 +1,191 @@ +#coding=utf-8 +''' +Created on 2016-9-27 + +@author: dengdan +''' +import matplotlib.pyplot as plt +import numpy as np +import util + +def hist(x, title = None, normed = False, show = True, save = False, save_path = None, bin_count = 100, bins = None): + x = np.asarray(x) + if len(np.shape(x)) > 1: +# x = np.reshape(x, np.prod(x.shape)) + x = util.np.flatten(x) + if bins == None: + bins = np.linspace(start = min(x), stop = max(x), num = bin_count, endpoint = True, retstep = False) + plt.figure(num = title) + plt.hist(x, bins, normed = normed) + if save: + if save_path is None: + raise ValueError + path = util.io.join_path(save_path, title + '.png') + save_image(path) + if show: + plt.show() + #util.img.imshow(title, path, block = block) + +def plot_solver_data(solver_path): + data = util.io.load(solver_path) + training_losses = data.training_losses + training_accuracies = data.training_accuracies + val_losses = data.val_losses + val_accuracies = data.val_accuracies + plt.figure(solver_path) + + n = len(training_losses) + x = range(n) + + plt.plot(x, training_losses, 'r-', label = 'Training Loss') + + if len(training_accuracies) > 0: + plt.plot(x, training_accuracies, 'r--', label = 'Training Accuracy') + + if len(val_losses) > 0: + n = len(val_losses) + x = range(n) + plt.plot(x, val_losses, 'g-', label = 'Validation Loss') + + if len(val_accuracies) > 0: + plt.plot(x, val_accuracies, 'g--', label = 'Validation Accuracy') + plt.legend() + plt.show() + + +def rectangle(xy, width, height, color = 'red', linewidth = 1, fill = False, alpha = None, axis = None): + """ + draw a rectangle on plt axis + """ + import matplotlib.patches as patches + rect = patches.Rectangle( + xy = xy, + width = width, + height = height, + alpha = alpha, + color = color, + fill = fill, + linewidth = linewidth + ) + if axis is not None: + axis.add_patch(rect) + return rect + +rect = rectangle + +def maximize_figure(): + mng = plt.get_current_fig_manager() + mng.full_screen_toggle() + +def line(xy_start, xy_end, color = 'red', linewidth = 1, alpha = None, axis = None): + """ + draw a line on plt axis + """ + from matplotlib.lines import Line2D + num = 100 + xdata = np.linspace(xy_start[0], xy_end[0], num = num) + ydata = np.linspace(xy_start[1], xy_end[1], num = num) + line = Line2D( + alpha = alpha, + color = color, + linewidth = linewidth, + xdata = xdata, + ydata = ydata + ) + if axis is not None: + axis.add_line(line) + return line + +def imshow(title = None, img = None, gray = False): + show_images([img], [title], gray = gray) + +def show_images(images, titles = None, shape = None, share_axis = False, + bgr2rgb = False, maximized = False, + show = True, gray = False, save = False, colorbar = False, + path = None, axis_off = False, vertical = False, subtitle = None): + + if shape == None: + if vertical: + shape = (len(images), 1) + else: + shape = (1, len(images)) + + ret_axes = [] + ax0 = None + for idx, img in enumerate(images): + if bgr2rgb: + img = util.img.bgr2rgb(img) + loc = (idx / shape[1], idx % shape[1]) + if idx == 0: + ax = plt.subplot2grid(shape, loc) + ax0 = ax + else: + if share_axis: + ax = plt.subplot2grid(shape, loc, sharex = ax0, sharey = ax0) + else: + ax = plt.subplot2grid(shape, loc) + if len(np.shape(img)) == 2 and gray: + img_ax = ax.imshow(img, cmap = 'gray') + else: + img_ax = ax.imshow(img) + + if len(np.shape(img)) == 2 and colorbar: + plt.colorbar(img_ax, ax = ax) + if titles != None: + ax.set_title(titles[idx]) + + if axis_off: + plt.axis('off') +# plt.xticks([]), plt.yticks([]) + ret_axes.append(ax) + + if subtitle is not None: + set_subtitle(subtitle) + if maximized: + maximize_figure() + + if save: + if path is None: + raise ValueError('path can not be None when save is True') + save_image(path) + if show: + plt.show() + return ret_axes + +def save_image(path, img = None, dpi = 150): + path = util.io.get_absolute_path(path) + util.io.make_parent_dir(path) + if img is None: + plt.gcf().savefig(path, dpi = dpi) + else: + plt.imsave(path, img) + +imwrite = save_image + +def to_ROI(ax, ROI): + xy1, xy2 = ROI + xmin, ymin = xy1 + xmax, ymax = xy2 + ax.set_xlim(xmin, xmax) + #ax.extent + ax.set_ylim(ymax, ymin) + +def set_subtitle(title, fontsize = 12): + plt.gcf().suptitle(title, fontsize=fontsize) + +def show(maximized = False): + if maximized: + maximize_figure() + plt.show() + +def draw(): + plt.gcf().canvas.draw() + +def get_random_line_style(): + colors = ['r', 'g', 'b'] + line_types = ['-']#, '--', '-.', ':'] + idx = util.rand.randint(len(colors)) + color = colors[idx] + idx = util.rand.randint(len(line_types)) + line_type = line_types[idx] + return color + line_type diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py~ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py~ new file mode 100644 index 0000000000000000000000000000000000000000..36129f23c783dc9324cadef5e94f1d4ef4bbfcca --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py~ @@ -0,0 +1,188 @@ +#coding=utf-8 +''' +Created on 2016-9-27 + +@author: dengdan +''' +import matplotlib.pyplot as plt +import numpy as np +import util + +def hist(x, title, normed = False, show = True, save = False, save_path = None, bin_count = 100, bins = None): + x = np.asarray(x) + if len(np.shape(x)) > 1: +# x = np.reshape(x, np.prod(x.shape)) + x = util.np.flatten(x) + if bins == None: + bins = np.linspace(start = min(x), stop = max(x), num = bin_count, endpoint = True, retstep = False) + plt.figure(num = title) + plt.hist(x, bins, normed = normed) + if save: + if save_path is None: + raise ValueError + path = util.io.join_path(save_path, title + '.png') + save_image(path) + if show: + plt.show() + #util.img.imshow(title, path, block = block) + +def plot_solver_data(solver_path): + data = util.io.load(solver_path) + training_losses = data.training_losses + training_accuracies = data.training_accuracies + val_losses = data.val_losses + val_accuracies = data.val_accuracies + plt.figure(solver_path) + + n = len(training_losses) + x = range(n) + + plt.plot(x, training_losses, 'r-', label = 'Training Loss') + + if len(training_accuracies) > 0: + plt.plot(x, training_accuracies, 'r--', label = 'Training Accuracy') + + if len(val_losses) > 0: + n = len(val_losses) + x = range(n) + plt.plot(x, val_losses, 'g-', label = 'Validation Loss') + + if len(val_accuracies) > 0: + plt.plot(x, val_accuracies, 'g--', label = 'Validation Accuracy') + plt.legend() + plt.show() + + +def rectangle(xy, width, height, color = 'red', linewidth = 1, fill = False, alpha = None, axis = None): + """ + draw a rectangle on plt axis + """ + import matplotlib.patches as patches + rect = patches.Rectangle( + xy = xy, + width = width, + height = height, + alpha = alpha, + color = color, + fill = fill, + linewidth = linewidth + ) + if axis is not None: + axis.add_patch(rect) + return rect + +rect = rectangle + +def maximize_figure(): + mng = plt.get_current_fig_manager() + mng.full_screen_toggle() + +def line(xy_start, xy_end, color = 'red', linewidth = 1, alpha = None, axis = None): + """ + draw a line on plt axis + """ + from matplotlib.lines import Line2D + num = 100 + xdata = np.linspace(xy_start[0], xy_end[0], num = num) + ydata = np.linspace(xy_start[1], xy_end[1], num = num) + line = Line2D( + alpha = alpha, + color = color, + linewidth = linewidth, + xdata = xdata, + ydata = ydata + ) + if axis is not None: + axis.add_line(line) + return line + +def imshow(title = None, img = None, gray = False): + show_images([img], [title], gray = gray) + +def show_images(images, titles = None, shape = None, share_axis = False, + bgr2rgb = False, maximized = False, + show = True, gray = False, save = False, + path = None, axis_off = False, vertical = False, subtitle = None): + + if shape == None: + if vertical: + shape = (len(images), 1) + else: + shape = (1, len(images)) + + ret_axes = [] + ax0 = None + for idx, img in enumerate(images): + if bgr2rgb: + img = util.img.bgr2rgb(img) + loc = (idx / shape[1], idx % shape[1]) + if idx == 0: + ax = plt.subplot2grid(shape, loc) + ax0 = ax + else: + if share_axis: + ax = plt.subplot2grid(shape, loc, sharex = ax0, sharey = ax0) + else: + ax = plt.subplot2grid(shape, loc) + if len(np.shape(img)) == 2 and gray: + ax.imshow(img, cmap = 'gray') + else: + ax.imshow(img) + if titles != None: + ax.set_title(titles[idx]) + + if axis_off: + plt.axis('off') +# plt.xticks([]), plt.yticks([]) + ret_axes.append(ax) + + if subtitle is not None: + set_subtitle(subtitle) + if maximized: + maximize_figure() + + if save: + if path is None: + raise ValueError('path can not be None when save is True') + save_image(path) + if show: + plt.show() + return ret_axes + +def save_image(path, img = None, dpi = 150): + path = util.io.get_absolute_path(path) + util.io.make_parent_dir(path) + if img is None: + plt.gcf().savefig(path, dpi = dpi) + else: + plt.imsave(path, img) + +imwrite = save_image + +def to_ROI(ax, ROI): + xy1, xy2 = ROI + xmin, ymin = xy1 + xmax, ymax = xy2 + ax.set_xlim(xmin, xmax) + #ax.extent + ax.set_ylim(ymax, ymin) + +def set_subtitle(title, fontsize = 12): + plt.gcf().suptitle(title, fontsize=fontsize) + +def show(maximized = False): + if maximized: + maximize_figure() + plt.show() + +def draw(): + plt.gcf().canvas.draw() + +def get_random_line_style(): + colors = ['r', 'g', 'b', 'k'] + line_types = ['-']#, '--', '-.', ':'] + idx = util.rand.randint(len(colors)) + color = colors[idx] + idx = util.rand.randint(len(line_types)) + line_type = line_types[idx] + return color + line_type diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/proc.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/proc.py new file mode 100644 index 0000000000000000000000000000000000000000..a6621c00b1cc3f4efa60b1dbaac72d8717f565b3 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/proc.py @@ -0,0 +1,51 @@ +import multiprocessing + +def cpu_count(): + return multiprocessing.cpu_count() + +def get_pool(processes): + pool = multiprocessing.Pool(processes = processes) + return pool + +def wait_for_pool(pool): + pool.close() + pool.join() + +def set_proc_name(name): + import setproctitle + setproctitle.setproctitle(name) + +def kill(pid): + import util + if type(pid) == list: + for p in pid: + kill(p) + elif type(pid) == int: + cmd = 'kill -9 %d'%(pid) + print cmd + print util.cmd.cmd(cmd) + elif type(pid) == str: + pids = get_pid(pid) + kill(pids) + else: + raise ValueError, 'Not supported parameter type:', type(pid) + +def ps_aux_grep(pattern): + import util + cmd = 'ps aux|grep %s'%(pattern) + return util.cmd.cmd(cmd) + + +def get_pid(pattern): + import util + cmd = 'ps aux|grep %s'%(pattern) + results = util.cmd.cmd(cmd) + results = util.str.split(results, '\n') + pids = [] + for result in results: + info = result.split() + if len(info) > 0: + pid = int(info[1]) + pids.append(pid) + return pids + diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/rand.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/rand.py new file mode 100644 index 0000000000000000000000000000000000000000..34ffa45c2fa36805d16ac5543bb453221b67a175 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/rand.py @@ -0,0 +1,37 @@ +#coding=utf-8 +''' +Created on 2016年9月27日 + +@author: dengdan +''' +import numpy as np +import time +import random + +rng = np.random.RandomState(int(time.time())) + +rand = np.random.rand +""" +Create an array of the given shape and populate it with random samples from a uniform distribution over [0, 1) +""" + + +def normal(shape, mu = 0, sigma_square = 1): + return rng.normal(mu, np.sqrt(sigma_square), shape) + +def randint(low = 2 ** 30, high = None, shape = None): + """ + low: the higher bound except when high is not None. + high: when it is not none, low must be smaller than it + shape: if not provided, a scalar will be returned + """ + return rng.randint(low = low, high = high, size = shape) + +def shuffle(lst): + random.shuffle(lst) + +def sample(lst, n): + return random.sample(lst, n) + + + \ No newline at end of file diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/statistic.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/statistic.py new file mode 100644 index 0000000000000000000000000000000000000000..69dab91c46cd93c0e666dca9aa067a7cbe384ac5 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/statistic.py @@ -0,0 +1,16 @@ +#coding=utf-8 +''' +Created on 2016年10月8日 + +@author: dengdan +''' +import numpy as np +import util.np + +def D(x): + x = util.np.flatten(x) + return np.var(x) + +def E(x): + x = util.np.flatten(x) + return np.average(x) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py new file mode 100644 index 0000000000000000000000000000000000000000..3fda36820bc711e2548df52d70472adc8c20b99b --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py @@ -0,0 +1,94 @@ +# encoding = utf-8 +def int_array_to_str(arr): + """turn an int array to a str""" + return "".join(map(chr, arr)) + +def join(arr, splitter=','): + temp = [] + for e in arr: + temp.append(e) + temp.append(splitter) + temp.pop() + return "".join(temp) + +def is_str(s): + return type(s) == str + +def to_lowercase(s): + return str.lower(s) + +def to_uppercase(s): + return str.upper(s) + +def ends_with(s, suffix, ignore_case = False): + """ + suffix: str, list, or tuple + """ + if is_str(suffix): + suffix = [suffix] + suffix = list(suffix) + if ignore_case: + for idx, suf in enumerate(suffix): + suffix[idx] = to_lowercase(suf) + s = to_lowercase(s) + suffix = tuple(suffix) + return s.endswith(suffix) + +def starts_with(s, prefix, ignore_case = False): + """ + prefix: str, list, or tuple + """ + if is_str(prefix): + prefix = [prefix] + prefix = list(prefix) + if ignore_case: + for idx, pre in enumerate(prefix): + prefix[idx] = to_lowercase(pre) + s = to_lowercase(s) + prefix = tuple(prefix) + return s.startswith(prefix) + + +def contains(s, target, ignore_case = False): + if ignore_case: + s = to_lowercase(s) + target = to_lowercase(target) + return s.find(target) >= 0 + +def index_of(s, target): + return s.find(target) + +def replace_all(s, old, new, reg = False): + if reg: + import re + targets = re.findall(old, s) + for t in targets: + s = s.replace(t, new) + else: + s = s.replace(old, new) + return s + +def remove_all(s, sub): + return replace_all(s, sub, '') + +def split(s, splitter, reg = False): + if not reg: + return s.split(splitter) + import re + return re.split(splitter, s) + +def remove_invisible(s): + s = replace_all(s, ' ', '') + s = replace_all(s, '\n', '') + s = replace_all(s, '\t', '') + s = replace_all(s, '\r', '') + return s + +def find_all(s, pattern): + import re + return re.findall(pattern, s) + +def is_none_or_empty(s): + if s is None: + return True + return len(s)==0; diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py~ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py~ new file mode 100644 index 0000000000000000000000000000000000000000..7b88e8f7e1e09d6d3e73b0d40f2abccf5c43e8a3 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py~ @@ -0,0 +1,75 @@ +# encoding = utf-8 +def int_array_to_str(arr): + """turn an int array to a str""" + return "".join(map(chr, arr)) + +def join(arr, splitter=','): + temp = [] + for e in arr: + temp.append(e) + temp.append(splitter) + temp.pop() + return "".join(temp) + +def is_str(s): + return type(s) == str + +def to_lowercase(s): + return str.lower(s) + +def ends_with(s, suffix, ignore_case = False): + """ + suffix: str, list, or tuple + """ + if is_str(suffix): + suffix = [suffix] + suffix = list(suffix) + if ignore_case: + for idx, suf in enumerate(suffix): + suffix[idx] = to_lowercase(suf) + s = to_lowercase(s) + suffix = tuple(suffix) + return s.endswith(suffix) + +def starts_with(s, prefix, ignore_case = False): + """ + prefix: str, list, or tuple + """ + if is_str(prefix): + prefix = [prefix] + prefix = list(prefix) + if ignore_case: + for idx, pre in enumerate(prefix): + prefix[idx] = to_lowercase(pre) + s = to_lowercase(s) + prefix = tuple(prefix) + return s.startswith(prefix) + + +def contains(s, target, ignore_case = False): + if ignore_case: + s = to_lowercase(s) + target = to_lowercase(target) + return s.find(target) >= 0 + +def replace_all(s, old, new): + return s.replace(old, new) + +def remove_all(s, sub): + return replace_all(s, sub, '') + +def split(s, splitter): +# return s.split(splitter) + import re + return re.split(splitter, s) +def remove_invisible(s): + s = replace_all(s, ' ', '') + s = replace_all(s, '\n', '') + s = replace_all(s, '\t', '') + s = replace_all(s, '\r', '') + return s + +def find_all(s, pattern): + import re + return re.findall(pattern, s) + diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/t.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/t.py new file mode 100644 index 0000000000000000000000000000000000000000..72ee2de7dadfe782f0f9092e2dc0233d93e468a5 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/t.py @@ -0,0 +1,25 @@ +#encoding=utf-8 +""" +for theano shortcuts +""" +import theano +import theano.tensor as T +import util.rand + +trng = T.shared_randomstreams.RandomStreams(util.rand.randint()) +scan_until = theano.scan_module.until + +def add_noise(input, noise_level): + noise = trng.binomial(size = input.shape, n = 1, p = 1 - noise_level) + return noise * input + +def crop_into(large, small): + """ + center crop large image into small. + both 'large' and 'small' are 4D: (batch_size, channels, h, w) + """ + + h1, w1 = large.shape[2:] + h2, w2 = small.shape[2:] + y, x = (h1 - h2) / 2, (w1 - h2)/2 + return large[:, :, y: y + h2, x: x + w2 ] \ No newline at end of file diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/test.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/test.py new file mode 100644 index 0000000000000000000000000000000000000000..ae99b2778f346df88890a0f3e2c1d0b730a5309d --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/test.py @@ -0,0 +1,7 @@ +#encoding = utf-8 +import numpy as np + +assert_true = np.testing.assert_ +assert_equal = np.testing.assert_equal +assert_array_equal = np.testing.assert_array_equal +assert_almost_equal = np.testing.assert_almost_equal diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/tf.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/tf.py new file mode 100644 index 0000000000000000000000000000000000000000..5db3b39e69a20717c7d840e537027ce0d833306c --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/tf.py @@ -0,0 +1,269 @@ +from __future__ import print_function + + +try: + import tensorflow as tf + from tensorflow.python.ops import nn + relu = nn.relu + slim = tf.contrib.slim + sigmoid = nn.sigmoid + softmax = nn.softmax +except: + print("tensorflow is not installed, util.tf can not be used.") + +def is_gpu_available(cuda_only=True): + """ + code from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/platform/test.py + Returns whether TensorFlow can access a GPU. + Args: + cuda_only: limit the search to CUDA gpus. + Returns: + True iff a gpu device of the requested kind is available. + """ + from tensorflow.python.client import device_lib as _device_lib + + if cuda_only: + return any((x.device_type == 'GPU') + for x in _device_lib.list_local_devices()) + else: + return any((x.device_type == 'GPU' or x.device_type == 'SYCL') + for x in _device_lib.list_local_devices()) + + + +def get_available_gpus(num_gpus = None): + """ + Modified on http://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow + However, the original code will occupy all available gpu memory. + The modified code need a parameter: num_gpus. It does nothing but return the device handler name + It will work well on single-maching-training, but I don't know whether it will work well on a cluster. + """ + if num_gpus == None: + from tensorflow.python.client import device_lib as _device_lib + local_device_protos = _device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] + else: + return ['/gpu:%d'%(idx) for idx in xrange(num_gpus)] + +def get_latest_ckpt(path): +# tf.train.latest_checkpoint + import util + path = util.io.get_absolute_path(path) + if util.io.is_dir(path): + ckpt = tf.train.get_checkpoint_state(path) + if ckpt is not None: + ckpt_path = ckpt.model_checkpoint_path + else: + ckpt_path = None + else: + ckpt_path = path; + return ckpt_path + +def get_all_ckpts(path): + ckpt = tf.train.get_checkpoint_state(path) + all_ckpts = ckpt.all_model_checkpoint_paths + ckpts = [str(c) for c in all_ckpts] + return ckpts + +def get_iter(ckpt): + import util + iter_ = int(util.str.find_all(ckpt, '.ckpt-\d+')[0].split('-')[-1]) + return iter_ + +def get_init_fn(checkpoint_path, train_dir, ignore_missing_vars = False, + checkpoint_exclude_scopes = None, model_name = None, checkpoint_model_scope = None): + """ + code from github/SSD-tensorflow/tf_utils.py + Returns a function run by the chief worker to warm-start the training. + Note that the init_fn is only run when initializing the model during the very + first global step. + + checkpoint_path: the checkpoint to be restored + train_dir: the directory where checkpoints are stored during training. + ignore_missing_vars: if False and there are variables in the model but not in the checkpoint, an error will be raised. + checkpoint_model_scope and model_name: if the root scope of checkpoints and the model in session is different, + (but the sub-scopes are all the same), specify them clearly + checkpoint_exclude_scopes: variables to be excluded when restoring from checkpoint_path. + Returns: + An init function run by the supervisor. + """ + import util + if util.str.is_none_or_empty(checkpoint_path): + return None + # Warn the user if a checkpoint exists in the train_dir. Then ignore. + if tf.train.latest_checkpoint(train_dir): + tf.logging.info( + 'Ignoring --checkpoint_path because a checkpoint already exists in %s' + % train_dir) + return None + + exclusions = [] + if checkpoint_exclude_scopes: + exclusions = [scope.strip() + for scope in checkpoint_exclude_scopes.split(',')] + + # TODO(sguada) variables.filter_variables() + variables_to_restore = [] + for var in slim.get_model_variables(): + excluded = False + for exclusion in exclusions: + if var.op.name.startswith(exclusion): + excluded = True + break + if not excluded: + variables_to_restore.append(var) + # Change model scope if necessary. + if checkpoint_model_scope is not None: + variables_to_restore = {checkpoint_model_scope + '/' + var.op.name : var for var in variables_to_restore} + tf.logging.info('variables_to_restore: %r'%(variables_to_restore)) + checkpoint_path = get_latest_ckpt(checkpoint_path) + tf.logging.info('Fine-tuning from %s. Ignoring missing vars: %s' % (checkpoint_path, ignore_missing_vars)) + print ('checkpoint_path', checkpoint_path) + return slim.assign_from_checkpoint_fn( + checkpoint_path, + variables_to_restore, + ignore_missing_vars=ignore_missing_vars) + + +def get_variables_to_train(flags = None): + """code from github/SSD-tensorflow/tf_utils.py + Returns a list of variables to train. + + Returns: + A list of variables to train by the optimizer. + """ + if flags is None or flags.trainable_scopes is None: + return tf.trainable_variables() + else: + scopes = [scope.strip() for scope in flags.trainable_scopes.split(',')] + + variables_to_train = [] + for scope in scopes: + variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) + variables_to_train.extend(variables) + return variables_to_train + +def Print(tensor, data, msg = '', file = None, mode = 'w'): + from tensorflow.python.ops import control_flow_ops + import util + def np_print(*args): + if util.str.contains(msg, '%'): + message = msg%tuple(args) + else: + message = msg + ' %'*len(args)%tuple(args) + if file is not None: + file_path = util.io.get_absolute_path(file) + print('writting message to file(%s):'%(file_path), message) + with open(file_path, mode) as f: + print(message, file = f) + else: + print(message) + return control_flow_ops.with_dependencies([tf.py_func(np_print, data, [])], tensor) + +def get_variable_names_in_checkpoint(path, return_shapes = False, return_reader = False): + """ + Args: + path: the path to training directory containing checkpoints, + or path to checkpoint + Return: + a list of variable names in the checkpoint + """ + import util + ckpt = get_latest_ckpt(path) + ckpt_reader = tf.train.NewCheckpointReader(ckpt) + ckpt_vars = ckpt_reader.get_variable_to_shape_map() + names = [var for var in ckpt_vars] + if return_shapes: + return names, ckpt_vars + def get(name): + return ckpt_reader.get_tensor(name) + if return_reader: + return names, get + return names + + + +def min_area_rect(xs, ys): + import util + rects = tf.py_func(util.img.min_area_rect, [xs, ys], xs.dtype) + rects.set_shape([None, 5]) + return rects + + +def gpu_config(config = None, allow_growth = None, gpu_memory_fraction = None): + if config is None: + config = tf.ConfigProto() + + if allow_growth is not None: + config.gpu_options.allow_growth = allow_growth + + if gpu_memory_fraction is not None: + config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction + + return config + +def wait_for_checkpoint(path): + from tensorflow.contrib.training.python.training import evaluation + return evaluation.checkpoints_iterator(path) + +def focal_loss(labels, logits, gamma = 2.0, alpha = 0.75, normalize = True): + labels = tf.where(labels > 0, tf.ones_like(labels), tf.zeros_like(labels)) + labels = tf.cast(labels, tf.float32) + probs = tf.sigmoid(logits) + CE = tf.nn.sigmoid_cross_entropy_with_logits(labels = labels, logits = logits) + + alpha_t = tf.ones_like(logits) * alpha + alpha_t = tf.where(labels > 0, alpha_t, 1.0 - alpha_t) + probs_t = tf.where(labels > 0, probs, 1.0 - probs) + + focal_matrix = alpha_t * tf.pow((1.0 - probs_t), gamma) + fl = focal_matrix * CE + + fl = tf.reduce_sum(fl) + if normalize: + #n_pos = tf.reduce_sum(labels) + #fl = fl / tf.cast(n_pos, tf.float32) + total_weights = tf.stop_gradient(tf.reduce_sum(focal_matrix)) + fl = fl / total_weights + return fl + + +def focal_loss_layer_initializer(sigma = 0.01, pi = 0.01): + import numpy as np + b0 = - np.log((1 - pi) / pi) + return tf.random_normal_initializer(stddev = sigma), \ + tf.constant_initializer(b0) + + +def sum_gradients(clone_grads, do_summary = False): + averaged_grads = [] + for grad_and_vars in zip(*clone_grads): + grads = [] + var = grad_and_vars[0][1] + try: + for g, v in grad_and_vars: + assert v == var + grads.append(g) + grad = tf.add_n(grads, name = v.op.name + '_summed_gradients') + except: + import pdb + pdb.set_trace() + + averaged_grads.append((grad, v)) + + if do_summary: + tf.summary.histogram("variables_and_gradients_" + grad.op.name, grad) + tf.summary.histogram("variables_and_gradients_" + v.op.name, v) + tf.summary.scalar("variables_and_gradients_" + grad.op.name+\ + '_mean/var_mean', tf.reduce_mean(grad)/tf.reduce_mean(var)) + tf.summary.scalar("variables_and_gradients_" + v.op.name+'_mean',tf.reduce_mean(var)) + return averaged_grads + +def get_update_op(): + """ + Extremely important for BatchNorm + """ + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + if update_ops: + return tf.group(*update_ops) + return None diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/thread_.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/thread_.py new file mode 100644 index 0000000000000000000000000000000000000000..907e01f922fecf85a68072df7c9dd5557f8be13f --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/thread_.py @@ -0,0 +1,62 @@ +import threading +from threading import Thread + +def get_current_thread(): + return threading.current_thread() + +def get_current_thread_name(): + return get_current_thread().getName() + +def is_alive(t): + return t.is_alive() + +def create_and_start(name, target, daemon = True): + t = Thread(target= target) + t.daemon = True + t.setName(name) + t.start() + return t + + + +class ThreadPool(object): + def __init__(self, capacity = 10): + import threadpool + self.num_threads = capacity + self.pool = threadpool.ThreadPool(10) + + def add(self, fn, args): + import threadpool + if type(args) == list: + args = [(args, None)] + elif type(args) == dict: + args = [(None, args)] + else: + raise ValueError, "Unsuported args,", type(args) + request = threadpool.makeRequests(fn, args)[0] + self.pool.putRequest(request, block = False) + self.pool.poll() + + def join(self): + self.pool.wait() + +class ProcessPool(object): + """ + Remember that function in function is not supported by multiprocessing. + """ + def __init__(self, capacity = 8): + from multiprocessing import Pool + + self.capacity = capacity + self.pool = Pool(capacity) + + def add(self, fn, args): + self.pool.apply_async(fn, args) +# self.pool.poll() +# self.pool.poll + + def join(self): + self.pool.close() + self.pool.join() + + diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/url.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/url.py new file mode 100644 index 0000000000000000000000000000000000000000..c08e6cac66aaa0092dc8ffa4945b653d0015f818 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/url.py @@ -0,0 +1,17 @@ +import sys +import os +from six.moves import urllib + +import util +def download(url, path): + filename = path.split('/')[-1] + if not util.io.exists(path): + def _progress(count, block_size, total_size): + sys.stdout.write('\r-----Downloading %s %.1f%%' % (filename, + float(count * block_size) / float(total_size) * 100.0)) + sys.stdout.flush() + path, _ = urllib.request.urlretrieve(url, path, _progress) + print() + statinfo = os.stat(path) + print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') + \ No newline at end of file diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/word_eval.py b/maskrcnn_benchmark/data/datasets/evaluation/word/word_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..462875e4d54adf792039b2eac4b83da0c0d07423 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/word/word_eval.py @@ -0,0 +1,782 @@ +import logging +import tempfile +import os +import torch +from collections import OrderedDict +import itertools +from tqdm import tqdm +from .util import io_ +from maskrcnn_benchmark.modeling.roi_heads.boundary_head.inference import Masker +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou + +from maskrcnn_benchmark.config import cfg +from shapely.geometry import * +import cv2 +import numpy as np +import csv +from .alfashape import getAlfaShapes +import torch.nn as nn + + +def do_coco_evaluation( + dataset, + predictions, + box_only, # False + output_folder, + iou_types, # 'segm' + expected_results, # [] + expected_results_sigma_tol, # 4 +): + logger = logging.getLogger("maskrcnn_benchmark.inference") + + if box_only: + logger.info("Evaluating bbox proposals") + areas = {"all": "", "small": "s", "medium": "m", "large": "l"} + res = COCOResults("box_proposal") + for limit in [100, 1000]: + for area, suffix in areas.items(): + stats = evaluate_box_proposals( + predictions, dataset, area=area, limit=limit + ) + key = "AR{}@{:d}".format(suffix, limit) + res.results["box_proposal"][key] = stats["ar"].item() + logger.info(res) + check_expected_results(res, expected_results, expected_results_sigma_tol) + if output_folder: + torch.save(res, os.path.join(output_folder, "box_proposals.pth")) + return + logger.info("Preparing results for COCO format") + coco_results = {} + if "bbox" in iou_types: + logger.info("Preparing bbox results") + coco_results["bbox"] = prepare_for_coco_detection(predictions, dataset) + if "bo" in iou_types: + logger.info("Preparing bo results") + coco_results["bo"] = prepare_for_boundary_segmentation(predictions, dataset) + logger.info("Do not apply evaluating predictions") + for iou_type in iou_types: + with tempfile.NamedTemporaryFile() as f: + file_path = f.name + if output_folder: + if not os.path.isdir(output_folder): + print('creating dir: ' + output_folder) + os.mkdir(output_folder) + file_path = os.path.join(output_folder, iou_type + ".json") + res = evaluate_predictions_on_coco( + dataset.coco, coco_results[iou_type], file_path, iou_type + ) + + return None + + +def prepare_for_coco_detection(predictions, dataset): + # assert isinstance(dataset, COCODataset) + coco_results = [] + for image_id, prediction in enumerate(predictions): + original_id = dataset.id_to_img_map[image_id] + if len(prediction) == 0: + continue + + # TODO replace with get_img_info? + image_width = dataset.coco.imgs[original_id]["width"] + image_height = dataset.coco.imgs[original_id]["height"] + prediction = prediction.resize((image_width, image_height)) + prediction = prediction.convert("xywh") + + boxes = prediction.bbox.tolist() + scores = prediction.get_field("scores").tolist() + labels = prediction.get_field("labels").tolist() + + mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": mapped_labels[k], + "bbox": box, + "score": scores[k], + } + for k, box in enumerate(boxes) + ] + ) + return coco_results + + +def contour_to_xys(cnt, image_shape): + """Convert rect to xys, i.e., eight points + The `image_shape` is used to to make sure all points return are valid, i.e., within image area + """ + rect = cv2.minAreaRect(cnt) + h, w = image_shape[0:2] + + def get_valid_x(x): + if x < 0: + return 0 + if x >= w: + return w - 1 + return x + + def get_valid_y(y): + if y < 0: + return 0 + if y >= h: + return h - 1 + return y + + points = cv2.boxPoints(rect) + points = np.int0(points) + for i_xy, (x, y) in enumerate(points): + x = get_valid_x(x) + y = get_valid_y(y) + points[i_xy, :] = [x, y] + points = np.reshape(points, -1) + return points + + +def contour_to_valid(cnt, image_shape): + """Convert rect to xys, i.e., eight points + The `image_shape` is used to to make sure all points return are valid, i.e., within image area + """ + # rect = cv2.minAreaRect(cnt) + if len(cnt.shape) != 3: + assert 1 < 0 + rect = cnt.reshape([cnt.shape[0], cnt.shape[2]]) + h, w = image_shape[0:2] + + def get_valid_x(x): + if x < 0: + return 0 + if x >= w: + return w - 1 + return x + + def get_valid_y(y): + if y < 0: + return 0 + if y >= h: + return h - 1 + return y + for i_xy, (x, y) in enumerate(rect): + x = get_valid_x(x) + y = get_valid_y(y) + rect[i_xy, :] = [x, y] + + points = np.reshape(rect, -1) + return points + + +def _nms_y(heat, kernel=3): + pad = (kernel - 1) // 2 + hmax = nn.functional.max_pool2d( + heat, (1, kernel), stride=1, padding=(0, pad)) + keep = (hmax == heat).float() + return heat * keep + + +def _nms_x(heat, kernel=3): + pad = (kernel - 1) // 2 + hmax = nn.functional.max_pool2d( + heat, (kernel, 1), stride=1, padding=(pad, 0)) + keep = (hmax == heat).float() + return heat * keep + +def CTW_order_lr(map_in): + + line_out_l2r = [] + line_out_r2l = [] + + map_in = torch.tensor(map_in) + value, top = torch.topk(map_in, 2, dim=0) + value = value.numpy() + top = top.numpy() + top_th = np.where(value[1] > 0.1)[0] # L + # print(top_th) + if len(top_th) == 0: + return [] + top1 = np.sort(top, axis=0) + for i in range(len(top_th)): + line_out_l2r.append([top_th[i], top1[0][top_th[i]]]) + line_out_r2l.append([top_th[i], top1[1][top_th[i]]]) + line_out = line_out_l2r+line_out_r2l[::-1] + # print(line_out) + return line_out + +def CTW_order_bt(map_in): + + line_out_t2b = [] + line_out_b2t = [] + + map_in = torch.tensor(map_in) + value, top = torch.topk(map_in, 2, dim=1) + value = value.numpy() + top = top.numpy() + top_th = np.where(value[:, 1] > 0.1)[0] # H + if len(top_th) == 0: + return [] + top1 = np.sort(top, axis=1) + for i in range(len(top_th)): + line_out_b2t.append([top1[top_th[i]][0], top_th[i]]) + line_out_t2b.append([top1[top_th[i]][1], top_th[i]]) + line_out = line_out_b2t[::-1] + line_out_t2b + # print(line_out) + return line_out + +def boundary_to_mask_ic(bo_x, bo_y, name, num): + + # NMS Hmap and Vmap + Vmap = _nms_x(bo_x, kernel=5) + Hmap = _nms_y(bo_y, kernel=3) + Vmap = Vmap[0] + Hmap = Hmap[0] + ploys_Alfa_x = Vmap.clone().numpy() + ploys_Alfa_y = Hmap.clone().numpy() + + # Threshold Hmap and Vmap + thresh = 0.5 + ploys_Alfa_x[ploys_Alfa_x < thresh] = 0 + ploys_Alfa_x[ploys_Alfa_x >= thresh] = 1 + ploys_Alfa_y[ploys_Alfa_y < thresh] = 0 + ploys_Alfa_y[ploys_Alfa_y >= thresh] = 1 + # Output points with strong texture inforamtion in both maps + ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y + ploys_Alfa[ploys_Alfa < 2] = 0 + ploys_Alfa[ploys_Alfa == 2] = 1 + img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8) + + # calculate polygon by Alpha-Shape Algorithm + if ploys_Alfa.sum() == 0: + return img_draw + ploys_Alfa_inds = np.argwhere(ploys_Alfa == 1) + zero_detect_x = ploys_Alfa_inds[:, 0] - ploys_Alfa_inds[0, 0] + zero_detect_y = ploys_Alfa_inds[:, 1] - ploys_Alfa_inds[0, 1] + if np.where(zero_detect_x != 0)[0].shape[0] == 0 or np.where(zero_detect_y != 0)[0].shape[0] == 0 or \ + ploys_Alfa_inds.shape[0] < 4: + draw_line = ploys_Alfa_inds[np.newaxis, np.newaxis, :, :] + cv2.fillPoly(img_draw, draw_line, 1) + return img_draw + ploys_Alfa_inds = ploys_Alfa_inds.tolist() + ploys_Alfa_inds = [tuple(ploys_Alfa_ind) for ploys_Alfa_ind in ploys_Alfa_inds] + lines = getAlfaShapes(ploys_Alfa_inds, alfas=[1]) + draw_line = np.array(lines) + if len(draw_line.shape) == 4: + if draw_line.shape[1] == 1: + draw_line[0, 0, :, :] = draw_line[0, 0, :, ::-1] + cv2.fillPoly(img_draw, draw_line, 1) + else: + i_draw = 0 + for draw_l in draw_line[0]: + img_draw_new = np.zeros([28, 28], dtype=np.uint8) + draw_l = draw_l[np.newaxis, np.newaxis, :, :] + cv2.fillPoly(img_draw, np.int32(draw_l), 1) + cv2.fillPoly(img_draw_new, np.int32(draw_l), 1) + i_draw += 1 + + else: + for i, line in enumerate(lines[0]): + draw_line = np.array(line) + draw_line = draw_line[np.newaxis, np.newaxis, :, :] + draw_line[0, 0, :, :] = draw_line[0, 0, :, ::-1] + cv2.fillPoly(img_draw, draw_line, 1) + return img_draw + +def boundary_to_mask_ctw(bo_x,bo_y, name, num, image_name_name,p_temp_box): + w_half = (p_temp_box[2] - p_temp_box[0]) * .5 + h_half = (p_temp_box[3] - p_temp_box[1]) * .5 + thresh_total = 0.5 + + if w_half >= h_half: + # point re-scoring + bo_x = _nms_x(bo_x, kernel=9) + bo_x = bo_x[0] + bo_y = bo_y[0] + ploys_Alfa_x = bo_x.clone().numpy() + ploys_Alfa_y = bo_y.clone().numpy() + thresh_x = thresh_total + thresh_y = thresh_total + ploys_Alfa_x_1 = bo_x.clone().numpy() + ploys_Alfa_y_1 = bo_y.clone().numpy() + ploys_Alfa__1 = ploys_Alfa_x_1 + ploys_Alfa_y_1 + ploys_Alfa_x[ploys_Alfa_x < thresh_x] = 0 + ploys_Alfa_x[ploys_Alfa_x >= thresh_x] = 1 + ploys_Alfa_y[ploys_Alfa_y < thresh_y] = 0 + ploys_Alfa_y[ploys_Alfa_y >= thresh_y] = 1 + ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y + ploys_Alfa[ploys_Alfa < 2] = 0 + ploys_Alfa[ploys_Alfa == 2] = 1 + ploys_Alfa *= ploys_Alfa__1 + # rebuild text region from contour points + img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8) + if ploys_Alfa.sum() == 0: + return img_draw + lines = CTW_order_lr(ploys_Alfa) + else: + bo_y = _nms_y(bo_y,kernel=9) + bo_x = bo_x[0] + bo_y = bo_y[0] + ploys_Alfa_x = bo_x.clone().numpy() + ploys_Alfa_y = bo_y.clone().numpy() + thresh_x = thresh_total + thresh_y = thresh_total + ploys_Alfa_x_1 = bo_x.clone().numpy() + ploys_Alfa_y_1 = bo_y.clone().numpy() + ploys_Alfa__1 = ploys_Alfa_x_1 + ploys_Alfa_y_1 + ploys_Alfa_x[ploys_Alfa_x < thresh_x] = 0 + ploys_Alfa_x[ploys_Alfa_x >= thresh_x] = 1 + ploys_Alfa_y[ploys_Alfa_y < thresh_y] = 0 + ploys_Alfa_y[ploys_Alfa_y >= thresh_y] = 1 + ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y + ploys_Alfa[ploys_Alfa < 2] = 0 + ploys_Alfa[ploys_Alfa == 2] = 1 + ploys_Alfa *= ploys_Alfa__1 + img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8) + if ploys_Alfa.sum() == 0: + return img_draw + lines = CTW_order_bt(ploys_Alfa) + if len(lines) <=10: + return img_draw + draw_line = np.array(lines) + draw_line = draw_line[np.newaxis, np.newaxis, :, :] + cv2.fillPoly(img_draw, draw_line, 1) + img_draw = img_draw.astype(np.uint8) + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) + img_draw = cv2.morphologyEx(img_draw, cv2.MORPH_CLOSE, kernel) + return img_draw + +def mask_to_roRect(mask, img_shape): + ## convert mask into rotated rect + e = mask[0, :, :] + _, countours, hier = cv2.findContours(e.clone().numpy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) # Aarlog + if len(countours) == 0: + return np.zeros((1, 8)) + t_c = countours[0].copy() + quad = contour_to_xys(t_c, img_shape) + return quad + + +def mask_to_contours(mask, img_shape): + e = mask[0, :, :] + + _, countours, hier = cv2.findContours(e.clone().numpy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) # Aarlog + + if len(countours) == 0: + return np.zeros((1, 8)) + t_c = countours[0].copy() + quad = contour_to_valid(t_c, img_shape) + return quad + + +def write_result_as_txt(image_name, bboxes, path): + if not os.path.exists(path): + os.makedirs(path) + + filename = io_.join_path(path, '%s.txt' % (image_name)) + lines = [] + for b_idx, bbox in enumerate(bboxes): + if len(bbox) < 6: + continue + values = [int(v) for v in bbox] + # line = "%d, %d, %d, %d, %d, %d, %d, %d\n"%tuple(values) + line = "%d" % values[0] + for v_id in range(1, len(values)): + line += ", %d" % values[v_id] + line += '\n' + lines.append(line) + io_.write_lines(filename, lines) + + +def prepare_for_boundary_segmentation(predictions, dataset): + import pycocotools.mask as mask_util + import numpy as np + + masker = Masker(threshold=0.5, padding=1) + coco_results = [] + + for image_id, prediction in tqdm(enumerate(predictions)): + original_id = dataset.id_to_img_map[image_id] + image_name = dataset.coco.imgs[original_id]["file_name"].split('.')[0] + im_w_name = dataset.coco.imgs[original_id]["file_name"] + if len(prediction) == 0: + continue + + # TODO replace with get_img_info? + image_width = dataset.coco.imgs[original_id]["width"] + image_height = dataset.coco.imgs[original_id]["height"] + prediction = prediction.resize((image_width, image_height)) + masks_x = prediction.get_field("mask_x") + masks_y = prediction.get_field("mask_y") + + if 'ic15' in cfg.DATASETS.TEST[0]: + masks = [boundary_to_mask_ic(mask_x, mask_y, dataset.coco.imgs[original_id]["file_name"], number) for + mask_x, mask_y, number in zip(masks_x, masks_y,list(range(masks_x.shape[0])))] + elif 'CTW' in cfg.DATASETS.TEST[0]: + masks = [boundary_to_mask_ctw(mask_x, mask_y, dataset.coco.imgs[original_id]["file_name"], number, image_name, + p_temp) for + mask_x, mask_y, number, p_temp in zip(masks_x, masks_y, + list(range(masks_x.shape[0])), prediction.bbox)] + else: + print('Please add your own construction code!') + assert 1<0 + + masks = torch.from_numpy(np.array(masks)[:, np.newaxis, :, :]) + # Masker is necessary only if masks haven't been already resized. + if list(masks.shape[-2:]) != [image_height, image_width]: + masks = masker(masks.expand(1, -1, -1, -1, -1), prediction) + masks = masks[0] + + scores = prediction.get_field("scores").tolist() + labels = prediction.get_field("labels").tolist() + if 'ic15' in cfg.DATASETS.TEST[0]: + rects = [mask_to_roRect(mask, [image_height, image_width]) for mask in masks] + if 'CTW' in cfg.DATASETS.TEST[0]: + contours = [mask_to_contours(mask, [image_height, image_width]) for mask in masks] + # output for evaluation + write_result_as_txt(image_name, contours, './output/ctw/results.txt') + # visualization + if cfg.DATASETS.Test_Visual: + im_write = cv2.imread( + '../ct/dataset/ctw/ctw_test_images/' + im_w_name)[:, :,::-1] + for box in contours: + box = np.array(box) + box = np.around(box).astype(np.int32) + cv2.polylines(im_write[:, :, ::-1], [box.astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), thickness=2) # 0,255,255 y 0,255,0 g + cv2.imwrite('./det_visual/' + im_w_name,im_write[:, :, ::-1]) + + if 'ic15' in cfg.DATASETS.TEST[0]: + mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] + esd = [] + for k, rect in enumerate(rects): + if rect.all() == 0: + continue + else: + esd.append( + { + "image_id": original_id, + "category_id": mapped_labels[k], + "seg_rorect": rect.tolist(), + "score": scores[k], + } + ) + if cfg.PROCESS.PNMS: + pnms_thresh = cfg.PROCESS.NMS_THRESH + keep = esd_pnms(esd, pnms_thresh) + new_esd = [] + for i in keep: + new_esd.append(esd[i]) + coco_results.extend(new_esd) + # visualization + if cfg.DATASETS.Test_Visual: + im_write = cv2.imread( + '../ct/dataset/ic15/ic15_test_images/' + im_w_name)[ + :, :, ::-1] + for i in keep: + box = esd[i] + # print(box) + # assert 1<0 + box = np.array(box['seg_rorect']) + box = np.around(box).astype(np.int32) + cv2.polylines(im_write[:, :, ::-1], [box.astype(np.int32).reshape((-1, 1, 2))], True, + color=(0, 255, 0), thickness=2) # 0,255,255 y 0,255,0 g + cv2.imwrite('./det_visual/' + im_w_name, im_write[:, :, ::-1]) + else: + coco_results.extend(esd) + + + return coco_results + +def ke_to_quad(ke, mty, img_shape): + mt = mty[:].argmax() + quad = paraToQuad_v3(ke, mt) + return quad + + +# polynms +def py_cpu_pnms(dets, scores, thresh): + pts = [] + for det in dets: + pts.append([[det[i][0], det[i][1]] for i in range(len(det))]) + order = scores.argsort()[::-1] + areas = np.zeros(scores.shape) + order = scores.argsort()[::-1] + inter_areas = np.zeros((scores.shape[0], scores.shape[0])) + for il in range(len(pts)): + poly = Polygon(pts[il]) + areas[il] = poly.area + for jl in range(il, len(pts)): + polyj = Polygon(pts[jl]) + try: + inS = poly.intersection(polyj) + except: + print(poly, polyj) + inter_areas[il][jl] = inS.area + inter_areas[jl][il] = inS.area + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = inter_areas[i][order[1:]] / (areas[i] + areas[order[1:]] - inter_areas[i][order[1:]]) + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + return keep + + +def esd_pnms(esd, pnms_thresh): + scores = [] + dets = [] + for ele in esd: + score = ele['score'] + quad = ele['seg_rorect'] + # det = np.array([[quad[0][0], quad[0][1]], [quad[1][0], quad[1][1]],[quad[2][0], quad[2][1]],[quad[3][0], quad[3][1]]]) + det = np.array([[quad[0], quad[1]], [quad[2], quad[3]], [quad[4], quad[5]], [quad[6], quad[7]]]) + scores.append(score) + dets.append(det) + scores = np.array(scores) + dets = np.array(dets) + keep = py_cpu_pnms(dets, scores, pnms_thresh) + return keep + + +# inspired from Detectron +def evaluate_box_proposals( + predictions, dataset, thresholds=None, area="all", limit=None +): + """Evaluate detection proposal recall metrics. This function is a much + faster alternative to the official COCO API recall evaluation code. However, + it produces slightly different results. + """ + # Record max overlap value for each gt box + # Return vector of overlap values + areas = { + "all": 0, + "small": 1, + "medium": 2, + "large": 3, + "96-128": 4, + "128-256": 5, + "256-512": 6, + "512-inf": 7, + } + area_ranges = [ + [0 ** 2, 1e5 ** 2], # all + [0 ** 2, 32 ** 2], # small + [32 ** 2, 96 ** 2], # medium + [96 ** 2, 1e5 ** 2], # large + [96 ** 2, 128 ** 2], # 96-128 + [128 ** 2, 256 ** 2], # 128-256 + [256 ** 2, 512 ** 2], # 256-512 + [512 ** 2, 1e5 ** 2], + ] # 512-inf + assert area in areas, "Unknown area range: {}".format(area) + area_range = area_ranges[areas[area]] + gt_overlaps = [] + num_pos = 0 + + for image_id, prediction in enumerate(predictions): + original_id = dataset.id_to_img_map[image_id] + + # TODO replace with get_img_info? + image_width = dataset.coco.imgs[original_id]["width"] + image_height = dataset.coco.imgs[original_id]["height"] + prediction = prediction.resize((image_width, image_height)) + + # sort predictions in descending order + # TODO maybe remove this and make it explicit in the documentation + inds = prediction.get_field("objectness").sort(descending=True)[1] + prediction = prediction[inds] + + ann_ids = dataset.coco.getAnnIds(imgIds=original_id) + anno = dataset.coco.loadAnns(ann_ids) + gt_boxes = [obj["bbox"] for obj in anno if obj["iscrowd"] == 0] + gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes + gt_boxes = BoxList(gt_boxes, (image_width, image_height), mode="xywh").convert( + "xyxy" + ) + gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) + + if len(gt_boxes) == 0: + continue + + valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) + gt_boxes = gt_boxes[valid_gt_inds] + + num_pos += len(gt_boxes) + + if len(gt_boxes) == 0: + continue + + if len(prediction) == 0: + continue + + if limit is not None and len(prediction) > limit: + prediction = prediction[:limit] + + overlaps = boxlist_iou(prediction, gt_boxes) + + _gt_overlaps = torch.zeros(len(gt_boxes)) + for j in range(min(len(prediction), len(gt_boxes))): + # find which proposal box maximally covers each gt box + # and get the iou amount of coverage for each gt box + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + + # find which gt box is 'best' covered (i.e. 'best' = most iou) + gt_ovr, gt_ind = max_overlaps.max(dim=0) + assert gt_ovr >= 0 + # find the proposal box that covers the best covered gt box + box_ind = argmax_overlaps[gt_ind] + # record the iou coverage of this gt box + _gt_overlaps[j] = overlaps[box_ind, gt_ind] + assert _gt_overlaps[j] == gt_ovr + # mark the proposal box and the gt box as used + overlaps[box_ind, :] = -1 + overlaps[:, gt_ind] = -1 + + # append recorded iou coverage level + gt_overlaps.append(_gt_overlaps) + gt_overlaps = torch.cat(gt_overlaps, dim=0) + gt_overlaps, _ = torch.sort(gt_overlaps) + + if thresholds is None: + step = 0.05 + thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) + recalls = torch.zeros_like(thresholds) + # compute recall for each iou threshold + for i, t in enumerate(thresholds): + recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) + # ar = 2 * np.trapz(recalls, thresholds) + ar = recalls.mean() + return { + "ar": ar, + "recalls": recalls, + "thresholds": thresholds, + "gt_overlaps": gt_overlaps, + "num_pos": num_pos, + } + + +def evaluate_predictions_on_coco( + coco_gt, coco_results, json_result_file, iou_type="bbox" +): + import json + + print('writing results to ' + json_result_file) + with open(json_result_file, "w") as f: + json.dump(coco_results, f) + + # from pycocotools.cocoeval import COCOeval + + # coco_dt = coco_gt.loadRes(str(json_result_file)) + # # coco_dt = coco_gt.loadRes(coco_results) + # coco_eval = COCOeval(coco_gt, coco_dt, iou_type) + # coco_eval.evaluate() + # coco_eval.accumulate() + # coco_eval.summarize() + # return coco_eval + return None + + +class COCOResults(object): + METRICS = { + "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], + "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], + "box_proposal": [ + "AR@100", + "ARs@100", + "ARm@100", + "ARl@100", + "AR@1000", + "ARs@1000", + "ARm@1000", + "ARl@1000", + ], + "keypoint": ["AP", "AP50", "AP75", "APm", "APl"], + } + + def __init__(self, *iou_types): + allowed_types = ("box_proposal", "bbox", "segm") + assert all(iou_type in allowed_types for iou_type in iou_types) + results = OrderedDict() + for iou_type in iou_types: + results[iou_type] = OrderedDict( + [(metric, -1) for metric in COCOResults.METRICS[iou_type]] + ) + self.results = results + + def update(self, coco_eval): + if coco_eval is None: + return + from pycocotools.cocoeval import COCOeval + + assert isinstance(coco_eval, COCOeval) + s = coco_eval.stats + iou_type = coco_eval.params.iouType + res = self.results[iou_type] + metrics = COCOResults.METRICS[iou_type] + for idx, metric in enumerate(metrics): + res[metric] = s[idx] + + def __repr__(self): + # TODO make it pretty + return repr(self.results) + + +def check_expected_results(results, expected_results, sigma_tol): + if not expected_results: + return + + logger = logging.getLogger("maskrcnn_benchmark.inference") + for task, metric, (mean, std) in expected_results: + actual_val = results.results[task][metric] + lo = mean - sigma_tol * std + hi = mean + sigma_tol * std + ok = (lo < actual_val) and (actual_val < hi) + msg = ( + "{} > {} sanity check (actual vs. expected): " + "{:.3f} vs. mean={:.4f}, std={:.4}, range=({:.4f}, {:.4f})" + ).format(task, metric, actual_val, mean, std, lo, hi) + if not ok: + msg = "FAIL: " + msg + logger.error(msg) + else: + msg = "PASS: " + msg + logger.info(msg) + + +def paraToQuad_v3(kes, mt): + ms = (kes[0, 0], kes[6, 0]) + xs = [kes[i, 0] for i in range(1, 5)] # 1 2 3 4 + ys = [kes[i, 0] for i in range(7, 11)] # 7 8 9 10 + crs = (kes[5, 0], kes[11, 0]) + ms = Point(ms) + crs = Point(crs) + vp = [] + all_types = [[1, 2, 3, 4], [1, 2, 4, 3], [1, 3, 2, 4], [1, 3, 4, 2], [1, 4, 2, 3], [1, 4, 3, 2], \ + [2, 1, 3, 4], [2, 1, 4, 3], [2, 3, 1, 4], [2, 3, 4, 1], [2, 4, 1, 3], [2, 4, 3, 1], \ + [3, 1, 2, 4], [3, 1, 4, 2], [3, 2, 1, 4], [3, 2, 4, 1], [3, 4, 1, 2], [3, 4, 2, 1], \ + [4, 1, 2, 3], [4, 1, 3, 2], [4, 2, 1, 3], [4, 2, 3, 1], [4, 3, 1, 2], [4, 3, 2, 1]] + all_types = [[all_types[iat][0] - 1, all_types[iat][1] - 1, all_types[iat][2] - 1, all_types[iat][3] - 1] for iat in + range(24)] + + tpe = all_types[mt] + p1 = Point((xs[0], ys[tpe[0]])) + p2 = Point((xs[1], ys[tpe[1]])) + p3 = Point((xs[2], ys[tpe[2]])) + p4 = Point((xs[3], ys[tpe[3]])) + pts = [p1, p2, p3, p4] + scs = [0, 1, 2, 3] + for it in itertools.permutations(scs, 4): + poly = Polygon([(pts[it[0]].x, pts[it[0]].y), (pts[it[1]].x, pts[it[1]].y),(pts[it[2]].x, pts[it[2]].y), (pts[it[3]].x, pts[it[3]].y)]) + if poly.is_valid and ms.within(poly) and crs.within(poly): + quad = [(pts[it[0]].x, pts[it[0]].y), (pts[it[1]].x, pts[it[1]].y),(pts[it[2]].x, pts[it[2]].y), (pts[it[3]].x, pts[it[3]].y)] + lr = LinearRing(quad) + if lr.is_ccw: + return [(int(iq[0]), int(iq[1])) for iq in quad] + else: + quad = [quad[0], quad[3], quad[2], quad[1]] + return [(int(iq[0]), int(iq[1])) for iq in quad] + + return [(int(iq[0]), int(iq[1])) for iq in quad] + + return None + diff --git a/maskrcnn_benchmark/data/datasets/list_dataset.py b/maskrcnn_benchmark/data/datasets/list_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..9058d35b3d4279048732074f4a8dbb6edd4c9ed0 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/list_dataset.py @@ -0,0 +1,36 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Simple dataset class that wraps a list of path names +""" + +from PIL import Image + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class ListDataset(object): + def __init__(self, image_lists, transforms=None): + self.image_lists = image_lists + self.transforms = transforms + + def __getitem__(self, item): + img = Image.open(self.image_lists[item]).convert("RGB") + + # dummy target + w, h = img.size + target = BoxList([[0, 0, w, h]], img.size, mode="xyxy") + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target + + def __len__(self): + return len(self.image_lists) + + def get_img_info(self, item): + """ + Return the image dimensions for the image, without + loading and pre-processing it + """ + pass diff --git a/maskrcnn_benchmark/data/datasets/voc.py b/maskrcnn_benchmark/data/datasets/voc.py new file mode 100644 index 0000000000000000000000000000000000000000..459985bd12a47ffe5a246cbf8e00b7930b991a1c --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/voc.py @@ -0,0 +1,134 @@ +import os + +import torch +import torch.utils.data +from PIL import Image +import sys + +if sys.version_info[0] == 2: + import xml.etree.cElementTree as ET +else: + import xml.etree.ElementTree as ET + + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class PascalVOCDataset(torch.utils.data.Dataset): + + CLASSES = ( + "__background__ ", + "aeroplane", + "bicycle", + "bird", + "boat", + "bottle", + "bus", + "car", + "cat", + "chair", + "cow", + "diningtable", + "dog", + "horse", + "motorbike", + "person", + "pottedplant", + "sheep", + "sofa", + "train", + "tvmonitor", + ) + + def __init__(self, data_dir, split, use_difficult=False, transforms=None): + self.root = data_dir + self.image_set = split + self.keep_difficult = use_difficult + self.transforms = transforms + + self._annopath = os.path.join(self.root, "Annotations", "%s.xml") + self._imgpath = os.path.join(self.root, "JPEGImages", "%s.jpg") + self._imgsetpath = os.path.join(self.root, "ImageSets", "Main", "%s.txt") + + with open(self._imgsetpath % self.image_set) as f: + self.ids = f.readlines() + self.ids = [x.strip("\n") for x in self.ids] + self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} + + cls = PascalVOCDataset.CLASSES + self.class_to_ind = dict(zip(cls, range(len(cls)))) + + def __getitem__(self, index): + img_id = self.ids[index] + img = Image.open(self._imgpath % img_id).convert("RGB") + + target = self.get_groundtruth(index) + target = target.clip_to_image(remove_empty=True) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target, index + + def __len__(self): + return len(self.ids) + + def get_groundtruth(self, index): + img_id = self.ids[index] + anno = ET.parse(self._annopath % img_id).getroot() + anno = self._preprocess_annotation(anno) + + height, width = anno["im_info"] + target = BoxList(anno["boxes"], (width, height), mode="xyxy") + target.add_field("labels", anno["labels"]) + target.add_field("difficult", anno["difficult"]) + return target + + def _preprocess_annotation(self, target): + boxes = [] + gt_classes = [] + difficult_boxes = [] + TO_REMOVE = 1 + + for obj in target.iter("object"): + difficult = int(obj.find("difficult").text) == 1 + if not self.keep_difficult and difficult: + continue + name = obj.find("name").text.lower().strip() + bb = obj.find("bndbox") + # Make pixel indexes 0-based + # Refer to "https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/pascal_voc.py#L208-L211" + box = [ + bb.find("xmin").text, + bb.find("ymin").text, + bb.find("xmax").text, + bb.find("ymax").text, + ] + bndbox = tuple( + map(lambda x: x - TO_REMOVE, list(map(int, box))) + ) + + boxes.append(bndbox) + gt_classes.append(self.class_to_ind[name]) + difficult_boxes.append(difficult) + + size = target.find("size") + im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) + + res = { + "boxes": torch.tensor(boxes, dtype=torch.float32), + "labels": torch.tensor(gt_classes), + "difficult": torch.tensor(difficult_boxes), + "im_info": im_info, + } + return res + + def get_img_info(self, index): + img_id = self.ids[index] + anno = ET.parse(self._annopath % img_id).getroot() + size = anno.find("size") + im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) + return {"height": im_info[0], "width": im_info[1]} + + def map_class_id_to_class_name(self, class_id): + return PascalVOCDataset.CLASSES[class_id] diff --git a/maskrcnn_benchmark/data/datasets/word_dataset.py b/maskrcnn_benchmark/data/datasets/word_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..bc1653472826506811e09aad785f58443ea584af --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/word_dataset.py @@ -0,0 +1,107 @@ +import torch +import torchvision + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask + +from maskrcnn_benchmark.structures.ke import textKES +from maskrcnn_benchmark.structures.mty import MTY + +DEBUG = 0 + +class WordDataset(torchvision.datasets.coco.CocoDetection): + def __init__( + self, ann_file, root, remove_images_without_annotations, transforms=None + ): + super(WordDataset, self).__init__(root, ann_file) + # sort indices for reproducible results + self.ids = sorted(self.ids) + + # filter images without detection annotations + if remove_images_without_annotations: + self.ids = [ + img_id + for img_id in self.ids + if len(self.coco.getAnnIds(imgIds=img_id, iscrowd=None)) > 0 + ] + + self.json_category_id_to_contiguous_id = { + v: i + 1 for i, v in enumerate(self.coco.getCatIds()) + } + self.contiguous_category_id_to_json_id = { + v: k for k, v in self.json_category_id_to_contiguous_id.items() + } + self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} + self.transforms = transforms + + def kes_encode(self, kes): + kes_encode = [] + for i in kes: + mnx = i[0] + mny = i[1] + assert(len(i)%3 == 0) + npts = int(len(i)/3-2) + for index in range(npts): + i[3+index*3] = (i[3+index*3]+mnx)/2 + i[4+index*3] = (i[4+index*3]+mny)/2 + kes_encode.append(i) + return kes_encode + + def kes_gen(self, kes): + kes_gen_out = [] + for i in kes: + mnx = i[0] + mny = i[1] + cx= i[27] + cy= i[28] + assert(len(i)%3 == 0) + ot = [mnx, i[3],i[6],i[9],i[12], cx,\ + mny, i[16],i[19],i[22],i[25], cy] + kes_gen_out.append(ot) + return kes_gen_out + + def __getitem__(self, idx): + img, anno = super(WordDataset, self).__getitem__(idx) + # filter crowd annotations + # TODO might be better to add an extra field + anno = [obj for obj in anno if obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + if DEBUG: print('len(boxes)', len(boxes), boxes[0]) + boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes + target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") + + classes = [obj["category_id"] for obj in anno] + if DEBUG: print('len(classes)', len(classes), classes[0]) + classes = [self.json_category_id_to_contiguous_id[c] for c in classes] + classes = torch.tensor(classes) + target.add_field("labels", classes) + + masks = [obj["segmentation"] for obj in anno] + if DEBUG: print('len(masks)', len(masks), masks[0]) + masks = SegmentationMask(masks, img.size) + target.add_field("masks", masks) + + if anno and 'keypoints' in anno[0]: + kes = [obj["keypoints"] for obj in anno] + kes = self.kes_gen(kes) + if DEBUG: print('len(kes)', len(kes), kes[0]) + kes = textKES(kes, img.size) + target.add_field("kes", kes) + + if anno and 'match_type' in anno[0]: + mty = [obj["match_type"] for obj in anno] + mty = MTY(mty, img.size) + target.add_field("mty", mty) + + target = target.clip_to_image(remove_empty=True) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target, idx + + def get_img_info(self, index): + img_id = self.id_to_img_map[index] + img_data = self.coco.imgs[img_id] + return img_data diff --git a/maskrcnn_benchmark/data/samplers/__init__.py b/maskrcnn_benchmark/data/samplers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..27982cbe68c6173a911e700273f25973acbf04bd --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .distributed import DistributedSampler +from .grouped_batch_sampler import GroupedBatchSampler +from .iteration_based_batch_sampler import IterationBasedBatchSampler + +__all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] diff --git a/maskrcnn_benchmark/data/samplers/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/samplers/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7467061ce7570f7d5ec8e73311376709ac572853 Binary files /dev/null and b/maskrcnn_benchmark/data/samplers/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/samplers/__pycache__/distributed.cpython-37.pyc b/maskrcnn_benchmark/data/samplers/__pycache__/distributed.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76a9e61b1ed44a3ee95e336ce85ef62429b884a2 Binary files /dev/null and b/maskrcnn_benchmark/data/samplers/__pycache__/distributed.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/samplers/__pycache__/grouped_batch_sampler.cpython-37.pyc b/maskrcnn_benchmark/data/samplers/__pycache__/grouped_batch_sampler.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d87ffbb208e8739735c8648a23454c3cfc9033a4 Binary files /dev/null and b/maskrcnn_benchmark/data/samplers/__pycache__/grouped_batch_sampler.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/samplers/__pycache__/iteration_based_batch_sampler.cpython-37.pyc b/maskrcnn_benchmark/data/samplers/__pycache__/iteration_based_batch_sampler.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a679ada15a0288db36dcb2181e1dfbc537f4f37 Binary files /dev/null and b/maskrcnn_benchmark/data/samplers/__pycache__/iteration_based_batch_sampler.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/samplers/distributed.py b/maskrcnn_benchmark/data/samplers/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..27a280f9ac767e299f996c8c0e1ba4c37a4f2759 --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/distributed.py @@ -0,0 +1,66 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Code is copy-pasted exactly as in torch.utils.data.distributed. +# FIXME remove this once c10d fixes the bug it has +import math +import torch +import torch.distributed as dist +from torch.utils.data.sampler import Sampler + + +class DistributedSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + .. note:: + Dataset is assumed to be of constant size. + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + """ + + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + self.shuffle = shuffle + + def __iter__(self): + if self.shuffle: + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + indices += indices[: (self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset : offset + self.num_samples] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py b/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..0960cd1f49ec7fb11bc586235653380f4b0fd02f --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py @@ -0,0 +1,115 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import itertools + +import torch +from torch.utils.data.sampler import BatchSampler +from torch.utils.data.sampler import Sampler + + +class GroupedBatchSampler(BatchSampler): + """ + Wraps another sampler to yield a mini-batch of indices. + It enforces that elements from the same group should appear in groups of batch_size. + It also tries to provide mini-batches which follows an ordering which is + as close as possible to the ordering from the original sampler. + + Arguments: + sampler (Sampler): Base sampler. + batch_size (int): Size of mini-batch. + drop_uneven (bool): If ``True``, the sampler will drop the batches whose + size is less than ``batch_size`` + + """ + + def __init__(self, sampler, group_ids, batch_size, drop_uneven=False): + if not isinstance(sampler, Sampler): + raise ValueError( + "sampler should be an instance of " + "torch.utils.data.Sampler, but got sampler={}".format(sampler) + ) + self.sampler = sampler + self.group_ids = torch.as_tensor(group_ids) + assert self.group_ids.dim() == 1 + self.batch_size = batch_size + self.drop_uneven = drop_uneven + + self.groups = torch.unique(self.group_ids).sort(0)[0] + + self._can_reuse_batches = False + + def _prepare_batches(self): + dataset_size = len(self.group_ids) + # get the sampled indices from the sampler + sampled_ids = torch.as_tensor(list(self.sampler)) + # potentially not all elements of the dataset were sampled + # by the sampler (e.g., DistributedSampler). + # construct a tensor which contains -1 if the element was + # not sampled, and a non-negative number indicating the + # order where the element was sampled. + # for example. if sampled_ids = [3, 1] and dataset_size = 5, + # the order is [-1, 1, -1, 0, -1] + order = torch.full((dataset_size,), -1, dtype=torch.int64) + order[sampled_ids] = torch.arange(len(sampled_ids)) + + # get a mask with the elements that were sampled + mask = order >= 0 + + # find the elements that belong to each individual cluster + clusters = [(self.group_ids == i) & mask for i in self.groups] + # get relative order of the elements inside each cluster + # that follows the order from the sampler + relative_order = [order[cluster] for cluster in clusters] + # with the relative order, find the absolute order in the + # sampled space + permutation_ids = [s[s.sort()[1]] for s in relative_order] + # permute each cluster so that they follow the order from + # the sampler + permuted_clusters = [sampled_ids[idx] for idx in permutation_ids] + + # splits each cluster in batch_size, and merge as a list of tensors + splits = [c.split(self.batch_size) for c in permuted_clusters] + merged = tuple(itertools.chain.from_iterable(splits)) + + # now each batch internally has the right order, but + # they are grouped by clusters. Find the permutation between + # different batches that brings them as close as possible to + # the order that we have in the sampler. For that, we will consider the + # ordering as coming from the first element of each batch, and sort + # correspondingly + first_element_of_batch = [t[0].item() for t in merged if t.numel() > 0] + # get and inverse mapping from sampled indices and the position where + # they occur (as returned by the sampler) + inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())} + # from the first element in each batch, get a relative ordering + first_index_of_batch = torch.as_tensor( + [inv_sampled_ids_map[s] for s in first_element_of_batch] + ) + + # permute the batches so that they approximately follow the order + # from the sampler + permutation_order = first_index_of_batch.sort(0)[1].tolist() + # finally, permute the batches + batches = [merged[i].tolist() for i in permutation_order] + + if self.drop_uneven: + kept = [] + for batch in batches: + if len(batch) == self.batch_size: + kept.append(batch) + batches = kept + return batches + + def __iter__(self): + if self._can_reuse_batches: + batches = self._batches + self._can_reuse_batches = False + else: + batches = self._prepare_batches() + self._batches = batches + return iter(batches) + + def __len__(self): + if not hasattr(self, "_batches"): + self._batches = self._prepare_batches() + self._can_reuse_batches = True + return len(self._batches) diff --git a/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py b/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..93452b64696dc9b2cd2a347b8051729864bf9510 --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from torch.utils.data.sampler import BatchSampler + + +class IterationBasedBatchSampler(BatchSampler): + """ + Wraps a BatchSampler, resampling from it until + a specified number of iterations have been sampled + """ + + def __init__(self, batch_sampler, num_iterations, start_iter=0): + self.batch_sampler = batch_sampler + self.num_iterations = num_iterations + self.start_iter = start_iter + + def __iter__(self): + iteration = self.start_iter + while iteration <= self.num_iterations: + # if the underlying sampler has a set_epoch method, like + # DistributedSampler, used for making each process see + # a different split of the dataset, then set it + if hasattr(self.batch_sampler.sampler, "set_epoch"): + self.batch_sampler.sampler.set_epoch(iteration) + for batch in self.batch_sampler: + iteration += 1 + if iteration > self.num_iterations: + break + yield batch + + def __len__(self): + return self.num_iterations diff --git a/maskrcnn_benchmark/data/transforms/__init__.py b/maskrcnn_benchmark/data/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..892b9cec0c2bc59162196ef9243e9aedcdcbaee6 --- /dev/null +++ b/maskrcnn_benchmark/data/transforms/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .transforms import Compose +from .transforms import Resize +from .transforms import RandomHorizontalFlip +from .transforms import ToTensor +from .transforms import Normalize +from .transforms import RandomCrop + +from .build import build_transforms diff --git a/maskrcnn_benchmark/data/transforms/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/transforms/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7de1b84ba522fa114d24097cc0476ab15a7f9608 Binary files /dev/null and b/maskrcnn_benchmark/data/transforms/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/transforms/__pycache__/build.cpython-37.pyc b/maskrcnn_benchmark/data/transforms/__pycache__/build.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6ac82a547e423c0cd236419d376f87559fe2f63 Binary files /dev/null and b/maskrcnn_benchmark/data/transforms/__pycache__/build.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/transforms/__pycache__/transforms.cpython-37.pyc b/maskrcnn_benchmark/data/transforms/__pycache__/transforms.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc854cb82ddc2351287695ad3028b547b10e9d98 Binary files /dev/null and b/maskrcnn_benchmark/data/transforms/__pycache__/transforms.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/data/transforms/build.py b/maskrcnn_benchmark/data/transforms/build.py new file mode 100644 index 0000000000000000000000000000000000000000..09b09a92985d23cfef9e78284584e10b66d5fde0 --- /dev/null +++ b/maskrcnn_benchmark/data/transforms/build.py @@ -0,0 +1,54 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from . import transforms as T + + +def build_transforms(cfg, is_train=True): + if is_train: + if cfg.INPUT.MIN_SIZE_RANGE_TRAIN[0] == -1: + min_size = cfg.INPUT.MIN_SIZE_TRAIN + else: + assert len(cfg.INPUT.MIN_SIZE_RANGE_TRAIN) == 2, \ + "MIN_SIZE_RANGE_TRAIN must have two elements (lower bound, upper bound)" + min_size = range( + cfg.INPUT.MIN_SIZE_RANGE_TRAIN[0], + cfg.INPUT.MIN_SIZE_RANGE_TRAIN[1] + 1 + ) + max_size = cfg.INPUT.MAX_SIZE_TRAIN + # max_size = None + + flip_prob = 0.5 # cfg.INPUT.FLIP_PROB_TRAIN + rotate_prob = cfg.INPUT.ROTATE_PROB_TRAIN + rotate_degree = cfg.INPUT.ROTATE_DEGREE + crop_prob = cfg.INPUT.CROP_PROB_TRAIN + else: + min_size = cfg.INPUT.MIN_SIZE_TEST + max_size = cfg.INPUT.MAX_SIZE_TEST + # max_size = None + + + flip_prob = 0 + rotate_prob = 0 + rotate_degree = 0 + crop_prob = 0 + + to_bgr255 = cfg.INPUT.TO_BGR255 + normalize_transform = T.Normalize( + mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255 + ) + + transform = T.Compose( + [ + T.RandomCrop(crop_prob), + T.RandomBrightness(crop_prob), + T.RandomContrast(crop_prob), + T.RandomHue(crop_prob), + T.RandomSaturation(crop_prob), + T.RandomGamma(crop_prob), + T.Resize(min_size, max_size), + T.RandomHorizontalFlip(flip_prob), + T.RandomRotation(rotate_prob, rotate_degree), + T.ToTensor(), + normalize_transform, + ] + ) + return transform diff --git a/maskrcnn_benchmark/data/transforms/transforms.py b/maskrcnn_benchmark/data/transforms/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..c5549a3de99e16cfb3562cece52482a111b7f506 --- /dev/null +++ b/maskrcnn_benchmark/data/transforms/transforms.py @@ -0,0 +1,468 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import random + +import torch +import torchvision +from torchvision.transforms import functional as F + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask + +from maskrcnn_benchmark.structures.ke import textKES +from maskrcnn_benchmark.structures.mty import MTY +import numpy as np +from PIL import Image +from shapely.geometry import * +import cv2 +from maskrcnn_benchmark.config import cfg + + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string + + +class Resize(object): + def __init__(self, min_size, max_size): + if not isinstance(min_size, (list, tuple)): + min_size = (min_size,) + self.min_size = min_size + self.max_size = max_size + + # modified from torchvision to add support for max size + def get_size(self, image_size): + + # if test ic15 + + #oh = 1200 + #ow = 2000 + #return (oh, ow) + + w, h = image_size + size = random.choice(self.min_size) + max_size = self.max_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def __call__(self, image, target): + size = self.get_size(image.size) + image = F.resize(image, size) + if isinstance(target, list): + target = [t.resize(image.size) for t in target] + else: + target = target.resize(image.size) + return image, target + + +class RandomHorizontalFlip(object): + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, image, target): + if random.random() < self.prob: + image = F.hflip(image) + if isinstance(target, list): + target = [t.transpose(0) for t in target] + else: + target = target.transpose(0) + return image, target + + +class ToTensor(object): + def __call__(self, image, target): + return F.to_tensor(image), target + + +class Normalize(object): + def __init__(self, mean, std, to_bgr255=True): + self.mean = mean + self.std = std + self.to_bgr255 = to_bgr255 + + def __call__(self, image, target): + if self.to_bgr255: + image = image[[2, 1, 0]] * 255 + image = F.normalize(image, mean=self.mean, std=self.std) + return image, target + + +class RandomCrop(object): + """Random crop with repeatedly expanding the range to included box borders.""" + def __init__(self, prob, init_crop_size=(0.5, 1.0)): + + if (not isinstance(init_crop_size, list)) and (not isinstance(init_crop_size, tuple)): + raise ValueError('Paremeter init_crop_size should be a list or tuple!') + elif len(init_crop_size) != 2: + raise ValueError('Length of init_crop_size should be 2!') + elif not (init_crop_size[0] <= 1 and init_crop_size[0] >= 0 and init_crop_size[1] <= 1 and init_crop_size[1] >= 0): + raise ValueError('Elements of init_crop_size should be within [0, 1]!') + self.prob = prob + self.init_crop_size = init_crop_size + + def __call__(self, image, target): + if random.random() >= self.prob: + return image, target + + if isinstance(target, list): + target0 = target[0] + else: + target0 = target + while True: + # Initial Crop Region + crop_region = self.initial_crop_region(image) + + # Adjust Crop Region + crop_region, keep_target = self.adjust_crop_region(crop_region, target0) + if crop_region is None and keep_target is None: + continue + + if isinstance(target, list): + # check empty char + new_t1 = target[1].crop(crop_region) + if len(new_t1) < 1: return image, target + + image = image.crop(crop_region.numpy()) + if isinstance(target, list): + target0 = keep_target.crop(crop_region) + others = [t.crop(crop_region, remove_empty=True) for t in target[1:]] + target = [target0] + others + else: + target = keep_target.crop(crop_region) + + return image, target + + def initial_crop_region(self, image): + width, height = image.size + ratio_w, ratio_h = torch.empty(2).uniform_(self.init_crop_size[0], self.init_crop_size[1]) + crop_width, crop_height = int(width*ratio_w), int(height*ratio_h) + crop_xmin = torch.randint(width-crop_width, (1,)) + crop_ymin = torch.randint(height-crop_height, (1,)) + crop_xmax = crop_xmin + crop_width + crop_ymax = crop_ymin + crop_height + crop_region = torch.Tensor([crop_xmin, crop_ymin, crop_xmax, crop_ymax]) + return crop_region + + def intersect_area(self, bbox, bboxes): + inter_xmin = torch.max(bbox[0], bboxes[:, 0]) + inter_ymin = torch.max(bbox[1], bboxes[:, 1]) + inter_xmax = torch.min(bbox[2], bboxes[:, 2]) + inter_ymax = torch.min(bbox[3], bboxes[:, 3]) + inter_width = torch.max(torch.Tensor([0]), inter_xmax-inter_xmin) + inter_height = torch.max(torch.Tensor([0]), inter_ymax-inter_ymin) + return inter_width*inter_height + + def adjust_crop_region(self, crop_region, target): + keep_indies_ = torch.zeros((len(target)), dtype=torch.uint8) + while True: + inter_area = self.intersect_area(crop_region, target.bbox) + keep_indies = (inter_area > 0) + if torch.sum(keep_indies) == 0: + return None, None + keep_target = target[keep_indies] + if keep_indies.equal(keep_indies_): + return crop_region, keep_target + keep_bbox = keep_target.bbox + crop_xmin = torch.min(crop_region[0], torch.min(keep_bbox[:, 0])) + crop_ymin = torch.min(crop_region[1], torch.min(keep_bbox[:, 1])) + crop_xmax = torch.max(crop_region[2], torch.max(keep_bbox[:, 2])) + crop_ymax = torch.max(crop_region[3], torch.max(keep_bbox[:, 3])) + crop_region = torch.Tensor([crop_xmin, crop_ymin, crop_xmax, crop_ymax]) + keep_indies_ = keep_indies + +class RandomBrightness(object): + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, image, target): + if random.random() < self.prob: + brightness_factor = random.uniform(0.5, 2) + image = F.adjust_brightness(image, brightness_factor) + return image, target + +class RandomContrast(object): + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, image, target): + if random.random() < self.prob: + contrast_factor = random.uniform(0.5, 2) + image = F.adjust_contrast(image, contrast_factor) + return image, target + +class RandomHue(object): + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, image, target): + if random.random() < self.prob: + hue_factor = random.uniform(-0.25, 0.25) + image = F.adjust_hue(image, hue_factor) + return image, target + +class RandomSaturation(object): + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, image, target): + if random.random() < self.prob: + saturation_factor = random.uniform(0.5, 2) + image = F.adjust_saturation(image, saturation_factor) + return image, target + +class RandomGamma(object): + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, image, target): + if random.random() < self.prob: + gamma_factor = random.uniform(0.5, 2) + image = F.adjust_gamma(image, gamma_factor) + return image, target + + +class RandomRotation(object): + def __init__(self, prob = 0.3, degree = 5): + self.prob = prob + self.degree = degree + + def kes_encode(self, kes): + kes_encode = [] + for i in kes: + mnx = i[0] + mny = i[1] + assert(len(i)%3 == 0) + npts = int(len(i)/3-2) + for index in range(npts): + i[3+index*3] = (i[3+index*3]+mnx)/2 + i[4+index*3] = (i[4+index*3]+mny)/2 + kes_encode.append(i) + return kes_encode + + def kes_gen(self, kes): + kes_gen_out = [] + for i in kes: + mnx = i[0] + mny = i[1] + cx= i[27] + cy= i[28] + assert(len(i)%3 == 0) + ot = [mnx, i[3],i[6],i[9],i[12], cx,\ + mny, i[16],i[19],i[22],i[25], cy] + kes_gen_out.append(ot) + return kes_gen_out + + def __call__(self, image, target): + if random.random() < self.prob: + image1 = image + target1 = target + img = np.array(image) + w = image.size[0] + h = image.size[1] + pri_points = [] + for i in range(len(target.extra_fields['masks'].instances)): + assert(len(target.extra_fields['masks'].instances[i].polygons)==1), 'one text instance should have only one polygon.' + tensor_box = target.extra_fields['masks'].instances[i].polygons[0].polygons + + points_x = np.array([tensor_box[0][0],tensor_box[0][2],tensor_box[0][4],tensor_box[0][6]]) + points_y = np.array([tensor_box[0][1],tensor_box[0][3],tensor_box[0][5],tensor_box[0][7]]) + smaller_x = np.where(points_x <= 0) + larger_x = np.where(points_x >= w) + smaller_y = np.where(points_y <= 0) + larger_y = np.where(points_y >= h) + points_x[smaller_x] = 1 + points_x[larger_x] = w - 1 + points_y[smaller_y] = 1 + points_y[larger_y] = h -1 + pri_points.append((int(points_x[0]),int(points_y[0]))) + pri_points.append((int(points_x[1]),int(points_y[1]))) + pri_points.append((int(points_x[2]),int(points_y[2]))) + pri_points.append((int(points_x[3]),int(points_y[3]))) + + #get the transform image and points + height, width = img.shape[:2] + + # if ROTATE_DEGREE = (0,30,60,90,210,150,180,210,240,270,300,330,360) + #de_ro = random.choice(self.degree) + #matrix = cv2.getRotationMatrix2D((width / 2, height / 2) ,de_ro, 1.0) + + # if ROTATE_DEGREE = 10 + matrix = cv2.getRotationMatrix2D((width / 2, height / 2), random.uniform(-self.degree[0],self.degree[0]), 1.0) + + cos = np.abs(matrix[0,0]) + sin = np.abs(matrix[0,1]) + new_W = int((height * sin) + (width * cos)) + new_H = int((height * cos) + (width * sin)) + matrix[0,2] += (new_W/2) - width/2 + matrix[1,2] += ((new_H/2)) - height/2 + img = cv2.warpAffine(img, matrix, (new_W,new_H)) + + change_points = [] + for i in range(int(len(pri_points))): + x_r,y_r = cv2.transform(np.array([[pri_points[i]]]),matrix).squeeze() + change_points.append([x_r,y_r]) + + image = Image.fromarray(img) + + keypoints_len = len(change_points) + tran_boxes = [] + n = keypoints_len/4 + + for i in range(int(n)): + tran_boxes.append(change_points[0 + i*4: 4 + i*4]) + + tran_boxes = np.array(tran_boxes).reshape(-1,2) + tran_x = [] + tran_y = [] + for k in range(len(tran_boxes)): + tran_x.append(int(tran_boxes[k][0])) + tran_y.append(int(tran_boxes[k][1])) + max_x = max(tran_x) + min_x = min(tran_x) + max_y = max(tran_y) + min_y = min(tran_x) + ctr_x = new_W / 2 + ctr_y = new_H / 2 + origin_xmin = ctr_x - width / 2 + origin_xmax = ctr_x + width / 2 + origin_ymin = ctr_y - height / 2 + origin_ymax = ctr_y + height / 2 + cut_xmax = origin_xmax + cut_xmin = origin_xmin + cut_ymax = origin_ymax + cut_ymin = origin_ymin + if max_x >= origin_xmax: + cut_xmax = max_x + if min_x <= origin_xmin: + cut_xmin = min_x + if max_y >= origin_ymax: + cut_ymax = max_y + if min_y <= origin_ymin: + cut_ymin = min_y + for i in range(len(tran_boxes)): + tran_x[i] = tran_x[i] - cut_xmin + tran_y[i] = tran_y[i] - cut_ymin + image = image.crop((cut_xmin,cut_ymin,cut_xmax,cut_ymax)) + tran_x = np.array(tran_x) + tran_y = np.array(tran_y) + + boxes = [] + masks = [] + mty = [] + kes = [] + #GET FORMAT OF BOXES,MASKS + for idx in range(int(tran_x.size/4)): + x_points = [tran_x[4 * idx], tran_x[4*idx+1],tran_x[4*idx+2],tran_x[4*idx+3]] + y_points = [tran_y[4 * idx], tran_y[4*idx+1],tran_y[4*idx+2],tran_y[4*idx+3]] + + l1 = LineString([(x_points[0], y_points[0]), (x_points[2], y_points[2])]) + l2 = LineString([(x_points[1], y_points[1]), (x_points[3], y_points[3])]) + p_l1l2 = l1.intersection(l2) + poly1 = Polygon([(x_points[0], y_points[0]), (x_points[1], y_points[1]), + (x_points[2], y_points[2]), (x_points[3], y_points[3])]) + if not poly1.is_valid: + continue + if not p_l1l2.within(poly1): + continue + if poly1.area <= 10: + continue + x_min = min(x_points) + x_max = max(x_points) + y_min = min(y_points) + y_max = max(y_points) + width = max(0, x_max - x_min + 1) + height = max(0, y_max - y_min + 1) + if width == 0 or height == 0: + continue + boxes.append([x_min,y_min,width,height]) + + #get mask format + one_point = [[tran_x[4*idx],tran_y[4*idx],tran_x[4*idx+1],tran_y[4*idx+1],tran_x[4*idx+2],tran_y[4*idx+2],tran_x[4*idx+3],tran_y[4*idx+3]]] + masks.append(one_point) + + #get matchtype format + mean_x = np.mean(x_points) + mean_y = np.mean(y_points) + xt_sort = np.sort(x_points) + yt_sort = np.sort(y_points) + xt_argsort = list(np.argsort(x_points)) + yt_argsort = list(np.argsort(y_points)) + ldx = [] + for ildx in range(4): + ldx.append(yt_argsort.index(xt_argsort[ildx])) + all_types = [[1,2,3,4],[1,2,4,3],[1,3,2,4],[1,3,4,2],[1,4,2,3],[1,4,3,2],\ + [2,1,3,4],[2,1,4,3],[2,3,1,4],[2,3,4,1],[2,4,1,3],[2,4,3,1],\ + [3,1,2,4],[3,1,4,2],[3,2,1,4],[3,2,4,1],[3,4,1,2],[3,4,2,1],\ + [4,1,2,3],[4,1,3,2],[4,2,1,3],[4,2,3,1],[4,3,1,2],[4,3,2,1]] + all_types = [[all_types[iat][0]-1,all_types[iat][1]-1,all_types[iat][2]-1,all_types[iat][3]-1] for iat in range(24)] + match_type = all_types.index(ldx) + mty.append(match_type) + + half_x = (xt_sort + mean_x) / 2 + half_y = (yt_sort + mean_y) / 2 + + keypoints = [] + keypoints.append(mean_x) + keypoints.append(mean_y) + keypoints.append(2) + for i in range(4): + keypoints.append(half_x[i]) + keypoints.append(mean_y) + keypoints.append(2) + for i in range(4): + keypoints.append(mean_x) + keypoints.append(half_y[i]) + keypoints.append(2) + try: + keypoints.append(int(p_l1l2.x)) + keypoints.append(int(p_l1l2.y)) + keypoints.append(2) + except Exception as e: + continue + kes.append(keypoints) + #IF ENCOUNTER THAT NO BOX IN A TRANSFORMED IMAGE, RETURN PRIMARY IMAGE AND TARGET + if kes == []: + image = image1 + target = target1 + return image,target + classes = [] + for i in range(len(boxes)): + classes.append(1) + classes = torch.tensor(classes) + #GET NEW TARGET + boxes = torch.as_tensor(boxes).reshape(-1, 4) + target = BoxList(boxes, image.size, mode="xywh").convert("xyxy") + + target.add_field("labels",classes) + + masks = SegmentationMask(masks, image.size) + target.add_field("masks", masks) + + return image,target diff --git a/maskrcnn_benchmark/engine/__init__.py b/maskrcnn_benchmark/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5c7f19c6c00a4ac3f2f2bc66f892e44bcbd72612 --- /dev/null +++ b/maskrcnn_benchmark/engine/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. diff --git a/maskrcnn_benchmark/engine/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/engine/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e163be9e859a22628af58e88c1d81aef428bbf03 Binary files /dev/null and b/maskrcnn_benchmark/engine/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/engine/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/engine/__pycache__/inference.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd6e6aeda2ccfb22b503e31b714ce6d6c8de9157 Binary files /dev/null and b/maskrcnn_benchmark/engine/__pycache__/inference.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/engine/__pycache__/trainer.cpython-37.pyc b/maskrcnn_benchmark/engine/__pycache__/trainer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db00539f6e5b6cde7325020046e6126958d8fb76 Binary files /dev/null and b/maskrcnn_benchmark/engine/__pycache__/trainer.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/engine/inference.py b/maskrcnn_benchmark/engine/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..77e7396d1e68f77301daee9af1c14707237bf5a9 --- /dev/null +++ b/maskrcnn_benchmark/engine/inference.py @@ -0,0 +1,129 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import time +import os + +import torch +from tqdm import tqdm + +from maskrcnn_benchmark.data.datasets.evaluation import evaluate +from ..utils.comm import is_main_process, get_world_size +from ..utils.comm import all_gather +from ..utils.comm import synchronize +from ..utils.timer import Timer, get_time_str + + +def compute_on_dataset(model, data_loader, device, timer=None): + model.eval() + results_dict = {} + cpu_device = torch.device("cpu") + for _, batch in enumerate(tqdm(data_loader)): + images, targets, image_ids = batch + images = images.to(device) + with torch.no_grad(): + if timer: + timer.tic() + output = model(images) + if timer: + torch.cuda.synchronize() + timer.toc() + output = [o.to(cpu_device) for o in output] + results_dict.update( + {img_id: result for img_id, result in zip(image_ids, output)} + ) + return results_dict + + +def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): + all_predictions = all_gather(predictions_per_gpu) + if not is_main_process(): + return + # merge the list of dicts + predictions = {} + for p in all_predictions: + predictions.update(p) + # convert a dict where the key is the index in a list + image_ids = list(sorted(predictions.keys())) + if len(image_ids) != image_ids[-1] + 1: + logger = logging.getLogger("maskrcnn_benchmark.inference") + logger.warning( + "Number of images that were gathered from multiple processes is not " + "a contiguous set. Some images might be missing from the evaluation" + ) + + # convert to a list + predictions = [predictions[i] for i in image_ids] + return predictions + + +def inference( + model, + data_loader, + dataset_name, + iou_types=("bbox",), + box_only=False, + device="cuda", + expected_results=(), + expected_results_sigma_tol=4, + output_folder=None, +): + + logger = logging.getLogger("maskrcnn_benchmark.inference") + dataset = data_loader.dataset + logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset))) + + extra_args = dict( + box_only=box_only, + iou_types=iou_types, + expected_results=expected_results, + expected_results_sigma_tol=expected_results_sigma_tol, + ) + + # load predictions if exists + prediction_file = os.path.join(output_folder, 'predictions.pth') + if os.path.isfile(prediction_file): + predictions = torch.load(prediction_file) + logger.info("Found prediction results at {}".format(prediction_file)) + + return evaluate(dataset=dataset, + predictions=predictions, + output_folder=output_folder, + **extra_args) + + # convert to a torch.device for efficiency + device = torch.device(device) + num_devices = get_world_size() + total_timer = Timer() + inference_timer = Timer() + total_timer.tic() + predictions = compute_on_dataset(model, data_loader, device, inference_timer) + # wait for all processes to complete before measuring the time + synchronize() + total_time = total_timer.toc() + total_time_str = get_time_str(total_time) + logger.info( + "Total run time: {} ({} s / img per device, on {} devices)".format( + total_time_str, total_time * num_devices / len(dataset), num_devices + ) + ) + total_infer_time = get_time_str(inference_timer.total_time) + logger.info( + "Model inference time: {} ({} s / img per device, on {} devices)".format( + total_infer_time, + inference_timer.total_time * num_devices / len(dataset), + num_devices, + ) + ) + + predictions = _accumulate_predictions_from_multiple_gpus(predictions) + if not is_main_process(): + return + + if output_folder: + torch.save(predictions, os.path.join(output_folder, "predictions.pth")) + + + return evaluate(dataset=dataset, + predictions=predictions, + output_folder=output_folder, + **extra_args) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..a13fda4b7597e94a0e0bdea4008574de655d3fee --- /dev/null +++ b/maskrcnn_benchmark/engine/trainer.py @@ -0,0 +1,130 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import datetime +import logging +import time + +import torch +import torch.distributed as dist + +from maskrcnn_benchmark.utils.comm import get_world_size +from maskrcnn_benchmark.utils.metric_logger import MetricLogger + + +def reduce_loss_dict(loss_dict): + """ + Reduce the loss dictionary from all processes so that process with rank + 0 has the averaged results. Returns a dict with the same fields as + loss_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return loss_dict + with torch.no_grad(): + loss_names = [] + all_losses = [] + for k in sorted(loss_dict.keys()): + loss_names.append(k) + all_losses.append(loss_dict[k]) + all_losses = torch.stack(all_losses, dim=0) + dist.reduce(all_losses, dst=0) + if dist.get_rank() == 0: + # only main process gets accumulated, so only divide by + # world_size in this case + all_losses /= world_size + reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} + return reduced_losses + + +def do_train( + model, + data_loader, + optimizer, + scheduler, + checkpointer, + device, + checkpoint_period, + arguments, +): + logger = logging.getLogger("maskrcnn_benchmark.trainer") + logger.info("Start training") + meters = MetricLogger(delimiter=" ") + max_iter = len(data_loader) + start_iter = arguments["iteration"] + + model.train() + start_training_time = time.time() + end = time.time() + for iteration, (images, targets, _) in enumerate(data_loader, start_iter): + data_time = time.time() - end + iteration = iteration + 1 + arguments["iteration"] = iteration + + scheduler.step() + + images = images.to(device) + if isinstance(targets[0], list): + targets = [[target[0].to(device) for target in targets], + [target[1].to(device) for target in targets]] + else: + targets = [target.to(device) for target in targets] + + loss_dict = model(images, targets) + + del targets + + losses = sum(loss for loss in loss_dict.values()) + + # reduce losses over all GPUs for logging purposes + loss_dict_reduced = reduce_loss_dict(loss_dict) + losses_reduced = sum(loss for loss in loss_dict_reduced.values()) + meters.update(loss=losses_reduced, **loss_dict_reduced) + + optimizer.zero_grad() + losses.backward() + optimizer.step() + + batch_time = time.time() - end + end = time.time() + meters.update(time=batch_time, data=data_time) + + eta_seconds = meters.time.global_avg * (max_iter - iteration) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + + losses = losses.float() + del losses, loss_dict, loss_dict_reduced, losses_reduced + + if iteration % 20 == 0 or iteration == max_iter: + logger.info( + meters.delimiter.join( + [ + "eta: {eta}", + "iter: {iter}", + "{meters}", + "lr: {lr:.6f}", + "max mem: {memory:.0f}", + ] + ).format( + eta=eta_string, + iter=iteration, + meters=str(meters), + lr=optimizer.param_groups[0]["lr"], + memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, + ) + ) + + del meters + meters = MetricLogger(delimiter=" ") + + if iteration % checkpoint_period == 0: + checkpointer.save("model_{:07d}".format(iteration), **arguments) + + if iteration == max_iter: + checkpointer.save("model_final", **arguments) + + total_training_time = time.time() - start_training_time + total_time_str = str(datetime.timedelta(seconds=total_training_time)) + logger.info( + "Total training time: {} ({:.4f} s / it)".format( + total_time_str, total_training_time / (max_iter) + ) + ) diff --git a/maskrcnn_benchmark/layers/__init__.py b/maskrcnn_benchmark/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..00c2d2b57abf56a8749329f6bf2092ffee021dca --- /dev/null +++ b/maskrcnn_benchmark/layers/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .batch_norm import FrozenBatchNorm2d +from .misc import Conv2d +from .misc import ConvTranspose2d +from .misc import BatchNorm2d +from .misc import interpolate +from .nms import nms +from .roi_align import ROIAlign +from .roi_align import roi_align +from .roi_pool import ROIPool +from .roi_pool import roi_pool +from .smooth_l1_loss import smooth_l1_loss +from .sigmoid_focal_loss import SigmoidFocalLoss +from .iou_loss import IOULoss +from .scale import Scale +from .deform_conv_v2 import DCN, DCNPooling +from .iou import iou_regress +from .focal_loss import Focal_Loss + +__all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool", + "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate", + "BatchNorm2d", "FrozenBatchNorm2d", "SigmoidFocalLoss", "IOULoss", + "Scale", "DCN", "DCNPooling", "iou_regress","Focal_Loss"] + diff --git a/maskrcnn_benchmark/layers/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2de2e6df60b22e4d34b270bfa1d5123303159a5f Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/batch_norm.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/batch_norm.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6ddb28b907c036c3359dac2c62ac9435754d8fc Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/batch_norm.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/deform_conv_v2.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/deform_conv_v2.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84cd84e702e4930348b1fabacf33bc5b27072a8d Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/deform_conv_v2.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/focal_loss.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/focal_loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1fb8a09da5180a3a452ee3ce7a0b37f137645030 Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/focal_loss.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/iou.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/iou.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..029e65110ae616b3271b9eeeb034470d8dc83550 Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/iou.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/iou_loss.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/iou_loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7bab390fd58b575424f6a94aca79e6232948339a Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/iou_loss.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/misc.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/misc.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48f7ee3beb974425895be0f0c9dd3e7259561c1b Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/misc.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/nms.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/nms.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..290eb866c51b4acaf55b20db2db0aad52121412c Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/nms.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/roi_align.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/roi_align.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cdcc07a13bdd2e2dc1486ab7c3b1cb7224ac7aa Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/roi_align.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/roi_pool.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/roi_pool.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f41a2990f4c1e590312d810b45a8ddee667f222 Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/roi_pool.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/scale.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/scale.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd3459342f2a1fa08b8d454e3375fac678b78524 Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/scale.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/sigmoid_focal_loss.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/sigmoid_focal_loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5305f02112bafb171e1ca04f7b16bb445fbd7e4 Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/sigmoid_focal_loss.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/__pycache__/smooth_l1_loss.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/smooth_l1_loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6318911f3715dd9c18e614f5d7e75c428a564b99 Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/smooth_l1_loss.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/layers/_utils.py b/maskrcnn_benchmark/layers/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3dabc127b221d67eae7587ab4905416fa5fcf121 --- /dev/null +++ b/maskrcnn_benchmark/layers/_utils.py @@ -0,0 +1,39 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import glob +import os.path + +import torch + +try: + from torch.utils.cpp_extension import load as load_ext + from torch.utils.cpp_extension import CUDA_HOME +except ImportError: + raise ImportError("The cpp layer extensions requires PyTorch 0.4 or higher") + + +def _load_C_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + this_dir = os.path.dirname(this_dir) + this_dir = os.path.join(this_dir, "csrc") + + main_file = glob.glob(os.path.join(this_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(this_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(this_dir, "cuda", "*.cu")) + + source = main_file + source_cpu + + extra_cflags = [] + if torch.cuda.is_available() and CUDA_HOME is not None: + source.extend(source_cuda) + extra_cflags = ["-DWITH_CUDA"] + source = [os.path.join(this_dir, s) for s in source] + extra_include_paths = [this_dir] + return load_ext( + "torchvision", + source, + extra_cflags=extra_cflags, + extra_include_paths=extra_include_paths, + ) + + +_C = _load_C_extensions() diff --git a/maskrcnn_benchmark/layers/batch_norm.py b/maskrcnn_benchmark/layers/batch_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..903607ac3895947d1aa6d6c4766624af0e97bc71 --- /dev/null +++ b/maskrcnn_benchmark/layers/batch_norm.py @@ -0,0 +1,24 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn + + +class FrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters + are fixed + """ + + def __init__(self, n): + super(FrozenBatchNorm2d, self).__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def forward(self, x): + scale = self.weight * self.running_var.rsqrt() + bias = self.bias - self.running_mean * scale + scale = scale.reshape(1, -1, 1, 1) + bias = bias.reshape(1, -1, 1, 1) + return x * scale + bias diff --git a/maskrcnn_benchmark/layers/deform_conv_v2.py b/maskrcnn_benchmark/layers/deform_conv_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..ce309848025dad326f9c891f3e6865095a571624 --- /dev/null +++ b/maskrcnn_benchmark/layers/deform_conv_v2.py @@ -0,0 +1,308 @@ +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import math +import torch +from torch import nn +from torch.autograd import Function +from torch.nn.modules.utils import _pair +from torch.autograd.function import once_differentiable + +import maskrcnn_benchmark._C as _backend + + +class _DCNv2(Function): + @staticmethod + def forward(ctx, input, offset, mask, weight, bias, + stride, padding, dilation, deformable_groups): + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.kernel_size = _pair(weight.shape[2:4]) + ctx.deformable_groups = deformable_groups + output = _backend.dcn_v2_forward(input, weight, bias, + offset, mask, + ctx.kernel_size[0], ctx.kernel_size[1], + ctx.stride[0], ctx.stride[1], + ctx.padding[0], ctx.padding[1], + ctx.dilation[0], ctx.dilation[1], + ctx.deformable_groups) + ctx.save_for_backward(input, offset, mask, weight, bias) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, offset, mask, weight, bias = ctx.saved_tensors + grad_input, grad_offset, grad_mask, grad_weight, grad_bias = \ + _backend.dcn_v2_backward(input, weight, + bias, + offset, mask, + grad_output, + ctx.kernel_size[0], ctx.kernel_size[1], + ctx.stride[0], ctx.stride[1], + ctx.padding[0], ctx.padding[1], + ctx.dilation[0], ctx.dilation[1], + ctx.deformable_groups) + + return grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\ + None, None, None, None, + + +dcn_v2_conv = _DCNv2.apply + + +class DCNv2(nn.Module): + + def __init__(self, in_channels, out_channels, + kernel_size, stride, padding, dilation=1, deformable_groups=1): + super(DCNv2, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.deformable_groups = deformable_groups + + self.weight = nn.Parameter(torch.Tensor( + out_channels, in_channels, *self.kernel_size)) + self.bias = nn.Parameter(torch.Tensor(out_channels)) + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + self.bias.data.zero_() + + def forward(self, input, offset, mask): + assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ + offset.shape[1] + assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ + mask.shape[1] + return dcn_v2_conv(input, offset, mask, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.deformable_groups) + + +class DCN(DCNv2): + + def __init__(self, in_channels, out_channels, + kernel_size, stride, padding=0, + dilation=1, deformable_groups=2, + groups=None, bias=True): + """ + groups and bias are two dummy args which have no effect + """ + super(DCN, self).__init__(in_channels, out_channels, + kernel_size, stride, padding, dilation, deformable_groups) + + channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] + self.conv_offset_mask = nn.Conv2d(self.in_channels, + channels_, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset_mask.weight.data.zero_() + self.conv_offset_mask.bias.data.zero_() + + def forward(self, input): + out = self.conv_offset_mask(input) + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + return dcn_v2_conv(input, offset, mask, + self.weight, self.bias, + self.stride, + self.padding, + self.dilation, + self.deformable_groups) + + + +class _DCNv2Pooling(Function): + @staticmethod + def forward(ctx, input, rois, offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0): + ctx.spatial_scale = spatial_scale + ctx.no_trans = int(no_trans) + ctx.output_dim = output_dim + ctx.group_size = group_size + ctx.pooled_size = pooled_size + ctx.part_size = pooled_size if part_size is None else part_size + ctx.sample_per_part = sample_per_part + ctx.trans_std = trans_std + + output, output_count = \ + _backend.dcn_v2_psroi_pooling_forward(input, rois, offset, + ctx.no_trans, ctx.spatial_scale, + ctx.output_dim, ctx.group_size, + ctx.pooled_size, ctx.part_size, + ctx.sample_per_part, ctx.trans_std) + ctx.save_for_backward(input, rois, offset, output_count) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, rois, offset, output_count = ctx.saved_tensors + grad_input, grad_offset = \ + _backend.dcn_v2_psroi_pooling_backward(grad_output, + input, + rois, + offset, + output_count, + ctx.no_trans, + ctx.spatial_scale, + ctx.output_dim, + ctx.group_size, + ctx.pooled_size, + ctx.part_size, + ctx.sample_per_part, + ctx.trans_std) + + return grad_input, None, grad_offset, \ + None, None, None, None, None, None, None, None + + +dcn_v2_pooling = _DCNv2Pooling.apply + + +class DCNv2Pooling(nn.Module): + + def __init__(self, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0): + super(DCNv2Pooling, self).__init__() + self.spatial_scale = spatial_scale + self.pooled_size = pooled_size + self.output_dim = output_dim + self.no_trans = no_trans + self.group_size = group_size + self.part_size = pooled_size if part_size is None else part_size + self.sample_per_part = sample_per_part + self.trans_std = trans_std + + def forward(self, input, rois, offset): + assert input.shape[1] == self.output_dim + if self.no_trans: + offset = input.new() + return dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) + + +class DCNPooling(DCNv2Pooling): + + def __init__(self, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0, + deform_fc_dim=1024): + # don't support non square pooling + pooled_size = pooled_size[0] + super(DCNPooling, self).__init__(spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std) + + self.deform_fc_dim = deform_fc_dim + + if not no_trans: + self.offset_mask_fc = nn.Sequential( + nn.Linear(self.pooled_size * self.pooled_size * + self.output_dim, self.deform_fc_dim), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_dim, self.deform_fc_dim), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_dim, self.pooled_size * + self.pooled_size * 3) + ) + self.offset_mask_fc[4].weight.data.zero_() + self.offset_mask_fc[4].bias.data.zero_() + + def forward(self, input, rois, debug=False): + offset = input.new() + + if not self.no_trans: + + # do roi_align first + n = rois.shape[0] + roi = dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + True, # no trans + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) + + # build mask and offset + offset_mask = self.offset_mask_fc(roi.view(n, -1)) + offset_mask = offset_mask.view( + n, 3, self.pooled_size, self.pooled_size) + o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + + # do pooling with offset and mask + return dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) * mask + # only roi_align + return dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) diff --git a/maskrcnn_benchmark/layers/focal_loss.py b/maskrcnn_benchmark/layers/focal_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..d29ca1b85ccb2584fb4e09cc13cba558544998ab --- /dev/null +++ b/maskrcnn_benchmark/layers/focal_loss.py @@ -0,0 +1,61 @@ +import torch +from torch import nn +from torch.nn import functional as F + +def Focal_Loss(pred, gt): + # print('yes!!') + + + + ce = nn.CrossEntropyLoss() + alpha = 0.25 + gamma = 2 + # logp = ce(input, target) + p = torch.sigmoid(pred) + + loss = -alpha * (1 - p) ** gamma * (gt * torch.log(p)) - \ + (1 - alpha) * p ** gamma * ((1 - gt) * torch.log(1 - p)) + + return loss.mean() + + + + + + + + + + + + # pred =torch.sigmoid(pred) + # pos_inds = gt.eq(1).float() + # neg_inds = gt.lt(1).float() + # + # loss = 0 + # + # pos_loss = torch.log(pred + 1e-10) * torch.pow(pred, 2) * pos_inds + # # neg_loss = torch.log(1 - pred) * torch.pow(pred, 2) * neg_weights * neg_inds + # neg_loss = torch.log(1 - pred) * torch.pow(1 - pred, 2) * neg_inds + # + # num_pos = pos_inds.float().sum() + # num_neg = neg_inds.float().sum() + # + # pos_loss = pos_loss.sum() + # neg_loss = neg_loss.sum() + # + # if num_pos == 0: + # loss = loss - neg_loss + # else: + # # loss = loss - (pos_loss + neg_loss) / (num_pos) + # loss = loss - (pos_loss + neg_loss ) + # return loss * 5 + + + + + # if weight is not None and weight.sum() > 0: + # return (losses * weight).sum() / weight.sum() + # else: + # assert losses.numel() != 0 + # return losses.mean() \ No newline at end of file diff --git a/maskrcnn_benchmark/layers/iou.py b/maskrcnn_benchmark/layers/iou.py new file mode 100644 index 0000000000000000000000000000000000000000..8f47d270958c76c6328fdcec3cea72fbd7967ca5 --- /dev/null +++ b/maskrcnn_benchmark/layers/iou.py @@ -0,0 +1,59 @@ +import torch +import numpy as np + + +def iou_regress(input, target, beta=1. / 9, size_average=True): + """ + very similar to the smooth_l1_loss from pytorch, but with + the extra beta parameter + """ + + + if len(input)==0: + return input.sum() * 0 + + width_i = input[:, 2] - input[:, 0] + height_i = input[:, 3] - input[:, 1] + width_t = target[:, 2] - target[:, 0] + height_t = target[:, 3] - target[:, 1] + + wh_if = torch.zeros_like(width_i) + wh_if[width_i > 0] += 1 + wh_if[height_i > 0] += 1 + + uion_i = width_i * height_i + uion_t = width_t * height_t + + x_1_max = torch.stack([input[:,0],target[:, 0]], 0) + y_1_max = torch.stack([input[:,1],target[:, 1]], 0) + x_2_min = torch.stack([input[:, 2], target[:, 2]], 0) + y_2_min = torch.stack([input[:, 3], target[:, 3]], 0) + + x_1_max = torch.max(x_1_max, 0, keepdim=True) + y_1_max = torch.max(y_1_max, 0, keepdim=True) + x_2_min = torch.min(x_2_min, 0, keepdim=True) + y_2_min = torch.min(y_2_min, 0, keepdim=True) + + width_inter = x_2_min[0] - x_1_max[0] + height_inter = y_2_min[0] - y_1_max[0] + N1, N2 = height_inter.shape + width_inter = width_inter.view([N2]) + + height_inter = height_inter.view([N2]) + + inter_area = width_inter * height_inter + area_union = uion_i + uion_t - inter_area + + wh_if[width_inter > 0] += 1 + wh_if[height_inter > 0] += 1 + wh_if [wh_if != 4] = 0 + wh_if [wh_if > 1] = 1 + + inter_area *= wh_if + area_union *= wh_if + + iou_loss_map = -torch.log((inter_area + 1.0) / (area_union + 1.0)) + iou_loss_map = iou_loss_map * wh_if + + del wh_if + return iou_loss_map.sum() \ No newline at end of file diff --git a/maskrcnn_benchmark/layers/iou_loss.py b/maskrcnn_benchmark/layers/iou_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..af398dd63877ec05b3fbd1ce45dd576e2e7d722a --- /dev/null +++ b/maskrcnn_benchmark/layers/iou_loss.py @@ -0,0 +1,36 @@ +import torch +from torch import nn + + +class IOULoss(nn.Module): + def forward(self, pred, target, weight=None): + pred_left = pred[:, 0] + pred_top = pred[:, 1] + pred_right = pred[:, 2] + pred_bottom = pred[:, 3] + + target_left = target[:, 0] + target_top = target[:, 1] + target_right = target[:, 2] + target_bottom = target[:, 3] + + target_aera = (target_left + target_right) * \ + (target_top + target_bottom) + pred_aera = (pred_left + pred_right) * \ + (pred_top + pred_bottom) + + w_intersect = torch.min(pred_left, target_left) + \ + torch.min(pred_right, target_right) + h_intersect = torch.min(pred_bottom, target_bottom) + \ + torch.min(pred_top, target_top) + + area_intersect = w_intersect * h_intersect + area_union = target_aera + pred_aera - area_intersect + + losses = -torch.log((area_intersect + 1.0) / (area_union + 1.0)) + + if weight is not None and weight.sum() > 0: + return (losses * weight).sum() / weight.sum() + else: + assert losses.numel() != 0 + return losses.mean() diff --git a/maskrcnn_benchmark/layers/misc.py b/maskrcnn_benchmark/layers/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..a8cf1c680c06b57412bfdf7a1c4a9c53f4acdbbd --- /dev/null +++ b/maskrcnn_benchmark/layers/misc.py @@ -0,0 +1,110 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +helper class that supports empty tensors on some nn functions. + +Ideally, add support directly in PyTorch to empty tensors in +those functions. + +This can be removed once https://github.com/pytorch/pytorch/issues/12013 +is implemented +""" + +import math +import torch +from torch.nn.modules.utils import _ntuple + + +class _NewEmptyTensorOp(torch.autograd.Function): + @staticmethod + def forward(ctx, x, new_shape): + ctx.shape = x.shape + return x.new_empty(new_shape) + + @staticmethod + def backward(ctx, grad): + shape = ctx.shape + return _NewEmptyTensorOp.apply(grad, shape), None + + +class Conv2d(torch.nn.Conv2d): + def forward(self, x): + if x.numel() > 0: + return super(Conv2d, self).forward(x) + # get output shape + + output_shape = [ + (i + 2 * p - (di * (k - 1) + 1)) // d + 1 + for i, p, di, k, d in zip( + x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride + ) + ] + output_shape = [x.shape[0], self.weight.shape[0]] + output_shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +class ConvTranspose2d(torch.nn.ConvTranspose2d): + def forward(self, x): + if x.numel() > 0: + return super(ConvTranspose2d, self).forward(x) + # get output shape + + output_shape = [ + (i - 1) * d - 2 * p + (di * (k - 1) + 1) + op + for i, p, di, k, d, op in zip( + x.shape[-2:], + self.padding, + self.dilation, + self.kernel_size, + self.stride, + self.output_padding, + ) + ] + output_shape = [x.shape[0], self.bias.shape[0]] + output_shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +class BatchNorm2d(torch.nn.BatchNorm2d): + def forward(self, x): + if x.numel() > 0: + return super(BatchNorm2d, self).forward(x) + # get output shape + output_shape = x.shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +def interpolate( + input, size=None, scale_factor=None, mode="nearest", align_corners=None +): + if input.numel() > 0: + return torch.nn.functional.interpolate( + input, size, scale_factor, mode, align_corners + ) + + def _check_size_scale_factor(dim): + if size is None and scale_factor is None: + raise ValueError("either size or scale_factor should be defined") + if size is not None and scale_factor is not None: + raise ValueError("only one of size or scale_factor should be defined") + if ( + scale_factor is not None + and isinstance(scale_factor, tuple) + and len(scale_factor) != dim + ): + raise ValueError( + "scale_factor shape must match input shape. " + "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor)) + ) + + def _output_size(dim): + _check_size_scale_factor(dim) + if size is not None: + return size + scale_factors = _ntuple(dim)(scale_factor) + # math.floor might return float in py2.7 + return [ + int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim) + ] + + output_shape = tuple(_output_size(2)) + output_shape = input.shape[:-2] + output_shape + return _NewEmptyTensorOp.apply(input, output_shape) diff --git a/maskrcnn_benchmark/layers/nms.py b/maskrcnn_benchmark/layers/nms.py new file mode 100644 index 0000000000000000000000000000000000000000..1e80b555045d85e509c917f940ee9bc62738fee7 --- /dev/null +++ b/maskrcnn_benchmark/layers/nms.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# from ._utils import _C +from maskrcnn_benchmark import _C + +nms = _C.nms +# nms.__doc__ = """ +# This function performs Non-maximum suppresion""" diff --git a/maskrcnn_benchmark/layers/roi_align.py b/maskrcnn_benchmark/layers/roi_align.py new file mode 100644 index 0000000000000000000000000000000000000000..170c8f18696aed19c4b9533a51933264530a1530 --- /dev/null +++ b/maskrcnn_benchmark/layers/roi_align.py @@ -0,0 +1,68 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from maskrcnn_benchmark import _C + + +class _ROIAlign(Function): + @staticmethod + def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): + ctx.save_for_backward(roi) + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.sampling_ratio = sampling_ratio + ctx.input_shape = input.size() + output = _C.roi_align_forward( + input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio + ) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + rois, = ctx.saved_tensors + output_size = ctx.output_size + spatial_scale = ctx.spatial_scale + sampling_ratio = ctx.sampling_ratio + bs, ch, h, w = ctx.input_shape + grad_input = _C.roi_align_backward( + grad_output, + rois, + spatial_scale, + output_size[0], + output_size[1], + bs, + ch, + h, + w, + sampling_ratio, + ) + return grad_input, None, None, None, None + + +roi_align = _ROIAlign.apply + + +class ROIAlign(nn.Module): + def __init__(self, output_size, spatial_scale, sampling_ratio): + super(ROIAlign, self).__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + self.sampling_ratio = sampling_ratio + + def forward(self, input, rois): + return roi_align( + input, rois, self.output_size, self.spatial_scale, self.sampling_ratio + ) + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "output_size=" + str(self.output_size) + tmpstr += ", spatial_scale=" + str(self.spatial_scale) + tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) + tmpstr += ")" + return tmpstr diff --git a/maskrcnn_benchmark/layers/roi_pool.py b/maskrcnn_benchmark/layers/roi_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..c0e42756ee6fcd779387255391a30079a28f5e60 --- /dev/null +++ b/maskrcnn_benchmark/layers/roi_pool.py @@ -0,0 +1,63 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from maskrcnn_benchmark import _C + + +class _ROIPool(Function): + @staticmethod + def forward(ctx, input, roi, output_size, spatial_scale): + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.input_shape = input.size() + output, argmax = _C.roi_pool_forward( + input, roi, spatial_scale, output_size[0], output_size[1] + ) + ctx.save_for_backward(input, roi, argmax) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, rois, argmax = ctx.saved_tensors + output_size = ctx.output_size + spatial_scale = ctx.spatial_scale + bs, ch, h, w = ctx.input_shape + grad_input = _C.roi_pool_backward( + grad_output, + input, + rois, + argmax, + spatial_scale, + output_size[0], + output_size[1], + bs, + ch, + h, + w, + ) + return grad_input, None, None, None + + +roi_pool = _ROIPool.apply + + +class ROIPool(nn.Module): + def __init__(self, output_size, spatial_scale): + super(ROIPool, self).__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + + def forward(self, input, rois): + return roi_pool(input, rois, self.output_size, self.spatial_scale) + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "output_size=" + str(self.output_size) + tmpstr += ", spatial_scale=" + str(self.spatial_scale) + tmpstr += ")" + return tmpstr diff --git a/maskrcnn_benchmark/layers/scale.py b/maskrcnn_benchmark/layers/scale.py new file mode 100644 index 0000000000000000000000000000000000000000..2c25622e939b6cc19e07c485a6910c1a5ff8da3c --- /dev/null +++ b/maskrcnn_benchmark/layers/scale.py @@ -0,0 +1,11 @@ +import torch +from torch import nn + + +class Scale(nn.Module): + def __init__(self, init_value=1.0): + super(Scale, self).__init__() + self.scale = nn.Parameter(torch.FloatTensor([init_value])) + + def forward(self, input): + return input * self.scale diff --git a/maskrcnn_benchmark/layers/sigmoid_focal_loss.py b/maskrcnn_benchmark/layers/sigmoid_focal_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..c42b4d69900e6222d972ee1296648eae97fec511 --- /dev/null +++ b/maskrcnn_benchmark/layers/sigmoid_focal_loss.py @@ -0,0 +1,76 @@ +import torch +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from maskrcnn_benchmark import _C + +# TODO: Use JIT to replace CUDA implementation in the future. +class _SigmoidFocalLoss(Function): + @staticmethod + def forward(ctx, logits, targets, gamma, alpha): + ctx.save_for_backward(logits, targets) + num_classes = logits.shape[1] + ctx.num_classes = num_classes + ctx.gamma = gamma + ctx.alpha = alpha + + losses = _C.sigmoid_focalloss_forward( + logits, targets, num_classes, gamma, alpha + ) + return losses + + @staticmethod + @once_differentiable + def backward(ctx, d_loss): + logits, targets = ctx.saved_tensors + num_classes = ctx.num_classes + gamma = ctx.gamma + alpha = ctx.alpha + d_loss = d_loss.contiguous() + d_logits = _C.sigmoid_focalloss_backward( + logits, targets, d_loss, num_classes, gamma, alpha + ) + return d_logits, None, None, None, None + + +sigmoid_focal_loss_cuda = _SigmoidFocalLoss.apply + + +def sigmoid_focal_loss_cpu(logits, targets, gamma, alpha): + num_classes = logits.shape[1] + gamma = gamma[0] + alpha = alpha[0] + dtype = targets.dtype + device = targets.device + class_range = torch.arange(1, num_classes+1, dtype=dtype, device=device).unsqueeze(0) + + t = targets.unsqueeze(1) + p = torch.sigmoid(logits) + term1 = (1 - p) ** gamma * torch.log(p) + term2 = p ** gamma * torch.log(1 - p) + return -(t == class_range).float() * term1 * alpha - ((t != class_range) * (t >= 0)).float() * term2 * (1 - alpha) + + +class SigmoidFocalLoss(nn.Module): + def __init__(self, gamma, alpha): + super(SigmoidFocalLoss, self).__init__() + self.gamma = gamma + self.alpha = alpha + + def forward(self, logits, targets): + device = logits.device + if logits.is_cuda: + loss_func = sigmoid_focal_loss_cuda + else: + loss_func = sigmoid_focal_loss_cpu + + loss = loss_func(logits, targets, self.gamma, self.alpha) + return loss.sum() + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "gamma=" + str(self.gamma) + tmpstr += ", alpha=" + str(self.alpha) + tmpstr += ")" + return tmpstr diff --git a/maskrcnn_benchmark/layers/smooth_l1_loss.py b/maskrcnn_benchmark/layers/smooth_l1_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..408a06e6a2c229c549e9ad2143826e3f7212e909 --- /dev/null +++ b/maskrcnn_benchmark/layers/smooth_l1_loss.py @@ -0,0 +1,18 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import numpy as np + +# TODO maybe push this to nn? +def smooth_l1_loss(input, target, beta=1. / 9, size_average=True): + """ + very similar to the smooth_l1_loss from pytorch, but with + the extra beta parameter + """ + n = torch.abs(input - target) + cond = n < beta + loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) + if size_average: + return loss.mean() + return loss.sum() + + diff --git a/maskrcnn_benchmark/modeling/__init__.py b/maskrcnn_benchmark/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ab781901b2f3a1ed9bc6142ffe9ef422cf4be65 Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/__pycache__/balanced_positive_negative_sampler.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/balanced_positive_negative_sampler.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c463f1dc7267420042ba6eca641cc669c9882e05 Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/balanced_positive_negative_sampler.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/__pycache__/box_coder.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/box_coder.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73f36b72d46c5df237708f116322d2d8f5d30307 Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/box_coder.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/__pycache__/make_layers.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/make_layers.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49885f2475ca254faf872c9aae3e4b5ac88a7062 Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/make_layers.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/__pycache__/matcher.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/matcher.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71a7bccbdb73f65666d37ec0d6ddf30d61c2abfb Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/matcher.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/__pycache__/poolers.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/poolers.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb65166368223e96fe2f5976828acf0c8cbd2f39 Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/poolers.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/__pycache__/registry.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/registry.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2fdf723eb51a74ddb0e88a38c48d930aa07469e2 Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/registry.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/__pycache__/utils.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd77f347e8904a48f0a85f60e3b67228067067f7 Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/utils.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/backbone/__init__.py b/maskrcnn_benchmark/modeling/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..537ebe56e683f4c665bb9b60fed9a1811645d8e5 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .backbone import build_backbone +from . import fbnet diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c1acb7b7c716b8cae13cb8a0788ef4100f8d005 Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/backbone.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/backbone.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d30113b53867c399ad4c68d8346e59d2028bdb73 Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/backbone.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9e29322bb280acb1f3dc5c2e9d4bd86d13d1a31 Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_builder.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_builder.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0558ee4cfbf387774e878d7f68741835dc235434 Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_builder.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_modeldef.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_modeldef.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a908e013e2e49b86c7b958b36384a409ff74144 Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_modeldef.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/fpn.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/fpn.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f4e225cbc5c8ea55f42c87760258133bc66d39f Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/fpn.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/msr.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/msr.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9872875bf5c3f7fc2b1103b3f447f15b6952df1d Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/msr.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/pan.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/pan.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8fc79c0279cec443d54aa1225f818a8fe413d29c Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/pan.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/resnet.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/resnet.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68a221cd9b27243a66508dd51c81fc199a617f15 Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/resnet.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/backbone/backbone.py b/maskrcnn_benchmark/modeling/backbone/backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..105d6dc54c888e8a25482c95be7b27d12abad47c --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/backbone.py @@ -0,0 +1,119 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import OrderedDict + +from torch import nn + +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform +from . import fpn as fpn_module +from .pan import PAN +from .msr import MSR +from . import resnet + + +@registry.BACKBONES.register("R-50-C4") +@registry.BACKBONES.register("R-50-C5") +@registry.BACKBONES.register("R-101-C4") +@registry.BACKBONES.register("R-101-C5") +def build_resnet_backbone(cfg): + body = resnet.ResNet(cfg) + model = nn.Sequential(OrderedDict([("body", body)])) + model.out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS + return model + + +@registry.BACKBONES.register("R-50-FPN") +@registry.BACKBONES.register("R-101-FPN") +@registry.BACKBONES.register("R-152-FPN") +def build_resnet_fpn_backbone(cfg): + in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS # 256 + in_channels_list = [ + in_channels_stage2, # 256 + in_channels_stage2 * 2, # 512 + in_channels_stage2 * 4, # 1024 + in_channels_stage2 * 8, # 2048 + ] + body = resnet.ResNet(cfg) + out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS # 256 + fpn = fpn_module.FPN( + in_channels_list=in_channels_list, + out_channels=out_channels, + conv_block=conv_with_kaiming_uniform( + cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU, + cfg.MODEL.FPN.USE_DEFORMABLE + ), + top_blocks=fpn_module.LastLevelMaxPool(), + ) + if cfg.MODEL.MSR_ON: + model = MSR(body, in_channels_list, fpn=fpn) + else: + model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) + model.out_channels = out_channels + return model + + +@registry.BACKBONES.register("R-50-PAN") +@registry.BACKBONES.register("R-101-PAN") +@registry.BACKBONES.register("R-152-PAN") +def build_resnet_fpn_backbone(cfg): + in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + in_channels_list = [ + in_channels_stage2, + in_channels_stage2 * 2, + in_channels_stage2 * 4, + in_channels_stage2 * 8, + ] + body = resnet.ResNet(cfg) + out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS + fpn = fpn_module.FPN( + in_channels_list=in_channels_list, + out_channels=out_channels, + conv_block=conv_with_kaiming_uniform( + cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU, + cfg.MODEL.FPN.USE_DEFORMABLE + ), + top_blocks=fpn_module.LastLevelMaxPool(), + ) + pan = PAN() + if cfg.MODEL.MSR_ON: + model = MSR(body, in_channels_list, fpn=fpn, pan=pan) + else: + model = nn.Sequential(OrderedDict([("body", body), + ("pan", pan), + ("fpn", fpn)])) + model.out_channels = out_channels + return model + + +@registry.BACKBONES.register("R-50-FPN-RETINANET") +@registry.BACKBONES.register("R-101-FPN-RETINANET") +def build_resnet_fpn_p3p7_backbone(cfg): + body = resnet.ResNet(cfg) + in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS + in_channels_p6p7 = in_channels_stage2 * 8 if cfg.MODEL.RETINANET.USE_C5 \ + else out_channels + fpn = fpn_module.FPN( + in_channels_list=[ + 0, + in_channels_stage2 * 2, + in_channels_stage2 * 4, + in_channels_stage2 * 8, + ], + out_channels=out_channels, + conv_block=conv_with_kaiming_uniform( + cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU + ), + top_blocks=fpn_module.LastLevelP6P7(in_channels_p6p7, out_channels), + ) + model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) + model.out_channels = out_channels + return model + + +def build_backbone(cfg): + assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \ + "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format( + cfg.MODEL.BACKBONE.CONV_BODY + ) + return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg) diff --git a/maskrcnn_benchmark/modeling/backbone/fbnet.py b/maskrcnn_benchmark/modeling/backbone/fbnet.py new file mode 100644 index 0000000000000000000000000000000000000000..0d8cf1522f61dd77c4c8617a1555a004509e4352 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/fbnet.py @@ -0,0 +1,252 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import copy +import json +import logging +from collections import OrderedDict + +from . import ( + fbnet_builder as mbuilder, + fbnet_modeldef as modeldef, +) +import torch.nn as nn +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.modeling.rpn import rpn +from maskrcnn_benchmark.modeling import poolers + + +logger = logging.getLogger(__name__) + + +def create_builder(cfg): + bn_type = cfg.MODEL.FBNET.BN_TYPE + if bn_type == "gn": + bn_type = (bn_type, cfg.GROUP_NORM.NUM_GROUPS) + factor = cfg.MODEL.FBNET.SCALE_FACTOR + + arch = cfg.MODEL.FBNET.ARCH + arch_def = cfg.MODEL.FBNET.ARCH_DEF + if len(arch_def) > 0: + arch_def = json.loads(arch_def) + if arch in modeldef.MODEL_ARCH: + if len(arch_def) > 0: + assert ( + arch_def == modeldef.MODEL_ARCH[arch] + ), "Two architectures with the same name {},\n{},\n{}".format( + arch, arch_def, modeldef.MODEL_ARCH[arch] + ) + arch_def = modeldef.MODEL_ARCH[arch] + else: + assert arch_def is not None and len(arch_def) > 0 + arch_def = mbuilder.unify_arch_def(arch_def) + + rpn_stride = arch_def.get("rpn_stride", None) + if rpn_stride is not None: + assert ( + cfg.MODEL.RPN.ANCHOR_STRIDE[0] == rpn_stride + ), "Needs to set cfg.MODEL.RPN.ANCHOR_STRIDE to {}, got {}".format( + rpn_stride, cfg.MODEL.RPN.ANCHOR_STRIDE + ) + width_divisor = cfg.MODEL.FBNET.WIDTH_DIVISOR + dw_skip_bn = cfg.MODEL.FBNET.DW_CONV_SKIP_BN + dw_skip_relu = cfg.MODEL.FBNET.DW_CONV_SKIP_RELU + + logger.info( + "Building fbnet model with arch {} (without scaling):\n{}".format( + arch, arch_def + ) + ) + + builder = mbuilder.FBNetBuilder( + width_ratio=factor, + bn_type=bn_type, + width_divisor=width_divisor, + dw_skip_bn=dw_skip_bn, + dw_skip_relu=dw_skip_relu, + ) + + return builder, arch_def + + +def _get_trunk_cfg(arch_def): + """ Get all stages except the last one """ + num_stages = mbuilder.get_num_stages(arch_def) + trunk_stages = arch_def.get("backbone", range(num_stages - 1)) + ret = mbuilder.get_blocks(arch_def, stage_indices=trunk_stages) + return ret + + +class FBNetTrunk(nn.Module): + def __init__( + self, builder, arch_def, dim_in, + ): + super(FBNetTrunk, self).__init__() + self.first = builder.add_first(arch_def["first"], dim_in=dim_in) + trunk_cfg = _get_trunk_cfg(arch_def) + self.stages = builder.add_blocks(trunk_cfg["stages"]) + + # return features for each stage + def forward(self, x): + y = self.first(x) + y = self.stages(y) + ret = [y] + return ret + + +@registry.BACKBONES.register("FBNet") +def add_conv_body(cfg, dim_in=3): + builder, arch_def = create_builder(cfg) + + body = FBNetTrunk(builder, arch_def, dim_in) + model = nn.Sequential(OrderedDict([("body", body)])) + model.out_channels = builder.last_depth + + return model + + +def _get_rpn_stage(arch_def, num_blocks): + rpn_stage = arch_def.get("rpn") + ret = mbuilder.get_blocks(arch_def, stage_indices=rpn_stage) + if num_blocks > 0: + logger.warn('Use last {} blocks in {} as rpn'.format(num_blocks, ret)) + block_count = len(ret["stages"]) + assert num_blocks <= block_count, "use block {}, block count {}".format( + num_blocks, block_count + ) + blocks = range(block_count - num_blocks, block_count) + ret = mbuilder.get_blocks(ret, block_indices=blocks) + return ret["stages"] + + +class FBNetRPNHead(nn.Module): + def __init__( + self, cfg, in_channels, builder, arch_def, + ): + super(FBNetRPNHead, self).__init__() + assert in_channels == builder.last_depth + + rpn_bn_type = cfg.MODEL.FBNET.RPN_BN_TYPE + if len(rpn_bn_type) > 0: + builder.bn_type = rpn_bn_type + + use_blocks = cfg.MODEL.FBNET.RPN_HEAD_BLOCKS + stages = _get_rpn_stage(arch_def, use_blocks) + + self.head = builder.add_blocks(stages) + self.out_channels = builder.last_depth + + def forward(self, x): + x = [self.head(y) for y in x] + return x + + +@registry.RPN_HEADS.register("FBNet.rpn_head") +def add_rpn_head(cfg, in_channels, num_anchors): + builder, model_arch = create_builder(cfg) + builder.last_depth = in_channels + + assert in_channels == builder.last_depth + # builder.name_prefix = "[rpn]" + + rpn_feature = FBNetRPNHead(cfg, in_channels, builder, model_arch) + rpn_regressor = rpn.RPNHeadConvRegressor( + cfg, rpn_feature.out_channels, num_anchors) + return nn.Sequential(rpn_feature, rpn_regressor) + + +def _get_head_stage(arch, head_name, blocks): + # use default name 'head' if the specific name 'head_name' does not existed + if head_name not in arch: + head_name = "head" + head_stage = arch.get(head_name) + ret = mbuilder.get_blocks(arch, stage_indices=head_stage, block_indices=blocks) + return ret["stages"] + + +# name mapping for head names in arch def and cfg +ARCH_CFG_NAME_MAPPING = { + "bbox": "ROI_BOX_HEAD", + "kpts": "ROI_KEYPOINT_HEAD", + "mask": "ROI_MASK_HEAD", +} + + +class FBNetROIHead(nn.Module): + def __init__( + self, cfg, in_channels, builder, arch_def, + head_name, use_blocks, stride_init, last_layer_scale, + ): + super(FBNetROIHead, self).__init__() + assert in_channels == builder.last_depth + assert isinstance(use_blocks, list) + + head_cfg_name = ARCH_CFG_NAME_MAPPING[head_name] + self.pooler = poolers.make_pooler(cfg, head_cfg_name) + + stage = _get_head_stage(arch_def, head_name, use_blocks) + + assert stride_init in [0, 1, 2] + if stride_init != 0: + stage[0]["block"][3] = stride_init + blocks = builder.add_blocks(stage) + + last_info = copy.deepcopy(arch_def["last"]) + last_info[1] = last_layer_scale + last = builder.add_last(last_info) + + self.head = nn.Sequential(OrderedDict([ + ("blocks", blocks), + ("last", last) + ])) + + self.out_channels = builder.last_depth + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + x = self.head(x) + return x + + +@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FBNet.roi_head") +def add_roi_head(cfg, in_channels): + builder, model_arch = create_builder(cfg) + builder.last_depth = in_channels + # builder.name_prefix = "_[bbox]_" + + return FBNetROIHead( + cfg, in_channels, builder, model_arch, + head_name="bbox", + use_blocks=cfg.MODEL.FBNET.DET_HEAD_BLOCKS, + stride_init=cfg.MODEL.FBNET.DET_HEAD_STRIDE, + last_layer_scale=cfg.MODEL.FBNET.DET_HEAD_LAST_SCALE, + ) + + +@registry.ROI_KEYPOINT_FEATURE_EXTRACTORS.register("FBNet.roi_head_keypoints") +def add_roi_head_keypoints(cfg, in_channels): + builder, model_arch = create_builder(cfg) + builder.last_depth = in_channels + # builder.name_prefix = "_[kpts]_" + + return FBNetROIHead( + cfg, in_channels, builder, model_arch, + head_name="kpts", + use_blocks=cfg.MODEL.FBNET.KPTS_HEAD_BLOCKS, + stride_init=cfg.MODEL.FBNET.KPTS_HEAD_STRIDE, + last_layer_scale=cfg.MODEL.FBNET.KPTS_HEAD_LAST_SCALE, + ) + + +@registry.ROI_MASK_FEATURE_EXTRACTORS.register("FBNet.roi_head_mask") +def add_roi_head_mask(cfg, in_channels): + builder, model_arch = create_builder(cfg) + builder.last_depth = in_channels + # builder.name_prefix = "_[mask]_" + + return FBNetROIHead( + cfg, in_channels, builder, model_arch, + head_name="mask", + use_blocks=cfg.MODEL.FBNET.MASK_HEAD_BLOCKS, + stride_init=cfg.MODEL.FBNET.MASK_HEAD_STRIDE, + last_layer_scale=cfg.MODEL.FBNET.MASK_HEAD_LAST_SCALE, + ) diff --git a/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py b/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..112a04074c31307d9080e0bf61115f79d4a9e0d4 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py @@ -0,0 +1,829 @@ +""" +FBNet model builder +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import copy +import logging +import math +from collections import OrderedDict + +import torch +import torch.nn as nn +from maskrcnn_benchmark.layers import ( + BatchNorm2d, + Conv2d, + FrozenBatchNorm2d, + interpolate, +) +from maskrcnn_benchmark.layers.misc import _NewEmptyTensorOp + + +logger = logging.getLogger(__name__) + + +def _py2_round(x): + return math.floor(x + 0.5) if x >= 0.0 else math.ceil(x - 0.5) + + +def _get_divisible_by(num, divisible_by, min_val): + ret = int(num) + if divisible_by > 0 and num % divisible_by != 0: + ret = int((_py2_round(num / divisible_by) or min_val) * divisible_by) + return ret + + +PRIMITIVES = { + "skip": lambda C_in, C_out, expansion, stride, **kwargs: Identity( + C_in, C_out, stride + ), + "ir_k3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, **kwargs + ), + "ir_k5": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, kernel=5, **kwargs + ), + "ir_k7": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, kernel=7, **kwargs + ), + "ir_k1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, kernel=1, **kwargs + ), + "shuffle": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, shuffle_type="mid", pw_group=4, **kwargs + ), + "basic_block": lambda C_in, C_out, expansion, stride, **kwargs: CascadeConv3x3( + C_in, C_out, stride + ), + "shift_5x5": lambda C_in, C_out, expansion, stride, **kwargs: ShiftBlock5x5( + C_in, C_out, expansion, stride + ), + # layer search 2 + "ir_k3_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=3, **kwargs + ), + "ir_k3_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=3, **kwargs + ), + "ir_k3_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=3, **kwargs + ), + "ir_k3_s4": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 4, stride, kernel=3, shuffle_type="mid", pw_group=4, **kwargs + ), + "ir_k5_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=5, **kwargs + ), + "ir_k5_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=5, **kwargs + ), + "ir_k5_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=5, **kwargs + ), + "ir_k5_s4": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 4, stride, kernel=5, shuffle_type="mid", pw_group=4, **kwargs + ), + # layer search se + "ir_k3_e1_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=3, se=True, **kwargs + ), + "ir_k3_e3_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=3, se=True, **kwargs + ), + "ir_k3_e6_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=3, se=True, **kwargs + ), + "ir_k3_s4_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, + C_out, + 4, + stride, + kernel=3, + shuffle_type="mid", + pw_group=4, + se=True, + **kwargs + ), + "ir_k5_e1_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=5, se=True, **kwargs + ), + "ir_k5_e3_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=5, se=True, **kwargs + ), + "ir_k5_e6_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=5, se=True, **kwargs + ), + "ir_k5_s4_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, + C_out, + 4, + stride, + kernel=5, + shuffle_type="mid", + pw_group=4, + se=True, + **kwargs + ), + # layer search 3 (in addition to layer search 2) + "ir_k3_s2": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=3, shuffle_type="mid", pw_group=2, **kwargs + ), + "ir_k5_s2": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=5, shuffle_type="mid", pw_group=2, **kwargs + ), + "ir_k3_s2_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, + C_out, + 1, + stride, + kernel=3, + shuffle_type="mid", + pw_group=2, + se=True, + **kwargs + ), + "ir_k5_s2_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, + C_out, + 1, + stride, + kernel=5, + shuffle_type="mid", + pw_group=2, + se=True, + **kwargs + ), + # layer search 4 (in addition to layer search 3) + "ir_k3_sep": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, kernel=3, cdw=True, **kwargs + ), + "ir_k33_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=3, cdw=True, **kwargs + ), + "ir_k33_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=3, cdw=True, **kwargs + ), + "ir_k33_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=3, cdw=True, **kwargs + ), + # layer search 5 (in addition to layer search 4) + "ir_k7_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=7, **kwargs + ), + "ir_k7_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=7, **kwargs + ), + "ir_k7_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=7, **kwargs + ), + "ir_k7_sep": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, kernel=7, cdw=True, **kwargs + ), + "ir_k7_sep_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=7, cdw=True, **kwargs + ), + "ir_k7_sep_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=7, cdw=True, **kwargs + ), + "ir_k7_sep_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=7, cdw=True, **kwargs + ), +} + + +class Identity(nn.Module): + def __init__(self, C_in, C_out, stride): + super(Identity, self).__init__() + self.conv = ( + ConvBNRelu( + C_in, + C_out, + kernel=1, + stride=stride, + pad=0, + no_bias=1, + use_relu="relu", + bn_type="bn", + ) + if C_in != C_out or stride != 1 + else None + ) + + def forward(self, x): + if self.conv: + out = self.conv(x) + else: + out = x + return out + + +class CascadeConv3x3(nn.Sequential): + def __init__(self, C_in, C_out, stride): + assert stride in [1, 2] + ops = [ + Conv2d(C_in, C_in, 3, stride, 1, bias=False), + BatchNorm2d(C_in), + nn.ReLU(inplace=True), + Conv2d(C_in, C_out, 3, 1, 1, bias=False), + BatchNorm2d(C_out), + ] + super(CascadeConv3x3, self).__init__(*ops) + self.res_connect = (stride == 1) and (C_in == C_out) + + def forward(self, x): + y = super(CascadeConv3x3, self).forward(x) + if self.res_connect: + y += x + return y + + +class Shift(nn.Module): + def __init__(self, C, kernel_size, stride, padding): + super(Shift, self).__init__() + self.C = C + kernel = torch.zeros((C, 1, kernel_size, kernel_size), dtype=torch.float32) + ch_idx = 0 + + assert stride in [1, 2] + self.stride = stride + self.padding = padding + self.kernel_size = kernel_size + self.dilation = 1 + + hks = kernel_size // 2 + ksq = kernel_size ** 2 + + for i in range(kernel_size): + for j in range(kernel_size): + if i == hks and j == hks: + num_ch = C // ksq + C % ksq + else: + num_ch = C // ksq + kernel[ch_idx : ch_idx + num_ch, 0, i, j] = 1 + ch_idx += num_ch + + self.register_parameter("bias", None) + self.kernel = nn.Parameter(kernel, requires_grad=False) + + def forward(self, x): + if x.numel() > 0: + return nn.functional.conv2d( + x, + self.kernel, + self.bias, + (self.stride, self.stride), + (self.padding, self.padding), + self.dilation, + self.C, # groups + ) + + output_shape = [ + (i + 2 * p - (di * (k - 1) + 1)) // d + 1 + for i, p, di, k, d in zip( + x.shape[-2:], + (self.padding, self.dilation), + (self.dilation, self.dilation), + (self.kernel_size, self.kernel_size), + (self.stride, self.stride), + ) + ] + output_shape = [x.shape[0], self.C] + output_shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +class ShiftBlock5x5(nn.Sequential): + def __init__(self, C_in, C_out, expansion, stride): + assert stride in [1, 2] + self.res_connect = (stride == 1) and (C_in == C_out) + + C_mid = _get_divisible_by(C_in * expansion, 8, 8) + + ops = [ + # pw + Conv2d(C_in, C_mid, 1, 1, 0, bias=False), + BatchNorm2d(C_mid), + nn.ReLU(inplace=True), + # shift + Shift(C_mid, 5, stride, 2), + # pw-linear + Conv2d(C_mid, C_out, 1, 1, 0, bias=False), + BatchNorm2d(C_out), + ] + super(ShiftBlock5x5, self).__init__(*ops) + + def forward(self, x): + y = super(ShiftBlock5x5, self).forward(x) + if self.res_connect: + y += x + return y + + +class ChannelShuffle(nn.Module): + def __init__(self, groups): + super(ChannelShuffle, self).__init__() + self.groups = groups + + def forward(self, x): + """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]""" + N, C, H, W = x.size() + g = self.groups + assert C % g == 0, "Incompatible group size {} for input channel {}".format( + g, C + ) + return ( + x.view(N, g, int(C / g), H, W) + .permute(0, 2, 1, 3, 4) + .contiguous() + .view(N, C, H, W) + ) + + +class ConvBNRelu(nn.Sequential): + def __init__( + self, + input_depth, + output_depth, + kernel, + stride, + pad, + no_bias, + use_relu, + bn_type, + group=1, + *args, + **kwargs + ): + super(ConvBNRelu, self).__init__() + + assert use_relu in ["relu", None] + if isinstance(bn_type, (list, tuple)): + assert len(bn_type) == 2 + assert bn_type[0] == "gn" + gn_group = bn_type[1] + bn_type = bn_type[0] + assert bn_type in ["bn", "af", "gn", None] + assert stride in [1, 2, 4] + + op = Conv2d( + input_depth, + output_depth, + kernel_size=kernel, + stride=stride, + padding=pad, + bias=not no_bias, + groups=group, + *args, + **kwargs + ) + nn.init.kaiming_normal_(op.weight, mode="fan_out", nonlinearity="relu") + if op.bias is not None: + nn.init.constant_(op.bias, 0.0) + self.add_module("conv", op) + + if bn_type == "bn": + bn_op = BatchNorm2d(output_depth) + elif bn_type == "gn": + bn_op = nn.GroupNorm(num_groups=gn_group, num_channels=output_depth) + elif bn_type == "af": + bn_op = FrozenBatchNorm2d(output_depth) + if bn_type is not None: + self.add_module("bn", bn_op) + + if use_relu == "relu": + self.add_module("relu", nn.ReLU(inplace=True)) + + +class SEModule(nn.Module): + reduction = 4 + + def __init__(self, C): + super(SEModule, self).__init__() + mid = max(C // self.reduction, 8) + conv1 = Conv2d(C, mid, 1, 1, 0) + conv2 = Conv2d(mid, C, 1, 1, 0) + + self.op = nn.Sequential( + nn.AdaptiveAvgPool2d(1), conv1, nn.ReLU(inplace=True), conv2, nn.Sigmoid() + ) + + def forward(self, x): + return x * self.op(x) + + +class Upsample(nn.Module): + def __init__(self, scale_factor, mode, align_corners=None): + super(Upsample, self).__init__() + self.scale = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + return interpolate( + x, scale_factor=self.scale, mode=self.mode, + align_corners=self.align_corners + ) + + +def _get_upsample_op(stride): + assert ( + stride in [1, 2, 4] + or stride in [-1, -2, -4] + or (isinstance(stride, tuple) and all(x in [-1, -2, -4] for x in stride)) + ) + + scales = stride + ret = None + if isinstance(stride, tuple) or stride < 0: + scales = [-x for x in stride] if isinstance(stride, tuple) else -stride + stride = 1 + ret = Upsample(scale_factor=scales, mode="nearest", align_corners=None) + + return ret, stride + + +class IRFBlock(nn.Module): + def __init__( + self, + input_depth, + output_depth, + expansion, + stride, + bn_type="bn", + kernel=3, + width_divisor=1, + shuffle_type=None, + pw_group=1, + se=False, + cdw=False, + dw_skip_bn=False, + dw_skip_relu=False, + ): + super(IRFBlock, self).__init__() + + assert kernel in [1, 3, 5, 7], kernel + + self.use_res_connect = stride == 1 and input_depth == output_depth + self.output_depth = output_depth + + mid_depth = int(input_depth * expansion) + mid_depth = _get_divisible_by(mid_depth, width_divisor, width_divisor) + + # pw + self.pw = ConvBNRelu( + input_depth, + mid_depth, + kernel=1, + stride=1, + pad=0, + no_bias=1, + use_relu="relu", + bn_type=bn_type, + group=pw_group, + ) + + # negative stride to do upsampling + self.upscale, stride = _get_upsample_op(stride) + + # dw + if kernel == 1: + self.dw = nn.Sequential() + elif cdw: + dw1 = ConvBNRelu( + mid_depth, + mid_depth, + kernel=kernel, + stride=stride, + pad=(kernel // 2), + group=mid_depth, + no_bias=1, + use_relu="relu", + bn_type=bn_type, + ) + dw2 = ConvBNRelu( + mid_depth, + mid_depth, + kernel=kernel, + stride=1, + pad=(kernel // 2), + group=mid_depth, + no_bias=1, + use_relu="relu" if not dw_skip_relu else None, + bn_type=bn_type if not dw_skip_bn else None, + ) + self.dw = nn.Sequential(OrderedDict([("dw1", dw1), ("dw2", dw2)])) + else: + self.dw = ConvBNRelu( + mid_depth, + mid_depth, + kernel=kernel, + stride=stride, + pad=(kernel // 2), + group=mid_depth, + no_bias=1, + use_relu="relu" if not dw_skip_relu else None, + bn_type=bn_type if not dw_skip_bn else None, + ) + + # pw-linear + self.pwl = ConvBNRelu( + mid_depth, + output_depth, + kernel=1, + stride=1, + pad=0, + no_bias=1, + use_relu=None, + bn_type=bn_type, + group=pw_group, + ) + + self.shuffle_type = shuffle_type + if shuffle_type is not None: + self.shuffle = ChannelShuffle(pw_group) + + self.se4 = SEModule(output_depth) if se else nn.Sequential() + + self.output_depth = output_depth + + def forward(self, x): + y = self.pw(x) + if self.shuffle_type == "mid": + y = self.shuffle(y) + if self.upscale is not None: + y = self.upscale(y) + y = self.dw(y) + y = self.pwl(y) + if self.use_res_connect: + y += x + y = self.se4(y) + return y + + +def _expand_block_cfg(block_cfg): + assert isinstance(block_cfg, list) + ret = [] + for idx in range(block_cfg[2]): + cur = copy.deepcopy(block_cfg) + cur[2] = 1 + cur[3] = 1 if idx >= 1 else cur[3] + ret.append(cur) + return ret + + +def expand_stage_cfg(stage_cfg): + """ For a single stage """ + assert isinstance(stage_cfg, list) + ret = [] + for x in stage_cfg: + ret += _expand_block_cfg(x) + return ret + + +def expand_stages_cfg(stage_cfgs): + """ For a list of stages """ + assert isinstance(stage_cfgs, list) + ret = [] + for x in stage_cfgs: + ret.append(expand_stage_cfg(x)) + return ret + + +def _block_cfgs_to_list(block_cfgs): + assert isinstance(block_cfgs, list) + ret = [] + for stage_idx, stage in enumerate(block_cfgs): + stage = expand_stage_cfg(stage) + for block_idx, block in enumerate(stage): + cur = {"stage_idx": stage_idx, "block_idx": block_idx, "block": block} + ret.append(cur) + return ret + + +def _add_to_arch(arch, info, name): + """ arch = [{block_0}, {block_1}, ...] + info = [ + # stage 0 + [ + block0_info, + block1_info, + ... + ], ... + ] + convert to: + arch = [ + { + block_0, + name: block0_info, + }, + { + block_1, + name: block1_info, + }, ... + ] + """ + assert isinstance(arch, list) and all(isinstance(x, dict) for x in arch) + assert isinstance(info, list) and all(isinstance(x, list) for x in info) + idx = 0 + for stage_idx, stage in enumerate(info): + for block_idx, block in enumerate(stage): + assert ( + arch[idx]["stage_idx"] == stage_idx + and arch[idx]["block_idx"] == block_idx + ), "Index ({}, {}) does not match for block {}".format( + stage_idx, block_idx, arch[idx] + ) + assert name not in arch[idx] + arch[idx][name] = block + idx += 1 + + +def unify_arch_def(arch_def): + """ unify the arch_def to: + { + ..., + "arch": [ + { + "stage_idx": idx, + "block_idx": idx, + ... + }, + {}, ... + ] + } + """ + ret = copy.deepcopy(arch_def) + + assert "block_cfg" in arch_def and "stages" in arch_def["block_cfg"] + assert "stages" not in ret + # copy 'first', 'last' etc. inside arch_def['block_cfg'] to ret + ret.update({x: arch_def["block_cfg"][x] for x in arch_def["block_cfg"]}) + ret["stages"] = _block_cfgs_to_list(arch_def["block_cfg"]["stages"]) + del ret["block_cfg"] + + assert "block_op_type" in arch_def + _add_to_arch(ret["stages"], arch_def["block_op_type"], "block_op_type") + del ret["block_op_type"] + + return ret + + +def get_num_stages(arch_def): + ret = 0 + for x in arch_def["stages"]: + ret = max(x["stage_idx"], ret) + ret = ret + 1 + return ret + + +def get_blocks(arch_def, stage_indices=None, block_indices=None): + ret = copy.deepcopy(arch_def) + ret["stages"] = [] + for block in arch_def["stages"]: + keep = True + if stage_indices not in (None, []) and block["stage_idx"] not in stage_indices: + keep = False + if block_indices not in (None, []) and block["block_idx"] not in block_indices: + keep = False + if keep: + ret["stages"].append(block) + return ret + + +class FBNetBuilder(object): + def __init__( + self, + width_ratio, + bn_type="bn", + width_divisor=1, + dw_skip_bn=False, + dw_skip_relu=False, + ): + self.width_ratio = width_ratio + self.last_depth = -1 + self.bn_type = bn_type + self.width_divisor = width_divisor + self.dw_skip_bn = dw_skip_bn + self.dw_skip_relu = dw_skip_relu + + def add_first(self, stage_info, dim_in=3, pad=True): + # stage_info: [c, s, kernel] + assert len(stage_info) >= 2 + channel = stage_info[0] + stride = stage_info[1] + out_depth = self._get_divisible_width(int(channel * self.width_ratio)) + kernel = 3 + if len(stage_info) > 2: + kernel = stage_info[2] + + out = ConvBNRelu( + dim_in, + out_depth, + kernel=kernel, + stride=stride, + pad=kernel // 2 if pad else 0, + no_bias=1, + use_relu="relu", + bn_type=self.bn_type, + ) + self.last_depth = out_depth + return out + + def add_blocks(self, blocks): + """ blocks: [{}, {}, ...] + """ + assert isinstance(blocks, list) and all( + isinstance(x, dict) for x in blocks + ), blocks + + modules = OrderedDict() + for block in blocks: + stage_idx = block["stage_idx"] + block_idx = block["block_idx"] + block_op_type = block["block_op_type"] + tcns = block["block"] + n = tcns[2] + assert n == 1 + nnblock = self.add_ir_block(tcns, [block_op_type]) + nn_name = "xif{}_{}".format(stage_idx, block_idx) + assert nn_name not in modules + modules[nn_name] = nnblock + ret = nn.Sequential(modules) + return ret + + def add_last(self, stage_info): + """ skip last layer if channel_scale == 0 + use the same output channel if channel_scale < 0 + """ + assert len(stage_info) == 2 + channels = stage_info[0] + channel_scale = stage_info[1] + + if channel_scale == 0.0: + return nn.Sequential() + + if channel_scale > 0: + last_channel = ( + int(channels * self.width_ratio) if self.width_ratio > 1.0 else channels + ) + last_channel = int(last_channel * channel_scale) + else: + last_channel = int(self.last_depth * (-channel_scale)) + last_channel = self._get_divisible_width(last_channel) + + if last_channel == 0: + return nn.Sequential() + + dim_in = self.last_depth + ret = ConvBNRelu( + dim_in, + last_channel, + kernel=1, + stride=1, + pad=0, + no_bias=1, + use_relu="relu", + bn_type=self.bn_type, + ) + self.last_depth = last_channel + return ret + + # def add_final_pool(self, model, blob_in, kernel_size): + # ret = model.AveragePool(blob_in, "final_avg", kernel=kernel_size, stride=1) + # return ret + + def _add_ir_block( + self, dim_in, dim_out, stride, expand_ratio, block_op_type, **kwargs + ): + ret = PRIMITIVES[block_op_type]( + dim_in, + dim_out, + expansion=expand_ratio, + stride=stride, + bn_type=self.bn_type, + width_divisor=self.width_divisor, + dw_skip_bn=self.dw_skip_bn, + dw_skip_relu=self.dw_skip_relu, + **kwargs + ) + return ret, ret.output_depth + + def add_ir_block(self, tcns, block_op_types, **kwargs): + t, c, n, s = tcns + assert n == 1 + out_depth = self._get_divisible_width(int(c * self.width_ratio)) + dim_in = self.last_depth + op, ret_depth = self._add_ir_block( + dim_in, + out_depth, + stride=s, + expand_ratio=t, + block_op_type=block_op_types[0], + **kwargs + ) + self.last_depth = ret_depth + return op + + def _get_divisible_width(self, width): + ret = _get_divisible_by(int(width), self.width_divisor, self.width_divisor) + return ret diff --git a/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py b/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py new file mode 100644 index 0000000000000000000000000000000000000000..fb1c96b3a4dbe735682ae81361ee0efed75cbb25 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py @@ -0,0 +1,218 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + + +def add_archs(archs): + global MODEL_ARCH + for x in archs: + assert x not in MODEL_ARCH, "Duplicated model name {} existed".format(x) + MODEL_ARCH[x] = archs[x] + + +MODEL_ARCH = { + "default": { + "block_op_type": [ + # stage 0 + ["ir_k3"], + # stage 1 + ["ir_k3"] * 2, + # stage 2 + ["ir_k3"] * 3, + # stage 3 + ["ir_k3"] * 7, + # stage 4, bbox head + ["ir_k3"] * 4, + # stage 5, rpn + ["ir_k3"] * 3, + # stage 5, mask head + ["ir_k3"] * 5, + ], + "block_cfg": { + "first": [32, 2], + "stages": [ + # [t, c, n, s] + # stage 0 + [[1, 16, 1, 1]], + # stage 1 + [[6, 24, 2, 2]], + # stage 2 + [[6, 32, 3, 2]], + # stage 3 + [[6, 64, 4, 2], [6, 96, 3, 1]], + # stage 4, bbox head + [[4, 160, 1, 2], [6, 160, 2, 1], [6, 240, 1, 1]], + # [[6, 160, 3, 2], [6, 320, 1, 1]], + # stage 5, rpn head + [[6, 96, 3, 1]], + # stage 6, mask head + [[4, 160, 1, 1], [6, 160, 3, 1], [3, 80, 1, -2]], + ], + # [c, channel_scale] + "last": [0, 0.0], + "backbone": [0, 1, 2, 3], + "rpn": [5], + "bbox": [4], + "mask": [6], + }, + }, + "xirb16d_dsmask": { + "block_op_type": [ + # stage 0 + ["ir_k3"], + # stage 1 + ["ir_k3"] * 2, + # stage 2 + ["ir_k3"] * 3, + # stage 3 + ["ir_k3"] * 7, + # stage 4, bbox head + ["ir_k3"] * 4, + # stage 5, mask head + ["ir_k3"] * 5, + # stage 6, rpn + ["ir_k3"] * 3, + ], + "block_cfg": { + "first": [16, 2], + "stages": [ + # [t, c, n, s] + # stage 0 + [[1, 16, 1, 1]], + # stage 1 + [[6, 32, 2, 2]], + # stage 2 + [[6, 48, 3, 2]], + # stage 3 + [[6, 96, 4, 2], [6, 128, 3, 1]], + # stage 4, bbox head + [[4, 128, 1, 2], [6, 128, 2, 1], [6, 160, 1, 1]], + # stage 5, mask head + [[4, 128, 1, 2], [6, 128, 2, 1], [6, 128, 1, -2], [3, 64, 1, -2]], + # stage 6, rpn head + [[6, 128, 3, 1]], + ], + # [c, channel_scale] + "last": [0, 0.0], + "backbone": [0, 1, 2, 3], + "rpn": [6], + "bbox": [4], + "mask": [5], + }, + }, + "mobilenet_v2": { + "block_op_type": [ + # stage 0 + ["ir_k3"], + # stage 1 + ["ir_k3"] * 2, + # stage 2 + ["ir_k3"] * 3, + # stage 3 + ["ir_k3"] * 7, + # stage 4 + ["ir_k3"] * 4, + ], + "block_cfg": { + "first": [32, 2], + "stages": [ + # [t, c, n, s] + # stage 0 + [[1, 16, 1, 1]], + # stage 1 + [[6, 24, 2, 2]], + # stage 2 + [[6, 32, 3, 2]], + # stage 3 + [[6, 64, 4, 2], [6, 96, 3, 1]], + # stage 4 + [[6, 160, 3, 1], [6, 320, 1, 1]], + ], + # [c, channel_scale] + "last": [0, 0.0], + "backbone": [0, 1, 2, 3], + "bbox": [4], + }, + }, +} + + +MODEL_ARCH_CHAM = { + "cham_v1a": { + "block_op_type": [ + # stage 0 + ["ir_k3"], + # stage 1 + ["ir_k7"] * 2, + # stage 2 + ["ir_k3"] * 5, + # stage 3 + ["ir_k5"] * 7 + ["ir_k3"] * 5, + # stage 4, bbox head + ["ir_k3"] * 5, + # stage 5, rpn + ["ir_k3"] * 3, + ], + "block_cfg": { + "first": [32, 2], + "stages": [ + # [t, c, n, s] + # stage 0 + [[1, 24, 1, 1]], + # stage 1 + [[4, 48, 2, 2]], + # stage 2 + [[7, 64, 5, 2]], + # stage 3 + [[12, 56, 7, 2], [8, 88, 5, 1]], + # stage 4, bbox head + [[7, 152, 4, 2], [10, 104, 1, 1]], + # stage 5, rpn head + [[8, 88, 3, 1]], + ], + # [c, channel_scale] + "last": [0, 0.0], + "backbone": [0, 1, 2, 3], + "rpn": [5], + "bbox": [4], + }, + }, + "cham_v2": { + "block_op_type": [ + # stage 0 + ["ir_k3"], + # stage 1 + ["ir_k5"] * 4, + # stage 2 + ["ir_k7"] * 6, + # stage 3 + ["ir_k5"] * 3 + ["ir_k3"] * 6, + # stage 4, bbox head + ["ir_k3"] * 7, + # stage 5, rpn + ["ir_k3"] * 1, + ], + "block_cfg": { + "first": [32, 2], + "stages": [ + # [t, c, n, s] + # stage 0 + [[1, 24, 1, 1]], + # stage 1 + [[8, 32, 4, 2]], + # stage 2 + [[5, 48, 6, 2]], + # stage 3 + [[9, 56, 3, 2], [6, 56, 6, 1]], + # stage 4, bbox head + [[2, 160, 6, 2], [6, 112, 1, 1]], + # stage 5, rpn head + [[6, 56, 1, 1]], + ], + # [c, channel_scale] + "last": [0, 0.0], + "backbone": [0, 1, 2, 3], + "rpn": [5], + "bbox": [4], + }, + }, +} +add_archs(MODEL_ARCH_CHAM) diff --git a/maskrcnn_benchmark/modeling/backbone/fpn.py b/maskrcnn_benchmark/modeling/backbone/fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..abd171776d8e10f4ac657303f5d1bfad624569dd --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/fpn.py @@ -0,0 +1,98 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn + + +class FPN(nn.Module): + """ + Module that adds FPN on top of a list of feature maps. + The feature maps are currently supposed to be in increasing depth order, and must be consecutive + """ + + def __init__( + self, in_channels_list, out_channels, conv_block, top_blocks=None + ): + """ + Arguments: + in_channels_list (list[int]): number of channels for each feature map that + will be fed + out_channels (int): number of channels of the FPN representation + top_blocks (nn.Module or None): if provided, an extra operation will + be performed on the output of the last (smallest resolution) + FPN output, and the result will extend the result list + """ + super(FPN, self).__init__() + self.inner_blocks = [] + self.layer_blocks = [] + for idx, in_channels in enumerate(in_channels_list, 1): + inner_block = "fpn_inner{}".format(idx) + layer_block = "fpn_layer{}".format(idx) + + if in_channels == 0: + continue + inner_block_module = conv_block(in_channels, out_channels, 1) + layer_block_module = conv_block(out_channels, out_channels, 3, 1) + self.add_module(inner_block, inner_block_module) + self.add_module(layer_block, layer_block_module) + self.inner_blocks.append(inner_block) + self.layer_blocks.append(layer_block) + self.top_blocks = top_blocks + + def forward(self, x): + """ + Arguments: + x (list[Tensor]): feature maps for each feature level. + Returns: + results (tuple[Tensor]): feature maps after FPN layers. + They are ordered from highest resolution first. + """ + last_inner = getattr(self, self.inner_blocks[-1])(x[-1]) + results = [] + results.append(getattr(self, self.layer_blocks[-1])(last_inner)) + for feature, inner_block, layer_block in zip( + x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1] + ): + if not inner_block: + continue + inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest") + inner_lateral = getattr(self, inner_block)(feature) + # TODO use size instead of scale to make it robust to different sizes + # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:], + # mode='bilinear', align_corners=False) + last_inner = inner_lateral + inner_top_down + results.insert(0, getattr(self, layer_block)(last_inner)) + + if isinstance(self.top_blocks, LastLevelP6P7): + last_results = self.top_blocks(x[-1], results[-1]) + results.extend(last_results) + elif isinstance(self.top_blocks, LastLevelMaxPool): + last_results = self.top_blocks(results[-1]) + results.extend(last_results) + + return tuple(results) + + +class LastLevelMaxPool(nn.Module): + def forward(self, x): + return [F.max_pool2d(x, 1, 2, 0)] + + +class LastLevelP6P7(nn.Module): + """ + This module is used in RetinaNet to generate extra layers, P6 and P7. + """ + def __init__(self, in_channels, out_channels): + super(LastLevelP6P7, self).__init__() + self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) + self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) + for module in [self.p6, self.p7]: + nn.init.kaiming_uniform_(module.weight, a=1) + nn.init.constant_(module.bias, 0) + self.use_P5 = in_channels == out_channels + + def forward(self, c5, p5): + x = p5 if self.use_P5 else c5 + p6 = self.p6(x) + p7 = self.p7(F.relu(p6)) + return [p6, p7] diff --git a/maskrcnn_benchmark/modeling/backbone/msr.py b/maskrcnn_benchmark/modeling/backbone/msr.py new file mode 100644 index 0000000000000000000000000000000000000000..3ed5a66cd1a7a926d6554482c47296dd617d9e2f --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/msr.py @@ -0,0 +1,65 @@ +import torch +from torch import nn +from torch.nn import functional as F + + +class ConcatUpConv(nn.Module): + def __init__(self, inplanes, outplanes, upsample=True): + super(ConcatUpConv, self).__init__() + out_channels = outplanes + self.upsample = upsample + self.con_1x1 = nn.Conv2d(inplanes, outplanes, 1, bias=False) + nn.init.kaiming_uniform_(self.con_1x1.weight, a=1) + self.nor_1 = nn.BatchNorm2d(out_channels) + self.leakyrelu_1 = nn.ReLU() + if self.upsample: + self.con_3x3 = nn.Conv2d(outplanes, out_channels // 2, + kernel_size=3, stride=1, padding=1, bias=False) + nn.init.kaiming_uniform_(self.con_3x3.weight, a=1) + self.nor_3 = nn.BatchNorm2d(out_channels // 2) + self.leakyrelu_3 = nn.ReLU() + + def forward(self, x1, x2): + fusion = torch.cat([x1, x2], dim=1) + out_1 = self.leakyrelu_1(self.nor_1(self.con_1x1(fusion))) + out = None + if self.upsample: + out = self.leakyrelu_3(self.nor_3(self.con_3x3(out_1))) + out = F.interpolate(out, scale_factor=2, mode='bilinear', align_corners=False) + return out, out_1 + + +class MSR(nn.Module): + def __init__(self, body, channels, fpn=None, pan=None): + super(MSR, self).__init__() + self.body = body + cucs = nn.ModuleList() + channel = channels[0] + cucs.append(ConcatUpConv(channel * 2, channel, upsample=False)) + for i, channel in enumerate(channels[1:]): + cucs.append(ConcatUpConv(channel * 2, channel)) + self.cucs = cucs + if fpn is not None: + self.fpn = fpn + if pan is not None: + self.pan = pan + + def forward(self, x): + outputs = self.body(x) + + re_x = F.interpolate(x, scale_factor=0.5, + mode='bilinear', align_corners=False) + output_re = self.body(re_x)[-1] + low = F.interpolate(output_re, + size=outputs[-1].shape[2:], + mode='bilinear', align_corners=False) + new_outputs = [] + for cuc, high in zip(self.cucs[::-1], outputs[::-1]): + low, out = cuc(high, low) + new_outputs.append(out) + outs = new_outputs[::-1] + if hasattr(self, 'pan'): + outs = self.pan(outs) + if hasattr(self, 'fpn'): + outs = self.fpn(outs) + return outs diff --git a/maskrcnn_benchmark/modeling/backbone/pan.py b/maskrcnn_benchmark/modeling/backbone/pan.py new file mode 100644 index 0000000000000000000000000000000000000000..e9703e271b3987ff380e5222232592678cafef61 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/pan.py @@ -0,0 +1,177 @@ +import torch.nn as nn +import torch.nn.functional as F + + +class FPA(nn.Module): + def __init__(self, channels=2048): + """ + Feature Pyramid Attention + :type channels: int + """ + super(FPA, self).__init__() + channels_mid = int(channels / 4) + + self.channels_cond = channels + + # Master branch + self.conv_master = nn.Conv2d(self.channels_cond, channels, kernel_size=1, bias=False) + self.bn_master = nn.BatchNorm2d(channels) + + # Global pooling branch + self.conv_gpb = nn.Conv2d(self.channels_cond, channels, kernel_size=1, bias=False) + #self.bn_gpb = nn.BatchNorm2d(channels) + + # C333 because of the shape of last feature maps is (16, 16). + self.conv7x7_1 = nn.Conv2d(self.channels_cond, channels_mid, kernel_size=(7, 7), stride=2, padding=3, bias=False) + self.bn1_1 = nn.BatchNorm2d(channels_mid) + self.conv5x5_1 = nn.Conv2d(channels_mid, channels_mid, kernel_size=(5, 5), stride=2, padding=2, bias=False) + self.bn2_1 = nn.BatchNorm2d(channels_mid) + self.conv3x3_1 = nn.Conv2d(channels_mid, channels_mid, kernel_size=(3, 3), stride=2, padding=1, bias=False) + self.bn3_1 = nn.BatchNorm2d(channels_mid) + + self.conv7x7_2 = nn.Conv2d(channels_mid, channels_mid, kernel_size=(7, 7), stride=1, padding=3, bias=False) + self.bn1_2 = nn.BatchNorm2d(channels_mid) + self.conv5x5_2 = nn.Conv2d(channels_mid, channels_mid, kernel_size=(5, 5), stride=1, padding=2, bias=False) + self.bn2_2 = nn.BatchNorm2d(channels_mid) + self.conv3x3_2 = nn.Conv2d(channels_mid, channels_mid, kernel_size=(3, 3), stride=1, padding=1, bias=False) + self.bn3_2 = nn.BatchNorm2d(channels_mid) + + self.bn_upsample_1 = nn.BatchNorm2d(channels) + self.conv1x1_up1 = nn.Conv2d(channels_mid, channels, kernel_size=(1, 1), stride=1, padding=0, bias=False) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + """ + :param x: Shape: [b, 2048, h, w] + :return: out: Feature maps. Shape: [b, 2048, h, w] + """ + # Master branch + x_master = self.conv_master(x) + x_master = self.bn_master(x_master) + + # Global pooling branch + x_gpb = nn.AvgPool2d(x.shape[2:])(x).view(x.shape[0], self.channels_cond, 1, 1) + x_gpb = self.conv_gpb(x_gpb) + #x_gpb = self.bn_gpb(x_gpb) + + # Branch 1 + x1_1 = self.conv7x7_1(x) + x1_1 = self.bn1_1(x1_1) + x1_1 = self.relu(x1_1) + x1_2 = self.conv7x7_2(x1_1) + x1_2 = self.bn1_2(x1_2) + + # Branch 2 + x2_1 = self.conv5x5_1(x1_1) + x2_1 = self.bn2_1(x2_1) + x2_1 = self.relu(x2_1) + x2_2 = self.conv5x5_2(x2_1) + x2_2 = self.bn2_2(x2_2) + + # Branch 3 + x3_1 = self.conv3x3_1(x2_1) + x3_1 = self.bn3_1(x3_1) + x3_1 = self.relu(x3_1) + x3_2 = self.conv3x3_2(x3_1) + x3_2 = self.bn3_2(x3_2) + + # Merge branch 1 and 2 + x3_upsample = F.upsample(x3_2, size=x2_2.shape[-2:], + mode='bilinear', align_corners=False) + + x2_merge = self.relu(x2_2 + x3_upsample) + + x2_upsample = F.upsample(x2_merge, size=x1_2.shape[-2:], + mode='bilinear', align_corners=False) + x1_merge = self.relu(x1_2 + x2_upsample) + + x1_merge_upsample = F.upsample(x1_merge, size=x_master.shape[-2:], + mode='bilinear', align_corners=False) + x1_merge_upsample_ch = self.relu(self.bn_upsample_1(self.conv1x1_up1(x1_merge_upsample))) + x_master = x_master * x1_merge_upsample_ch + # + out = self.relu(x_master + x_gpb) + + return out + + +class GAU(nn.Module): + def __init__(self, channels_high, channels_low, upsample=True): + super(GAU, self).__init__() + # Global Attention Upsample + self.upsample = upsample + self.conv3x3 = nn.Conv2d(channels_low, channels_low, kernel_size=3, padding=1, bias=False) + self.bn_low = nn.BatchNorm2d(channels_low) + + self.conv1x1 = nn.Conv2d(channels_high, channels_low, kernel_size=1, padding=0, bias=False) + #self.bn_high = nn.BatchNorm2d(channels_low) + + if upsample: + self.conv_upsample = nn.ConvTranspose2d(channels_high, channels_low, kernel_size=4, stride=2, padding=1, bias=False) + self.bn_upsample = nn.BatchNorm2d(channels_low) + else: + self.conv_reduction = nn.Conv2d(channels_high, channels_low, kernel_size=1, padding=0, bias=False) + self.bn_reduction = nn.BatchNorm2d(channels_low) + self.relu = nn.ReLU(inplace=True) + + def forward(self, fms_high, fms_low, fm_mask=None): + """ + Use the high level features with abundant catagory information to weight the low level features with pixel + localization information. In the meantime, we further use mask feature maps with catagory-specific information + to localize the mask position. + :param fms_high: Features of high level. Tensor. + :param fms_low: Features of low level. Tensor. + :param fm_mask: + :return: fms_att_upsample + """ + b, c, h, w = fms_high.shape + + fms_high_gp = nn.AvgPool2d(fms_high.shape[2:])(fms_high).view(len(fms_high), c, 1, 1) + fms_high_gp = self.conv1x1(fms_high_gp) + # fms_high_gp = self.bn_high(fms_high_gp)# arlog, when the spatial size HxW = 1x1, the BN cannot be used. + fms_high_gp = self.relu(fms_high_gp) + + # fms_low_mask = torch.cat([fms_low, fm_mask], dim=1) + fms_low_mask = self.conv3x3(fms_low) + fms_low_mask = self.bn_low(fms_low_mask) + + fms_att = fms_low_mask * fms_high_gp + if self.upsample: + out = self.relu( + self.bn_upsample(self.conv_upsample(fms_high)) + fms_att) + else: + out = self.relu( + self.bn_reduction(self.conv_reduction(fms_high)) + fms_att) + return out + + +class PAN(nn.Module): + def __init__(self): + """ + :param blocks: Blocks of the network with reverse sequential. + """ + super(PAN, self).__init__() + channels_blocks = [2048, 1024, 512, 256] + + self.fpa = FPA(channels=channels_blocks[0]) + + self.gau_block1 = GAU(channels_blocks[0], channels_blocks[1]) + self.gau_block2 = GAU(channels_blocks[1], channels_blocks[2]) + self.gau_block3 = GAU(channels_blocks[2], channels_blocks[3]) + self.gau = [self.gau_block1, self.gau_block2, self.gau_block3] + + def forward(self, fms): + """ + :param fms: Feature maps of forward propagation in the network with reverse sequential. shape:[b, c, h, w] + :return: fm_high. [b, 256, h, w] + """ + feats = [] + for i, fm_low in enumerate(fms[::-1]): + if i == 0: + fm_high = self.fpa(fm_low) + else: + fm_high = self.gau[int(i-1)](fm_high, fm_low) + feats.append(fm_high) + feats.reverse() + return tuple(feats) diff --git a/maskrcnn_benchmark/modeling/backbone/resnet.py b/maskrcnn_benchmark/modeling/backbone/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..39adf1520463abcf5778a674c7e4d5fb3dc0163d --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/resnet.py @@ -0,0 +1,498 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Variant of the resnet module that takes cfg as an argument. +Example usage. Strings may be specified in the config file. + model = ResNet( + "StemWithFixedBatchNorm", + "BottleneckWithFixedBatchNorm", + "ResNet50StagesTo4", + ) +OR: + model = ResNet( + "StemWithGN", + "BottleneckWithGN", + "ResNet50StagesTo4", + ) +Custom implementations may be written in user code and hooked in via the +`register_*` functions. +""" +from collections import namedtuple + +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.layers import FrozenBatchNorm2d +from maskrcnn_benchmark.layers import Conv2d +from maskrcnn_benchmark.modeling.make_layers import group_norm +from maskrcnn_benchmark.layers import DCN +from maskrcnn_benchmark.utils.registry import Registry + + +# ResNet stage specification +StageSpec = namedtuple( + "StageSpec", + [ + "index", # Index of the stage, eg 1, 2, ..,. 5 + "block_count", # Number of residual blocks in the stage + "return_features", # True => return the last feature map from this stage + ], +) + +# ----------------------------------------------------------------------------- +# Standard ResNet models +# ----------------------------------------------------------------------------- +# ResNet-50 (including all stages) +ResNet50StagesTo5 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, False), (4, 3, True)) +) +# ResNet-50 up to stage 4 (excludes stage 5) +ResNet50StagesTo4 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, True)) +) +# ResNet-101 (including all stages) +ResNet101StagesTo5 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, False), (4, 3, True)) +) +# ResNet-101 up to stage 4 (excludes stage 5) +ResNet101StagesTo4 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, True)) +) +# ResNet-50-FPN (including all stages) +ResNet50FPNStagesTo5 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 6, True), (4, 3, True)) +) +# ResNet-101-FPN (including all stages) +ResNet101FPNStagesTo5 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 23, True), (4, 3, True)) +) +# ResNet-152-FPN (including all stages) +ResNet152FPNStagesTo5 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, True), (2, 8, True), (3, 36, True), (4, 3, True)) +) + +class ResNet(nn.Module): + def __init__(self, cfg): + super(ResNet, self).__init__() + + # If we want to use the cfg in forward(), then we should make a copy + # of it and store it for later use: + # self.cfg = cfg.clone() + + # Translate string names to implementations + stem_module = _STEM_MODULES[cfg.MODEL.RESNETS.STEM_FUNC] + stage_specs = _STAGE_SPECS[cfg.MODEL.BACKBONE.CONV_BODY] + transformation_module = _TRANSFORMATION_MODULES[cfg.MODEL.RESNETS.TRANS_FUNC] + deformable_module = _TRANSFORMATION_MODULES[cfg.MODEL.RESNETS.DEF_FUNC] + start_module = cfg.MODEL.RESNETS.DEF_START_MODULE + _DEF_IDX = {"C3": 1, "C4": 2, "C5": 3} + if start_module in _DEF_IDX: + start_idx = _DEF_IDX[start_module] + else: + start_idx = 65535 + + # Construct the stem module + self.stem = stem_module(cfg) + + # Constuct the specified ResNet stages + num_groups = cfg.MODEL.RESNETS.NUM_GROUPS + width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP + in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS + stage2_bottleneck_channels = num_groups * width_per_group + stage2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + self.stages = [] + self.return_features = {} + for i, stage_spec in enumerate(stage_specs): + name = "layer" + str(stage_spec.index) + stage2_relative_factor = 2 ** (stage_spec.index - 1) + bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor + out_channels = stage2_out_channels * stage2_relative_factor + if i >= start_idx: + trans_mod = deformable_module + else: + trans_mod = transformation_module + module = _make_stage( + trans_mod, + in_channels, + bottleneck_channels, + out_channels, + stage_spec.block_count, + num_groups, + cfg.MODEL.RESNETS.STRIDE_IN_1X1, + first_stride=int(stage_spec.index > 1) + 1, + ) + in_channels = out_channels + self.add_module(name, module) + self.stages.append(name) + self.return_features[name] = stage_spec.return_features + + # Optionally freeze (requires_grad=False) parts of the backbone + self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT) + + def _freeze_backbone(self, freeze_at): + if freeze_at < 0: + return + for stage_index in range(freeze_at): + if stage_index == 0: + m = self.stem # stage 0 is the stem + else: + m = getattr(self, "layer" + str(stage_index)) + for p in m.parameters(): + p.requires_grad = False + + def forward(self, x): + outputs = [] + x = self.stem(x) + for stage_name in self.stages: + x = getattr(self, stage_name)(x) + if self.return_features[stage_name]: + outputs.append(x) + return outputs + + +class ResNetHead(nn.Module): + def __init__( + self, + block_module, + stages, + num_groups=1, + width_per_group=64, + stride_in_1x1=True, + stride_init=None, + res2_out_channels=256, + dilation=1 + ): + super(ResNetHead, self).__init__() + + stage2_relative_factor = 2 ** (stages[0].index - 1) + # print('stage2_relative_factor---',stage2_relative_factor) + + stage2_bottleneck_channels = num_groups * width_per_group + # print('stage2_bottleneck_channels---',stage2_bottleneck_channels) + + out_channels = res2_out_channels * stage2_relative_factor + # print('out_channels---',out_channels) + + in_channels = out_channels // 2 + # print('in_channels---',in_channels) + # + bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor + # print('bottleneck_channels---',bottleneck_channels) + + block_module = _TRANSFORMATION_MODULES[block_module] + # print('block_module---',block_module) + + + self.stages = [] + stride = stride_init + for stage in stages: + name = "layer" + str(stage.index) + if not stride: + stride = int(stage.index > 1) + 1 + # print('stride---', stride) + print('stage.block_count---', stage.block_count) + module = _make_stage( + block_module, + in_channels, + bottleneck_channels, + out_channels, + stage.block_count, + num_groups, + stride_in_1x1, + first_stride=stride, + dilation=dilation + ) + stride = None + self.add_module(name, module) + self.stages.append(name) + self.out_channels = out_channels + + def forward(self, x): + for stage in self.stages: + x = getattr(self, stage)(x) + print('x-----------',x.shape) + return x + + +def _make_stage( + transformation_module, + in_channels, + bottleneck_channels, + out_channels, + block_count, + num_groups, + stride_in_1x1, + first_stride, + dilation=1 +): + blocks = [] + stride = first_stride + for _ in range(block_count): + blocks.append( + transformation_module( + in_channels, + bottleneck_channels, + out_channels, + num_groups, + stride_in_1x1, + stride, + dilation=dilation + ) + ) + stride = 1 + in_channels = out_channels + return nn.Sequential(*blocks) + + +class Bottleneck(nn.Module): + def __init__( + self, + in_channels, + bottleneck_channels, + out_channels, + num_groups, + stride_in_1x1, + stride, + dilation, + norm_func, + conv_func=Conv2d + ): + super(Bottleneck, self).__init__() + + self.downsample = None + if in_channels != out_channels: + down_stride = stride if dilation == 1 else 1 + self.downsample = nn.Sequential( + conv_func( + in_channels, out_channels, + kernel_size=1, stride=down_stride, bias=False + ), + norm_func(out_channels), + ) + for modules in [self.downsample,]: + for l in modules.modules(): + if isinstance(l, Conv2d): + nn.init.kaiming_uniform_(l.weight, a=1) + + if dilation > 1: + stride = 1 # reset to be 1 + + # The original MSRA ResNet models have stride in the first 1x1 conv + # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have + # stride in the 3x3 conv + stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) + + self.conv1 = conv_func( + in_channels, + bottleneck_channels, + kernel_size=1, + stride=stride_1x1, + bias=False, + ) + self.bn1 = norm_func(bottleneck_channels) + # TODO: specify init for the above + + self.conv2 = conv_func( + bottleneck_channels, + bottleneck_channels, + kernel_size=3, + stride=stride_3x3, + padding=dilation, + bias=False, + groups=num_groups, + dilation=dilation + ) + self.bn2 = norm_func(bottleneck_channels) + + self.conv3 = Conv2d( + bottleneck_channels, out_channels, kernel_size=1, bias=False + ) + self.bn3 = norm_func(out_channels) + + for l in [self.conv1, self.conv2, self.conv3,]: + nn.init.kaiming_uniform_(l.weight, a=1) + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = F.relu_(out) + + out = self.conv2(out) + out = self.bn2(out) + out = F.relu_(out) + + out0 = self.conv3(out) + out = self.bn3(out0) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = F.relu_(out) + + return out + + +class BaseStem(nn.Module): + def __init__(self, cfg, norm_func): + super(BaseStem, self).__init__() + + out_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS + + self.conv1 = Conv2d( + 3, out_channels, kernel_size=7, stride=2, padding=3, bias=False + ) + self.bn1 = norm_func(out_channels) + + for l in [self.conv1,]: + nn.init.kaiming_uniform_(l.weight, a=1) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = F.relu_(x) + x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) + return x + +############################################# + +class BottleneckWithFixedBatchNorm(Bottleneck): + def __init__( + self, + in_channels, + bottleneck_channels, + out_channels, + num_groups=1, + stride_in_1x1=True, + stride=1, + dilation=1 + ): + super(BottleneckWithFixedBatchNorm, self).__init__( + in_channels=in_channels, + bottleneck_channels=bottleneck_channels, + out_channels=out_channels, + num_groups=num_groups, + stride_in_1x1=stride_in_1x1, + stride=stride, + dilation=dilation, + norm_func=FrozenBatchNorm2d + ) + + +class DeformableConvWithFixedBatchNorm(Bottleneck): + def __init__( + self, + in_channels, + bottleneck_channels, + out_channels, + num_groups=1, + stride_in_1x1=True, + stride=1, + dilation=1 + ): + super(DeformableConvWithFixedBatchNorm, self).__init__( + in_channels=in_channels, + bottleneck_channels=bottleneck_channels, + out_channels=out_channels, + num_groups=num_groups, + stride_in_1x1=stride_in_1x1, + stride=stride, + dilation=dilation, + norm_func=FrozenBatchNorm2d, + conv_func=DCN + ) + + +class StemWithFixedBatchNorm(BaseStem): + def __init__(self, cfg): + super(StemWithFixedBatchNorm, self).__init__( + cfg, norm_func=FrozenBatchNorm2d + ) + + +class BottleneckWithGN(Bottleneck): + def __init__( + self, + in_channels, + bottleneck_channels, + out_channels, + num_groups=1, + stride_in_1x1=True, + stride=1, + dilation=1 + ): + super(BottleneckWithGN, self).__init__( + in_channels=in_channels, + bottleneck_channels=bottleneck_channels, + out_channels=out_channels, + num_groups=num_groups, + stride_in_1x1=stride_in_1x1, + stride=stride, + dilation=dilation, + norm_func=group_norm + ) + + +class DeformableConvWithGN(Bottleneck): + def __init__( + self, + in_channels, + bottleneck_channels, + out_channels, + num_groups=1, + stride_in_1x1=True, + stride=1, + dilation=1 + ): + super(DeformableConvWithGN, self).__init__( + in_channels=in_channels, + bottleneck_channels=bottleneck_channels, + out_channels=out_channels, + num_groups=num_groups, + stride_in_1x1=stride_in_1x1, + stride=stride, + dilation=dilation, + norm_func=group_norm, + conv_func=DCN + ) + + +class StemWithGN(BaseStem): + def __init__(self, cfg): + super(StemWithGN, self).__init__(cfg, norm_func=group_norm) + + +_TRANSFORMATION_MODULES = Registry({ + "BottleneckWithFixedBatchNorm": BottleneckWithFixedBatchNorm, + "BottleneckWithGN": BottleneckWithGN, + "DeformableConvWithFixedBatchNorm": DeformableConvWithFixedBatchNorm, + "DeformableConvWithGN": DeformableConvWithGN, +}) + +_STEM_MODULES = Registry({ + "StemWithFixedBatchNorm": StemWithFixedBatchNorm, + "StemWithGN": StemWithGN, +}) + +_STAGE_SPECS = Registry({ + "R-50-C4": ResNet50StagesTo4, + "R-50-C5": ResNet50StagesTo5, + "R-101-C4": ResNet101StagesTo4, + "R-101-C5": ResNet101StagesTo5, + "R-50-FPN": ResNet50FPNStagesTo5, + "R-50-FPN-RETINANET": ResNet50FPNStagesTo5, + "R-101-FPN": ResNet101FPNStagesTo5, + "R-101-PAN": ResNet101FPNStagesTo5, + "R-101-FPN-RETINANET": ResNet101FPNStagesTo5, + "R-152-FPN": ResNet152FPNStagesTo5, + "R-152-PAN": ResNet152FPNStagesTo5, +}) diff --git a/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..c0bd00444d3b1bdefa1a4015e8e6af72166817cf --- /dev/null +++ b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py @@ -0,0 +1,68 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + + +class BalancedPositiveNegativeSampler(object): + """ + This class samples batches, ensuring that they contain a fixed proportion of positives + """ + + def __init__(self, batch_size_per_image, positive_fraction): + """ + Arguments: + batch_size_per_image (int): number of elements to be selected per image + positive_fraction (float): percentace of positive elements per batch + """ + self.batch_size_per_image = batch_size_per_image + self.positive_fraction = positive_fraction + + def __call__(self, matched_idxs): + """ + Arguments: + matched idxs: list of tensors containing -1, 0 or positive values. + Each tensor corresponds to a specific image. + -1 values are ignored, 0 are considered as negatives and > 0 as + positives. + + Returns: + pos_idx (list[tensor]) + neg_idx (list[tensor]) + + Returns two lists of binary masks for each image. + The first list contains the positive elements that were selected, + and the second list the negative example. + """ + pos_idx = [] + neg_idx = [] + for matched_idxs_per_image in matched_idxs: + positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1) + negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1) + + num_pos = int(self.batch_size_per_image * self.positive_fraction) + # protect against not enough positive examples + num_pos = min(positive.numel(), num_pos) + num_neg = self.batch_size_per_image - num_pos + # protect against not enough negative examples + num_neg = min(negative.numel(), num_neg) + + # randomly select positive and negative examples + perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] + perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] + + pos_idx_per_image = positive[perm1] + neg_idx_per_image = negative[perm2] + + # create binary mask from indices + pos_idx_per_image_mask = torch.zeros_like( + matched_idxs_per_image, dtype=torch.uint8 + ) + neg_idx_per_image_mask = torch.zeros_like( + matched_idxs_per_image, dtype=torch.uint8 + ) + pos_idx_per_image_mask[pos_idx_per_image] = 1 + neg_idx_per_image_mask[neg_idx_per_image] = 1 + + pos_idx.append(pos_idx_per_image_mask) + neg_idx.append(neg_idx_per_image_mask) + + return pos_idx, neg_idx diff --git a/maskrcnn_benchmark/modeling/box_coder.py b/maskrcnn_benchmark/modeling/box_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..5579503fa55c92b82690fe55dd9715447ab8f081 --- /dev/null +++ b/maskrcnn_benchmark/modeling/box_coder.py @@ -0,0 +1,193 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import math + +import torch +import pandas as pd +from maskrcnn_benchmark.data.datasets.evaluation.word import io_ +class BoxCoder(object): + """ + This class encodes and decodes a set of bounding boxes into the representation used for training the regressors. + """ + + def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)): + """ + Arguments: + weights (4-element tuple) + bbox_xform_clip (float) + """ + self.weights = weights + self.bbox_xform_clip = bbox_xform_clip + + def encode(self, reference_boxes, proposals): + """ + Encode a set of proposals with respect to some + reference boxes + + Arguments: + reference_boxes (Tensor): reference boxes + proposals (Tensor): boxes to be encoded + """ + TO_REMOVE = 1 # TODO remove + ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE + ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE + ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths + ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights + + gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE + gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE + gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths + gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights + + wx, wy, ww, wh = self.weights + targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths + targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights + targets_dw = ww * torch.log(gt_widths / ex_widths) + targets_dh = wh * torch.log(gt_heights / ex_heights) + + targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) + return targets + + def encode_iou(self, reference_boxes, proposals): + """ + Encode a set of proposals with respect to some + reference boxes + + Arguments: + reference_boxes (Tensor): reference boxes + proposals (Tensor): boxes to be encoded + """ + TO_REMOVE = 1 # TODO remove + ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE + ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE + ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths + ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights + + gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE + gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE + gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths + gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights + + wx, wy, ww, wh = self.weights + targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths + targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights + targets_dw = ww * torch.log(gt_widths / ex_widths) + targets_dh = wh * torch.log(gt_heights / ex_heights) + + targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) + return targets + + + def decode(self, rel_codes, boxes): + """ + From a set of original boxes and encoded relative box offsets, + get the decoded boxes. + + Arguments: + rel_codes (Tensor): encoded boxes # predict [2, 12000, 4] + boxes (Tensor): reference boxes. # anchor [2, 12000, 4] xmin0 ymin1 xmax2 ymax3 + """ + boxes = boxes.to(rel_codes.dtype) + + + TO_REMOVE = 1 # TODO remove + widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE + heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = self.weights + dx = rel_codes[:, 0::4] / wx + dy = rel_codes[:, 1::4] / wy + dw = rel_codes[:, 2::4] / ww + dh = rel_codes[:, 3::4] / wh + + dw = torch.clamp(dw, max=self.bbox_xform_clip) + dh = torch.clamp(dh, max=self.bbox_xform_clip) + + pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] + pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] + pred_w = torch.exp(dw) * widths[:, None] + pred_h = torch.exp(dh) * heights[:, None] + + ############################## + + pred_boxes = torch.zeros_like(rel_codes) + pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w + pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h + pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 + pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 + + return pred_boxes + + + def decode_iou(self, rel_codes, boxes, num_p = 8): + """ + From a set of original boxes and encoded relative box offsets, + get the decoded boxes. + + Arguments: + rel_codes (Tensor): encoded boxes # predict [2, 12000, 4] + boxes (Tensor): reference boxes. # anchor [2, 12000, 4] xmin0 ymin1 xmax2 ymax3 + """ + boxes = boxes.to(rel_codes.dtype) + + TO_REMOVE = 1 # TODO remove + widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE + heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE + + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + # 123 + # 8#4 + # 765 + if num_p == 8: # 8 boundary points + x_1 = boxes[:, 0] + widths * rel_codes[:, 0] + y_1 = boxes[:, 1] + heights * rel_codes[:, 1] + x_2 = ctr_x + widths * rel_codes[:, 2] + y_2 = boxes[:, 1] + heights * rel_codes[:, 3] + x_3 = boxes[:, 2] + widths * rel_codes[:, 4] + y_3 = boxes[:, 1] + heights * rel_codes[:, 5] + x_4 = boxes[:, 2] + widths * rel_codes[:, 6] + y_4 = ctr_y + heights * rel_codes[:, 7] + x_5 = boxes[:, 2] + widths * rel_codes[:, 8] + y_5 = boxes[:, 3] + heights * rel_codes[:, 9] + x_6 = ctr_x + widths * rel_codes[:, 10] + y_6 = boxes[:, 3] + heights * rel_codes[:, 11] + x_7 = boxes[:, 0] + widths * rel_codes[:, 12] + y_7 = boxes[:, 3] + heights * rel_codes[:, 13] + x_8 = boxes[:, 0] + widths * rel_codes[:, 14] + y_8 = ctr_y + heights * rel_codes[:, 15] + x_total = torch.stack([x_1, x_2, x_3, x_4, x_5, x_6, x_7, x_8], 0) + y_total = torch.stack([y_1, y_2, y_3, y_4, y_5, y_6, y_7, y_8], 0) + + x_min = torch.min(x_total, 0, keepdim=True) # [1, N] + x_max = torch.max(x_total, 0, keepdim=True) + + y_min = torch.min(y_total, 0, keepdim=True) + y_max = torch.max(y_total, 0, keepdim=True) + + N1, N2 = x_min[0].shape + x_min = x_min[0].view([N2]) + x_max = x_max[0].view([N2]) + y_min = y_min[0].view([N2]) + y_max = y_max[0].view([N2]) + + x_min = torch.stack([x_min, ctr_x], 0) + x_max = torch.stack([x_max, ctr_x], 0) + y_min = torch.stack([y_min, ctr_y], 0) + y_max = torch.stack([y_max, ctr_y], 0) + + x_min = torch.min(x_min, 0, keepdim=True) # [1, N] + x_max = torch.max(x_max, 0, keepdim=True) + y_min = torch.min(y_min, 0, keepdim=True) + y_max = torch.max(y_max, 0, keepdim=True) + + pred_boxes = torch.zeros_like(boxes) + + pred_boxes[:, 0] = x_min[0][0, :] + pred_boxes[:, 1] = y_min[0][0, :] + pred_boxes[:, 2] = x_max[0][0, :] + pred_boxes[:, 3] = y_max[0][0, :] + + + return pred_boxes diff --git a/maskrcnn_benchmark/modeling/detector/__init__.py b/maskrcnn_benchmark/modeling/detector/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ff421e281e16e6623bab2551b242ea003d1f2166 --- /dev/null +++ b/maskrcnn_benchmark/modeling/detector/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .detectors import build_detection_model diff --git a/maskrcnn_benchmark/modeling/detector/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/detector/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..674c0fcc2a09067ad812c6e7f1f2c295bb13d495 Binary files /dev/null and b/maskrcnn_benchmark/modeling/detector/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/detector/__pycache__/detectors.cpython-37.pyc b/maskrcnn_benchmark/modeling/detector/__pycache__/detectors.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..525398659531677234501163fa548df7e21deb7e Binary files /dev/null and b/maskrcnn_benchmark/modeling/detector/__pycache__/detectors.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/detector/__pycache__/generalized_rcnn.cpython-37.pyc b/maskrcnn_benchmark/modeling/detector/__pycache__/generalized_rcnn.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fa1e99e5a9a2c671b6d3b09474c1a3be7bc370a Binary files /dev/null and b/maskrcnn_benchmark/modeling/detector/__pycache__/generalized_rcnn.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/detector/detectors.py b/maskrcnn_benchmark/modeling/detector/detectors.py new file mode 100644 index 0000000000000000000000000000000000000000..af2100cac15830cd60be5911aa15d0d7c9309a17 --- /dev/null +++ b/maskrcnn_benchmark/modeling/detector/detectors.py @@ -0,0 +1,10 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .generalized_rcnn import GeneralizedRCNN + + +_DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN} + + +def build_detection_model(cfg): + meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE] + return meta_arch(cfg) diff --git a/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..6dfab58deee63d3483927a50f1a8b3a548119ee6 --- /dev/null +++ b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py @@ -0,0 +1,73 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Implements the Generalized R-CNN framework +""" + +import torch +from torch import nn + +from maskrcnn_benchmark.structures.image_list import to_image_list + +from ..backbone import build_backbone +from ..rpn.rpn import build_rpn +from ..roi_heads.roi_heads import build_roi_heads +import numpy as np +import cv2 + +class GeneralizedRCNN(nn.Module): + """ + Main class for Generalized R-CNN. Currently supports boxes and masks. + It consists of three main parts: + - backbone + - rpn + - heads: takes the features + the proposals from the RPN and computes + detections / masks from it. + """ + + def __init__(self, cfg): + super(GeneralizedRCNN, self).__init__() + + self.cfg = cfg.clone() + self.backbone = build_backbone(cfg) + self.rpn = build_rpn(cfg, self.backbone.out_channels) + self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels) + + + def forward(self, images, targets=None): + """ + Arguments: + images (list[Tensor] or ImageList): images to be processed + targets (list[BoxList]): ground-truth boxes present in the image (optional) + + Returns: + result (list[BoxList] or dict[Tensor]): the output from the model. + During training, it returns a dict[Tensor] which contains the losses. + During testing, it returns list[BoxList] contains additional fields + like `scores`, `labels` and `mask` (for Mask R-CNN models). + + """ + if self.training and targets is None: + raise ValueError("In training mode, targets should be passed") + + + images = to_image_list(images) + + features = self.backbone(images.tensors) + proposals, proposal_losses = self.rpn(images, features, targets) + if self.roi_heads: + x, result, detector_losses = self.roi_heads(features, proposals, targets) + else: + #self.warm_start -= 1 + # RPN-only models don't have roi_heads + x = features + result = proposals + detector_losses = {} + + if self.training: + losses = {} + losses.update(detector_losses) + losses.update(proposal_losses) + + return losses + else: + return result diff --git a/maskrcnn_benchmark/modeling/make_layers.py b/maskrcnn_benchmark/modeling/make_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..1656fb4f0ea4aeb65260f46beb80e8bd14fcc091 --- /dev/null +++ b/maskrcnn_benchmark/modeling/make_layers.py @@ -0,0 +1,126 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Miscellaneous utility functions +""" + +import torch +from torch import nn +from torch.nn import functional as F +from maskrcnn_benchmark.config import cfg +from maskrcnn_benchmark.layers import Conv2d, DCN +from maskrcnn_benchmark.modeling.poolers import Pooler + + +def get_group_gn(dim, dim_per_gp, num_groups): + """get number of groups used by GroupNorm, based on number of channels.""" + assert dim_per_gp == -1 or num_groups == -1, \ + "GroupNorm: can only specify G or C/G." + + if dim_per_gp > 0: + assert dim % dim_per_gp == 0, \ + "dim: {}, dim_per_gp: {}".format(dim, dim_per_gp) + group_gn = dim // dim_per_gp + else: + assert dim % num_groups == 0, \ + "dim: {}, num_groups: {}".format(dim, num_groups) + group_gn = num_groups + + return group_gn + + +def group_norm(out_channels, affine=True, divisor=1): + out_channels = out_channels // divisor + dim_per_gp = cfg.MODEL.GROUP_NORM.DIM_PER_GP // divisor + num_groups = cfg.MODEL.GROUP_NORM.NUM_GROUPS // divisor + eps = cfg.MODEL.GROUP_NORM.EPSILON # default: 1e-5 + return torch.nn.GroupNorm( + get_group_gn(out_channels, dim_per_gp, num_groups), + out_channels, + eps, + affine + ) + + +def make_conv3x3( + in_channels, + out_channels, + dilation=1, + stride=1, + use_gn=False, + use_relu=False, + kaiming_init=True +): + conv = Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False if use_gn else True + ) + if kaiming_init: + nn.init.kaiming_normal_( + conv.weight, mode="fan_out", nonlinearity="relu" + ) + else: + torch.nn.init.normal_(conv.weight, std=0.01) + if not use_gn: + nn.init.constant_(conv.bias, 0) + module = [conv,] + if use_gn: + module.append(group_norm(out_channels)) + if use_relu: + module.append(nn.ReLU(inplace=True)) + if len(module) > 1: + return nn.Sequential(*module) + return conv + + +def make_fc(dim_in, hidden_dim, use_gn=False): + ''' + Caffe2 implementation uses XavierFill, which in fact + corresponds to kaiming_uniform_ in PyTorch + ''' + if use_gn: + fc = nn.Linear(dim_in, hidden_dim, bias=False) + nn.init.kaiming_uniform_(fc.weight, a=1) + return nn.Sequential(fc, group_norm(hidden_dim)) + fc = nn.Linear(dim_in, hidden_dim) + nn.init.kaiming_uniform_(fc.weight, a=1) + nn.init.constant_(fc.bias, 0) + return fc + + +def conv_with_kaiming_uniform(use_gn=False, use_relu=False, use_deformable=False): + def make_conv( + in_channels, out_channels, kernel_size, stride=1, dilation=1 + ): + if use_deformable: + conv_func = DCN + else: + conv_func = Conv2d + conv = conv_func( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=dilation * (kernel_size - 1) // 2, + dilation=dilation, + bias=False if use_gn else True + ) + # Caffe2 implementation uses XavierFill, which in fact + # corresponds to kaiming_uniform_ in PyTorch + nn.init.kaiming_uniform_(conv.weight, a=1) + if not use_gn: + nn.init.constant_(conv.bias, 0) + module = [conv,] + if use_gn: + module.append(group_norm(out_channels)) + if use_relu: + module.append(nn.ReLU(inplace=True)) + if len(module) > 1: + return nn.Sequential(*module) + return conv + + return make_conv diff --git a/maskrcnn_benchmark/modeling/matcher.py b/maskrcnn_benchmark/modeling/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..35ec5f1fe819526055c10607f05d47ac88277de6 --- /dev/null +++ b/maskrcnn_benchmark/modeling/matcher.py @@ -0,0 +1,112 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + + +class Matcher(object): + """ + This class assigns to each predicted "element" (e.g., a box) a ground-truth + element. Each predicted element will have exactly zero or one matches; each + ground-truth element may be assigned to zero or more predicted elements. + + Matching is based on the MxN match_quality_matrix, that characterizes how well + each (ground-truth, predicted)-pair match. For example, if the elements are + boxes, the matrix may contain box IoU overlap values. + + The matcher returns a tensor of size N containing the index of the ground-truth + element m that matches to prediction n. If there is no match, a negative value + is returned. + """ + + BELOW_LOW_THRESHOLD = -1 + BETWEEN_THRESHOLDS = -2 + + def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False): + """ + Args: + high_threshold (float): quality values greater than or equal to + this value are candidate matches. + low_threshold (float): a lower quality threshold used to stratify + matches into three levels: + 1) matches >= high_threshold + 2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold) + 3) BELOW_LOW_THRESHOLD matches in [0, low_threshold) + allow_low_quality_matches (bool): if True, produce additional matches + for predictions that have only low-quality match candidates. See + set_low_quality_matches_ for more details. + """ + assert low_threshold <= high_threshold + self.high_threshold = high_threshold + self.low_threshold = low_threshold + self.allow_low_quality_matches = allow_low_quality_matches + + def __call__(self, match_quality_matrix): + """ + Args: + match_quality_matrix (Tensor[float]): an MxN tensor, containing the + pairwise quality between M ground-truth elements and N predicted elements. + + Returns: + matches (Tensor[int64]): an N tensor where N[i] is a matched gt in + [0, M - 1] or a negative value indicating that prediction i could not + be matched. + """ + if match_quality_matrix.numel() == 0: + # empty targets or proposals not supported during training + if match_quality_matrix.shape[0] == 0: + raise ValueError( + "No ground-truth boxes available for one of the images " + "during training") + else: + raise ValueError( + "No proposal boxes available for one of the images " + "during training") + + # match_quality_matrix is M (gt) x N (predicted) + # Max over gt elements (dim 0) to find best gt candidate for each prediction + matched_vals, matches = match_quality_matrix.max(dim=0) + if self.allow_low_quality_matches: + all_matches = matches.clone() + + # Assign candidate matches with low quality to negative (unassigned) values + below_low_threshold = matched_vals < self.low_threshold + between_thresholds = (matched_vals >= self.low_threshold) & ( + matched_vals < self.high_threshold + ) + matches[below_low_threshold] = Matcher.BELOW_LOW_THRESHOLD + matches[between_thresholds] = Matcher.BETWEEN_THRESHOLDS + + if self.allow_low_quality_matches: + self.set_low_quality_matches_(matches, all_matches, match_quality_matrix) + + return matches + + def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix): + """ + Produce additional matches for predictions that have only low-quality matches. + Specifically, for each ground-truth find the set of predictions that have + maximum overlap with it (including ties); for each prediction in that set, if + it is unmatched, then match it to the ground-truth with which it has the highest + quality value. + """ + # For each gt, find the prediction with which it has highest quality + highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) + # Find highest quality match available, even if it is low, including ties + gt_pred_pairs_of_highest_quality = torch.nonzero( + match_quality_matrix == highest_quality_foreach_gt[:, None] + ) + # Example gt_pred_pairs_of_highest_quality: + # tensor([[ 0, 39796], + # [ 1, 32055], + # [ 1, 32070], + # [ 2, 39190], + # [ 2, 40255], + # [ 3, 40390], + # [ 3, 41455], + # [ 4, 45470], + # [ 5, 45325], + # [ 5, 46390]]) + # Each row is a (gt index, prediction index) + # Note how gt items 1, 2, 3, and 5 each have two ties + + pred_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1] + matches[pred_inds_to_update] = all_matches[pred_inds_to_update] diff --git a/maskrcnn_benchmark/modeling/poolers.py b/maskrcnn_benchmark/modeling/poolers.py new file mode 100644 index 0000000000000000000000000000000000000000..0164f439b8668fb136611249eb8301a2d90e7d1d --- /dev/null +++ b/maskrcnn_benchmark/modeling/poolers.py @@ -0,0 +1,151 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.layers import ROIAlign +from maskrcnn_benchmark.layers import DCNPooling + +from .utils import cat + + +class LevelMapper(object): + """Determine which FPN level each RoI in a set of RoIs should map to based + on the heuristic in the FPN paper. + """ + + def __init__(self, k_min, k_max, canonical_scale=224, canonical_level=4, eps=1e-6): + """ + Arguments: + k_min (int) + k_max (int) + canonical_scale (int) + canonical_level (int) + eps (float) + """ + self.k_min = k_min + self.k_max = k_max + self.s0 = canonical_scale + self.lvl0 = canonical_level + self.eps = eps + + def __call__(self, boxlists): + """ + Arguments: + boxlists (list[BoxList]) + """ + # Compute level ids + s = torch.sqrt(cat([boxlist.area() for boxlist in boxlists])) + + # Eqn.(1) in FPN paper + target_lvls = torch.floor(self.lvl0 + torch.log2(s / self.s0 + self.eps)) + target_lvls = torch.clamp(target_lvls, min=self.k_min, max=self.k_max) + return target_lvls.to(torch.int64) - self.k_min + + def get_random(self, level): + """ Generate a random roi for target level + """ + xmin, ymin, xmax, ymax = torch.tensor + + +class Pooler(nn.Module): + """ + Pooler for Detection with or without FPN. + It currently hard-code ROIAlign in the implementation, + but that can be made more generic later on. + Also, the requirement of passing the scales is not strictly necessary, as they + can be inferred from the size of the feature map / size of original image, + which is available thanks to the BoxList. + """ + + def __init__(self, output_size, scales, sampling_ratio, + deformable=False, output_channel=256): + """ + Arguments: + output_size (list[tuple[int]] or list[int]): output size for the pooled region + scales (list[float]): scales for each Pooler + sampling_ratio (int): sampling ratio for ROIAlign + """ + super(Pooler, self).__init__() + poolers = [] + for scale in scales: + poolers.append( + ROIAlign( + output_size, spatial_scale=scale, sampling_ratio=sampling_ratio + ) if not deformable else + DCNPooling(spatial_scale=scale, pooled_size=output_size, no_trans=False, + group_size=1, trans_std=0.1, output_dim=output_channel) + ) + self.poolers = nn.ModuleList(poolers) + self.output_size = output_size + # get the levels in the feature map by leveraging the fact that the network always + # downsamples by a factor of 2 at each level. + lvl_min = -torch.log2(torch.tensor(scales[0], dtype=torch.float32)).item() + lvl_max = -torch.log2(torch.tensor(scales[-1], dtype=torch.float32)).item() + self.map_levels = LevelMapper(lvl_min, lvl_max, canonical_scale=160) + + def convert_to_roi_format(self, boxes): + concat_boxes = cat([b.bbox for b in boxes], dim=0) + device, dtype = concat_boxes.device, concat_boxes.dtype + ids = cat( + [ + torch.full((len(b), 1), i, dtype=dtype, device=device) + for i, b in enumerate(boxes) + ], + dim=0, + ) + rois = torch.cat([ids, concat_boxes], dim=1) + return rois + + def forward(self, x, boxes): + """ + Arguments: + x (list[Tensor]): feature maps for each level + boxes (list[BoxList]): boxes to be used to perform the pooling operation. + Returns: + result (Tensor) + """ + num_levels = len(self.poolers) + rois = self.convert_to_roi_format(boxes) + if num_levels == 1: + return self.poolers[0](x[0], rois) + + levels = self.map_levels(boxes) + + num_rois = len(rois) + num_channels = x[0].shape[1] + output_size = self.output_size[0] + + dtype, device = x[0].dtype, x[0].device + result = torch.zeros( + (num_rois, num_channels, output_size, output_size), + dtype=dtype, + device=device, + ) + for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)): + idx_in_level = torch.nonzero(levels == level).squeeze(1) + rois_per_level = rois[idx_in_level] + if idx_in_level.numel() == 0: + if num_rois == 0: + continue + # create a roi and do one empty forward pass + new_level = idx_in_level.new_tensor((0,)) + new_rois = rois[new_level] + result[new_level] = result[new_level] \ + + pooler(per_level_feature, new_rois) * 0.0 + else: + result[idx_in_level] = pooler(per_level_feature, rois_per_level) + + return result + + +def make_pooler(cfg, head_name): + resolution = cfg.MODEL[head_name].POOLER_RESOLUTION + scales = cfg.MODEL[head_name].POOLER_SCALES + sampling_ratio = cfg.MODEL[head_name].POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + return pooler diff --git a/maskrcnn_benchmark/modeling/registry.py b/maskrcnn_benchmark/modeling/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..e14fb118c458d0ba97d2a699be3004c6bdd3913c --- /dev/null +++ b/maskrcnn_benchmark/modeling/registry.py @@ -0,0 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +from maskrcnn_benchmark.utils.registry import Registry + +BACKBONES = Registry() +RPN_HEADS = Registry() +ROI_BOX_FEATURE_EXTRACTORS = Registry() +ROI_BOX_PREDICTOR = Registry() +ROI_KEYPOINT_FEATURE_EXTRACTORS = Registry() +ROI_KEYPOINT_PREDICTOR = Registry() +ROI_MASK_FEATURE_EXTRACTORS = Registry() +ROI_MASK_PREDICTOR = Registry() diff --git a/maskrcnn_benchmark/modeling/roi_heads/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/roi_heads/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..200603ec4fc014adada1cc6d180f84c22c92d9a9 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/__pycache__/roi_heads.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/__pycache__/roi_heads.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50e17ea8bccbccce7553e5d955ed608e15878c22 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/__pycache__/roi_heads.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..671bf7a8e888f033a1b68d3a691bd4024518a426 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/boundary_head.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/boundary_head.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83bbc12a984d6e8dc18152757b28c65a55ad0345 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/boundary_head.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/inference.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..adaf00725f22784d78ced491b33369f9540a3bfe Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/inference.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/ke_head.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/ke_head.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..194d3a457ba237a4916d16b7609302cc6d18f608 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/ke_head.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/loss.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07f33dcf36472ecf3c90e17717fb50cfcb034e28 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/loss.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_feature_extractors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_feature_extractors.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aaeaa85fe3187a5b97ec7d912f56ea518ff0e7f0 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_feature_extractors.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_predictors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_predictors.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8899f61e1f2d33be72ea139133231f50c0b45342 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_predictors.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_feature_extractors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_feature_extractors.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34d4eb611ed7e32740b31dfb5f5733410128c1e4 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_feature_extractors.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_predictors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_predictors.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7777e886dcfa288907e56652e6ba98dfd4de29c4 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_predictors.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/boundary_head.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/boundary_head.py new file mode 100644 index 0000000000000000000000000000000000000000..643e58b01e04cab324420ce9a09f0310f2a97d91 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/boundary_head.py @@ -0,0 +1,104 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn + +from maskrcnn_benchmark.structures.bounding_box import BoxList + +from .roi_boundary_feature_extractors import make_roi_boundary_feature_extractor +from .roi_boundary_predictors import make_roi_boundary_predictor +from .inference import make_roi_boundary_post_processor +from .loss import make_roi_boundary_loss_evaluator + +def keep_only_positive_boxes(boxes): + """ + Given a set of BoxList containing the `labels` field, + return a set of BoxList for which `labels > 0`. + + Arguments: + boxes (list of BoxList) + """ + assert isinstance(boxes, (list, tuple)) + assert isinstance(boxes[0], BoxList) + assert boxes[0].has_field("labels") + positive_boxes = [] + positive_inds = [] + num_boxes = 0 + for boxes_per_image in boxes: + labels = boxes_per_image.get_field("labels") + inds_mask = labels > 0 + inds = inds_mask.nonzero().squeeze(1) + positive_boxes.append(boxes_per_image[inds]) + positive_inds.append(inds_mask) + return positive_boxes, positive_inds + + +def keep_only_positive_boxes(boxes): + """ + Given a set of BoxList containing the `labels` field, + return a set of BoxList for which `labels > 0`. + + Arguments: + boxes (list of BoxList) + """ + assert isinstance(boxes, (list, tuple)) + assert isinstance(boxes[0], BoxList) + assert boxes[0].has_field("labels") + positive_boxes = [] + positive_inds = [] + num_boxes = 0 + for boxes_per_image in boxes: + labels = boxes_per_image.get_field("labels") + inds_mask = labels > 0 + inds = inds_mask.nonzero().squeeze(1) + positive_boxes.append(boxes_per_image[inds]) + positive_inds.append(inds_mask) + return positive_boxes, positive_inds + + +class ROIBOHead(torch.nn.Module): + def __init__(self, cfg, in_channels): + super(ROIBOHead, self).__init__() + self.cfg = cfg.clone() + self.feature_extractor = make_roi_boundary_feature_extractor(cfg, in_channels) + self.predictor = make_roi_boundary_predictor(cfg) + self.post_processor = make_roi_boundary_post_processor(cfg) + self.loss_evaluator = make_roi_boundary_loss_evaluator(cfg) + + def forward(self, features, proposals, targets=None): + """ + Arguments: + features (list[Tensor]): feature-maps from possibly several levels + proposals (list[BoxList]): proposal boxes + targets (list[BoxList], optional): the ground-truth targets. + + Returns: + x (Tensor): the result of the feature extractor + proposals (list[BoxList]): during training, the original proposals + are returned. During testing, the predicted boxlists are returned + with the `mask` field set + losses (dict[Tensor]): During training, returns the losses for the + head. During testing, returns an empty dict. + """ + + if self.training: + # during training, only focus on positive boxes + with torch.no_grad(): + # proposals = self.loss_evaluator.subsample(proposals, targets) + all_proposals = proposals + proposals, positive_inds = keep_only_positive_boxes(proposals) + + x = self.feature_extractor(features, proposals) + outputs_x, outputs_y= self.predictor(x) + + if not self.training: + result = self.post_processor(outputs_x, outputs_y, proposals) + + return x, result, {}, {}, {} + + loss_bo, loss_x, loss_y = self.loss_evaluator(proposals, outputs_x, outputs_y, targets) + + return x, proposals, dict(loss_bo=loss_bo), dict(loss_bo_x=loss_x), dict(loss_bo_y=loss_y) + + +def build_roi_boundary_head(cfg, in_channels): + return ROIBOHead(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..e734da2b274434d001fecaec37d4437e890edfda --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.py @@ -0,0 +1,207 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import numpy as np +import torch +from torch import nn +from maskrcnn_benchmark.layers.misc import interpolate + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +# TODO check if want to return a single BoxList or a composite +# object +class MaskPostProcessor(nn.Module): + """ + From the results of the CNN, post process the masks + by taking the mask corresponding to the class with max + probability (which are of fixed size and directly output + by the CNN) and return the masks in the mask field of the BoxList. + + If a masker object is passed, it will additionally + project the masks in the image according to the locations in boxes, + """ + + def __init__(self, masker=None): + super(MaskPostProcessor, self).__init__() + self.masker = masker + + def forward(self, x, y, boxes): + """ + Arguments: + x (Tensor): the mask logits + boxes (list[BoxList]): bounding boxes that are used as + reference, one for ech image + + Returns: + results (list[BoxList]): one BoxList for each image, containing + the extra field mask + """ + mask_prob_x = x.sigmoid() + mask_prob_y = y.sigmoid() + # select masks coresponding to the predicted classes + num_masks = x.shape[0] # 286 + labels = [bbox.get_field("labels") for bbox in boxes] + labels = torch.cat(labels) + index = torch.arange(num_masks, device=labels.device) + mask_prob_x = mask_prob_x[index, 0][:, None] + mask_prob_y = mask_prob_y[index, 0][:, None] + + boxes_per_image = [len(box) for box in boxes] # boxes for one image + mask_prob_x = mask_prob_x.split(boxes_per_image, dim=0) + mask_prob_y = mask_prob_y.split(boxes_per_image, dim=0) + + if self.masker: + print('yes!!!') + mask_prob_x = self.masker(mask_prob_x, boxes) + mask_prob_y = self.masker(mask_prob_y, boxes) + + results = [] + for prob_x, prob_y, box in zip(mask_prob_x, mask_prob_y, boxes): + bbox = BoxList(box.bbox, box.size, mode="xyxy") + for field in box.fields(): + bbox.add_field(field, box.get_field(field)) + bbox.add_field("mask_x", prob_x) + bbox.add_field("mask_y", prob_y) + results.append(bbox) + return results + + +class MaskPostProcessorCOCOFormat(MaskPostProcessor): + """ + From the results of the CNN, post process the results + so that the masks are pasted in the image, and + additionally convert the results to COCO format. + """ + + def forward(self, x, boxes): + import pycocotools.mask as mask_util + import numpy as np + + results = super(MaskPostProcessorCOCOFormat, self).forward(x, boxes) + for result in results: + masks = result.get_field("mask").cpu() + rles = [ + mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] + for mask in masks + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + result.add_field("mask", rles) + return results + + +# the next two functions should be merged inside Masker +# but are kept here for the moment while we need them +# temporarily gor paste_mask_in_image +def expand_boxes(boxes, scale): + w_half = (boxes[:, 2] - boxes[:, 0]) * .5 + h_half = (boxes[:, 3] - boxes[:, 1]) * .5 + x_c = (boxes[:, 2] + boxes[:, 0]) * .5 + y_c = (boxes[:, 3] + boxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + boxes_exp = torch.zeros_like(boxes) + boxes_exp[:, 0] = x_c - w_half + boxes_exp[:, 2] = x_c + w_half + boxes_exp[:, 1] = y_c - h_half + boxes_exp[:, 3] = y_c + h_half + return boxes_exp + + +def expand_masks(mask, padding): + N = mask.shape[0] + M = mask.shape[-1] + pad2 = 2 * padding + scale = float(M + pad2) / M + padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2)) + + padded_mask[:, :, padding:-padding, padding:-padding] = mask + return padded_mask, scale + + +def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1): + padded_mask, scale = expand_masks(mask[None], padding=padding) + mask = padded_mask[0, 0] + box = expand_boxes(box[None], scale)[0] + box = box.to(dtype=torch.int32) + TO_REMOVE = 1 + w = int(box[2] - box[0] + TO_REMOVE) + h = int(box[3] - box[1] + TO_REMOVE) + w = max(w, 1) + h = max(h, 1) + + # Set shape to [batchxCxHxW] + mask = mask.expand((1, 1, -1, -1)) + + # Resize mask + mask = mask.to(torch.float32) + mask = interpolate(mask, size=(h, w), mode='bilinear', align_corners=False) + mask = mask[0][0] + + if thresh >= 0: + mask = mask > thresh + else: + # for visualization and debugging, we also + # allow it to return an unmodified mask + mask = (mask * 255).to(torch.uint8) + + im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8) + x_0 = max(box[0], 0) + x_1 = min(box[2] + 1, im_w) + y_0 = max(box[1], 0) + y_1 = min(box[3] + 1, im_h) + + im_mask[y_0:y_1, x_0:x_1] = mask[ + (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0]) + ] + return im_mask + + +class Masker(object): + """ + Projects a set of masks in an image on the locations specified by the bounding boxes + """ + + def __init__(self, threshold=0.5, padding=1): + self.threshold = threshold + self.padding = padding + + def forward_single_image(self, masks, boxes): + boxes = boxes.convert("xyxy") + im_w, im_h = boxes.size + res = [ + paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding) + for mask, box in zip(masks, boxes.bbox) + ] + if len(res) > 0: + res = torch.stack(res, dim=0)[:, None] + else: + res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1])) + return res + + def __call__(self, masks, boxes): + if isinstance(boxes, BoxList): + boxes = [boxes] + + # Make some sanity check + assert len(boxes) == len(masks), "Masks and boxes should have the same length." + + # TODO: Is this JIT compatible? + # If not we should make it compatible. + results = [] + for mask, box in zip(masks, boxes): + assert mask.shape[0] == len(box), "Number of objects should be the same." + result = self.forward_single_image(mask, box) + results.append(result) + return results + + +def make_roi_boundary_post_processor(cfg): + if cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS: + mask_threshold = cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD ## 0.5 + masker = Masker(threshold=mask_threshold, padding=1) + else: + masker = None + mask_post_processor = MaskPostProcessor(masker) + return mask_post_processor diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.pybk b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.pybk new file mode 100644 index 0000000000000000000000000000000000000000..60a9d4cba3aba34fd33366890571fb8a88fd8030 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.pybk @@ -0,0 +1,293 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import numpy as np +import torch +from torch import nn +import torch.nn.functional as F + +from maskrcnn_benchmark.structures.bounding_box import BoxList + +import cv2 + +# TODO check if want to return a single BoxList or a composite +# object +class KEPostProcessor(nn.Module): + """ + From the results of the CNN, post process the kes + by taking the ke corresponding to the class with max + probability (which are of fixed size and directly output + by the CNN) and return the kes in the ke field of the BoxList. + + If a keer object is passed, it will additionally + project the kes in the image according to the locations in boxes, + """ + + def __init__(self, keer=None): + super(KEPostProcessor, self).__init__() + self.keer = keer + + def forward(self, x, boxes): + """ + Arguments: + x (Tensor): the ke logits + boxes (list[BoxList]): bounding boxes that are used as + reference, one for ech image + + Returns: + results (list[BoxList]): one BoxList for each image, containing + the extra field ke + """ + # ke_prob = x.sigmoid() + + # select kes coresponding to the predicted classes + num_proposals = x.shape[0] + labels = [bbox.get_field("labels") for bbox in boxes] + labels = torch.cat(labels) + index = torch.arange(num_proposals, device=labels.device) + ####### outputs + + ke_prob = x[index] + # print("labels", labels) + # print("x",x.size()) + # print("ke_",ke_prob.size()) + # assert(0) + + boxes_per_image = [len(box) for box in boxes] + ke_prob = ke_prob.split(boxes_per_image, dim=0) + + if self.keer: + ke_prob = self.keer(ke_prob, boxes) + + results = [] + for prob, box in zip(ke_prob, boxes): + bbox = BoxList(box.bbox, box.size, mode="xyxy") + for field in box.fields(): + bbox.add_field(field, box.get_field(field)) + bbox.add_field("ke", prob) + results.append(bbox) + + return results + + +class KEPostProcessorCOCOFormat(KEPostProcessor): + """ + From the results of the CNN, post process the results + so that the kes are pasted in the image, and + additionally convert the results to COCO format. + """ + + def forward(self, x, boxes): + # import pycocotools.mask as mask_util + import numpy as np + + results = super(KEPostProcessorCOCOFormat, self).forward(x, boxes) + for result in results: + kes = result.get_field("ke").cpu() + rles = [ + ke_util.encode(np.array(ke[0, :, :, np.newaxis], order="F"))[0] + for ke in kes + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + result.add_field("ke", rles) + return results + + +# the next two functions should be merged inside keer +# but are kept here for the moment while we need them +# temporarily gor paste_ke_in_image +def expand_boxes(boxes, scale): + w_half = (boxes[:, 2] - boxes[:, 0]) * .5 + h_half = (boxes[:, 3] - boxes[:, 1]) * .5 + x_c = (boxes[:, 2] + boxes[:, 0]) * .5 + y_c = (boxes[:, 3] + boxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + boxes_exp = torch.zeros_like(boxes) + boxes_exp[:, 0] = x_c - w_half + boxes_exp[:, 2] = x_c + w_half + boxes_exp[:, 1] = y_c - h_half + boxes_exp[:, 3] = y_c + h_half + return boxes_exp + + +def expand_kes(ke, padding): + N = ke.shape[0] + M = ke.shape[-1] + # print("NM ", N ,M) + pad2 = 2 * padding + scale = float(M + pad2) / M + padded_ke = ke.new_zeros((N, 1, M + pad2, M + pad2)) + padded_ke[:, :, padding:-padding, padding:-padding] = ke + # print("padded_ke ", padded_ke.size()) + return padded_ke, scale + + +def paste_ke_in_image(ke, box, im_h, im_w, thresh=0.5, padding=1): + # print("ke ", ke.size(), ke[None].size()) + padded_ke, scale = expand_kes(ke[None], padding=padding) + ke = padded_ke[0, 0] + box = expand_boxes(box[None], scale)[0] + box = box.to(dtype=torch.int32) + + TO_REMOVE = 1 + w = int(box[2] - box[0] + TO_REMOVE) + h = int(box[3] - box[1] + TO_REMOVE) + w = max(w, 1) + h = max(h, 1) + + # Set shape to [batchxCxHxW] + ke = ke.expand((1, 1, -1, -1)) + + # print("ke 2", ke.size()) + # Resize ke + ke = ke.to(torch.float32) + ke = F.interpolate(ke, size=(h, w), mode='bilinear', align_corners=False) + ke = ke[0][0] + + # print("ke3 ", ke.size()) + + if thresh >= 0: + ke = ke > thresh + else: + # for visualization and debugging, we also + # allow it to return an unmodified ke + ke = (ke * 255).to(torch.uint8) + + im_ke = torch.zeros((im_h, im_w), dtype=torch.uint8) + x_0 = max(box[0], 0) + x_1 = min(box[2] + 1, im_w) + y_0 = max(box[1], 0) + y_1 = min(box[3] + 1, im_h) + + im_ke[y_0:y_1, x_0:x_1] = ke[ + (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0]) + ] + # print("im_ke ", im_ke.size()) + return im_ke + +def scores_to_probs(scores): + """Transforms CxHxW of scores to probabilities spatially.""" + channels = scores.shape[0] + for c in range(channels): + temp = scores[c, :, :] + max_score = temp.max() + temp = np.exp(temp - max_score) / np.sum(np.exp(temp - max_score)) + scores[c, :, :] = temp + return scores + +def heatmaps_to_kes(maps, rois): + # This function converts a discrete image coordinate in a HEATMAP_SIZE x + # HEATMAP_SIZE image to a continuous ke coordinate. We maintain + # consistency with ke_to_heatmap_labels by using the conversion from + # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a + # continuous coordinate. + rois =rois.numpy() + maps = maps.numpy() + offset_x = rois[:, 0] + offset_y = rois[:, 1] + + widths = rois[:, 2] - rois[:, 0] + heights = rois[:, 3] - rois[:, 1] + widths = np.maximum(widths, 1) + heights = np.maximum(heights, 1) + widths_ceil = np.ceil(widths) + heights_ceil = np.ceil(heights) + + # NCHW to NHWC for use with OpenCV + maps = np.transpose(maps, [0, 2, 3, 1]) + # min_size = cfg.KRCNN.INFERENCE_MIN_SIZE + + num_kes = 10 + xy_preds = np.zeros( + (len(rois), 4, num_kes), dtype=np.float32) + for i in range(len(rois)): + # if min_size > 0: + # roi_map_width = int(np.maximum(widths_ceil[i], min_size)) + # roi_map_height = int(np.maximum(heights_ceil[i], min_size)) + # else: + # roi_map_width = widths_ceil[i] + # roi_map_height = heights_ceil[i] + roi_map_width = int(widths_ceil[i]) + roi_map_height = int(heights_ceil[i]) + + width_correction = widths[i] / roi_map_width + height_correction = heights[i] / roi_map_height + roi_map = cv2.resize( + maps[i], (roi_map_width, roi_map_height), + interpolation=cv2.INTER_CUBIC) + # Bring back to CHW + roi_map = np.transpose(roi_map, [2, 0, 1]) + roi_map_probs = scores_to_probs(roi_map.copy()) + w = roi_map.shape[2] + for k in range(num_kes): + pos = roi_map[k, :, :].argmax() + x_int = pos % w + y_int = (pos - x_int) // w + assert (roi_map_probs[k, y_int, x_int] == + roi_map_probs[k, :, :].max()) + x = (x_int + 0.5) * width_correction + y = (y_int + 0.5) * height_correction + xy_preds[i, 0, k] = x + offset_x[i] + xy_preds[i, 1, k] = y + offset_y[i] + xy_preds[i, 2, k] = roi_map[k, y_int, x_int] + xy_preds[i, 3, k] = roi_map_probs[k, y_int, x_int] + + return xy_preds + +class KEer(object): + """ + Projects a set of kes in an image on the locations + specified by the bounding boxes + """ + + def __init__(self, threshold=0.5, padding=1): + self.threshold = threshold + self.padding = padding + + def forward_single_image(self, kes, boxes): + boxes = boxes.convert("xyxy") + im_w, im_h = boxes.size + # print("KEer kes.size()", kes.size(), kes[0].size(), kes[0][0].size()) + # assert(0) + # res = [ + # paste_ke_in_image(ke[0], box, im_h, im_w, self.threshold, self.padding) + # for ke, box in zip(kes, boxes.bbox) + # ] + res = heatmaps_to_kes(kes, boxes.bbox) + + if len(res) > 0: + # res = torch.stack(res, dim=0)[:, None] + res = torch.from_numpy(res) + else: + res = kes.new_empty((0, 1, kes.shape[-2], kes.shape[-1])) + print("res inference.py", res.size()) + return res + + def __call__(self, kes, boxes): + if isinstance(boxes, BoxList): + boxes = [boxes] + + # Make some sanity check + assert len(boxes) == len(kes), "kes and boxes should have the same length." + + # TODO: Is this JIT compatible? + # If not we should make it compatible. + results = [] + for ke, box in zip(kes, boxes): + assert ke.shape[0] == len(box), "Number of objects should be the same." + # print("ke inference.py", ke.size()) + result = self.forward_single_image(ke, box) + results.append(result) + return results + + +def make_roi_ke_post_processor(cfg): + if cfg.MODEL.ROI_KE_HEAD.POSTPROCESS_KES: + ke_threshold = cfg.MODEL.ROI_KE_HEAD.POSTPROCESS_KES_THRESHOLD + keer = KEer(threshold=ke_threshold, padding=1) + else: + keer = None + ke_post_processor = KEPostProcessor(keer) + return ke_post_processor diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..00b659e1abde19746ef13aae30fa3bb2f298a57c --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/loss.py @@ -0,0 +1,259 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch.nn import functional as F + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.modeling.utils import cat + +from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import ( + BalancedPositiveNegativeSampler +) +# import torch import torch.nn as nn +from maskrcnn_benchmark.structures.ke import kes_to_heat_map +import numpy as np +import os, time +import cv2 +DEBUG = 0 + +from scipy.ndimage.morphology import distance_transform_edt + + +def onehot_to_binary_edges(mask, radius): + """ + Converts a segmentation mask (K,H,W) to a binary edgemap (1,H,W) + """ + if radius < 0: + return mask + + # We need to pad the borders for boundary conditions + + mask = np.pad(mask, ((1, 1), (1, 1)), mode='constant', constant_values=0) + mask = distance_transform_edt(mask) + mask = mask[1:-1, 1:-1] + mask[mask > radius] = 0 + mask = (mask > 0).astype(np.uint8) + return mask + + +def project_masks_on_boxes(segmentation_masks, proposals, discretization_size): + """ + Given segmentation masks and the bounding boxes corresponding + to the location of the masks in the image, this function + crops and resizes the masks in the position defined by the + boxes. This prepares the masks for them to be fed to the + loss computation as the targets. + + Arguments: + segmentation_masks: an instance of SegmentationMask + proposals: an instance of BoxList + """ + masks = [] + M = discretization_size + device = proposals.bbox.device + proposals = proposals.convert("xyxy") + assert segmentation_masks.size == proposals.size, "{}, {}".format( + segmentation_masks, proposals + ) + + # FIXME: CPU computation bottleneck, this should be parallelized + proposals = proposals.bbox.to(torch.device("cpu")) + for segmentation_mask, proposal in zip(segmentation_masks, proposals): + # crop the masks, resize them to the desired resolution and + # then convert them to the tensor representation. + cropped_mask = segmentation_mask.crop(proposal) + scaled_mask = cropped_mask.resize((M, M)) + mask = scaled_mask.get_mask_tensor() + mask = mask.numpy().astype(np.uint8) + mask = onehot_to_binary_edges(mask, 2) + mask = torch.from_numpy(mask) + masks.append(mask) + if len(masks) == 0: + return torch.empty(0, dtype=torch.float32, device=device) + return torch.stack(masks, dim=0).to(device, dtype=torch.float32) + + +def project_kes_to_heatmap(kes, mty, proposals, discretization_size): + proposals = proposals.convert('xyxy') + out_x, out_y, valid_x, valid_y, out_mty, valid_mty = kes_to_heat_map(kes.kes_x, kes.kes_y, mty.mty, proposals.bbox, discretization_size) + return out_x, out_y, valid_x, valid_y, out_mty, valid_mty + +def _within_box(points_x, points_y, boxes): + """Validate which kes are contained inside a given box. + points: NxKx2 + boxes: Nx4 + output: NxK + """ + x_within = (points_x[..., :, 0] >= boxes[:, 0, None]) & (points_x[..., :, 0] <= boxes[:, 2, None]) + y_within = (points_y[..., :, 0] >= boxes[:, 1, None]) & (points_y[..., :, 0] <= boxes[:, 3, None]) + return x_within & y_within + +_TOTAL_SKIPPED = 0 + +def balance_ce_loss(pre_mk, target_mk): + pre_mk = torch.sigmoid(pre_mk) + + pos_inds = target_mk.eq(1) + pos_num = torch.sum(pos_inds).float() + neg_num = torch.sum(1 - pos_inds).float() + loss = -(target_mk * torch.log(pre_mk + 1e-4)) / pos_num - ((1 - target_mk) * torch.log(1 - pre_mk + 1e-4)) / neg_num + return loss.sum() + + +def edge_loss(input, target): + n, c, h, w = input.size() + + log_p = input.transpose(1, 2).transpose(2, 3).contiguous().view(1, -1) + target_t = target.transpose(1, 2).transpose(2, 3).contiguous().view(1, -1) + pos_index = (target_t == 1) + neg_index = (target_t == 0) + pos_index = pos_index.data.cpu().numpy().astype(bool) + neg_index = neg_index.data.cpu().numpy().astype(bool) + weight = torch.Tensor(log_p.size()).fill_(0) + weight = weight.numpy() + pos_num = pos_index.sum() + neg_num = neg_index.sum() + sum_num = pos_num + neg_num + weight[pos_index] = neg_num * 1.0 / sum_num + weight[neg_index] = pos_num * 1.0 / sum_num + weight = torch.from_numpy(weight) + weight = weight.cuda() + loss = F.binary_cross_entropy_with_logits(log_p, target_t, weight, size_average=True) + # del pos_index, neg_index + # del weight + return loss + +class BORCNNLossComputation(object): + def __init__(self, proposal_matcher, fg_bg_sampler, discretization_size, cfg): + """ + Arguments: + proposal_matcher (Matcher) + discretization_size (int) + """ + self.proposal_matcher = proposal_matcher + self.fg_bg_sampler = fg_bg_sampler + self.discretization_size = discretization_size + self.cfg = cfg.clone() + + def match_targets_to_proposals(self, proposal, target): + match_quality_matrix = boxlist_iou(target, proposal) + matched_idxs = self.proposal_matcher(match_quality_matrix) + target = target.copy_with_fields(["labels", "masks"]) + matched_targets = target[matched_idxs.clamp(min=0)] + matched_targets.add_field("matched_idxs", matched_idxs) + return matched_targets + + def prepare_targets(self, proposals, targets): + labels = [] + masks = [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + matched_targets = self.match_targets_to_proposals( + proposals_per_image, targets_per_image + ) + matched_idxs = matched_targets.get_field("matched_idxs") + + labels_per_image = matched_targets.get_field("labels") + labels_per_image = labels_per_image.to(dtype=torch.int64) + + # this can probably be removed, but is left here for clarity + # and completeness + neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD + labels_per_image[neg_inds] = 0 + + # mask scores are only computed on positive samples + positive_inds = torch.nonzero(labels_per_image > 0).squeeze(1) + + segmentation_masks = matched_targets.get_field("masks") + segmentation_masks = segmentation_masks[positive_inds] + + positive_proposals = proposals_per_image[positive_inds] + + masks_per_image = project_masks_on_boxes( + segmentation_masks, positive_proposals, self.discretization_size + ) + + labels.append(labels_per_image) + masks.append(masks_per_image) + + return labels, masks + + def subsample(self, proposals, targets): + """ + This method performs the positive/negative sampling, and return + the sampled proposals. + Note: this function keeps a state. + + Arguments: + proposals (list[BoxList]) + targets (list[BoxList]) + """ + + labels, kes, mty = self.prepare_targets(proposals, targets) + sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) + + proposals = list(proposals) + # add corresponding label and regression_targets information to the bounding boxes + for labels_per_image, kes_per_image, mty_per_image, proposals_per_image in zip( + labels, kes, mty, proposals + ): + proposals_per_image.add_field("labels", labels_per_image) + proposals_per_image.add_field("kes", kes_per_image) + proposals_per_image.add_field("mty", mty_per_image) + + # distributed sampled proposals, that were obtained on all feature maps + # concatenated via the fg_bg_sampler, into individual feature map levels + for img_idx, (pos_inds_img, neg_inds_img) in enumerate( + zip(sampled_pos_inds, sampled_neg_inds) + ): + # img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1) + img_sampled_inds = torch.nonzero(pos_inds_img).squeeze(1) + proposals_per_image = proposals[img_idx][img_sampled_inds] + proposals[img_idx] = proposals_per_image + + self._proposals = proposals + return proposals + + def __call__(self, proposals, ke_logits_x, ke_logits_y, targets): + """ + Arguments: + proposals (list[BoxList]) + mask_logits (Tensor) + targets (list[BoxList]) + + Return: + mask_loss (Tensor): scalar tensor containing the loss + """ + labels, mask_targets = self.prepare_targets(proposals, targets) + + labels = cat(labels, dim=0) + mask_targets = cat(mask_targets, dim=0) + positive_inds = torch.nonzero(labels > 0).squeeze(1) + + if mask_targets.numel() == 0: + return 0 + + sb, sh, sw = mask_targets.shape + mask_loss_x = edge_loss( ke_logits_x[positive_inds, 0].view([sb, 1, sh, sw]), mask_targets.view([sb, 1, sh, sw])) + mask_loss_y = edge_loss( ke_logits_y[positive_inds, 0].view([sb, 1, sh, sw]), mask_targets.view([sb, 1, sh, sw])) + + mask_loss = mask_loss_x + mask_loss_y + + return mask_loss , mask_loss_x, mask_loss_y + +def make_roi_boundary_loss_evaluator(cfg): + matcher = Matcher( + cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, + cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, + allow_low_quality_matches=False, + ) + + fg_bg_sampler = BalancedPositiveNegativeSampler( + cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION + ) + + loss_evaluator = BORCNNLossComputation( + matcher, fg_bg_sampler, cfg.MODEL.ROI_BOUNDARY_HEAD.RESOLUTION, cfg + ) + + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_feature_extractors.py new file mode 100644 index 0000000000000000000000000000000000000000..96fe5b019a54ae06799065cf39adea7ba452442d --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_feature_extractors.py @@ -0,0 +1,69 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from torch import nn +from torch.nn import functional as F + +# from ..box_head.roi_box_feature_extractors import ResNet50Conv5ROIFeatureExtractor +from maskrcnn_benchmark.modeling.poolers import Pooler +from maskrcnn_benchmark.modeling.make_layers import make_conv3x3 + + +class BOUNDARYRCNNFPNFeatureExtractor(nn.Module): + """ + Heads for FPN for classification + """ + + def __init__(self, cfg, in_channels): + """ + Arguments: + num_classes (int): number of output classes + input_size (int): number of channels of the input once it's flattened + representation_size (int): size of the intermediate representation + """ + super(BOUNDARYRCNNFPNFeatureExtractor, self).__init__() + + resolution = cfg.MODEL.ROI_BOUNDARY_HEAD.POOLER_RESOLUTION + scales = cfg.MODEL.ROI_BOUNDARY_HEAD.POOLER_SCALES + sampling_ratio = cfg.MODEL.ROI_BOUNDARY_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + deformable=cfg.MODEL.ROI_BOUNDARY_HEAD.DEFORMABLE_POOLING + # deformable = True + ) + input_size = in_channels + self.pooler = pooler + + layers = cfg.MODEL.ROI_BOUNDARY_HEAD.CONV_LAYERS + use_gn = cfg.MODEL.ROI_MASK_HEAD.USE_GN + dilation = cfg.MODEL.ROI_MASK_HEAD.DILATION + + next_feature = input_size + self.blocks = [] + for layer_idx, layer_features in enumerate(layers, 1): + layer_name = "boundary_fcn{}".format(layer_idx) + module = make_conv3x3( + next_feature, layer_features, + dilation=dilation, stride=1, use_gn=use_gn + ) + self.add_module(layer_name, module) + next_feature = layer_features + self.blocks.append(layer_name) + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + + for layer_name in self.blocks: + x = F.relu(getattr(self, layer_name)(x)) + + return x + + +_ROI_KE_FEATURE_EXTRACTORS = { + "BoundaryRCNNFPNFeatureExtractor": BOUNDARYRCNNFPNFeatureExtractor, +} + + +def make_roi_boundary_feature_extractor(cfg, in_channels): + func = _ROI_KE_FEATURE_EXTRACTORS[cfg.MODEL.ROI_BOUNDARY_HEAD.FEATURE_EXTRACTOR] + return func(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_predictors.py new file mode 100644 index 0000000000000000000000000000000000000000..9727592b5ca4d6280a4c017d5501f40f6a0d16d5 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_predictors.py @@ -0,0 +1,78 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn +from torch.nn import functional as F + +from maskrcnn_benchmark.layers import Conv2d +from maskrcnn_benchmark.layers import ConvTranspose2d + +from maskrcnn_benchmark import layers + +class BOUNDARYRCNNC4Predictor(nn.Module): + def __init__(self, cfg): + super(BOUNDARYRCNNC4Predictor, self).__init__() + dim_reduced = cfg.MODEL.ROI_BOUNDARY_HEAD.CONV_LAYERS[-1] + self.resol = cfg.MODEL.ROI_BOUNDARY_HEAD.RESOLUTION # 56 + + if cfg.MODEL.ROI_HEADS.USE_FPN: + num_inputs = dim_reduced + else: + stage_index = 4 + stage2_relative_factor = 2 ** (stage_index - 1) + res2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS #256 + num_inputs = res2_out_channels * stage2_relative_factor + + self.bo_input_xy = Conv2d(num_inputs, num_inputs, 1, 1, 0) + nn.init.kaiming_normal_(self.bo_input_xy.weight, + mode='fan_out', nonlinearity='relu') + nn.init.constant_(self.bo_input_xy.bias, 0) + + self.conv5_bo_xy = ConvTranspose2d(num_inputs, dim_reduced, 2, 2, 0) + nn.init.kaiming_normal_(self.conv5_bo_xy.weight, + mode='fan_out', nonlinearity='relu') + nn.init.constant_(self.conv5_bo_xy.bias, 0) + + self.bo_input_1_1 = Conv2d(dim_reduced, dim_reduced, 1, 1, 0) + nn.init.kaiming_normal_(self.bo_input_1_1.weight, + mode='fan_out', nonlinearity='relu') + nn.init.constant_(self.bo_input_1_1.bias, 0) + + self.bo_input_2_1 = Conv2d(dim_reduced, dim_reduced, 1, 1, 0) + nn.init.kaiming_normal_(self.bo_input_2_1.weight, + mode='fan_out', nonlinearity='relu') + nn.init.constant_(self.bo_input_2_1.bias, 0) + + self.conv5_bo_x = Conv2d(dim_reduced, 1, (3, 1), 1, (1,0)) # H W + nn.init.kaiming_normal_(self.conv5_bo_x.weight, + mode='fan_out', nonlinearity='relu') # 'relu' + nn.init.constant_(self.conv5_bo_x.bias, 0) + + self.conv5_bo_y = Conv2d(dim_reduced, 1, (1, 3), 1, (0,1)) # H W + nn.init.kaiming_normal_(self.conv5_bo_y.weight, + mode='fan_out', nonlinearity='relu') + nn.init.constant_(self.conv5_bo_y.bias, 0) + self.up_scale=2 + + + def forward(self, ft): + ft = self.bo_input_xy(ft) + ft_2x = self.conv5_bo_xy(ft) + + ft_2x = layers.interpolate(ft_2x, size = (48,48), mode='bilinear', align_corners=True) + + x = self.bo_input_1_1(ft_2x) + y = self.bo_input_2_1(ft_2x) + + x = self.conv5_bo_x(x) + y = self.conv5_bo_y(y) + + return x, y + + + +_ROI_KE_PREDICTOR = {"BoundaryRCNNC4Predictor": BOUNDARYRCNNC4Predictor} + + +def make_roi_boundary_predictor(cfg): + func = _ROI_KE_PREDICTOR[cfg.MODEL.ROI_BOUNDARY_HEAD.PREDICTOR] + return func(cfg) diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c47b2586ccedadd8b3fe5c31aca37cc36bfb1ac Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/box_head.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/box_head.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99992e37d12c3eacec8dcc131c10dd2635f55332 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/box_head.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/inference.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b93e667ee972fbdad5d2a4587e140a72d4cd8b11 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/inference.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/loss.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebc6843988b68dad375ba351116c84c74d6ef36c Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/loss.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_feature_extractors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_feature_extractors.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3094234af1a6183f20b5395a5b3d2b013c072d57 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_feature_extractors.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_predictors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_predictors.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e60df9f2aca4ce51866f378e65e6709d420a1b37 Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_predictors.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py new file mode 100644 index 0000000000000000000000000000000000000000..482081b8de7431282c8a017cd34d965c8f355bb0 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn + +from .roi_box_feature_extractors import make_roi_box_feature_extractor +from .roi_box_predictors import make_roi_box_predictor +from .inference import make_roi_box_post_processor +from .loss import make_roi_box_loss_evaluator + + +class ROIBoxHead(torch.nn.Module): + """ + Generic Box Head class. + """ + + def __init__(self, cfg, in_channels): + super(ROIBoxHead, self).__init__() + self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels) + self.predictor = make_roi_box_predictor( + cfg, self.feature_extractor.out_channels) + self.post_processor = make_roi_box_post_processor(cfg) + self.loss_evaluator = make_roi_box_loss_evaluator(cfg) + + def forward(self, features, proposals, targets=None): + """ + Arguments: + features (list[Tensor]): feature-maps from possibly several levels + proposals (list[BoxList]): proposal boxes + targets (list[BoxList], optional): the ground-truth targets. + + Returns: + x (Tensor): the result of the feature extractor + proposals (list[BoxList]): during training, the subsampled proposals + are returned. During testing, the predicted boxlists are returned + losses (dict[Tensor]): During training, returns the losses for the + head. During testing, returns an empty dict. + """ + + if self.training: + # Faster R-CNN subsamples during training the proposals with a fixed + # positive / negative ratio + with torch.no_grad(): + proposals = self.loss_evaluator.subsample(proposals, targets) + + # extract features that will be fed to the final classifier. The + # feature_extractor generally corresponds to the pooler + heads + x = self.feature_extractor(features, proposals) + # final classifier that converts the features into predictions + class_logits, box_regression = self.predictor(x) + + if not self.training: + result = self.post_processor((class_logits, box_regression), proposals) + return x, result, {} + + loss_classifier, loss_box_reg = self.loss_evaluator( + [class_logits], [box_regression] + ) + return ( + x, + proposals, + dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg), + ) + + +def build_roi_box_head(cfg, in_channels): + """ + Constructs a new box head. + By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class + and make it a parameter in the config + """ + return ROIBoxHead(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..595a2e61620fbd345bc36060c43191792fc010ea --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py @@ -0,0 +1,167 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.modeling.box_coder import BoxCoder + + +class PostProcessor(nn.Module): + """ + From a set of classification scores, box regression and proposals, + computes the post-processed boxes, and applies NMS to obtain the + final results + """ + + def __init__( + self, + score_thresh=0.05, + nms=0.5, + detections_per_img=100, + box_coder=None, + cls_agnostic_bbox_reg=False + ): + """ + Arguments: + score_thresh (float) + nms (float) + detections_per_img (int) + box_coder (BoxCoder) + """ + super(PostProcessor, self).__init__() + self.score_thresh = score_thresh + self.nms = nms + self.detections_per_img = detections_per_img + if box_coder is None: + box_coder = BoxCoder(weights=(10., 10., 5., 5.)) + self.box_coder = box_coder + self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg + + def forward(self, x, boxes): + """ + Arguments: + x (tuple[tensor, tensor]): x contains the class logits + and the box_regression from the model. + boxes (list[BoxList]): bounding boxes that are used as + reference, one for ech image + + Returns: + results (list[BoxList]): one BoxList for each image, containing + the extra fields labels and scores + """ + class_logits, box_regression = x + class_prob = F.softmax(class_logits, -1) + + # TODO think about a representation of batch of boxes + image_shapes = [box.size for box in boxes] + boxes_per_image = [len(box) for box in boxes] + concat_boxes = torch.cat([a.bbox for a in boxes], dim=0) + + if self.cls_agnostic_bbox_reg: + box_regression = box_regression[:, -4:] + proposals = self.box_coder.decode( + box_regression.view(sum(boxes_per_image), -1), concat_boxes + ) + if self.cls_agnostic_bbox_reg: + proposals = proposals.repeat(1, class_prob.shape[1]) + + num_classes = class_prob.shape[1] + + proposals = proposals.split(boxes_per_image, dim=0) + class_prob = class_prob.split(boxes_per_image, dim=0) + + results = [] + for prob, boxes_per_img, image_shape in zip( + class_prob, proposals, image_shapes + ): + boxlist = self.prepare_boxlist(boxes_per_img, prob, image_shape) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = self.filter_results(boxlist, num_classes) + results.append(boxlist) + return results + + def prepare_boxlist(self, boxes, scores, image_shape): + """ + Returns BoxList from `boxes` and adds probability scores information + as an extra field + `boxes` has shape (#detections, 4 * #classes), where each row represents + a list of predicted bounding boxes for each of the object classes in the + dataset (including the background class). The detections in each row + originate from the same object proposal. + `scores` has shape (#detection, #classes), where each row represents a list + of object detection confidence scores for each of the object classes in the + dataset (including the background class). `scores[i, j]`` corresponds to the + box at `boxes[i, j * 4:(j + 1) * 4]`. + """ + boxes = boxes.reshape(-1, 4) + scores = scores.reshape(-1) + boxlist = BoxList(boxes, image_shape, mode="xyxy") + boxlist.add_field("scores", scores) + return boxlist + + def filter_results(self, boxlist, num_classes): + """Returns bounding-box detection results by thresholding on scores and + applying non-maximum suppression (NMS). + """ + # unwrap the boxlist to avoid additional overhead. + # if we had multi-class NMS, we could perform this directly on the boxlist + boxes = boxlist.bbox.reshape(-1, num_classes * 4) + scores = boxlist.get_field("scores").reshape(-1, num_classes) + + device = scores.device + result = [] + # Apply threshold on detection probabilities and apply NMS + # Skip j = 0, because it's the background class + inds_all = scores > self.score_thresh + for j in range(1, num_classes): + inds = inds_all[:, j].nonzero().squeeze(1) + scores_j = scores[inds, j] + boxes_j = boxes[inds, j * 4 : (j + 1) * 4] + boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") + boxlist_for_class.add_field("scores", scores_j) + boxlist_for_class = boxlist_nms( + boxlist_for_class, self.nms + ) + num_labels = len(boxlist_for_class) + boxlist_for_class.add_field( + "labels", torch.full((num_labels,), j, dtype=torch.int64, device=device) + ) + result.append(boxlist_for_class) + + result = cat_boxlist(result) + number_of_detections = len(result) + + # Limit to max_per_image detections **over all classes** + if number_of_detections > self.detections_per_img > 0: + cls_scores = result.get_field("scores") + image_thresh, _ = torch.kthvalue( + cls_scores.cpu(), number_of_detections - self.detections_per_img + 1 + ) + keep = cls_scores >= image_thresh.item() + keep = torch.nonzero(keep).squeeze(1) + result = result[keep] + return result + + +def make_roi_box_post_processor(cfg): + use_fpn = cfg.MODEL.ROI_HEADS.USE_FPN + + bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS + box_coder = BoxCoder(weights=bbox_reg_weights) + + score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH + nms_thresh = cfg.MODEL.ROI_HEADS.NMS + detections_per_img = cfg.MODEL.ROI_HEADS.DETECTIONS_PER_IMG + cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG + + postprocessor = PostProcessor( + score_thresh, + nms_thresh, + detections_per_img, + box_coder, + cls_agnostic_bbox_reg + ) + return postprocessor diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..9f2771d029e6d027b29e60b83d268f03628d3a14 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py @@ -0,0 +1,193 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch.nn import functional as F + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import ( + BalancedPositiveNegativeSampler +) +from maskrcnn_benchmark.modeling.utils import cat + + +class FastRCNNLossComputation(object): + """ + Computes the loss for Faster R-CNN. + Also supports FPN + """ + + def __init__( + self, + proposal_matcher, + fg_bg_sampler, + box_coder, + cls_agnostic_bbox_reg=False + ): + """ + Arguments: + proposal_matcher (Matcher) + fg_bg_sampler (BalancedPositiveNegativeSampler) + box_coder (BoxCoder) + """ + self.proposal_matcher = proposal_matcher + self.fg_bg_sampler = fg_bg_sampler + self.box_coder = box_coder + self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg + + def match_targets_to_proposals(self, proposal, target): + match_quality_matrix = boxlist_iou(target, proposal) + matched_idxs = self.proposal_matcher(match_quality_matrix) + # Fast RCNN only need "labels" field for selecting the targets + target = target.copy_with_fields("labels") + # get the targets corresponding GT for each proposal + # NB: need to clamp the indices because we can have a single + # GT in the image, and matched_idxs can be -2, which goes + # out of bounds + matched_targets = target[matched_idxs.clamp(min=0)] + matched_targets.add_field("matched_idxs", matched_idxs) + return matched_targets + + def prepare_targets(self, proposals, targets): + labels = [] + regression_targets = [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + matched_targets = self.match_targets_to_proposals( + proposals_per_image, targets_per_image + ) + matched_idxs = matched_targets.get_field("matched_idxs") + + labels_per_image = matched_targets.get_field("labels") + labels_per_image = labels_per_image.to(dtype=torch.int64) + + # Label background (below the low threshold) + bg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD + labels_per_image[bg_inds] = 0 + + # Label ignore proposals (between low and high thresholds) + ignore_inds = matched_idxs == Matcher.BETWEEN_THRESHOLDS + labels_per_image[ignore_inds] = -1 # -1 is ignored by sampler + + # compute regression targets + regression_targets_per_image = self.box_coder.encode( + matched_targets.bbox, proposals_per_image.bbox + ) + + labels.append(labels_per_image) + regression_targets.append(regression_targets_per_image) + + return labels, regression_targets + + def subsample(self, proposals, targets): + """ + This method performs the positive/negative sampling, and return + the sampled proposals. + Note: this function keeps a state. + + Arguments: + proposals (list[BoxList]) + targets (list[BoxList]) + """ + + labels, regression_targets = self.prepare_targets(proposals, targets) + sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) + + proposals = list(proposals) + # add corresponding label and regression_targets information to the bounding boxes + for labels_per_image, regression_targets_per_image, proposals_per_image in zip( + labels, regression_targets, proposals + ): + proposals_per_image.add_field("labels", labels_per_image) + proposals_per_image.add_field( + "regression_targets", regression_targets_per_image + ) + + # distributed sampled proposals, that were obtained on all feature maps + # concatenated via the fg_bg_sampler, into individual feature map levels + for img_idx, (pos_inds_img, neg_inds_img) in enumerate( + zip(sampled_pos_inds, sampled_neg_inds) + ): + img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1) + proposals_per_image = proposals[img_idx][img_sampled_inds] + proposals[img_idx] = proposals_per_image + + self._proposals = proposals + return proposals + + def __call__(self, class_logits, box_regression): + """ + Computes the loss for Faster R-CNN. + This requires that the subsample method has been called beforehand. + + Arguments: + class_logits (list[Tensor]) + box_regression (list[Tensor]) + + Returns: + classification_loss (Tensor) + box_loss (Tensor) + """ + + class_logits = cat(class_logits, dim=0) + box_regression = cat(box_regression, dim=0) + device = class_logits.device + + if not hasattr(self, "_proposals"): + raise RuntimeError("subsample needs to be called before") + + proposals = self._proposals + + labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0) + regression_targets = cat( + [proposal.get_field("regression_targets") for proposal in proposals], dim=0 + ) + + classification_loss = F.cross_entropy(class_logits, labels) + + # get indices that correspond to the regression targets for + # the corresponding ground truth labels, to be used with + # advanced indexing + sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1) + labels_pos = labels[sampled_pos_inds_subset] + if self.cls_agnostic_bbox_reg: + map_inds = torch.tensor([4, 5, 6, 7], device=device) + else: + map_inds = 4 * labels_pos[:, None] + torch.tensor( + [0, 1, 2, 3], device=device) + + box_loss = smooth_l1_loss( + box_regression[sampled_pos_inds_subset[:, None], map_inds], + regression_targets[sampled_pos_inds_subset], + size_average=False, + beta=1, + ) + box_loss = box_loss / labels.numel() + + return classification_loss, box_loss + + +def make_roi_box_loss_evaluator(cfg): + matcher = Matcher( + cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, + cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, + allow_low_quality_matches=False, + ) + + bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS + box_coder = BoxCoder(weights=bbox_reg_weights) + + fg_bg_sampler = BalancedPositiveNegativeSampler( + cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION + ) + + cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG + + loss_evaluator = FastRCNNLossComputation( + matcher, + fg_bg_sampler, + box_coder, + cls_agnostic_bbox_reg + ) + + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py new file mode 100644 index 0000000000000000000000000000000000000000..e4406deedc2ce5430bf54d75868ea1a438b7bc57 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py @@ -0,0 +1,153 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn +from torch.nn import functional as F + +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.modeling.backbone import resnet +from maskrcnn_benchmark.modeling.poolers import Pooler +from maskrcnn_benchmark.modeling.make_layers import group_norm +from maskrcnn_benchmark.modeling.make_layers import make_fc + + +@registry.ROI_BOX_FEATURE_EXTRACTORS.register("ResNet50Conv5ROIFeatureExtractor") +class ResNet50Conv5ROIFeatureExtractor(nn.Module): + def __init__(self, config, in_channels): + super(ResNet50Conv5ROIFeatureExtractor, self).__init__() + + resolution = config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + scales = config.MODEL.ROI_BOX_HEAD.POOLER_SCALES + sampling_ratio = config.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + deformable=_C.MODEL.ROI_BOX_HEAD.DEFORMABLE_POOLING + ) + + stage = resnet.StageSpec(index=4, block_count=3, return_features=False) + head = resnet.ResNetHead( + block_module=config.MODEL.RESNETS.TRANS_FUNC, + stages=(stage,), + num_groups=config.MODEL.RESNETS.NUM_GROUPS, + width_per_group=config.MODEL.RESNETS.WIDTH_PER_GROUP, + stride_in_1x1=config.MODEL.RESNETS.STRIDE_IN_1X1, + stride_init=None, + res2_out_channels=config.MODEL.RESNETS.RES2_OUT_CHANNELS, + dilation=config.MODEL.RESNETS.RES5_DILATION + ) + + self.pooler = pooler + self.head = head + self.out_channels = head.out_channels + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + x = self.head(x) + return x + + +@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPN2MLPFeatureExtractor") +class FPN2MLPFeatureExtractor(nn.Module): + """ + Heads for FPN for classification + """ + + def __init__(self, cfg, in_channels): + super(FPN2MLPFeatureExtractor, self).__init__() + + resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES + sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + deformable=cfg.MODEL.RESNETS.DEFORM_POOLING + ) + input_size = in_channels * resolution ** 2 + representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM + use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN + self.pooler = pooler + self.fc6 = make_fc(input_size, representation_size, use_gn) + self.fc7 = make_fc(representation_size, representation_size, use_gn) + self.out_channels = representation_size + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + x = x.view(x.size(0), -1) + + x = F.relu(self.fc6(x)) + x = F.relu(self.fc7(x)) + + return x + + +@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPNXconv1fcFeatureExtractor") +class FPNXconv1fcFeatureExtractor(nn.Module): + """ + Heads for FPN for classification + """ + + def __init__(self, cfg, in_channels): + super(FPNXconv1fcFeatureExtractor, self).__init__() + + resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES + sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + self.pooler = pooler + + use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN + conv_head_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM + num_stacked_convs = cfg.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS + dilation = cfg.MODEL.ROI_BOX_HEAD.DILATION + + xconvs = [] + for ix in range(num_stacked_convs): + xconvs.append( + nn.Conv2d( + in_channels, + conv_head_dim, + kernel_size=3, + stride=1, + padding=dilation, + dilation=dilation, + bias=False if use_gn else True + ) + ) + in_channels = conv_head_dim + if use_gn: + xconvs.append(group_norm(in_channels)) + xconvs.append(nn.ReLU(inplace=True)) + + self.add_module("xconvs", nn.Sequential(*xconvs)) + for modules in [self.xconvs,]: + for l in modules.modules(): + if isinstance(l, nn.Conv2d): + torch.nn.init.normal_(l.weight, std=0.01) + if not use_gn: + torch.nn.init.constant_(l.bias, 0) + + input_size = conv_head_dim * resolution ** 2 + representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM + self.fc6 = make_fc(input_size, representation_size, use_gn=False) + self.out_channels = representation_size + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + x = self.xconvs(x) + x = x.view(x.size(0), -1) + x = F.relu(self.fc6(x)) + return x + + +def make_roi_box_feature_extractor(cfg, in_channels): + func = registry.ROI_BOX_FEATURE_EXTRACTORS[ + cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR + ] + return func(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py new file mode 100644 index 0000000000000000000000000000000000000000..66ee4ace585cff5ea2933553d3e800f03757eba9 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py @@ -0,0 +1,62 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from maskrcnn_benchmark.modeling import registry +from torch import nn + + +@registry.ROI_BOX_PREDICTOR.register("FastRCNNPredictor") +class FastRCNNPredictor(nn.Module): + def __init__(self, config, in_channels): + super(FastRCNNPredictor, self).__init__() + assert in_channels is not None + + num_inputs = in_channels + + num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES + self.avgpool = nn.AdaptiveAvgPool2d(1) + self.cls_score = nn.Linear(num_inputs, num_classes) + num_bbox_reg_classes = 2 if config.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes + self.bbox_pred = nn.Linear(num_inputs, num_bbox_reg_classes * 4) + + nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) + nn.init.constant_(self.cls_score.bias, 0) + + nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001) + nn.init.constant_(self.bbox_pred.bias, 0) + + def forward(self, x): + x = self.avgpool(x) + x = x.view(x.size(0), -1) + cls_logit = self.cls_score(x) + bbox_pred = self.bbox_pred(x) + return cls_logit, bbox_pred + + +@registry.ROI_BOX_PREDICTOR.register("FPNPredictor") +class FPNPredictor(nn.Module): + def __init__(self, cfg, in_channels): + super(FPNPredictor, self).__init__() + num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES + representation_size = in_channels + + self.cls_score = nn.Linear(representation_size, num_classes) + num_bbox_reg_classes = 2 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes + self.bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4) + + nn.init.normal_(self.cls_score.weight, std=0.01) + nn.init.normal_(self.bbox_pred.weight, std=0.001) + for l in [self.cls_score, self.bbox_pred]: + nn.init.constant_(l.bias, 0) + + def forward(self, x): + if x.ndimension() == 4: + assert list(x.shape[2:]) == [1, 1] + x = x.view(x.size(0), -1) + scores = self.cls_score(x) + bbox_deltas = self.bbox_pred(x) + + return scores, bbox_deltas + + +def make_roi_box_predictor(cfg, in_channels): + func = registry.ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR] + return func(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py b/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..614a36203c95ffc5d01373d9bdf50b1c11c9790d --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py @@ -0,0 +1,78 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .box_head.box_head import build_roi_box_head +from .boundary_head.boundary_head import build_roi_boundary_head +class CombinedROIHeads(torch.nn.ModuleDict): + """ + Combines a set of individual heads (for box prediction or masks) into a single + head. + """ + + def __init__(self, cfg, heads): + super(CombinedROIHeads, self).__init__(heads) + self.cfg = cfg.clone() + if cfg.MODEL.BOUNDARY_ON and cfg.MODEL.ROI_BOUNDARY_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: + self.mask.feature_extractor = self.box.feature_extractor + self.bo.feature_extractor = self.bo.feature_extractor + + def forward(self, features, proposals, targets=None, prefix=''): + """ + prefix (str): Some model may use auxiliary heads which don't share rpn, + use this to separate the loss names + """ + losses = {} + # TODO rename x to roi_box_features, if it doesn't increase memory consumption + x, detections, loss_box = self.box(features, proposals, targets) + losses.update(loss_box) + if self.cfg.MODEL.MASK_ON: + mask_features = features + # optimization: during training, if we share the feature extractor between + # the box and the mask heads, then we can reuse the features already computed + if ( + self.training + and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR + ): + mask_features = x + # During training, self.box() will return the unaltered proposals as "detections" + # this makes the API consistent during training and testing + + x, detections, loss_mask = self.mask(mask_features, detections, targets) + + losses.update(loss_mask) + + if self.cfg.MODEL.BOUNDARY_ON: + bo_features = features + if ( + self.training + and self.cfg.MODEL.ROI_BOUNDARY_HEAD.SHARE_BOX_FEATURE_EXTRACTOR + ): + bo_features = x + x, detections, loss_bo, loss_bo_x, loss_bo_y = self.bound(bo_features, detections, targets) + losses.update(loss_bo) + losses.update(loss_bo_x) + losses.update(loss_bo_y) + + losses = {prefix + k: losses[k] for k in losses} + + + + return x, detections, losses + + +def build_roi_heads(cfg, in_channels): + # individually create the heads, that will be combined together + # afterwards + roi_heads = [] + if cfg.MODEL.RETINANET_ON: + return [] + + if not cfg.MODEL.RPN_ONLY: + roi_heads.append(("box", build_roi_box_head(cfg, in_channels))) + if cfg.MODEL.BOUNDARY_ON: + roi_heads.append(("bound", build_roi_boundary_head(cfg, in_channels))) + # combine individual heads in a single module + if roi_heads: + roi_heads = CombinedROIHeads(cfg, roi_heads) + + return roi_heads diff --git a/maskrcnn_benchmark/modeling/rpn/__init__.py b/maskrcnn_benchmark/modeling/rpn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b01f30cfddd8ed97d5a39f55641fbc929297d885 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# from .rpn import build_rpn diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f8adc17c922e0e094f9d6993ec5ad10f71dad47 Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/anchor_generator.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/anchor_generator.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec46ba8427ecaa1225303687066c180588f7dd2a Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/anchor_generator.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/inference.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd3fb12fe494f0e7164921545b8c7cfffb1dde18 Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/inference.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/loss.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7fb46be527bd179c65b6fc9a07877d941c2854c8 Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/loss.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/rpn.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/rpn.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40623a7cc1a203c6aefac289709d3a40dec55115 Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/rpn.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/utils.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16d829a945683875f785401af5537d39b697662c Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/utils.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/anchor_generator.py b/maskrcnn_benchmark/modeling/rpn/anchor_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..5314d0c9d0bf449cb8ee1ca3eea0385cb7c2a8e5 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/anchor_generator.py @@ -0,0 +1,292 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import math + +import numpy as np +import torch +from torch import nn + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class BufferList(nn.Module): + """ + Similar to nn.ParameterList, but for buffers + """ + + def __init__(self, buffers=None): + super(BufferList, self).__init__() + if buffers is not None: + self.extend(buffers) + + def extend(self, buffers): + offset = len(self) + for i, buffer in enumerate(buffers): + self.register_buffer(str(offset + i), buffer) + return self + + def __len__(self): + return len(self._buffers) + + def __iter__(self): + return iter(self._buffers.values()) + + +class AnchorGenerator(nn.Module): + """ + For a set of image sizes and feature maps, computes a set + of anchors + """ + def __init__( + self, + sizes=(128, 256, 512), # 32, 64, 128, 256, 512 + aspect_ratios=(0.5, 1.0, 2.0), # 0.25, 0.5, 1.0, 2.0, 4.0 + anchor_strides=(8, 16, 32), # 4, 8, 16, 32, 64 + straddle_thresh=0, # 0 + ): + super(AnchorGenerator, self).__init__() + + if len(anchor_strides) == 1: + anchor_stride = anchor_strides[0] + cell_anchors = [ + generate_anchors(anchor_stride, sizes, aspect_ratios).float() + ] + else: + + # This step is done + + if len(anchor_strides) != len(sizes): + raise RuntimeError("FPN should have #anchor_strides == #sizes") + + cell_anchors = [ + generate_anchors( + anchor_stride, + size if isinstance(size, (tuple, list)) else (size,), + aspect_ratios + ).float() + for anchor_stride, size in zip(anchor_strides, sizes) + ] + self.strides = anchor_strides + self.cell_anchors = BufferList(cell_anchors) + self.straddle_thresh = straddle_thresh + + def num_anchors_per_location(self): + return [len(cell_anchors) for cell_anchors in self.cell_anchors] + + def grid_anchors(self, grid_sizes): + anchors = [] + for size, stride, base_anchors in zip( + grid_sizes, self.strides, self.cell_anchors + ): + grid_height, grid_width = size + device = base_anchors.device + shifts_x = torch.arange( + 0, grid_width * stride, step=stride, dtype=torch.float32, device=device + ) + + shifts_y = torch.arange( + 0, grid_height * stride, step=stride, dtype=torch.float32, device=device + ) + shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) + shift_x = shift_x.reshape(-1) + shift_y = shift_y.reshape(-1) + shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) + + anchors.append( + (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) + ) + + return anchors + + def add_visibility_to(self, boxlist): + image_width, image_height = boxlist.size + anchors = boxlist.bbox + if self.straddle_thresh >= 0: + inds_inside = ( + (anchors[..., 0] >= -self.straddle_thresh) + & (anchors[..., 1] >= -self.straddle_thresh) + & (anchors[..., 2] < image_width + self.straddle_thresh) + & (anchors[..., 3] < image_height + self.straddle_thresh) + ) + else: + device = anchors.device + inds_inside = torch.ones(anchors.shape[0], dtype=torch.uint8, device=device) + boxlist.add_field("visibility", inds_inside) + + def forward(self, image_list, feature_maps): + grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps] # size of features + anchors_over_all_feature_maps = self.grid_anchors(grid_sizes) + anchors = [] + for i, (image_height, image_width) in enumerate(image_list.image_sizes): + anchors_in_image = [] + for anchors_per_feature_map in anchors_over_all_feature_maps: + boxlist = BoxList( + anchors_per_feature_map, (image_width, image_height), mode="xyxy" + ) + self.add_visibility_to(boxlist) + anchors_in_image.append(boxlist) + anchors.append(anchors_in_image) + return anchors # [image,number,[n,4]] + + +def make_anchor_generator(config): + anchor_sizes = config.MODEL.RPN.ANCHOR_SIZES # 32, 64, 128, 256, 512 + aspect_ratios = config.MODEL.RPN.ASPECT_RATIOS # 0.25, 0.5, 1.0, 2.0, 4.0 + anchor_stride = config.MODEL.RPN.ANCHOR_STRIDE # 4, 8, 16, 32, 64 + straddle_thresh = config.MODEL.RPN.STRADDLE_THRESH #0 + + if config.MODEL.RPN.USE_FPN: # This step is done + assert len(anchor_stride) == len( + anchor_sizes + ), "FPN should have len(ANCHOR_STRIDE) == len(ANCHOR_SIZES)" + else: + assert len(anchor_stride) == 1, "Non-FPN should have a single ANCHOR_STRIDE" + anchor_generator = AnchorGenerator( + anchor_sizes, aspect_ratios, anchor_stride, straddle_thresh + ) + return anchor_generator + + +def make_anchor_generator_retinanet(config): + anchor_sizes = config.MODEL.RETINANET.ANCHOR_SIZES + aspect_ratios = config.MODEL.RETINANET.ASPECT_RATIOS + anchor_strides = config.MODEL.RETINANET.ANCHOR_STRIDES + straddle_thresh = config.MODEL.RETINANET.STRADDLE_THRESH + octave = config.MODEL.RETINANET.OCTAVE + scales_per_octave = config.MODEL.RETINANET.SCALES_PER_OCTAVE + + assert len(anchor_strides) == len(anchor_sizes), "Only support FPN now" + new_anchor_sizes = [] + for size in anchor_sizes: + per_layer_anchor_sizes = [] + for scale_per_octave in range(scales_per_octave): + octave_scale = octave ** (scale_per_octave / float(scales_per_octave)) + per_layer_anchor_sizes.append(octave_scale * size) + new_anchor_sizes.append(tuple(per_layer_anchor_sizes)) + + anchor_generator = AnchorGenerator( + tuple(new_anchor_sizes), aspect_ratios, anchor_strides, straddle_thresh + ) + return anchor_generator + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Faster R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick and Sean Bell +# -------------------------------------------------------- + + +# Verify that we compute the same anchors as Shaoqing's matlab implementation: +# +# >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat +# >> anchors +# +# anchors = +# +# -83 -39 100 56 +# -175 -87 192 104 +# -359 -183 376 200 +# -55 -55 72 72 +# -119 -119 136 136 +# -247 -247 264 264 +# -35 -79 52 96 +# -79 -167 96 184 +# -167 -343 184 360 + +# array([[ -83., -39., 100., 56.], +# [-175., -87., 192., 104.], +# [-359., -183., 376., 200.], +# [ -55., -55., 72., 72.], +# [-119., -119., 136., 136.], +# [-247., -247., 264., 264.], +# [ -35., -79., 52., 96.], +# [ -79., -167., 96., 184.], +# [-167., -343., 184., 360.]]) + + +def generate_anchors( + stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2) +): + """Generates a matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors + are centered on stride / 2, have (approximate) sqrt areas of the specified + sizes, and aspect ratios as given. + """ + return _generate_anchors( + stride, + np.array(sizes, dtype=np.float) / stride, + np.array(aspect_ratios, dtype=np.float), + ) + + +def _generate_anchors(base_size, scales, aspect_ratios): + """Generate anchor (reference) windows by enumerating aspect ratios X + scales wrt a reference (0, 0, base_size - 1, base_size - 1) window. + """ + anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1 + anchors = _ratio_enum(anchor, aspect_ratios) + anchors = np.vstack( + [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])] + ) + return torch.from_numpy(anchors) + + +def _whctrs(anchor): + """Return width, height, x center, and y center for an anchor (window).""" + w = anchor[2] - anchor[0] + 1 + h = anchor[3] - anchor[1] + 1 + x_ctr = anchor[0] + 0.5 * (w - 1) + y_ctr = anchor[1] + 0.5 * (h - 1) + return w, h, x_ctr, y_ctr + + +def _mkanchors(ws, hs, x_ctr, y_ctr): + """Given a vector of widths (ws) and heights (hs) around a center + (x_ctr, y_ctr), output a set of anchors (windows). + """ + ws = ws[:, np.newaxis] + hs = hs[:, np.newaxis] + anchors = np.hstack( + ( + x_ctr - 0.5 * (ws - 1), + y_ctr - 0.5 * (hs - 1), + x_ctr + 0.5 * (ws - 1), + y_ctr + 0.5 * (hs - 1), + ) + ) + return anchors + + +def _ratio_enum(anchor, ratios): + """Enumerate a set of anchors for each aspect ratio wrt an anchor.""" + w, h, x_ctr, y_ctr = _whctrs(anchor) + size = w * h + size_ratios = size / ratios + ws = np.round(np.sqrt(size_ratios)) + hs = np.round(ws * ratios) + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + + +def _scale_enum(anchor, scales): + """Enumerate a set of anchors for each scale wrt an anchor.""" + w, h, x_ctr, y_ctr = _whctrs(anchor) + ws = w * scales + hs = h * scales + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/__init__.py b/maskrcnn_benchmark/modeling/rpn/fcos/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ac106b0dc0a110a88c4c5bb96c6b2a32b01ebe7 Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/fcos.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/fcos.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d1a79212b4451708988487c84fafada2b80b9ae Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/fcos.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/inference.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d28c4630201020e2365ff585ea67db29907af0e2 Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/inference.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/loss.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b39c77d13f4b149176a594c16cfb431c5ef4ae1a Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/loss.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/fcos.py b/maskrcnn_benchmark/modeling/rpn/fcos/fcos.py new file mode 100644 index 0000000000000000000000000000000000000000..0492218ece4a437392a049fcb389f3aaf38ef11f --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/fcos/fcos.py @@ -0,0 +1,199 @@ +import math +import torch +import torch.nn.functional as F +from torch import nn + +from .inference import make_fcos_postprocessor +from .loss import make_fcos_loss_evaluator + +from maskrcnn_benchmark.layers import Scale + + +class FCOSHead(torch.nn.Module): + def __init__(self, cfg, in_channels): + """ + Arguments: + in_channels (int): number of channels of the input feature + """ + super(FCOSHead, self).__init__() + # TODO: Implement the sigmoid version first. + num_classes = cfg.MODEL.FCOS.NUM_CLASSES - 1 + + cls_tower = [] + bbox_tower = [] + for i in range(cfg.MODEL.FCOS.NUM_CONVS): + cls_tower.append( + nn.Conv2d( + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1 + ) + ) + cls_tower.append(nn.GroupNorm(32, in_channels)) + cls_tower.append(nn.ReLU()) + bbox_tower.append( + nn.Conv2d( + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1 + ) + ) + bbox_tower.append(nn.GroupNorm(32, in_channels)) + bbox_tower.append(nn.ReLU()) + + self.add_module('cls_tower', nn.Sequential(*cls_tower)) + self.add_module('bbox_tower', nn.Sequential(*bbox_tower)) + self.cls_logits = nn.Conv2d( + in_channels, num_classes, kernel_size=3, stride=1, + padding=1 + ) + self.bbox_pred = nn.Conv2d( + in_channels, 4, kernel_size=3, stride=1, + padding=1 + ) + self.centerness = nn.Conv2d( + in_channels, 1, kernel_size=3, stride=1, + padding=1 + ) + + # initialization + for modules in [self.cls_tower, self.bbox_tower, + self.cls_logits, self.bbox_pred, + self.centerness]: + for l in modules.modules(): + if isinstance(l, nn.Conv2d): + torch.nn.init.normal_(l.weight, std=0.01) + torch.nn.init.constant_(l.bias, 0) + + # initialize the bias for focal loss + prior_prob = cfg.MODEL.FCOS.PRIOR_PROB + bias_value = -math.log((1 - prior_prob) / prior_prob) + torch.nn.init.constant_(self.cls_logits.bias, bias_value) + + self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(5)]) + + def forward(self, x): + logits = [] + bbox_reg = [] + centerness = [] + for l, feature in enumerate(x): + cls_tower = self.cls_tower(feature) + logits.append(self.cls_logits(cls_tower)) + centerness.append(self.centerness(cls_tower)) + bbox_reg.append(torch.exp(self.scales[l]( + self.bbox_pred(self.bbox_tower(feature)) + ))) + return logits, bbox_reg, centerness + + +class FCOSModule(torch.nn.Module): + """ + Module for FCOS computation. Takes feature maps from the backbone and + FCOS outputs and losses. Only Test on FPN now. + """ + + def __init__(self, cfg, in_channels): + super(FCOSModule, self).__init__() + + self.cfg = cfg.clone() + + head = FCOSHead(cfg, in_channels) + + box_selector_train = make_fcos_postprocessor(cfg, is_train=True) + box_selector_test = make_fcos_postprocessor(cfg) + + loss_evaluator = make_fcos_loss_evaluator(cfg) + self.head = head + self.box_selector_train = box_selector_train + self.box_selector_test = box_selector_test + self.loss_evaluator = loss_evaluator + self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES + + def forward(self, images, features, targets=None): + """ + Arguments: + images (ImageList): images for which we want to compute the predictions + features (list[Tensor]): features computed from the images that are + used for computing the predictions. Each tensor in the list + correspond to different feature levels + targets (list[BoxList): ground-truth boxes present in the image (optional) + + Returns: + boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per + image. + losses (dict[Tensor]): the losses for the model during training. During + testing, it is an empty dict. + """ + box_cls, box_regression, centerness = self.head(features) + locations = self.compute_locations(features) + + if self.training: + return self._forward_train( + locations, box_cls, + box_regression, + centerness, targets, images.image_sizes + ) + else: + return self._forward_test( + locations, box_cls, box_regression, + centerness, images.image_sizes + ) + + def _forward_train(self, locations, box_cls, box_regression, + centerness, targets, image_sizes): + loss_box_cls, loss_box_reg, loss_centerness = self.loss_evaluator( + locations, box_cls, box_regression, centerness, targets + ) + if self.cfg.MODEL.RPN_ONLY: + boxes = None + else: + with torch.no_grad(): + boxes = self.box_selector_train( + locations, box_cls, box_regression, + centerness, image_sizes) + losses = { + "loss_cls": loss_box_cls, + "loss_reg": loss_box_reg, + "loss_centerness": loss_centerness + } + return boxes, losses + + def _forward_test(self, locations, box_cls, box_regression, centerness, image_sizes): + boxes = self.box_selector_test( + locations, box_cls, box_regression, + centerness, image_sizes + ) + return boxes, {} + + def compute_locations(self, features): + locations = [] + for level, feature in enumerate(features): + h, w = feature.size()[-2:] + locations_per_level = self.compute_locations_per_level( + h, w, self.fpn_strides[level], + feature.device + ) + locations.append(locations_per_level) + return locations + + def compute_locations_per_level(self, h, w, stride, device): + shifts_x = torch.arange( + 0, w * stride, step=stride, + dtype=torch.float32, device=device + ) + shifts_y = torch.arange( + 0, h * stride, step=stride, + dtype=torch.float32, device=device + ) + shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) + shift_x = shift_x.reshape(-1) + shift_y = shift_y.reshape(-1) + locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2 + return locations + +def build_fcos(cfg, in_channels): + return FCOSModule(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/inference.py b/maskrcnn_benchmark/modeling/rpn/fcos/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..72e5bf911142367d03fcb2ea4dc6d8fd17004dc8 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/fcos/inference.py @@ -0,0 +1,209 @@ +import torch + +from ..inference import RPNPostProcessor +from ..utils import permute_and_flatten + +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.modeling.utils import cat +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms +from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes + + +class FCOSPostProcessor(torch.nn.Module): + """ + Performs post-processing on the outputs of the RetinaNet boxes. + This is only used in the testing. + """ + def __init__( + self, + pre_nms_thresh, + pre_nms_top_n, + nms_thresh, + fpn_post_nms_top_n, + min_size, + num_classes, + ): + """ + Arguments: + pre_nms_thresh (float) + pre_nms_top_n (int) + nms_thresh (float) + fpn_post_nms_top_n (int) + min_size (int) + num_classes (int) + box_coder (BoxCoder) + """ + super(FCOSPostProcessor, self).__init__() + self.pre_nms_thresh = pre_nms_thresh + self.pre_nms_top_n = pre_nms_top_n + self.nms_thresh = nms_thresh + self.fpn_post_nms_top_n = fpn_post_nms_top_n + self.min_size = min_size + self.num_classes = num_classes + + def forward_for_single_feature_map( + self, locations, box_cls, + box_regression, centerness, + image_sizes): + """ + Arguments: + anchors: list[BoxList] + box_cls: tensor of size N, A * C, H, W + box_regression: tensor of size N, A * 4, H, W + """ + N, C, H, W = box_cls.shape + + # put in the same format as locations + box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1) + box_cls = box_cls.reshape(N, -1, C).sigmoid() + box_regression = box_regression.view(N, 4, H, W).permute(0, 2, 3, 1) + box_regression = box_regression.reshape(N, -1, 4) + centerness = centerness.view(N, 1, H, W).permute(0, 2, 3, 1) + centerness = centerness.reshape(N, -1).sigmoid() + + candidate_inds = box_cls > self.pre_nms_thresh + pre_nms_top_n = candidate_inds.view(N, -1).sum(1) + pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) + #print('pre_nms_top_n: ', pre_nms_top_n) + + # multiply the classification scores with centerness scores + box_cls = box_cls * centerness[:, :, None] + + results = [] + for i in range(N): + per_box_cls = box_cls[i] + per_candidate_inds = candidate_inds[i] + per_box_cls = per_box_cls[per_candidate_inds] + + per_candidate_nonzeros = per_candidate_inds.nonzero() + per_box_loc = per_candidate_nonzeros[:, 0] + per_class = per_candidate_nonzeros[:, 1] + 1 + + per_box_regression = box_regression[i] + per_box_regression = per_box_regression[per_box_loc] + per_locations = locations[per_box_loc] + + per_pre_nms_top_n = pre_nms_top_n[i] + + if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): + per_box_cls, top_k_indices = \ + per_box_cls.topk(per_pre_nms_top_n, sorted=False) + per_class = per_class[top_k_indices] + per_box_regression = per_box_regression[top_k_indices] + per_locations = per_locations[top_k_indices] + + detections = torch.stack([ + per_locations[:, 0] - per_box_regression[:, 0], + per_locations[:, 1] - per_box_regression[:, 1], + per_locations[:, 0] + per_box_regression[:, 2], + per_locations[:, 1] + per_box_regression[:, 3], + ], dim=1) + + h, w = image_sizes[i] + boxlist = BoxList(detections, (int(w), int(h)), mode="xyxy") + boxlist.add_field("labels", per_class) + boxlist.add_field("scores", per_box_cls) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = remove_small_boxes(boxlist, self.min_size) + results.append(boxlist) + + return results + + def forward(self, locations, box_cls, box_regression, centerness, image_sizes): + """ + Arguments: + anchors: list[list[BoxList]] + box_cls: list[tensor] + box_regression: list[tensor] + image_sizes: list[(h, w)] + Returns: + boxlists (list[BoxList]): the post-processed anchors, after + applying box decoding and NMS + """ + sampled_boxes = [] + for _, (l, o, b, c) in enumerate(zip(locations, box_cls, box_regression, centerness)): + sampled_boxes.append( + self.forward_for_single_feature_map( + l, o, b, c, image_sizes + ) + ) + + boxlists = list(zip(*sampled_boxes)) + boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] + boxlists = self.select_over_all_levels(boxlists) + + return boxlists + + # TODO very similar to filter_results from PostProcessor + # but filter_results is per image + # TODO Yang: solve this issue in the future. No good solution + # right now. + def select_over_all_levels(self, boxlists): + num_images = len(boxlists) + results = [] + for i in range(num_images): + scores = boxlists[i].get_field("scores") + labels = boxlists[i].get_field("labels") + boxes = boxlists[i].bbox + boxlist = boxlists[i] + result = [] + # skip the background + for j in range(1, self.num_classes): + inds = (labels == j).nonzero().view(-1) + + scores_j = scores[inds] + boxes_j = boxes[inds, :].view(-1, 4) + boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") + boxlist_for_class.add_field("scores", scores_j) + boxlist_for_class = boxlist_nms( + boxlist_for_class, self.nms_thresh, + score_field="scores" + ) + num_labels = len(boxlist_for_class) + boxlist_for_class.add_field( + "labels", torch.full((num_labels,), j, + dtype=torch.int64, + device=scores.device) + ) + result.append(boxlist_for_class) + + result = cat_boxlist(result) + number_of_detections = len(result) + + # Limit to max_per_image detections **over all classes** + if number_of_detections > self.fpn_post_nms_top_n > 0: + cls_scores = result.get_field("scores") + image_thresh, _ = torch.kthvalue( + cls_scores.cpu(), + number_of_detections - self.fpn_post_nms_top_n + 1 + ) + keep = cls_scores >= image_thresh.item() + keep = torch.nonzero(keep).squeeze(1) + result = result[keep] + results.append(result) + return results + + +def make_fcos_postprocessor(config, is_train=False): + pre_nms_thresh = config.MODEL.FCOS.INFERENCE_TH + pre_nms_top_n = config.MODEL.FCOS.PRE_NMS_TOP_N + nms_thresh = config.MODEL.FCOS.NMS_TH + fpn_post_nms_top_n = config.TEST.DETECTIONS_PER_IMG + + if is_train: + fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN + pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TRAIN + pre_nms_thresh = 0.01 + + box_selector = FCOSPostProcessor( + pre_nms_thresh=pre_nms_thresh, + pre_nms_top_n=pre_nms_top_n, + nms_thresh=nms_thresh, + fpn_post_nms_top_n=fpn_post_nms_top_n, + min_size=0, + num_classes=config.MODEL.FCOS.NUM_CLASSES + ) + + return box_selector diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/loss.py b/maskrcnn_benchmark/modeling/rpn/fcos/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..3ae915c416d5d4182b01b36738e2ee0485c5f7eb --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/fcos/loss.py @@ -0,0 +1,194 @@ +""" +This file contains specific functions for computing losses of FCOS +file +""" + +import torch +from torch.nn import functional as F +from torch import nn + +from ..utils import concat_box_prediction_layers +from maskrcnn_benchmark.layers import IOULoss +from maskrcnn_benchmark.layers import SigmoidFocalLoss +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.modeling.utils import cat +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist + + +INF = 100000000 + + +class FCOSLossComputation(object): + """ + This class computes the FCOS losses. + """ + + def __init__(self, cfg): + self.cls_loss_func = SigmoidFocalLoss( + cfg.MODEL.FCOS.LOSS_GAMMA, + cfg.MODEL.FCOS.LOSS_ALPHA + ) + # we make use of IOU Loss for bounding boxes regression, + # but we found that L1 in log scale can yield a similar performance + self.box_reg_loss_func = IOULoss() + self.centerness_loss_func = nn.BCEWithLogitsLoss() + # generate sizes of interest + soi = [] + prev_size = -1 + for s in cfg.MODEL.FCOS.SIZES_OF_INTEREST: + soi.append([prev_size, s]) + prev_size = s + soi.append([prev_size, INF]) + self.object_sizes_of_interest = soi + + def prepare_targets(self, points, targets): + object_sizes_of_interest = self.object_sizes_of_interest + expanded_object_sizes_of_interest = [] + for l, points_per_level in enumerate(points): + object_sizes_of_interest_per_level = \ + points_per_level.new_tensor(object_sizes_of_interest[l]) + expanded_object_sizes_of_interest.append( + object_sizes_of_interest_per_level[None].expand(len(points_per_level), -1) + ) + + expanded_object_sizes_of_interest = torch.cat(expanded_object_sizes_of_interest, dim=0) + num_points_per_level = [len(points_per_level) for points_per_level in points] + points_all_level = torch.cat(points, dim=0) + labels, reg_targets = self.compute_targets_for_locations( + points_all_level, targets, expanded_object_sizes_of_interest + ) + + for i in range(len(labels)): + labels[i] = torch.split(labels[i], num_points_per_level, dim=0) + reg_targets[i] = torch.split(reg_targets[i], num_points_per_level, dim=0) + + labels_level_first = [] + reg_targets_level_first = [] + for level in range(len(points)): + labels_level_first.append( + torch.cat([labels_per_im[level] for labels_per_im in labels], dim=0) + ) + reg_targets_level_first.append( + torch.cat([reg_targets_per_im[level] for reg_targets_per_im in reg_targets], dim=0) + ) + + return labels_level_first, reg_targets_level_first + + def compute_targets_for_locations(self, locations, targets, object_sizes_of_interest): + labels = [] + reg_targets = [] + xs, ys = locations[:, 0], locations[:, 1] + + for im_i in range(len(targets)): + targets_per_im = targets[im_i] + assert targets_per_im.mode == "xyxy" + bboxes = targets_per_im.bbox + labels_per_im = targets_per_im.get_field("labels") + area = targets_per_im.area() + + l = xs[:, None] - bboxes[:, 0][None] + t = ys[:, None] - bboxes[:, 1][None] + r = bboxes[:, 2][None] - xs[:, None] + b = bboxes[:, 3][None] - ys[:, None] + reg_targets_per_im = torch.stack([l, t, r, b], dim=2) + + is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0 + + max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0] + # limit the regression range for each location + is_cared_in_the_level = \ + (max_reg_targets_per_im >= object_sizes_of_interest[:, [0]]) & \ + (max_reg_targets_per_im <= object_sizes_of_interest[:, [1]]) + + locations_to_gt_area = area[None].repeat(len(locations), 1) + locations_to_gt_area[is_in_boxes == 0] = INF + locations_to_gt_area[is_cared_in_the_level == 0] = INF + + # if there are still more than one objects for a location, + # we choose the one with minimal area + locations_to_min_aera, locations_to_gt_inds = locations_to_gt_area.min(dim=1) + + reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds] + labels_per_im = labels_per_im[locations_to_gt_inds] + labels_per_im[locations_to_min_aera == INF] = 0 + + labels.append(labels_per_im) + reg_targets.append(reg_targets_per_im) + + return labels, reg_targets + + def compute_centerness_targets(self, reg_targets): + left_right = reg_targets[:, [0, 2]] + top_bottom = reg_targets[:, [1, 3]] + centerness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \ + (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]) + return torch.sqrt(centerness) + + def __call__(self, locations, box_cls, box_regression, centerness, targets): + """ + Arguments: + locations (list[BoxList]) + box_cls (list[Tensor]) + box_regression (list[Tensor]) + centerness (list[Tensor]) + targets (list[BoxList]) + + Returns: + cls_loss (Tensor) + reg_loss (Tensor) + centerness_loss (Tensor) + """ + N = box_cls[0].size(0) + num_classes = box_cls[0].size(1) + labels, reg_targets = self.prepare_targets(locations, targets) + + box_cls_flatten = [] + box_regression_flatten = [] + centerness_flatten = [] + labels_flatten = [] + reg_targets_flatten = [] + for l in range(len(labels)): + box_cls_flatten.append(box_cls[l].permute(0, 2, 3, 1).reshape(-1, num_classes)) + box_regression_flatten.append(box_regression[l].permute(0, 2, 3, 1).reshape(-1, 4)) + labels_flatten.append(labels[l].reshape(-1)) + reg_targets_flatten.append(reg_targets[l].reshape(-1, 4)) + centerness_flatten.append(centerness[l].reshape(-1)) + + box_cls_flatten = torch.cat(box_cls_flatten, dim=0) + box_regression_flatten = torch.cat(box_regression_flatten, dim=0) + centerness_flatten = torch.cat(centerness_flatten, dim=0) + labels_flatten = torch.cat(labels_flatten, dim=0) + reg_targets_flatten = torch.cat(reg_targets_flatten, dim=0) + + pos_inds = torch.nonzero(labels_flatten > 0).squeeze(1) + cls_loss = self.cls_loss_func( + box_cls_flatten, + labels_flatten.int() + ) / (pos_inds.numel() + N) # add N to avoid dividing by a zero + + box_regression_flatten = box_regression_flatten[pos_inds] + reg_targets_flatten = reg_targets_flatten[pos_inds] + centerness_flatten = centerness_flatten[pos_inds] + + if pos_inds.numel() > 0: + centerness_targets = self.compute_centerness_targets(reg_targets_flatten) + reg_loss = self.box_reg_loss_func( + box_regression_flatten, + reg_targets_flatten, + centerness_targets + ) + centerness_loss = self.centerness_loss_func( + centerness_flatten, + centerness_targets + ) + else: + reg_loss = box_regression_flatten.sum() + centerness_loss = centerness_flatten.sum() + + return cls_loss, reg_loss, centerness_loss + + +def make_fcos_loss_evaluator(cfg): + loss_evaluator = FCOSLossComputation(cfg) + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/rpn/inference.py b/maskrcnn_benchmark/modeling/rpn/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..9a2e3871e42fac9fcef3db00da626ec0386d68b2 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/inference.py @@ -0,0 +1,199 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms +from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes + +from ..utils import cat +from .utils import permute_and_flatten + +class RPNPostProcessor(torch.nn.Module): + """ + Performs post-processing on the outputs of the RPN boxes, before feeding the + proposals to the heads + """ + + def __init__( + self, + pre_nms_top_n, + post_nms_top_n, + nms_thresh, + min_size, + box_coder=None, + fpn_post_nms_top_n=None, + ): + """ + Arguments: + pre_nms_top_n (int) + post_nms_top_n (int) + nms_thresh (float) + min_size (int) + box_coder (BoxCoder) + fpn_post_nms_top_n (int) + """ + super(RPNPostProcessor, self).__init__() + self.pre_nms_top_n = pre_nms_top_n # 12000 + self.post_nms_top_n = post_nms_top_n # 2000 + self.nms_thresh = nms_thresh # 0.7 + self.min_size = min_size # 0 + + if box_coder is None: + box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) + self.box_coder = box_coder + + if fpn_post_nms_top_n is None: + fpn_post_nms_top_n = post_nms_top_n + self.fpn_post_nms_top_n = fpn_post_nms_top_n # 2000 + + def add_gt_proposals(self, proposals, targets): + """ + Arguments: + proposals: list[BoxList] + targets: list[BoxList] + """ + # Get the device we're operating on + device = proposals[0].bbox.device + + gt_boxes = [target.copy_with_fields([]) for target in targets] + + # later cat of bbox requires all fields to be present for all bbox + # so we need to add a dummy for objectness that's missing + for gt_box in gt_boxes: + gt_box.add_field("objectness", torch.ones(len(gt_box), device=device)) + + proposals = [ + cat_boxlist((proposal, gt_box)) + for proposal, gt_box in zip(proposals, gt_boxes) + ] + + return proposals + + def forward_for_single_feature_map(self, anchors, objectness, box_regression): + """ + Arguments: + anchors: list[BoxList] # [image,number,[n,4]] + objectness: tensor of size N, A, H, W + box_regression: tensor of size N, A * 4, H, W + """ + device = objectness.device + N, A, H, W = objectness.shape + # put in the same format as anchors + objectness = permute_and_flatten(objectness, N, A, 1, H, W).view(N, -1) # N H*W*A*1 + objectness = objectness.sigmoid() + box_regression = permute_and_flatten(box_regression, N, A, 18, H, W) # N H*W*A 4 + num_anchors = A * H * W # 391040 97760 + + pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) #12000 + objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True) + # objectness = objectness.cpu() + batch_idx = torch.arange(N, device=device)[:, None] + box_regression = box_regression[batch_idx, topk_idx] + image_shapes = [box.size for box in anchors] + concat_anchors = torch.cat([a.bbox for a in anchors], dim=0) + concat_anchors = concat_anchors.reshape(N, -1, 4)[batch_idx, topk_idx] + proposals = self.box_coder.decode_iou( + box_regression.view(-1, 18), concat_anchors.view(-1, 4) + ) + + proposals = proposals.view(N, -1, 4) + + result = [] + for proposal, score, im_shape in zip(proposals, objectness, image_shapes): + boxlist = BoxList(proposal, im_shape, mode="xyxy") + boxlist.add_field("objectness", score) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = remove_small_boxes(boxlist, self.min_size) + boxlist = boxlist_nms( + boxlist, + self.nms_thresh, + max_proposals=self.post_nms_top_n, + score_field="objectness", + ) + result.append(boxlist) + return result + + def forward(self, anchors, objectness, box_regression, targets=None): + """ + Arguments: + anchors: list[list[BoxList]] + objectness: list[tensor] + box_regression: list[tensor] + + Returns: + boxlists (list[BoxList]): the post-processed anchors, after + applying box decoding and NMS + """ + sampled_boxes = [] + num_levels = len(objectness) # classification + anchors = list(zip(*anchors)) # [image,number,[n,4]] + # i =-1 + for a, o, b in zip(anchors, objectness, box_regression): + sampled_boxes.append(self.forward_for_single_feature_map(a, o, b)) + + + boxlists = list(zip(*sampled_boxes)) + boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] + + if num_levels > 1: + boxlists = self.select_over_all_levels(boxlists) + + # append ground-truth bboxes to proposals + if self.training and targets is not None: + boxlists = self.add_gt_proposals(boxlists, targets) + + return boxlists + + def select_over_all_levels(self, boxlists): + num_images = len(boxlists) + # different behavior during training and during testing: + # during training, post_nms_top_n is over *all* the proposals combined, while + # during testing, it is over the proposals for each image + # TODO resolve this difference and make it consistent. It should be per image, + # and not per batch + if self.training: + objectness = torch.cat( + [boxlist.get_field("objectness") for boxlist in boxlists], dim=0 + ) + box_sizes = [len(boxlist) for boxlist in boxlists] + post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness)) + _, inds_sorted = torch.topk(objectness, post_nms_top_n, dim=0, sorted=True) + inds_mask = torch.zeros_like(objectness, dtype=torch.uint8) + inds_mask[inds_sorted] = 1 + inds_mask = inds_mask.split(box_sizes) + for i in range(num_images): + boxlists[i] = boxlists[i][inds_mask[i]] + else: + for i in range(num_images): + objectness = boxlists[i].get_field("objectness") + post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness)) + _, inds_sorted = torch.topk( + objectness, post_nms_top_n, dim=0, sorted=True + ) + boxlists[i] = boxlists[i][inds_sorted] + return boxlists + + +def make_rpn_postprocessor(config, rpn_box_coder, is_train): + fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN # 2000 + if not is_train: + fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST + + pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TRAIN # 12000 + post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TRAIN # 2000 + if not is_train: + pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TEST + post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TEST + nms_thresh = config.MODEL.RPN.NMS_THRESH # 0.7 + min_size = config.MODEL.RPN.MIN_SIZE # 0 + box_selector = RPNPostProcessor( + pre_nms_top_n=pre_nms_top_n, #12000 + post_nms_top_n=post_nms_top_n, #2000 + nms_thresh=nms_thresh, # 0.7 + min_size=min_size, # 0 + box_coder=rpn_box_coder, + fpn_post_nms_top_n=fpn_post_nms_top_n, #2000 + ) + return box_selector diff --git a/maskrcnn_benchmark/modeling/rpn/loss.py b/maskrcnn_benchmark/modeling/rpn/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..d39d03ce96eeca40d81df45eaa262674c21dbccc --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/loss.py @@ -0,0 +1,153 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +This file contains specific functions for computing losses on the RPN +file +""" + +import torch +from torch.nn import functional as F +from maskrcnn_benchmark.config import cfg + +from .utils import concat_box_prediction_layers + +from ..balanced_positive_negative_sampler import BalancedPositiveNegativeSampler +from ..utils import cat + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.layers import iou_regress +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist + + +class RPNLossComputation(object): + """ + This class computes the RPN loss. + """ + + def __init__(self, proposal_matcher, fg_bg_sampler, box_coder, + generate_labels_func): + """ + Arguments: + proposal_matcher (Matcher) + fg_bg_sampler (BalancedPositiveNegativeSampler) + box_coder (BoxCoder) + """ + self.proposal_matcher = proposal_matcher + self.fg_bg_sampler = fg_bg_sampler + self.box_coder = box_coder + self.copied_fields = [] + self.generate_labels_func = generate_labels_func + self.discard_cases = ['not_visibility', 'between_thresholds'] + + def match_targets_to_anchors(self, anchor, target, copied_fields=[]): + + match_quality_matrix = boxlist_iou(target, anchor) + matched_idxs = self.proposal_matcher(match_quality_matrix) + # RPN doesn't need any fields from target for creating the labels, so clear them all + target = target.copy_with_fields(copied_fields) + # get the targets corresponding GT for each anchor + # NB: need to clamp the indices because we can have a single + # GT in the image, and matched_idxs can be -2, which goes + # out of bounds + matched_targets = target[matched_idxs.clamp(min=0)] + matched_targets.add_field("matched_idxs", matched_idxs) + return matched_targets + + def prepare_targets(self, anchors, targets): + labels = [] + regression_targets = [] + + for anchors_per_image, targets_per_image in zip(anchors, targets): + matched_targets = self.match_targets_to_anchors( + anchors_per_image, targets_per_image, self.copied_fields + ) + + matched_idxs = matched_targets.get_field("matched_idxs") + labels_per_image = self.generate_labels_func(matched_targets) + labels_per_image = labels_per_image.to(dtype=torch.float32) + + # Background (negative examples) + bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD + labels_per_image[bg_indices] = 0 + + # discard anchors that go out of the boundaries of the image + if "not_visibility" in self.discard_cases: + labels_per_image[~anchors_per_image.get_field("visibility")] = -1 + + # discard indices that are between thresholds + if "between_thresholds" in self.discard_cases: + inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS + labels_per_image[inds_to_discard] = -1 + regression_targets_per_image = matched_targets.bbox + labels.append(labels_per_image) + regression_targets.append(regression_targets_per_image) + + return labels, regression_targets + + + def __call__(self, anchors, objectness, box_regression, targets): + """ + Arguments: + anchors (list[BoxList]) + objectness (list[Tensor]) + box_regression (list[Tensor]) + targets (list[BoxList]) + + Returns: + objectness_loss (Tensor) + box_loss (Tensor + """ + anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors] + + labels, regression_targets = self.prepare_targets(anchors, targets) + sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) + sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1) + sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1) + + sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0) + objectness, box_regression = \ + concat_box_prediction_layers(objectness, box_regression) + objectness = objectness.squeeze() # [1041820] + labels = torch.cat(labels, dim=0) + regression_targets = torch.cat(regression_targets, dim=0) + + box_loss = iou_regress( + box_regression[sampled_pos_inds], + regression_targets[sampled_pos_inds], + beta=1.0 / 9, + size_average=False, + ) / (sampled_inds.numel()) + + box_loss *= cfg.MODEL.ROI_BOUNDARY_HEAD.Loss_balance + + objectness_loss = F.binary_cross_entropy_with_logits( + objectness[sampled_inds], labels[sampled_inds] + ) + return objectness_loss, box_loss + +# This function should be overwritten in RetinaNet 11 +def generate_rpn_labels(matched_targets): + matched_idxs = matched_targets.get_field("matched_idxs") + labels_per_image = matched_idxs >= 0 + return labels_per_image + + +def make_rpn_loss_evaluator(cfg, box_coder): + matcher = Matcher( + cfg.MODEL.RPN.FG_IOU_THRESHOLD, + cfg.MODEL.RPN.BG_IOU_THRESHOLD, + allow_low_quality_matches=True, + ) + + fg_bg_sampler = BalancedPositiveNegativeSampler( + cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, cfg.MODEL.RPN.POSITIVE_FRACTION + ) + + loss_evaluator = RPNLossComputation( + matcher, + fg_bg_sampler, + box_coder, + generate_rpn_labels + ) + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py b/maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd36d5fbbe62db89fc5c32cc62146d1808c9459a Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/inference.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57e0916dfc6fd334502ba62f3293cfc1338c10b0 Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/inference.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/loss.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f498585c7958def9ad05cf056265d2e790624aa Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/loss.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/retinanet.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/retinanet.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..559289d998ca6b6e6a6290d7d6933534544b9e26 Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/retinanet.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py b/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..09c17adfc2565871632cffecd51738a2cbd9acb2 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py @@ -0,0 +1,194 @@ +import torch + +from ..inference import RPNPostProcessor +from ..utils import permute_and_flatten + +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.modeling.utils import cat +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms +from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes + + +class RetinaNetPostProcessor(RPNPostProcessor): + """ + Performs post-processing on the outputs of the RetinaNet boxes. + This is only used in the testing. + """ + def __init__( + self, + pre_nms_thresh, + pre_nms_top_n, + nms_thresh, + fpn_post_nms_top_n, + min_size, + num_classes, + box_coder=None, + ): + """ + Arguments: + pre_nms_thresh (float) + pre_nms_top_n (int) + nms_thresh (float) + fpn_post_nms_top_n (int) + min_size (int) + num_classes (int) + box_coder (BoxCoder) + """ + super(RetinaNetPostProcessor, self).__init__( + pre_nms_thresh, 0, nms_thresh, min_size + ) + self.pre_nms_thresh = pre_nms_thresh + self.pre_nms_top_n = pre_nms_top_n + self.nms_thresh = nms_thresh + self.fpn_post_nms_top_n = fpn_post_nms_top_n + self.min_size = min_size + self.num_classes = num_classes + + if box_coder is None: + box_coder = BoxCoder(weights=(10., 10., 5., 5.)) + self.box_coder = box_coder + + def add_gt_proposals(self, proposals, targets): + """ + This function is not used in RetinaNet + """ + pass + + def forward_for_single_feature_map( + self, anchors, box_cls, box_regression): + """ + Arguments: + anchors: list[BoxList] + box_cls: tensor of size N, A * C, H, W + box_regression: tensor of size N, A * 4, H, W + """ + device = box_cls.device + N, _, H, W = box_cls.shape + A = box_regression.size(1) // 4 + C = box_cls.size(1) // A + + # put in the same format as anchors + box_cls = permute_and_flatten(box_cls, N, A, C, H, W) + box_cls = box_cls.sigmoid() + + box_regression = permute_and_flatten(box_regression, N, A, 4, H, W) + box_regression = box_regression.reshape(N, -1, 4) + + num_anchors = A * H * W + + candidate_inds = box_cls > self.pre_nms_thresh + + pre_nms_top_n = candidate_inds.view(N, -1).sum(1) + pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) + + results = [] + for per_box_cls, per_box_regression, per_pre_nms_top_n, \ + per_candidate_inds, per_anchors in zip( + box_cls, + box_regression, + pre_nms_top_n, + candidate_inds, + anchors): + + # Sort and select TopN + # TODO most of this can be made out of the loop for + # all images. + # TODO:Yang: Not easy to do. Because the numbers of detections are + # different in each image. Therefore, this part needs to be done + # per image. + per_box_cls = per_box_cls[per_candidate_inds] + + per_box_cls, top_k_indices = \ + per_box_cls.topk(per_pre_nms_top_n, sorted=False) + + per_candidate_nonzeros = \ + per_candidate_inds.nonzero()[top_k_indices, :] + + per_box_loc = per_candidate_nonzeros[:, 0] + per_class = per_candidate_nonzeros[:, 1] + per_class += 1 + + detections = self.box_coder.decode( + per_box_regression[per_box_loc, :].view(-1, 4), + per_anchors.bbox[per_box_loc, :].view(-1, 4) + ) + + boxlist = BoxList(detections, per_anchors.size, mode="xyxy") + boxlist.add_field("labels", per_class) + boxlist.add_field("scores", per_box_cls) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = remove_small_boxes(boxlist, self.min_size) + results.append(boxlist) + + return results + + # TODO very similar to filter_results from PostProcessor + # but filter_results is per image + # TODO Yang: solve this issue in the future. No good solution + # right now. + def select_over_all_levels(self, boxlists): + num_images = len(boxlists) + results = [] + for i in range(num_images): + scores = boxlists[i].get_field("scores") + labels = boxlists[i].get_field("labels") + boxes = boxlists[i].bbox + boxlist = boxlists[i] + result = [] + # skip the background + for j in range(1, self.num_classes): + inds = (labels == j).nonzero().view(-1) + + scores_j = scores[inds] + boxes_j = boxes[inds, :].view(-1, 4) + boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") + boxlist_for_class.add_field("scores", scores_j) + boxlist_for_class = boxlist_nms( + boxlist_for_class, self.nms_thresh, + score_field="scores" + ) + num_labels = len(boxlist_for_class) + boxlist_for_class.add_field( + "labels", torch.full((num_labels,), j, + dtype=torch.int64, + device=scores.device) + ) + result.append(boxlist_for_class) + + result = cat_boxlist(result) + number_of_detections = len(result) + + # Limit to max_per_image detections **over all classes** + if number_of_detections > self.fpn_post_nms_top_n > 0: + cls_scores = result.get_field("scores") + image_thresh, _ = torch.kthvalue( + cls_scores.cpu(), + number_of_detections - self.fpn_post_nms_top_n + 1 + ) + keep = cls_scores >= image_thresh.item() + keep = torch.nonzero(keep).squeeze(1) + result = result[keep] + results.append(result) + return results + + +def make_retinanet_postprocessor(config, rpn_box_coder, is_train): + pre_nms_thresh = config.MODEL.RETINANET.INFERENCE_TH + pre_nms_top_n = config.MODEL.RETINANET.PRE_NMS_TOP_N + nms_thresh = config.MODEL.RETINANET.NMS_TH + fpn_post_nms_top_n = config.TEST.DETECTIONS_PER_IMG + min_size = 0 + + box_selector = RetinaNetPostProcessor( + pre_nms_thresh=pre_nms_thresh, + pre_nms_top_n=pre_nms_top_n, + nms_thresh=nms_thresh, + fpn_post_nms_top_n=fpn_post_nms_top_n, + min_size=min_size, + num_classes=config.MODEL.RETINANET.NUM_CLASSES, + box_coder=rpn_box_coder, + ) + + return box_selector diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py b/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..080e2153ba59e90e620f30a5adc5426a1551e4e8 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py @@ -0,0 +1,107 @@ +""" +This file contains specific functions for computing losses on the RetinaNet +file +""" + +import torch +from torch.nn import functional as F + +from ..utils import concat_box_prediction_layers + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.layers import SigmoidFocalLoss +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.modeling.utils import cat +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.modeling.rpn.loss import RPNLossComputation + +class RetinaNetLossComputation(RPNLossComputation): + """ + This class computes the RetinaNet loss. + """ + + def __init__(self, proposal_matcher, box_coder, + generate_labels_func, + sigmoid_focal_loss, + bbox_reg_beta=0.11, + regress_norm=1.0): + """ + Arguments: + proposal_matcher (Matcher) + box_coder (BoxCoder) + """ + self.proposal_matcher = proposal_matcher + self.box_coder = box_coder + self.box_cls_loss_func = sigmoid_focal_loss + self.bbox_reg_beta = bbox_reg_beta + self.copied_fields = ['labels'] + self.generate_labels_func = generate_labels_func + self.discard_cases = ['between_thresholds'] + self.regress_norm = regress_norm + + def __call__(self, anchors, box_cls, box_regression, targets): + """ + Arguments: + anchors (list[BoxList]) + box_cls (list[Tensor]) + box_regression (list[Tensor]) + targets (list[BoxList]) + + Returns: + retinanet_cls_loss (Tensor) + retinanet_regression_loss (Tensor + """ + anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors] + labels, regression_targets = self.prepare_targets(anchors, targets) + + N = len(labels) + box_cls, box_regression = \ + concat_box_prediction_layers(box_cls, box_regression) + + labels = torch.cat(labels, dim=0) + regression_targets = torch.cat(regression_targets, dim=0) + pos_inds = torch.nonzero(labels > 0).squeeze(1) + + retinanet_regression_loss = smooth_l1_loss( + box_regression[pos_inds], + regression_targets[pos_inds], + beta=self.bbox_reg_beta, + size_average=False, + ) / (max(1, pos_inds.numel() * self.regress_norm)) + + labels = labels.int() + + retinanet_cls_loss = self.box_cls_loss_func( + box_cls, + labels + ) / (pos_inds.numel() + N) + + return retinanet_cls_loss, retinanet_regression_loss + + +def generate_retinanet_labels(matched_targets): + labels_per_image = matched_targets.get_field("labels") + return labels_per_image + + +def make_retinanet_loss_evaluator(cfg, box_coder): + matcher = Matcher( + cfg.MODEL.RETINANET.FG_IOU_THRESHOLD, + cfg.MODEL.RETINANET.BG_IOU_THRESHOLD, + allow_low_quality_matches=True, + ) + sigmoid_focal_loss = SigmoidFocalLoss( + cfg.MODEL.RETINANET.LOSS_GAMMA, + cfg.MODEL.RETINANET.LOSS_ALPHA + ) + + loss_evaluator = RetinaNetLossComputation( + matcher, + box_coder, + generate_retinanet_labels, + sigmoid_focal_loss, + bbox_reg_beta = cfg.MODEL.RETINANET.BBOX_REG_BETA, + regress_norm = cfg.MODEL.RETINANET.BBOX_REG_WEIGHT, + ) + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py b/maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py new file mode 100644 index 0000000000000000000000000000000000000000..1599b29b2e9bbb626b31d652022fbbd034bf5e30 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py @@ -0,0 +1,152 @@ +import math +import torch +import torch.nn.functional as F +from torch import nn + +from .inference import make_retinanet_postprocessor +from .loss import make_retinanet_loss_evaluator +from ..anchor_generator import make_anchor_generator_retinanet + +from maskrcnn_benchmark.modeling.box_coder import BoxCoder + + +class RetinaNetHead(torch.nn.Module): + """ + Adds a RetinNet head with classification and regression heads + """ + + def __init__(self, cfg, in_channels): + """ + Arguments: + in_channels (int): number of channels of the input feature + num_anchors (int): number of anchors to be predicted + """ + super(RetinaNetHead, self).__init__() + # TODO: Implement the sigmoid version first. + num_classes = cfg.MODEL.RETINANET.NUM_CLASSES - 1 + num_anchors = len(cfg.MODEL.RETINANET.ASPECT_RATIOS) \ + * cfg.MODEL.RETINANET.SCALES_PER_OCTAVE + + cls_tower = [] + bbox_tower = [] + for i in range(cfg.MODEL.RETINANET.NUM_CONVS): + cls_tower.append( + nn.Conv2d( + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1 + ) + ) + cls_tower.append(nn.ReLU()) + bbox_tower.append( + nn.Conv2d( + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1 + ) + ) + bbox_tower.append(nn.ReLU()) + + self.add_module('cls_tower', nn.Sequential(*cls_tower)) + self.add_module('bbox_tower', nn.Sequential(*bbox_tower)) + self.cls_logits = nn.Conv2d( + in_channels, num_anchors * num_classes, kernel_size=3, stride=1, + padding=1 + ) + self.bbox_pred = nn.Conv2d( + in_channels, num_anchors * 4, kernel_size=3, stride=1, + padding=1 + ) + + # Initialization + for modules in [self.cls_tower, self.bbox_tower, self.cls_logits, + self.bbox_pred]: + for l in modules.modules(): + if isinstance(l, nn.Conv2d): + torch.nn.init.normal_(l.weight, std=0.01) + torch.nn.init.constant_(l.bias, 0) + + + # retinanet_bias_init + prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB + bias_value = -math.log((1 - prior_prob) / prior_prob) + torch.nn.init.constant_(self.cls_logits.bias, bias_value) + + def forward(self, x): + logits = [] + bbox_reg = [] + for feature in x: + logits.append(self.cls_logits(self.cls_tower(feature))) + bbox_reg.append(self.bbox_pred(self.bbox_tower(feature))) + return logits, bbox_reg + + +class RetinaNetModule(torch.nn.Module): + """ + Module for RetinaNet computation. Takes feature maps from the backbone and + RetinaNet outputs and losses. Only Test on FPN now. + """ + + def __init__(self, cfg, in_channels): + super(RetinaNetModule, self).__init__() + + self.cfg = cfg.clone() + + anchor_generator = make_anchor_generator_retinanet(cfg) + head = RetinaNetHead(cfg, in_channels) + box_coder = BoxCoder(weights=(10., 10., 5., 5.)) + + box_selector_test = make_retinanet_postprocessor(cfg, box_coder, is_train=False) + + loss_evaluator = make_retinanet_loss_evaluator(cfg, box_coder) + + self.anchor_generator = anchor_generator + self.head = head + self.box_selector_test = box_selector_test + self.loss_evaluator = loss_evaluator + + def forward(self, images, features, targets=None): + """ + Arguments: + images (ImageList): images for which we want to compute the predictions + features (list[Tensor]): features computed from the images that are + used for computing the predictions. Each tensor in the list + correspond to different feature levels + targets (list[BoxList): ground-truth boxes present in the image (optional) + + Returns: + boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per + image. + losses (dict[Tensor]): the losses for the model during training. During + testing, it is an empty dict. + """ + box_cls, box_regression = self.head(features) + anchors = self.anchor_generator(images, features) + + if self.training: + return self._forward_train(anchors, box_cls, box_regression, targets) + else: + return self._forward_test(anchors, box_cls, box_regression) + + def _forward_train(self, anchors, box_cls, box_regression, targets): + + loss_box_cls, loss_box_reg = self.loss_evaluator( + anchors, box_cls, box_regression, targets + ) + losses = { + "loss_retina_cls": loss_box_cls, + "loss_retina_reg": loss_box_reg, + } + return anchors, losses + + def _forward_test(self, anchors, box_cls, box_regression): + boxes = self.box_selector_test(anchors, box_cls, box_regression) + return boxes, {} + + +def build_retinanet(cfg, in_channels): + return RetinaNetModule(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/rpn/rpn.py b/maskrcnn_benchmark/modeling/rpn/rpn.py new file mode 100644 index 0000000000000000000000000000000000000000..8b027855cf114594c437f7d867a187b496b3bc80 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/rpn.py @@ -0,0 +1,321 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn +import math +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.modeling.rpn.retinanet.retinanet import build_retinanet +from maskrcnn_benchmark.modeling.rpn.fcos.fcos import build_fcos +from .loss import make_rpn_loss_evaluator +from .anchor_generator import make_anchor_generator +from .inference import make_rpn_postprocessor + + +class RPNHeadConvRegressor(nn.Module): + """ + A simple RPN Head for classification and bbox regression + """ + + def __init__(self, cfg, in_channels, num_anchors): + """ + Arguments: + cfg : config + in_channels (int): number of channels of the input feature + num_anchors (int): number of anchors to be predicted + """ + super(RPNHeadConvRegressor, self).__init__() + self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) + self.bbox_pred = nn.Conv2d( + in_channels, num_anchors * 4, kernel_size=1, stride=1 + ) + + for l in [self.cls_logits, self.bbox_pred]: + torch.nn.init.normal_(l.weight, std=0.01) + torch.nn.init.constant_(l.bias, 0) + + def forward(self, x): + assert isinstance(x, (list, tuple)) + logits = [self.cls_logits(y) for y in x] + bbox_reg = [self.bbox_pred(y) for y in x] + + return logits, bbox_reg + + +class RPNHeadFeatureSingleConv(nn.Module): + """ + Adds a simple RPN Head with one conv to extract the feature + """ + + def __init__(self, cfg, in_channels): + """ + Arguments: + cfg : config + in_channels (int): number of channels of the input feature + """ + super(RPNHeadFeatureSingleConv, self).__init__() + self.conv = nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1 + ) + + for l in [self.conv]: + torch.nn.init.normal_(l.weight, std=0.01) + torch.nn.init.constant_(l.bias, 0) + + self.out_channels = in_channels + + def forward(self, x): + assert isinstance(x, (list, tuple)) + x = [F.relu(self.conv(z)) for z in x] + + return x + + +@registry.RPN_HEADS.register("SingleConvRPNHead_1") +class RPNHead(nn.Module): + """ + Adds a simple RPN Head with classification and regression heads + """ + + def __init__(self, cfg, in_channels, num_anchors): + """ + Arguments: + cfg : config + in_channels (int): number of channels of the input feature + num_anchors (int): number of anchors to be predicted + """ + super(RPNHead, self).__init__() + self.conv = nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1 + ) + self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) + self.bbox_pred_new = nn.Conv2d( + in_channels, num_anchors * 18, kernel_size=1, stride=1 + ) + + for l in [self.conv, self.cls_logits, self.bbox_pred_new]: + torch.nn.init.normal_(l.weight, std=0.01) + torch.nn.init.constant_(l.bias, 0) + + def forward(self, x): + + logits = [] + bbox_reg = [] + for feature in x: + t = F.relu(self.conv(feature)) + logits.append(self.cls_logits(t)) + bbox_reg.append(self.bbox_pred_new(t)) + return logits, bbox_reg + + +class RPNModule(torch.nn.Module): + """ + Module for RPN computation. Takes feature maps from the backbone and RPN + proposals and losses. Works for both FPN and non-FPN. + """ + + def __init__(self, cfg, in_channels): + super(RPNModule, self).__init__() + + self.cfg = cfg.clone() + + anchor_generator = make_anchor_generator(cfg) + + rpn_head = registry.RPN_HEADS[cfg.MODEL.RPN.RPN_HEAD] + head = rpn_head( + cfg, in_channels, anchor_generator.num_anchors_per_location()[0] + ) + + rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) + + box_selector_train = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=True) + box_selector_test = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=False) + + loss_evaluator = make_rpn_loss_evaluator(cfg, rpn_box_coder) + + self.anchor_generator = anchor_generator + self.head = head + self.box_selector_train = box_selector_train + self.box_selector_test = box_selector_test + self.loss_evaluator = loss_evaluator + + def forward(self, images, features, targets=None, prefix=''): + """ + Arguments: + images (ImageList): images for which we want to compute the predictions + features (list[Tensor]): features computed from the images that are + used for computing the predictions. Each tensor in the list + correspond to different feature levels + targets (list[BoxList): ground-truth boxes present in the image (optional) + + Returns: + boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per + image. + losses (dict[Tensor]): the losses for the model during training. During + testing, it is an empty dict. + """ + objectness, rpn_box_regression = self.head(features) # len = 5 + anchors = self.anchor_generator(images, features) + + if self.training: + return self._forward_train(anchors, objectness, + rpn_box_regression, targets, prefix) + else: + return self._forward_test(anchors, objectness, rpn_box_regression) + + def _forward_train(self, anchors, objectness, rpn_box_regression, # [image,number,[n,4]] + targets, prefix): + if self.cfg.MODEL.RPN_ONLY: + # When training an RPN-only model, the loss is determined by the + # predicted objectness and rpn_box_regression values and there is + # no need to transform the anchors into predicted boxes; this is an + # optimization that avoids the unnecessary transformation. + boxes = anchors + else: + # print('\n---end-to-end model---\n') + # For end-to-end models, anchors must be transformed into boxes and + # sampled into a training batch. + with torch.no_grad(): + boxes = self.box_selector_train( + anchors, objectness, rpn_box_regression, targets + ) + anchors_new = list(zip(*anchors)) + regress_new = regress_to_box(anchors_new, rpn_box_regression) + + loss_objectness, loss_rpn_box_reg = self.loss_evaluator( + anchors, objectness, regress_new, targets + ) + losses = { + prefix + "loss_objectness": loss_objectness, + prefix + "loss_rpn_box_reg": loss_rpn_box_reg, + } + return boxes, losses + + def _forward_test(self, anchors, objectness, rpn_box_regression): + boxes = self.box_selector_test(anchors, objectness, rpn_box_regression) + if self.cfg.MODEL.RPN_ONLY: + # For end-to-end models, the RPN proposals are an intermediate state + # and don't bother to sort them in decreasing score order. For RPN-only + # models, the proposals are the final output and we return them in + # high-to-low confidence order. + inds = [ + box.get_field("objectness").sort(descending=True)[1] for box in boxes + ] + boxes = [box[ind] for box, ind in zip(boxes, inds)] + return boxes, {} + + +def build_rpn(cfg, in_channels): + """ + This gives the gist of it. Not super important because it doesn't change as much + """ + if cfg.MODEL.FCOS_ON: + return build_fcos(cfg, in_channels) + if cfg.MODEL.RETINANET_ON: + return build_retinanet(cfg, in_channels) + + return RPNModule(cfg, in_channels) + + +def regress_to_box(anchor_define,regress_pre): + + boxes_total = [] + num_f = 0 + for a, b in zip(anchor_define, regress_pre): + boxes_total.append(forward_feature_map(a, b)) + num_f += 1 + return boxes_total + +def forward_feature_map(anchors_define, boxes_regression): + N, A, H, W = boxes_regression.shape + + boxes_regression = faltten(boxes_regression, N, A, 18, H, W) # + + # image_shapes = [box.size for box in anchors_define] + concat_anchors = torch.cat([a.bbox for a in anchors_define], dim=0) + concat_anchors = concat_anchors.reshape(N, -1, 4) + proposals = decode_iou(boxes_regression.view(-1, 18), concat_anchors.view(-1, 4)) + box_temp_post = proposals.view(N, -1, 4) + + return box_temp_post + +def faltten(layer, N, A, C, H, W): + layer = layer.view(N, -1, C, H, W) + layer = layer.permute(0, 3, 4, 1, 2) #N H W A C + layer = layer.reshape(N, -1, C) # N H*W*A C + return layer + +def decode_iou( rel_codes, boxes, num_p = 8): + """ + From a set of original boxes and encoded relative box offsets, + get the decoded boxes. + + Arguments: + rel_codes (Tensor): encoded boxes # predict [2, 12000, 4] + boxes (Tensor): reference boxes. # anchor [2, 12000, 4] xmin0 ymin1 xmax2 ymax3 + """ + boxes = boxes.to(rel_codes.dtype) + + TO_REMOVE = 1 # TODO remove + widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE + heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE + dx = rel_codes[:, 16] + dy = rel_codes[:, 17] + + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + ctr_x_new = dx * widths * 0.5 + ctr_x + ctr_y_new = dy * heights * 0.5 + ctr_y + # 123 + # 8#4 + # 765 + if num_p == 8: # 8 boundary points + x_1 = boxes[:, 0] + widths * rel_codes[:, 0] + y_1 = boxes[:, 1] + heights * rel_codes[:, 1] + x_2 = ctr_x + widths * rel_codes[:, 2] + y_2 = boxes[:, 1] + heights * rel_codes[:, 3] + x_3 = boxes[:, 2] + widths * rel_codes[:, 4] + y_3 = boxes[:, 1] + heights * rel_codes[:, 5] + x_4 = boxes[:, 2] + widths * rel_codes[:, 6] + y_4 = ctr_y + heights * rel_codes[:, 7] + x_5 = boxes[:, 2] + widths * rel_codes[:, 8] + y_5 = boxes[:, 3] + heights * rel_codes[:, 9] + x_6 = ctr_x + widths * rel_codes[:, 10] + y_6 = boxes[:, 3] + heights * rel_codes[:, 11] + x_7 = boxes[:, 0] + widths * rel_codes[:, 12] + y_7 = boxes[:, 3] + heights * rel_codes[:, 13] + x_8 = boxes[:, 0] + widths * rel_codes[:, 14] + y_8 = ctr_y + heights * rel_codes[:, 15] + x_total = torch.stack([x_1, x_2, x_3, x_4, x_5, x_6, x_7, x_8], 0) # [8, N] + y_total = torch.stack([y_1, y_2, y_3, y_4, y_5, y_6, y_7, y_8], 0) + + x_min = torch.min(x_total, 0, keepdim=True) # [1, N] + x_max = torch.max(x_total, 0, keepdim=True) # [1, N] + y_min = torch.min(y_total, 0, keepdim=True) # [1, N] + y_max = torch.max(y_total, 0, keepdim=True) # [1, N] + + N1, N2 = x_min[0].shape + x_min = x_min[0].view([N2]) + x_max = x_max[0].view([N2]) + y_min = y_min[0].view([N2]) + y_max = y_max[0].view([N2]) + + x_min = torch.stack([x_min, ctr_x_new], 0) + x_max = torch.stack([x_max, ctr_x_new], 0) + y_min = torch.stack([y_min, ctr_y_new], 0) + y_max = torch.stack([y_max, ctr_y_new], 0) + + x_min = torch.min(x_min, 0, keepdim=True) # [1, N] + x_max = torch.max(x_max, 0, keepdim=True) # [1, N] + y_min = torch.min(y_min, 0, keepdim=True) # [1, N] + y_max = torch.max(y_max, 0, keepdim=True) # [1, N] + + pred_boxes = torch.zeros_like(boxes) + + pred_boxes[:, 0] = x_min[0][0, :] + pred_boxes[:, 1] = y_min[0][0, :] + pred_boxes[:, 2] = x_max[0][0, :] + pred_boxes[:, 3] = y_max[0][0, :] + + return pred_boxes \ No newline at end of file diff --git a/maskrcnn_benchmark/modeling/rpn/utils.py b/maskrcnn_benchmark/modeling/rpn/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d29a5a7d97c56bc2ce60af3f562d40e5ed98125b --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/utils.py @@ -0,0 +1,41 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Utility functions minipulating the prediction layers +""" + +from ..utils import cat + +import torch + +def permute_and_flatten(layer, N, A, C, H, W): + layer = layer.view(N, -1, C, H, W) + layer = layer.permute(0, 3, 4, 1, 2) #N H W A C + layer = layer.reshape(N, -1, C) # N H*W*A C + return layer + + +def concat_box_prediction_layers(box_cls, box_regression): + box_cls_flattened = [] + box_regression_flattened = [] + # for each feature level, permute the outputs to make them be in the + # same format as the labels. Note that the labels are computed for + # all feature levels concatenated, so we keep the same representation + # for the objectness and the box_regression + for box_cls_per_level, box_regression_per_level in zip( + box_cls, box_regression + ): + N, AxC, H, W = box_cls_per_level.shape + Ax4 = box_regression_per_level.shape[1] + A = 5 + C = AxC // A # 1 + + box_cls_per_level = permute_and_flatten( box_cls_per_level, N, A, C, H, W) + box_cls_flattened.append(box_cls_per_level) + box_regression_flattened.append(box_regression_per_level) + # concatenate on the first dimension (representing the feature levels), to + # take into account the way the labels were generated (with all feature maps + # being concatenated as well) + box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C) + box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4) + + return box_cls, box_regression diff --git a/maskrcnn_benchmark/modeling/utils.py b/maskrcnn_benchmark/modeling/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5b1d79a812ab3db034cf817583281c006b11b90a --- /dev/null +++ b/maskrcnn_benchmark/modeling/utils.py @@ -0,0 +1,16 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Miscellaneous utility functions +""" + +import torch + + +def cat(tensors, dim=0): + """ + Efficient version of torch.cat that avoids a copy if there is only a single element in a list + """ + assert isinstance(tensors, (list, tuple)) + if len(tensors) == 1: + return tensors[0] + return torch.cat(tensors, dim) diff --git a/maskrcnn_benchmark/solver/__init__.py b/maskrcnn_benchmark/solver/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..75f40530cccb6b989d33193de92a6c26a07cf751 --- /dev/null +++ b/maskrcnn_benchmark/solver/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .build import make_optimizer +from .build import make_lr_scheduler +from .lr_scheduler import WarmupMultiStepLR diff --git a/maskrcnn_benchmark/solver/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/solver/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d989ebb97bd46ebece2fc664b09b180c85b10090 Binary files /dev/null and b/maskrcnn_benchmark/solver/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/solver/__pycache__/build.cpython-37.pyc b/maskrcnn_benchmark/solver/__pycache__/build.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfad300d379e1097ee9470260ff9f55ef465b583 Binary files /dev/null and b/maskrcnn_benchmark/solver/__pycache__/build.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/solver/__pycache__/lr_scheduler.cpython-37.pyc b/maskrcnn_benchmark/solver/__pycache__/lr_scheduler.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd084ad3466d7e25a3aac716d11bb65aed11e00f Binary files /dev/null and b/maskrcnn_benchmark/solver/__pycache__/lr_scheduler.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/solver/build.py b/maskrcnn_benchmark/solver/build.py new file mode 100644 index 0000000000000000000000000000000000000000..865a4ec8d1b3d996b0618e3b2b77bd1b44acfa96 --- /dev/null +++ b/maskrcnn_benchmark/solver/build.py @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .lr_scheduler import WarmupMultiStepLR + + +def make_optimizer(cfg, model): + params = [] + for key, value in model.named_parameters(): + if not value.requires_grad: + continue + lr = cfg.SOLVER.BASE_LR + weight_decay = cfg.SOLVER.WEIGHT_DECAY + if "bias" in key: + lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR + weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS + params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] + + optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) + return optimizer + + +def make_lr_scheduler(cfg, optimizer): + return WarmupMultiStepLR( + optimizer, + cfg.SOLVER.STEPS, + cfg.SOLVER.GAMMA, + warmup_factor=cfg.SOLVER.WARMUP_FACTOR, + warmup_iters=cfg.SOLVER.WARMUP_ITERS, + warmup_method=cfg.SOLVER.WARMUP_METHOD, + ) diff --git a/maskrcnn_benchmark/solver/lr_scheduler.py b/maskrcnn_benchmark/solver/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..d7d45b6c6f98e66a5da5b8b84a50258a517bb1e4 --- /dev/null +++ b/maskrcnn_benchmark/solver/lr_scheduler.py @@ -0,0 +1,52 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from bisect import bisect_right + +import torch + + +# FIXME ideally this would be achieved with a CombinedLRScheduler, +# separating MultiStepLR with WarmupLR +# but the current LRScheduler design doesn't allow it +class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): + def __init__( + self, + optimizer, + milestones, + gamma=0.1, + warmup_factor=1.0 / 3, + warmup_iters=500, + warmup_method="linear", + last_epoch=-1, + ): + if not list(milestones) == sorted(milestones): + raise ValueError( + "Milestones should be a list of" " increasing integers. Got {}", + milestones, + ) + + if warmup_method not in ("constant", "linear"): + raise ValueError( + "Only 'constant' or 'linear' warmup_method accepted" + "got {}".format(warmup_method) + ) + self.milestones = milestones + self.gamma = gamma + self.warmup_factor = warmup_factor + self.warmup_iters = warmup_iters + self.warmup_method = warmup_method + super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + warmup_factor = 1 + if self.last_epoch < self.warmup_iters: + if self.warmup_method == "constant": + warmup_factor = self.warmup_factor + elif self.warmup_method == "linear": + alpha = float(self.last_epoch) / self.warmup_iters + warmup_factor = self.warmup_factor * (1 - alpha) + alpha + return [ + base_lr + * warmup_factor + * self.gamma ** bisect_right(self.milestones, self.last_epoch) + for base_lr in self.base_lrs + ] diff --git a/maskrcnn_benchmark/structures/__init__.py b/maskrcnn_benchmark/structures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/structures/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3e9bf87ce7d981f3d0ac69bc1875c9aab3a82a7 Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/structures/__pycache__/bounding_box.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/bounding_box.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9fd7c3e37ffc1920db7a269caaaf0fed05e1ccb1 Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/bounding_box.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/structures/__pycache__/boxlist_ops.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/boxlist_ops.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20ccdf81f21989509a19735dcae158249fce70df Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/boxlist_ops.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/structures/__pycache__/image_list.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/image_list.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae2762abe21312d6e2dea08a90f2653734245050 Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/image_list.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/structures/__pycache__/ke.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/ke.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0901e970575ca48573ada6619d22555f8cbfd7e Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/ke.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/structures/__pycache__/keypoint.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/keypoint.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8761c854ca0519614cb9b6391718ea5214d8d7e2 Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/keypoint.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/structures/__pycache__/mty.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/mty.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f2faaf1273900153149166e8a36f3e0293a5c67 Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/mty.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/structures/__pycache__/segmentation_mask.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/segmentation_mask.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1cbae8444905986fa9fcc3cc4bb1609ee6eeb6bb Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/segmentation_mask.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/structures/bounding_box.py b/maskrcnn_benchmark/structures/bounding_box.py new file mode 100644 index 0000000000000000000000000000000000000000..5a1ecf746c1c6183d83d0613f0a13686ecb2a04b --- /dev/null +++ b/maskrcnn_benchmark/structures/bounding_box.py @@ -0,0 +1,271 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + + +class BoxList(object): + """ + This class represents a set of bounding boxes. + The bounding boxes are represented as a Nx4 Tensor. + In order to uniquely determine the bounding boxes with respect + to an image, we also store the corresponding image dimensions. + They can contain extra information that is specific to each bounding box, such as + labels. + """ + + def __init__(self, bbox, image_size, mode="xyxy"): + device = bbox.device if isinstance(bbox, torch.Tensor) else torch.device("cpu") + bbox = torch.as_tensor(bbox, dtype=torch.float32, device=device) + if bbox.ndimension() != 2: + raise ValueError( + "bbox should have 2 dimensions, got {}".format(bbox.ndimension()) + ) + if bbox.size(-1) != 4: + raise ValueError( + "last dimension of bbox should have a " + "size of 4, got {}".format(bbox.size(-1)) + ) + if mode not in ("xyxy", "xywh"): + raise ValueError("mode should be 'xyxy' or 'xywh'") + + self.bbox = bbox + self.size = image_size # (image_width, image_height) + self.mode = mode + self.extra_fields = {} + + def add_field(self, field, field_data): + self.extra_fields[field] = field_data + + def get_field(self, field): + return self.extra_fields[field] + + def has_field(self, field): + return field in self.extra_fields + + def fields(self): + return list(self.extra_fields.keys()) + + def _copy_extra_fields(self, bbox): + for k, v in bbox.extra_fields.items(): + self.extra_fields[k] = v + + def convert(self, mode): + if mode not in ("xyxy", "xywh"): + raise ValueError("mode should be 'xyxy' or 'xywh'") + if mode == self.mode: + return self + # we only have two modes, so don't need to check + # self.mode + xmin, ymin, xmax, ymax = self._split_into_xyxy() + if mode == "xyxy": + bbox = torch.cat((xmin, ymin, xmax, ymax), dim=-1) + bbox = BoxList(bbox, self.size, mode=mode) + else: + TO_REMOVE = 1 + bbox = torch.cat( + (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE), dim=-1 + ) + bbox = BoxList(bbox, self.size, mode=mode) + bbox._copy_extra_fields(self) + return bbox + + def _split_into_xyxy(self): + if self.mode == "xyxy": + xmin, ymin, xmax, ymax = self.bbox.split(1, dim=-1) + return xmin, ymin, xmax, ymax + elif self.mode == "xywh": + TO_REMOVE = 1 + xmin, ymin, w, h = self.bbox.split(1, dim=-1) + return ( + xmin, + ymin, + xmin + (w - TO_REMOVE).clamp(min=0), + ymin + (h - TO_REMOVE).clamp(min=0), + ) + else: + raise RuntimeError("Should not be here") + + def resize(self, size, *args, **kwargs): + """ + Returns a resized copy of this bounding box + + :param size: The requested size in pixels, as a 2-tuple: + (width, height). + """ + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) + if ratios[0] == ratios[1]: + ratio = ratios[0] + scaled_box = self.bbox * ratio + bbox = BoxList(scaled_box, size, mode=self.mode) + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.resize(size, *args, **kwargs) + bbox.add_field(k, v) + return bbox + + ratio_width, ratio_height = ratios + xmin, ymin, xmax, ymax = self._split_into_xyxy() + scaled_xmin = xmin * ratio_width + scaled_xmax = xmax * ratio_width + scaled_ymin = ymin * ratio_height + scaled_ymax = ymax * ratio_height + scaled_box = torch.cat( + (scaled_xmin, scaled_ymin, scaled_xmax, scaled_ymax), dim=-1 + ) + bbox = BoxList(scaled_box, size, mode="xyxy") + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.resize(size, *args, **kwargs) + bbox.add_field(k, v) + + return bbox.convert(self.mode) + + def transpose(self, method): + """ + Transpose bounding box (flip or rotate in 90 degree steps) + :param method: One of :py:attr:`PIL.Image.FLIP_LEFT_RIGHT`, + :py:attr:`PIL.Image.FLIP_TOP_BOTTOM`, :py:attr:`PIL.Image.ROTATE_90`, + :py:attr:`PIL.Image.ROTATE_180`, :py:attr:`PIL.Image.ROTATE_270`, + :py:attr:`PIL.Image.TRANSPOSE` or :py:attr:`PIL.Image.TRANSVERSE`. + """ + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + image_width, image_height = self.size + xmin, ymin, xmax, ymax = self._split_into_xyxy() + if method == FLIP_LEFT_RIGHT: + TO_REMOVE = 1 + transposed_xmin = image_width - xmax - TO_REMOVE + transposed_xmax = image_width - xmin - TO_REMOVE + transposed_ymin = ymin + transposed_ymax = ymax + elif method == FLIP_TOP_BOTTOM: + transposed_xmin = xmin + transposed_xmax = xmax + transposed_ymin = image_height - ymax + transposed_ymax = image_height - ymin + + transposed_boxes = torch.cat( + (transposed_xmin, transposed_ymin, transposed_xmax, transposed_ymax), dim=-1 + ) + bbox = BoxList(transposed_boxes, self.size, mode="xyxy") + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.transpose(method) + bbox.add_field(k, v) + return bbox.convert(self.mode) + + def crop(self, box, remove_empty=False): + """ + Cropss a rectangular region from this bounding box. The box is a + 4-tuple defining the left, upper, right, and lower pixel + coordinate. + """ + xmin, ymin, xmax, ymax = self._split_into_xyxy() + w, h = box[2] - box[0], box[3] - box[1] + cropped_xmin = (xmin - box[0]).clamp(min=0, max=w) + cropped_ymin = (ymin - box[1]).clamp(min=0, max=h) + cropped_xmax = (xmax - box[0]).clamp(min=0, max=w) + cropped_ymax = (ymax - box[1]).clamp(min=0, max=h) + + # TODO should I filter empty boxes here? + if False: + is_empty = (cropped_xmin == cropped_xmax) | (cropped_ymin == cropped_ymax) + + cropped_box = torch.cat( + (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1 + ) + bbox = BoxList(cropped_box, (w, h), mode="xyxy") + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.crop(box) + bbox.add_field(k, v) + + if remove_empty: + box = bbox.bbox + keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0]) + bbox = bbox[keep] + return bbox.convert(self.mode) + + # Tensor-like methods + + def to(self, device): + bbox = BoxList(self.bbox.to(device), self.size, self.mode) + for k, v in self.extra_fields.items(): + if hasattr(v, "to"): + v = v.to(device) + bbox.add_field(k, v) + return bbox + + def __getitem__(self, item): + bbox = BoxList(self.bbox[item], self.size, self.mode) + for k, v in self.extra_fields.items(): + bbox.add_field(k, v[item]) + return bbox + + def __len__(self): + return self.bbox.shape[0] + + def clip_to_image(self, remove_empty=True): + TO_REMOVE = 1 + self.bbox[:, 0].clamp_(min=0, max=self.size[0] - TO_REMOVE) + self.bbox[:, 1].clamp_(min=0, max=self.size[1] - TO_REMOVE) + self.bbox[:, 2].clamp_(min=0, max=self.size[0] - TO_REMOVE) + self.bbox[:, 3].clamp_(min=0, max=self.size[1] - TO_REMOVE) + if remove_empty: + box = self.bbox + keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0]) + return self[keep] + return self + + def area(self): + box = self.bbox + if self.mode == "xyxy": + TO_REMOVE = 1 + area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (box[:, 3] - box[:, 1] + TO_REMOVE) + elif self.mode == "xywh": + area = box[:, 2] * box[:, 3] + else: + raise RuntimeError("Should not be here") + + return area + + def copy_with_fields(self, fields, skip_missing=False): + bbox = BoxList(self.bbox, self.size, self.mode) + if not isinstance(fields, (list, tuple)): + fields = [fields] + for field in fields: + if self.has_field(field): + bbox.add_field(field, self.get_field(field)) + elif not skip_missing: + raise KeyError("Field '{}' not found in {}".format(field, self)) + return bbox + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_boxes={}, ".format(len(self)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={}, ".format(self.size[1]) + s += "mode={})".format(self.mode) + return s + + +if __name__ == "__main__": + bbox = BoxList([[0, 0, 10, 10], [0, 0, 5, 5]], (10, 10)) + s_bbox = bbox.resize((5, 5)) + print(s_bbox) + print(s_bbox.bbox) + + t_bbox = bbox.transpose(0) + print(t_bbox) + print(t_bbox.bbox) diff --git a/maskrcnn_benchmark/structures/boxlist_ops.py b/maskrcnn_benchmark/structures/boxlist_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..dc51212f4ff7abc6d978df75d3de44f956f38f67 --- /dev/null +++ b/maskrcnn_benchmark/structures/boxlist_ops.py @@ -0,0 +1,128 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .bounding_box import BoxList + +from maskrcnn_benchmark.layers import nms as _box_nms + + +def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores"): + """ + Performs non-maximum suppression on a boxlist, with scores specified + in a boxlist field via score_field. + + Arguments: + boxlist(BoxList) + nms_thresh (float) + max_proposals (int): if > 0, then only the top max_proposals are kept + after non-maximum suppression + score_field (str) + """ + if nms_thresh <= 0: + return boxlist + mode = boxlist.mode + boxlist = boxlist.convert("xyxy") + boxes = boxlist.bbox + score = boxlist.get_field(score_field) + keep = _box_nms(boxes, score, nms_thresh) + if max_proposals > 0: + keep = keep[: max_proposals] + boxlist = boxlist[keep] + return boxlist.convert(mode) + + +def remove_small_boxes(boxlist, min_size): + """ + Only keep boxes with both sides >= min_size + + Arguments: + boxlist (Boxlist) + min_size (int) + """ + # TODO maybe add an API for querying the ws / hs + xywh_boxes = boxlist.convert("xywh").bbox + _, _, ws, hs = xywh_boxes.unbind(dim=1) + keep = ( + (ws >= min_size) & (hs >= min_size) + ).nonzero().squeeze(1) + return boxlist[keep] + + +# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py +# with slight modifications +def boxlist_iou(boxlist1, boxlist2): + """Compute the intersection over union of two set of boxes. + The box order must be (xmin, ymin, xmax, ymax). + + Arguments: + box1: (BoxList) bounding boxes, sized [N,4]. + box2: (BoxList) bounding boxes, sized [M,4]. + + Returns: + (tensor) iou, sized [N,M]. + + Reference: + https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py + """ + if boxlist1.size != boxlist2.size: + raise RuntimeError( + "boxlists should have same image size, got {}, {}".format(boxlist1, boxlist2)) + + N = len(boxlist1) + M = len(boxlist2) + + area1 = boxlist1.area() + area2 = boxlist2.area() + + box1, box2 = boxlist1.bbox, boxlist2.bbox + + lt = torch.max(box1[:, None, :2], box2[:, :2]) # [N,M,2] + rb = torch.min(box1[:, None, 2:], box2[:, 2:]) # [N,M,2] + + TO_REMOVE = 1 + + wh = (rb - lt + TO_REMOVE).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + iou = inter / (area1[:, None] + area2 - inter) + return iou + + +# TODO redundant, remove +def _cat(tensors, dim=0): + """ + Efficient version of torch.cat that avoids a copy if there is only a single element in a list + """ + assert isinstance(tensors, (list, tuple)) + if len(tensors) == 1: + return tensors[0] + return torch.cat(tensors, dim) + + +def cat_boxlist(bboxes): + """ + Concatenates a list of BoxList (having the same image size) into a + single BoxList + + Arguments: + bboxes (list[BoxList]) + """ + assert isinstance(bboxes, (list, tuple)) + assert all(isinstance(bbox, BoxList) for bbox in bboxes) + + size = bboxes[0].size + assert all(bbox.size == size for bbox in bboxes) + + mode = bboxes[0].mode + assert all(bbox.mode == mode for bbox in bboxes) + + fields = set(bboxes[0].fields()) + assert all(set(bbox.fields()) == fields for bbox in bboxes) + + cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode) + + for field in fields: + data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0) + cat_boxes.add_field(field, data) + + return cat_boxes diff --git a/maskrcnn_benchmark/structures/image_list.py b/maskrcnn_benchmark/structures/image_list.py new file mode 100644 index 0000000000000000000000000000000000000000..590b87a65a23aa94234022bcc530cb00e1e25b47 --- /dev/null +++ b/maskrcnn_benchmark/structures/image_list.py @@ -0,0 +1,72 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from __future__ import division + +import torch + + +class ImageList(object): + """ + Structure that holds a list of images (of possibly + varying sizes) as a single tensor. + This works by padding the images to the same size, + and storing in a field the original sizes of each image + """ + + def __init__(self, tensors, image_sizes): + """ + Arguments: + tensors (tensor) + image_sizes (list[tuple[int, int]]) + """ + self.tensors = tensors + self.image_sizes = image_sizes + + def to(self, *args, **kwargs): + cast_tensor = self.tensors.to(*args, **kwargs) + return ImageList(cast_tensor, self.image_sizes) + + +def to_image_list(tensors, size_divisible=0): + """ + tensors can be an ImageList, a torch.Tensor or + an iterable of Tensors. It can't be a numpy array. + When tensors is an iterable of Tensors, it pads + the Tensors with zeros so that they have the same + shape + """ + if isinstance(tensors, torch.Tensor) and size_divisible > 0: + tensors = [tensors] + + if isinstance(tensors, ImageList): + return tensors + elif isinstance(tensors, torch.Tensor): + # single tensor shape can be inferred + if tensors.dim() == 3: + tensors = tensors[None] + assert tensors.dim() == 4 + image_sizes = [tensor.shape[-2:] for tensor in tensors] + return ImageList(tensors, image_sizes) + elif isinstance(tensors, (tuple, list)): + max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors])) + + # TODO Ideally, just remove this and let me model handle arbitrary + # input sizs + if size_divisible > 0: + import math + + stride = size_divisible + max_size = list(max_size) + max_size[1] = int(math.ceil(max_size[1] / stride) * stride) + max_size[2] = int(math.ceil(max_size[2] / stride) * stride) + max_size = tuple(max_size) + + batch_shape = (len(tensors),) + max_size + batched_imgs = tensors[0].new(*batch_shape).zero_() + for img, pad_img in zip(tensors, batched_imgs): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + + image_sizes = [im.shape[-2:] for im in tensors] + + return ImageList(batched_imgs, image_sizes) + else: + raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors))) diff --git a/maskrcnn_benchmark/structures/ke.py b/maskrcnn_benchmark/structures/ke.py new file mode 100644 index 0000000000000000000000000000000000000000..41e777e803f0ad206bb7042ddd35395c107a84eb --- /dev/null +++ b/maskrcnn_benchmark/structures/ke.py @@ -0,0 +1,164 @@ +import torch + + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + +class KES(object): + def __init__(self, kes, size, mode=None): + # FIXME remove check once we have better integration with device + # in my version this would consistently return a CPU tensor + device = kes.device if isinstance(kes, torch.Tensor) else torch.device('cpu') + kes = torch.as_tensor(kes, dtype=torch.float32, device=device) + if len(kes.size()) == 2: + kes = kes.unsqueeze(2) + if not kes.size()[0] ==0: + assert(kes.size()[-2] == 12), str(kes.size()) # 12kes + + num_kes = kes.shape[0] + kes_x = kes[:, :6, 0] # 4+2=6 + kes_y = kes[:, 6:, 0] + # TODO remove once support or zero in dim is in + if not kes.size()[0] ==0: + assert(kes_x.size() == kes_y.size()), str(kes_x.size())+' '+str(kes_y.size()) + + if num_kes > 0: + kes = kes.view(num_kes, -1, 1) + kes_x = kes_x.view(num_kes, -1, 1) + kes_y = kes_y.view(num_kes, -1, 1) + + # TODO should I split them? + self.kes = kes + self.kes_x = kes_x + self.kes_y = kes_y + + self.size = size + self.mode = mode + + def crop(self, box): + w, h = box[2] - box[0], box[3] - box[1] + k = self.kes.clone() + k[:, :6, 0] -= box[0] + k[:, 6:, 0] -= box[1] + return type(self)(k, (w, h), self.mode) + + def resize(self, size, *args, **kwargs): + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) + ratio_w, ratio_h = ratios + resized_data_x = self.kes_x.clone() + resized_data_x[..., :] *= ratio_w + + resized_data_y = self.kes_y.clone() + resized_data_y[..., :] *= ratio_h + + resized_data = torch.cat((resized_data_x, resized_data_y), dim=-2) + return type(self)(resized_data, size, self.mode) + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT,): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT implemented") + + flip_inds = type(self).FLIP_INDS + flipped_data_x = self.kes_x[:, flip_inds] + width = self.size[0] + TO_REMOVE = 1 + # Flip x coordinates + flipped_data_x[..., :] = width - flipped_data_x[..., :] - TO_REMOVE + + flipped_data_y = self.kes_y.clone() + flipped_data = torch.cat((flipped_data_x, flipped_data_y), dim=-2) + return type(self)(flipped_data, self.size, self.mode) + + def to(self, *args, **kwargs): + return type(self)(self.kes.to(*args, **kwargs), self.size, self.mode) + + def __getitem__(self, item): + return type(self)(self.kes[item], self.size, self.mode) + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += 'num_instances_x={}, '.format(len(self.kes_x)) + s += 'num_instances_y={}, '.format(len(self.kes_y)) + s += 'image_width={}, '.format(self.size[0]) + s += 'image_height={})'.format(self.size[1]) + return s + + +def _create_flip_indices(names, flip_map): + full_flip_map = flip_map.copy() + full_flip_map.update({v: k for k, v in flip_map.items()}) + flipped_names = [i if i not in full_flip_map else full_flip_map[i] for i in names] + flip_indices = [names.index(i) for i in flipped_names] + return torch.tensor(flip_indices) + + +class textKES(KES): + NAMES = [ # x and y + 'meanx', + 'xmin', + 'x2', + 'x3', + 'xmax', + 'cx' + # 'meany', + # 'ymin', + # 'y2', + # 'y3', + # 'ymax', + # 'cy' + ] + FLIP_MAP = { + 'xmin': 'xmax', + 'x2': 'x3', + } + + +# TODO this doesn't look great +textKES.FLIP_INDS = _create_flip_indices(textKES.NAMES, textKES.FLIP_MAP) + + +# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop) +def kes_to_heat_map(kes_x, kes_y, mty, rois, heatmap_size): + if rois.numel() == 0: + return rois.new().long(), rois.new().long() + offset_x = rois[:, 0] + offset_y = rois[:, 1] + scale_x = heatmap_size / (rois[:, 2] - rois[:, 0]) + scale_y = heatmap_size / (rois[:, 3] - rois[:, 1]) + + offset_x = offset_x[:, None] + offset_y = offset_y[:, None] + scale_x = scale_x[:, None] + scale_y = scale_y[:, None] + + x = kes_x[..., 0] + y = kes_y[..., 0] + + x_boundary_inds = x == rois[:, 2][:, None] + y_boundary_inds = y == rois[:, 3][:, None] + + x = (x - offset_x) * scale_x + x = x.floor().long() + y = (y - offset_y) * scale_y + y = y.floor().long() + + x[x_boundary_inds] = heatmap_size - 1 + y[y_boundary_inds] = heatmap_size - 1 + + valid_loc_x = (x >= 0) & (x < heatmap_size) + valid_x = (valid_loc_x).long() + + valid_loc_y = (y >= 0) & (y < heatmap_size) + valid_y = (valid_loc_y).long() + + valid_mty = ((x >= 0) & (x < heatmap_size)) & ((y >= 0) & (y < heatmap_size)) + valid_mty = valid_mty.sum(dim=1)>0 + valid_mty = (valid_mty).long() + + heatmap_x = x + heatmap_y = y + + mty = mty + return heatmap_x, heatmap_y, valid_x, valid_y, mty, valid_mty diff --git a/maskrcnn_benchmark/structures/keypoint.py b/maskrcnn_benchmark/structures/keypoint.py new file mode 100644 index 0000000000000000000000000000000000000000..a6881f72f4f757855105638f2f7a9fca81760bb7 --- /dev/null +++ b/maskrcnn_benchmark/structures/keypoint.py @@ -0,0 +1,188 @@ +import torch + + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + +class Keypoints(object): + def __init__(self, keypoints, size, mode=None): + # FIXME remove check once we have better integration with device + # in my version this would consistently return a CPU tensor + device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device('cpu') + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.view(num_keypoints, -1, 3) + + # TODO should I split them? + # self.visibility = keypoints[..., 2] + self.keypoints = keypoints# [..., :2] + + self.size = size + self.mode = mode + self.extra_fields = {} + + def crop(self, box): + raise NotImplementedError() + + def resize(self, size, *args, **kwargs): + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) + ratio_w, ratio_h = ratios + resized_data = self.keypoints.clone() + resized_data[..., 0] *= ratio_w + resized_data[..., 1] *= ratio_h + keypoints = type(self)(resized_data, size, self.mode) + for k, v in self.extra_fields.items(): + keypoints.add_field(k, v) + return keypoints + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT,): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT implemented") + + flip_inds = type(self).FLIP_INDS + flipped_data = self.keypoints[:, flip_inds] + width = self.size[0] + TO_REMOVE = 1 + # Flip x coordinates + flipped_data[..., 0] = width - flipped_data[..., 0] - TO_REMOVE + + # Maintain COCO convention that if visibility == 0, then x, y = 0 + inds = flipped_data[..., 2] == 0 + flipped_data[inds] = 0 + + keypoints = type(self)(flipped_data, self.size, self.mode) + for k, v in self.extra_fields.items(): + keypoints.add_field(k, v) + return keypoints + + def to(self, *args, **kwargs): + keypoints = type(self)(self.keypoints.to(*args, **kwargs), self.size, self.mode) + for k, v in self.extra_fields.items(): + if hasattr(v, "to"): + v = v.to(*args, **kwargs) + keypoints.add_field(k, v) + return keypoints + + def __getitem__(self, item): + keypoints = type(self)(self.keypoints[item], self.size, self.mode) + for k, v in self.extra_fields.items(): + keypoints.add_field(k, v[item]) + return keypoints + + def add_field(self, field, field_data): + self.extra_fields[field] = field_data + + def get_field(self, field): + return self.extra_fields[field] + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += 'num_instances={}, '.format(len(self.keypoints)) + s += 'image_width={}, '.format(self.size[0]) + s += 'image_height={})'.format(self.size[1]) + return s + + +def _create_flip_indices(names, flip_map): + full_flip_map = flip_map.copy() + full_flip_map.update({v: k for k, v in flip_map.items()}) + flipped_names = [i if i not in full_flip_map else full_flip_map[i] for i in names] + flip_indices = [names.index(i) for i in flipped_names] + return torch.tensor(flip_indices) + + +class PersonKeypoints(Keypoints): + NAMES = [ + 'nose', + 'left_eye', + 'right_eye', + 'left_ear', + 'right_ear', + 'left_shoulder', + 'right_shoulder', + 'left_elbow', + 'right_elbow', + 'left_wrist', + 'right_wrist', + 'left_hip', + 'right_hip', + 'left_knee', + 'right_knee', + 'left_ankle', + 'right_ankle' + ] + FLIP_MAP = { + 'left_eye': 'right_eye', + 'left_ear': 'right_ear', + 'left_shoulder': 'right_shoulder', + 'left_elbow': 'right_elbow', + 'left_wrist': 'right_wrist', + 'left_hip': 'right_hip', + 'left_knee': 'right_knee', + 'left_ankle': 'right_ankle' + } + + +# TODO this doesn't look great +PersonKeypoints.FLIP_INDS = _create_flip_indices(PersonKeypoints.NAMES, PersonKeypoints.FLIP_MAP) +def kp_connections(keypoints): + kp_lines = [ + [keypoints.index('left_eye'), keypoints.index('right_eye')], + [keypoints.index('left_eye'), keypoints.index('nose')], + [keypoints.index('right_eye'), keypoints.index('nose')], + [keypoints.index('right_eye'), keypoints.index('right_ear')], + [keypoints.index('left_eye'), keypoints.index('left_ear')], + [keypoints.index('right_shoulder'), keypoints.index('right_elbow')], + [keypoints.index('right_elbow'), keypoints.index('right_wrist')], + [keypoints.index('left_shoulder'), keypoints.index('left_elbow')], + [keypoints.index('left_elbow'), keypoints.index('left_wrist')], + [keypoints.index('right_hip'), keypoints.index('right_knee')], + [keypoints.index('right_knee'), keypoints.index('right_ankle')], + [keypoints.index('left_hip'), keypoints.index('left_knee')], + [keypoints.index('left_knee'), keypoints.index('left_ankle')], + [keypoints.index('right_shoulder'), keypoints.index('left_shoulder')], + [keypoints.index('right_hip'), keypoints.index('left_hip')], + ] + return kp_lines +PersonKeypoints.CONNECTIONS = kp_connections(PersonKeypoints.NAMES) + + +# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop) +def keypoints_to_heat_map(keypoints, rois, heatmap_size): + if rois.numel() == 0: + return rois.new().long(), rois.new().long() + offset_x = rois[:, 0] + offset_y = rois[:, 1] + scale_x = heatmap_size / (rois[:, 2] - rois[:, 0]) + scale_y = heatmap_size / (rois[:, 3] - rois[:, 1]) + + offset_x = offset_x[:, None] + offset_y = offset_y[:, None] + scale_x = scale_x[:, None] + scale_y = scale_y[:, None] + + x = keypoints[..., 0] + y = keypoints[..., 1] + + x_boundary_inds = x == rois[:, 2][:, None] + y_boundary_inds = y == rois[:, 3][:, None] + + x = (x - offset_x) * scale_x + x = x.floor().long() + y = (y - offset_y) * scale_y + y = y.floor().long() + + x[x_boundary_inds] = heatmap_size - 1 + y[y_boundary_inds] = heatmap_size - 1 + + valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size) + vis = keypoints[..., 2] > 0 + valid = (valid_loc & vis).long() + + lin_ind = y * heatmap_size + x + heatmaps = lin_ind * valid + + return heatmaps, valid diff --git a/maskrcnn_benchmark/structures/mty.py b/maskrcnn_benchmark/structures/mty.py new file mode 100644 index 0000000000000000000000000000000000000000..f5758683d1958a8b02fe7b8ffff0193d1236169f --- /dev/null +++ b/maskrcnn_benchmark/structures/mty.py @@ -0,0 +1,59 @@ +import torch + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + +all_types = [[1,2,3,4],[1,2,4,3],[1,3,2,4],[1,3,4,2],[1,4,2,3],[1,4,3,2],\ + [2,1,3,4],[2,1,4,3],[2,3,1,4],[2,3,4,1],[2,4,1,3],[2,4,3,1],\ + [3,1,2,4],[3,1,4,2],[3,2,1,4],[3,2,4,1],[3,4,1,2],[3,4,2,1],\ + [4,1,2,3],[4,1,3,2],[4,2,1,3],[4,2,3,1],[4,3,1,2],[4,3,2,1]] +aty= [[all_types[iat][0]-1,all_types[iat][1]-1,all_types[iat][2]-1,all_types[iat][3]-1] for iat in range(24)] + +class MTY(object): + def __init__(self, mty, size, mode=None): + # FIXME remove check once we have better integration with device + # in my version this would consistently return a CPU tensor + device = mty.device if isinstance(mty, torch.Tensor) else torch.device('cpu') + mty = torch.as_tensor(mty, dtype=torch.int64, device=device) + + # TODO should I split them? + assert(len(mty.size()) == 1), str(mty.size()) + self.mty = mty + + self.size = size + self.mode = mode + + def crop(self, box): + w, h = box[2] - box[0], box[3] - box[1] + return type(self)(self.mty, (w, h), self.mode) + + def resize(self, size, *args, **kwargs): + return type(self)(self.mty, size, self.mode) + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT,): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT implemented") + + flipped_data = self.mty.clone() + for i in range(self.mty.size()[0]): + revs = [it for it in aty[self.mty[i]]] + revs.reverse() + flip_type = aty.index(revs) + flipped_data[i] = flip_type + + return type(self)(flipped_data, self.size, self.mode) + + def to(self, *args, **kwargs): + return type(self)(self.mty.to(*args, **kwargs), self.size, self.mode) + + def __getitem__(self, item): + return type(self)(self.mty[item], self.size, self.mode) + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += 'num_instances={}, '.format(len(self.mty)) + s += 'image_width={}, '.format(self.size[0]) + s += 'image_height={})'.format(self.size[1]) + return s diff --git a/maskrcnn_benchmark/structures/segmentation_mask.py b/maskrcnn_benchmark/structures/segmentation_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..5e1ba07767df487c9b4cccca4a87540a4bce3b99 --- /dev/null +++ b/maskrcnn_benchmark/structures/segmentation_mask.py @@ -0,0 +1,535 @@ +import cv2 +import copy +import torch +import numpy as np +from maskrcnn_benchmark.layers.misc import interpolate + +import pycocotools.mask as mask_utils + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + + +""" ABSTRACT +Segmentations come in either: +1) Binary masks +2) Polygons + +Binary masks can be represented in a contiguous array +and operations can be carried out more efficiently, +therefore BinaryMaskList handles them together. + +Polygons are handled separately for each instance, +by PolygonInstance and instances are handled by +PolygonList. + +SegmentationList is supposed to represent both, +therefore it wraps the functions of BinaryMaskList +and PolygonList to make it transparent. +""" + + +class BinaryMaskList(object): + """ + This class handles binary masks for all objects in the image + """ + + def __init__(self, masks, size): + """ + Arguments: + masks: Either torch.tensor of [num_instances, H, W] + or list of torch.tensors of [H, W] with num_instances elems, + or RLE (Run Length Encoding) - interpreted as list of dicts, + or BinaryMaskList. + size: absolute image size, width first + + After initialization, a hard copy will be made, to leave the + initializing source data intact. + """ + + if isinstance(masks, torch.Tensor): + # The raw data representation is passed as argument + masks = masks.clone() + elif isinstance(masks, (list, tuple)): + if isinstance(masks[0], torch.Tensor): + masks = torch.stack(masks, dim=2).clone() + elif isinstance(masks[0], dict) and "count" in masks[0]: + # RLE interpretation + + masks = mask_utils + else: + RuntimeError( + "Type of `masks[0]` could not be interpreted: %s" % type(masks) + ) + elif isinstance(masks, BinaryMaskList): + # just hard copy the BinaryMaskList instance's underlying data + masks = masks.masks.clone() + else: + RuntimeError( + "Type of `masks` argument could not be interpreted:%s" % type(masks) + ) + + if len(masks.shape) == 2: + # if only a single instance mask is passed + masks = masks[None] + + assert len(masks.shape) == 3 + assert masks.shape[1] == size[1], "%s != %s" % (masks.shape[1], size[1]) + assert masks.shape[2] == size[0], "%s != %s" % (masks.shape[2], size[0]) + + self.masks = masks + self.size = tuple(size) + + def transpose(self, method): + dim = 1 if method == FLIP_TOP_BOTTOM else 2 + flipped_masks = self.masks.flip(dim) + return BinaryMaskList(flipped_masks, self.size) + + def crop(self, box): + assert isinstance(box, (list, tuple, torch.Tensor)), str(type(box)) + # box is assumed to be xyxy + current_width, current_height = self.size + xmin, ymin, xmax, ymax = [round(float(b)) for b in box] + + assert xmin <= xmax and ymin <= ymax, str(box) + xmin = min(max(xmin, 0), current_width - 1) + ymin = min(max(ymin, 0), current_height - 1) + + xmax = min(max(xmax, 0), current_width) + ymax = min(max(ymax, 0), current_height) + + xmax = max(xmax, xmin + 1) + ymax = max(ymax, ymin + 1) + + width, height = xmax - xmin, ymax - ymin + cropped_masks = self.masks[:, ymin:ymax, xmin:xmax] + cropped_size = width, height + return BinaryMaskList(cropped_masks, cropped_size) + + def resize(self, size): + try: + iter(size) + except TypeError: + assert isinstance(size, (int, float)) + size = size, size + width, height = map(int, size) + + assert width > 0 + assert height > 0 + + # Height comes first here! + resized_masks = torch.nn.functional.interpolate( + input=self.masks[None].float(), + size=(height, width), + mode="bilinear", + align_corners=False, + )[0].type_as(self.masks) + resized_size = width, height + return BinaryMaskList(resized_masks, resized_size) + + def convert_to_polygon(self): + contours = self._findContours() + return PolygonList(contours, self.size) + + def to(self, *args, **kwargs): + return self + + def _findContours(self): + contours = [] + masks = self.masks.detach().numpy() + for mask in masks: + mask = cv2.UMat(mask) + contour, hierarchy = cv2.findContours( + mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_L1 + ) + + reshaped_contour = [] + for entity in contour: + assert len(entity.shape) == 3 + assert entity.shape[1] == 1, "Hierarchical contours are not allowed" + reshaped_contour.append(entity.reshape(-1).tolist()) + contours.append(reshaped_contour) + return contours + + def __len__(self): + return len(self.masks) + + def __getitem__(self, index): + # Probably it can cause some overhead + # but preserves consistency + masks = self.masks[index].clone() + return BinaryMaskList(masks, self.size) + + def __iter__(self): + return iter(self.masks) + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_instances={}, ".format(len(self.masks)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={})".format(self.size[1]) + return s + + +class PolygonInstance(object): + """ + This class holds a set of polygons that represents a single instance + of an object mask. The object can be represented as a set of + polygons + """ + + def __init__(self, polygons, size): + """ + Arguments: + a list of lists of numbers. + The first level refers to all the polygons that compose the + object, and the second level to the polygon coordinates. + """ + if isinstance(polygons, (list, tuple)): + valid_polygons = [] + for p in polygons: + p = torch.as_tensor(p, dtype=torch.float32) + if len(p) >= 6: # 3 * 2 coordinates + valid_polygons.append(p) + polygons = valid_polygons + + elif isinstance(polygons, PolygonInstance): + polygons = copy.copy(polygons.polygons) + else: + RuntimeError( + "Type of argument `polygons` is not allowed:%s" % (type(polygons)) + ) + + """ This crashes the training way too many times... + for p in polygons: + assert p[::2].min() >= 0 + assert p[::2].max() < size[0] + assert p[1::2].min() >= 0 + assert p[1::2].max() , size[1] + """ + + self.polygons = polygons + self.size = tuple(size) + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + flipped_polygons = [] + width, height = self.size + if method == FLIP_LEFT_RIGHT: + dim = width + idx = 0 + elif method == FLIP_TOP_BOTTOM: + dim = height + idx = 1 + + for poly in self.polygons: + p = poly.clone() + TO_REMOVE = 1 + p[idx::2] = dim - poly[idx::2] - TO_REMOVE + flipped_polygons.append(p) + + return PolygonInstance(flipped_polygons, size=self.size) + + def crop(self, box): + assert isinstance(box, (list, tuple, torch.Tensor)), str(type(box)) + + # box is assumed to be xyxy + current_width, current_height = self.size + xmin, ymin, xmax, ymax = map(float, box) + + assert xmin <= xmax and ymin <= ymax, str(box) + xmin = min(max(xmin, 0), current_width - 1) + ymin = min(max(ymin, 0), current_height - 1) + + xmax = min(max(xmax, 0), current_width) + ymax = min(max(ymax, 0), current_height) + + xmax = max(xmax, xmin + 1) + ymax = max(ymax, ymin + 1) + + w, h = xmax - xmin, ymax - ymin + + cropped_polygons = [] + for poly in self.polygons: + p = poly.clone() + p[0::2] = p[0::2] - xmin # .clamp(min=0, max=w) + p[1::2] = p[1::2] - ymin # .clamp(min=0, max=h) + cropped_polygons.append(p) + + return PolygonInstance(cropped_polygons, size=(w, h)) + + def resize(self, size): + try: + iter(size) + except TypeError: + assert isinstance(size, (int, float)) + size = size, size + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) + + if ratios[0] == ratios[1]: + ratio = ratios[0] + scaled_polys = [p * ratio for p in self.polygons] + return PolygonInstance(scaled_polys, size) + + ratio_w, ratio_h = ratios + scaled_polygons = [] + for poly in self.polygons: + p = poly.clone() + p[0::2] *= ratio_w + p[1::2] *= ratio_h + scaled_polygons.append(p) + + return PolygonInstance(scaled_polygons, size=size) + + def convert_to_binarymask(self): + width, height = self.size + # formatting for COCO PythonAPI + polygons = [p.numpy() for p in self.polygons] + rles = mask_utils.frPyObjects(polygons, height, width) + rle = mask_utils.merge(rles) + mask = mask_utils.decode(rle) + mask = torch.from_numpy(mask) + return mask + + def __len__(self): + return len(self.polygons) + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_groups={}, ".format(len(self.polygons)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={}, ".format(self.size[1]) + return s + + +class PolygonList(object): + """ + This class handles PolygonInstances for all objects in the image + """ + + def __init__(self, polygons, size): + """ + Arguments: + polygons: + a list of list of lists of numbers. The first + level of the list correspond to individual instances, + the second level to all the polygons that compose the + object, and the third level to the polygon coordinates. + + OR + + a list of PolygonInstances. + + OR + + a PolygonList + + size: absolute image size + + """ + if isinstance(polygons, (list, tuple)): + if len(polygons) == 0: + polygons = [[[]]] + if isinstance(polygons[0], (list, tuple)): + assert isinstance(polygons[0][0], (list, tuple)), str( + type(polygons[0][0]) + ) + else: + assert isinstance(polygons[0], PolygonInstance), str(type(polygons[0])) + + elif isinstance(polygons, PolygonList): + size = polygons.size + polygons = polygons.polygons + + else: + RuntimeError( + "Type of argument `polygons` is not allowed:%s" % (type(polygons)) + ) + + assert isinstance(size, (list, tuple)), str(type(size)) + + self.polygons = [] + for p in polygons: + p = PolygonInstance(p, size) + if len(p) > 0: + self.polygons.append(p) + + self.size = tuple(size) + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + flipped_polygons = [] + for polygon in self.polygons: + flipped_polygons.append(polygon.transpose(method)) + + return PolygonList(flipped_polygons, size=self.size) + + def crop(self, box): + w, h = box[2] - box[0], box[3] - box[1] + cropped_polygons = [] + for polygon in self.polygons: + cropped_polygons.append(polygon.crop(box)) + + cropped_size = w, h + return PolygonList(cropped_polygons, cropped_size) + + def resize(self, size): + resized_polygons = [] + for polygon in self.polygons: + resized_polygons.append(polygon.resize(size)) + + resized_size = size + return PolygonList(resized_polygons, resized_size) + + def to(self, *args, **kwargs): + return self + + def convert_to_binarymask(self): + if len(self) > 0: + masks = torch.stack([p.convert_to_binarymask() for p in self.polygons]) + else: + size = self.size + masks = torch.empty([0, size[1], size[0]], dtype=torch.uint8) + + return BinaryMaskList(masks, size=self.size) + + def __len__(self): + return len(self.polygons) + + def __getitem__(self, item): + if isinstance(item, int): + selected_polygons = [self.polygons[item]] + elif isinstance(item, slice): + selected_polygons = self.polygons[item] + else: + # advanced indexing on a single dimension + selected_polygons = [] + if isinstance(item, torch.Tensor) and item.dtype == torch.uint8: + item = item.nonzero() + item = item.squeeze(1) if item.numel() > 0 else item + item = item.tolist() + for i in item: + selected_polygons.append(self.polygons[i]) + return PolygonList(selected_polygons, size=self.size) + + def __iter__(self): + return iter(self.polygons) + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_instances={}, ".format(len(self.polygons)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={})".format(self.size[1]) + return s + + +class SegmentationMask(object): + + """ + This class stores the segmentations for all objects in the image. + It wraps BinaryMaskList and PolygonList conveniently. + """ + + def __init__(self, instances, size, mode="poly"): + """ + Arguments: + instances: two types + (1) polygon + (2) binary mask + size: (width, height) + mode: 'poly', 'mask'. if mode is 'mask', convert mask of any format to binary mask + """ + + assert isinstance(size, (list, tuple)) + assert len(size) == 2 + if isinstance(size[0], torch.Tensor): + assert isinstance(size[1], torch.Tensor) + size = size[0].item(), size[1].item() + + assert isinstance(size[0], (int, float)) + assert isinstance(size[1], (int, float)) + + if mode == "poly": + self.instances = PolygonList(instances, size) + elif mode == "mask": + self.instances = BinaryMaskList(instances, size) + else: + raise NotImplementedError("Unknown mode: %s" % str(mode)) + + self.mode = mode + self.size = tuple(size) + + def transpose(self, method): + flipped_instances = self.instances.transpose(method) + return SegmentationMask(flipped_instances, self.size, self.mode) + + def crop(self, box): + cropped_instances = self.instances.crop(box) + cropped_size = cropped_instances.size + return SegmentationMask(cropped_instances, cropped_size, self.mode) + + def resize(self, size, *args, **kwargs): + resized_instances = self.instances.resize(size) + resized_size = size + return SegmentationMask(resized_instances, resized_size, self.mode) + + def to(self, *args, **kwargs): + return self + + def convert(self, mode): + if mode == self.mode: + return self + + if mode == "poly": + converted_instances = self.instances.convert_to_polygon() + elif mode == "mask": + converted_instances = self.instances.convert_to_binarymask() + else: + raise NotImplementedError("Unknown mode: %s" % str(mode)) + + return SegmentationMask(converted_instances, self.size, mode) + + def get_mask_tensor(self): + instances = self.instances + if self.mode == "poly": + instances = instances.convert_to_binarymask() + # If there is only 1 instance + return instances.masks.squeeze(0) + + def __len__(self): + return len(self.instances) + + def __getitem__(self, item): + selected_instances = self.instances.__getitem__(item) + return SegmentationMask(selected_instances, self.size, self.mode) + + def __iter__(self): + self.iter_idx = 0 + return self + + def __next__(self): + if self.iter_idx < self.__len__(): + next_segmentation = self.__getitem__(self.iter_idx) + self.iter_idx += 1 + return next_segmentation + raise StopIteration() + + next = __next__ # Python 2 compatibility + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_instances={}, ".format(len(self.instances)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={}, ".format(self.size[1]) + s += "mode={})".format(self.mode) + return s diff --git a/maskrcnn_benchmark/utils/README.md b/maskrcnn_benchmark/utils/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9765b24a730b77556104187ac3ef5439ab0859fd --- /dev/null +++ b/maskrcnn_benchmark/utils/README.md @@ -0,0 +1,5 @@ +# Utility functions + +This folder contain utility functions that are not used in the +core library, but are useful for building models or training +code using the config system. diff --git a/maskrcnn_benchmark/utils/__init__.py b/maskrcnn_benchmark/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/utils/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..478558ef880084e6a2e3a227739eb8f598b9e6e8 Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/__init__.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/c2_model_loading.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/c2_model_loading.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80db9f7391ecfe0185dcc70d1dce27efb7b28214 Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/c2_model_loading.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/chars.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/chars.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b079ba502c033f66ce8bc3ee26b7804ccb00a47 Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/chars.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/checkpoint.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/checkpoint.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..218c09ad70e4057624442578f4bef3548f0a703a Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/checkpoint.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/collect_env.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/collect_env.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9dcea6d95dba11c3ee8021015b932da57e73c8b9 Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/collect_env.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/comm.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/comm.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a379af37b9249d18c5f4ab59c2d669af5f6dd4f9 Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/comm.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/env.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/env.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6efe3e59d8490caad73ba355b0739695a35485f Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/env.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/imports.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/imports.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e660149ec763a317cae0e30aab436585ed72b27 Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/imports.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/logger.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/logger.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..155a0c02ea5abb57c2d0ca70e532f4b0e3221a31 Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/logger.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/miscellaneous.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/miscellaneous.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d6b49b716c5b20cba1aca496a1526b811e9428e Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/miscellaneous.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/model_serialization.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/model_serialization.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8df189759e6c901fa822c6f4f1000455765e75ad Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/model_serialization.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/model_zoo.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/model_zoo.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3211d8a0a435d4cadcf85fc556a4ce4b66eead56 Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/model_zoo.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/registry.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/registry.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c40cfa8baab3c427083602ebb6a9a8dacb17f248 Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/registry.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/__pycache__/timer.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/timer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25c633890986b8b6647c0b0bd8a5b6213d62174f Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/timer.cpython-37.pyc differ diff --git a/maskrcnn_benchmark/utils/c2_model_loading.py b/maskrcnn_benchmark/utils/c2_model_loading.py new file mode 100644 index 0000000000000000000000000000000000000000..041d7e0141d52c2b6390d13a437062477b493fd5 --- /dev/null +++ b/maskrcnn_benchmark/utils/c2_model_loading.py @@ -0,0 +1,177 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import pickle +from collections import OrderedDict + +import torch + +from maskrcnn_benchmark.utils.model_serialization import load_state_dict +from maskrcnn_benchmark.utils.registry import Registry + + +def _rename_basic_resnet_weights(layer_keys): + layer_keys = [k.replace("_", ".") for k in layer_keys] + layer_keys = [k.replace(".w", ".weight") for k in layer_keys] + layer_keys = [k.replace(".bn", "_bn") for k in layer_keys] + layer_keys = [k.replace(".b", ".bias") for k in layer_keys] + layer_keys = [k.replace("_bn.s", "_bn.scale") for k in layer_keys] + layer_keys = [k.replace(".biasranch", ".branch") for k in layer_keys] + layer_keys = [k.replace("bbox.pred", "bbox_pred") for k in layer_keys] + layer_keys = [k.replace("cls.score", "cls_score") for k in layer_keys] + layer_keys = [k.replace("res.conv1_", "conv1_") for k in layer_keys] + + # RPN / Faster RCNN + layer_keys = [k.replace(".biasbox", ".bbox") for k in layer_keys] + layer_keys = [k.replace("conv.rpn", "rpn.conv") for k in layer_keys] + layer_keys = [k.replace("rpn.bbox.pred", "rpn.bbox_pred") for k in layer_keys] + layer_keys = [k.replace("rpn.cls.logits", "rpn.cls_logits") for k in layer_keys] + + # Affine-Channel -> BatchNorm enaming + layer_keys = [k.replace("_bn.scale", "_bn.weight") for k in layer_keys] + + # Make torchvision-compatible + layer_keys = [k.replace("conv1_bn.", "bn1.") for k in layer_keys] + + layer_keys = [k.replace("res2.", "layer1.") for k in layer_keys] + layer_keys = [k.replace("res3.", "layer2.") for k in layer_keys] + layer_keys = [k.replace("res4.", "layer3.") for k in layer_keys] + layer_keys = [k.replace("res5.", "layer4.") for k in layer_keys] + + layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys] + layer_keys = [k.replace(".branch2a_bn.", ".bn1.") for k in layer_keys] + layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys] + layer_keys = [k.replace(".branch2b_bn.", ".bn2.") for k in layer_keys] + layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys] + layer_keys = [k.replace(".branch2c_bn.", ".bn3.") for k in layer_keys] + + layer_keys = [k.replace(".branch1.", ".downsample.0.") for k in layer_keys] + layer_keys = [k.replace(".branch1_bn.", ".downsample.1.") for k in layer_keys] + + # GroupNorm + layer_keys = [k.replace("conv1.gn.s", "bn1.weight") for k in layer_keys] + layer_keys = [k.replace("conv1.gn.bias", "bn1.bias") for k in layer_keys] + layer_keys = [k.replace("conv2.gn.s", "bn2.weight") for k in layer_keys] + layer_keys = [k.replace("conv2.gn.bias", "bn2.bias") for k in layer_keys] + layer_keys = [k.replace("conv3.gn.s", "bn3.weight") for k in layer_keys] + layer_keys = [k.replace("conv3.gn.bias", "bn3.bias") for k in layer_keys] + layer_keys = [k.replace("downsample.0.gn.s", "downsample.1.weight") \ + for k in layer_keys] + layer_keys = [k.replace("downsample.0.gn.bias", "downsample.1.bias") \ + for k in layer_keys] + + return layer_keys + +def _rename_fpn_weights(layer_keys, stage_names): + for mapped_idx, stage_name in enumerate(stage_names, 1): + suffix = "" + if mapped_idx < 4: + suffix = ".lateral" + layer_keys = [ + k.replace("fpn.inner.layer{}.sum{}".format(stage_name, suffix), "fpn_inner{}".format(mapped_idx)) for k in layer_keys + ] + layer_keys = [k.replace("fpn.layer{}.sum".format(stage_name), "fpn_layer{}".format(mapped_idx)) for k in layer_keys] + + + layer_keys = [k.replace("rpn.conv.fpn2", "rpn.conv") for k in layer_keys] + layer_keys = [k.replace("rpn.bbox_pred.fpn2", "rpn.bbox_pred") for k in layer_keys] + layer_keys = [ + k.replace("rpn.cls_logits.fpn2", "rpn.cls_logits") for k in layer_keys + ] + + return layer_keys + + +def _rename_weights_for_resnet(weights, stage_names): + original_keys = sorted(weights.keys()) + layer_keys = sorted(weights.keys()) + + # for X-101, rename output to fc1000 to avoid conflicts afterwards + layer_keys = [k if k != "pred_b" else "fc1000_b" for k in layer_keys] + layer_keys = [k if k != "pred_w" else "fc1000_w" for k in layer_keys] + + # performs basic renaming: _ -> . , etc + layer_keys = _rename_basic_resnet_weights(layer_keys) + + # FPN + layer_keys = _rename_fpn_weights(layer_keys, stage_names) + + # Mask R-CNN + layer_keys = [k.replace("mask.fcn.logits", "mask_fcn_logits") for k in layer_keys] + layer_keys = [k.replace(".[mask].fcn", "mask_fcn") for k in layer_keys] + layer_keys = [k.replace("conv5.mask", "conv5_mask") for k in layer_keys] + + # Keypoint R-CNN + layer_keys = [k.replace("kps.score.lowres", "kps_score_lowres") for k in layer_keys] + layer_keys = [k.replace("kps.score", "kps_score") for k in layer_keys] + layer_keys = [k.replace("conv.fcn", "conv_fcn") for k in layer_keys] + + # Rename for our RPN structure + layer_keys = [k.replace("rpn.", "rpn.head.") for k in layer_keys] + + key_map = {k: v for k, v in zip(original_keys, layer_keys)} + + logger = logging.getLogger(__name__) + logger.info("Remapping C2 weights") + max_c2_key_size = max([len(k) for k in original_keys if "_momentum" not in k]) + + new_weights = OrderedDict() + for k in original_keys: + v = weights[k] + if "_momentum" in k: + continue + # if 'fc1000' in k: + # continue + w = torch.from_numpy(v) + # if "bn" in k: + # w = w.view(1, -1, 1, 1) + logger.info("C2 name: {: <{}} mapped name: {}".format(k, max_c2_key_size, key_map[k])) + new_weights[key_map[k]] = w + + return new_weights + + +def _load_c2_pickled_weights(file_path): + with open(file_path, "rb") as f: + if torch._six.PY3: + data = pickle.load(f, encoding="latin1") + else: + data = pickle.load(f) + if "blobs" in data: + weights = data["blobs"] + else: + weights = data + return weights + + +_C2_STAGE_NAMES = { + "R-50": ["1.2", "2.3", "3.5", "4.2"], + "R-101": ["1.2", "2.3", "3.22", "4.2"], + "R-152": ["1.2", "2.7", "3.35", "4.2"], +} + +C2_FORMAT_LOADER = Registry() + + +@C2_FORMAT_LOADER.register("R-50-C4") +@C2_FORMAT_LOADER.register("R-50-C5") +@C2_FORMAT_LOADER.register("R-101-C4") +@C2_FORMAT_LOADER.register("R-101-C5") +@C2_FORMAT_LOADER.register("R-50-FPN") +@C2_FORMAT_LOADER.register("R-50-FPN-RETINANET") +@C2_FORMAT_LOADER.register("R-101-FPN") +@C2_FORMAT_LOADER.register("R-101-PAN") +@C2_FORMAT_LOADER.register("R-101-FPN-RETINANET") +@C2_FORMAT_LOADER.register("R-152-FPN") +@C2_FORMAT_LOADER.register("R-152-PAN") +def load_resnet_c2_format(cfg, f): + state_dict = _load_c2_pickled_weights(f) + conv_body = cfg.MODEL.BACKBONE.CONV_BODY + arch = conv_body.replace("-C4", "").replace("-C5", "").replace("-FPN", "") + arch = arch.replace("-RETINANET", "").replace("-PAN", "") + stages = _C2_STAGE_NAMES[arch] + state_dict = _rename_weights_for_resnet(state_dict, stages) + return dict(model=state_dict) + + +def load_c2_format(cfg, f): + return C2_FORMAT_LOADER[cfg.MODEL.BACKBONE.CONV_BODY](cfg, f) diff --git a/maskrcnn_benchmark/utils/chars.py b/maskrcnn_benchmark/utils/chars.py new file mode 100644 index 0000000000000000000000000000000000000000..71772ab85dec2b42458e25593b611e5f24e465d2 --- /dev/null +++ b/maskrcnn_benchmark/utils/chars.py @@ -0,0 +1,199 @@ +import os + +import cv2 +import numpy as np + + +def char2num(char): + if char in "0123456789": + num = ord(char) - ord("0") + 1 + elif char in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": + num = ord(char.lower()) - ord("a") + 11 + else: + num = 0 + return num + + +def num2char(num): + chars = "_0123456789abcdefghijklmnopqrstuvwxyz" + char = chars[num] + # if num >=1 and num <=10: + # char = chr(ord('0') + num - 1) + # elif num > 10 and num <= 36: + # char = chr(ord('a') + num - 11) + # else: + # print('error number:%d'%(num)) + # exit() + return char + + +def getstr_grid(seg, box, threshold=192): + pos = 255 - (seg[0] * 255).astype(np.uint8) + mask_index = np.argmax(seg, axis=0) + mask_index = mask_index.astype(np.uint8) + pos = pos.astype(np.uint8) + string, score, rec_scores, char_polygons = seg2text( + pos, mask_index, seg, box, threshold=threshold + ) + return string, score, rec_scores, char_polygons + + +def seg2text(gray, mask, seg, box, threshold=192): + ## input numpy + img_h, img_w = gray.shape + box_w = box[2] - box[0] + box_h = box[3] - box[1] + ratio_h = float(box_h) / img_h + ratio_w = float(box_w) / img_w + # SE1=cv2.getStructuringElement(cv2.MORPH_RECT,(3,3)) + # gray = cv2.erode(gray,SE1) + # gray = cv2.dilate(gray,SE1) + # gray = cv2.morphologyEx(gray,cv2.MORPH_CLOSE,SE1) + ret, thresh = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY) + try: + _, contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + except: + contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + chars = [] + scores = [] + char_polygons = [] + for i in range(len(contours)): + char = {} + temp = np.zeros((img_h, img_w)).astype(np.uint8) + cv2.drawContours(temp, [contours[i]], 0, (255), -1) + x, y, w, h = cv2.boundingRect(contours[i]) + c_x, c_y = x + w / 2, y + h / 2 + perimeter = cv2.arcLength(contours[i], True) + epsilon = 0.01 * cv2.arcLength(contours[i], True) + approx = cv2.approxPolyDP(contours[i], epsilon, True) + pts = approx.reshape((-1, 2)) + pts[:, 0] = pts[:, 0] * ratio_w + box[0] + pts[:, 1] = pts[:, 1] * ratio_h + box[1] + polygon = list(pts.reshape((-1,))) + polygon = list(map(int, polygon)) + if len(polygon) >= 6: + char_polygons.append(polygon) + # x1 = x * ratio_w + box[0] + # y1 = y * ratio_h + box[1] + # x3 = (x + w) * ratio_w + box[0] + # y3 = (y + h) * ratio_h + box[1] + # polygon = [x1, y1, x3, y1, x3, y3, x1, y3] + regions = seg[1:, temp == 255].reshape((36, -1)) + cs = np.mean(regions, axis=1) + sym = num2char(np.argmax(cs.reshape((-1))) + 1) + char["x"] = c_x + char["y"] = c_y + char["s"] = sym + char["cs"] = cs.reshape((-1, 1)) + scores.append(np.max(char["cs"], axis=0)[0]) + + chars.append(char) + chars = sorted(chars, key=lambda x: x["x"]) + string = "" + css = [] + for char in chars: + string = string + char["s"] + css.append(char["cs"]) + if len(scores) > 0: + score = sum(scores) / len(scores) + else: + score = 0.00 + if not css: + css = [0.0] + return string, score, np.hstack(css), char_polygons + + +# def get_tight_rect(points, start_x, start_y, image_height, image_width, scale): +# points = list(points) +# ps = sorted(points, key=lambda x: x[0]) +# +# if ps[1][1] > ps[0][1]: +# px1 = ps[0][0] * scale + start_x +# py1 = ps[0][1] * scale + start_y +# px4 = ps[1][0] * scale + start_x +# py4 = ps[1][1] * scale + start_y +# else: +# px1 = ps[1][0] * scale + start_x +# py1 = ps[1][1] * scale + start_y +# px4 = ps[0][0] * scale + start_x +# py4 = ps[0][1] * scale + start_y +# if ps[3][1] > ps[2][1]: +# px2 = ps[2][0] * scale + start_x +# py2 = ps[2][1] * scale + start_y +# px3 = ps[3][0] * scale + start_x +# py3 = ps[3][1] * scale + start_y +# else: +# px2 = ps[3][0] * scale + start_x +# py2 = ps[3][1] * scale + start_y +# px3 = ps[2][0] * scale + start_x +# py3 = ps[2][1] * scale + start_y +# +# if px1 < 0: +# px1 = 1 +# if px1 > image_width: +# px1 = image_width - 1 +# if px2 < 0: +# px2 = 1 +# if px2 > image_width: +# px2 = image_width - 1 +# if px3 < 0: +# px3 = 1 +# if px3 > image_width: +# px3 = image_width - 1 +# if px4 < 0: +# px4 = 1 +# if px4 > image_width: +# px4 = image_width - 1 +# +# if py1 < 0: +# py1 = 1 +# if py1 > image_height: +# py1 = image_height - 1 +# if py2 < 0: +# py2 = 1 +# if py2 > image_height: +# py2 = image_height - 1 +# if py3 < 0: +# py3 = 1 +# if py3 > image_height: +# py3 = image_height - 1 +# if py4 < 0: +# py4 = 1 +# if py4 > image_height: +# py4 = image_height - 1 +# return [px1, py1, px2, py2, px3, py3, px4, py4] + +def get_tight_rect(points, start_x, start_y, image_height, image_width, scale): + points = list(points) + ps = sorted(points, key=lambda x: x[0]) + + if ps[1][1] > ps[0][1]: + px1 = ps[0][0] * scale + start_x + py1 = ps[0][1] * scale + start_y + px4 = ps[1][0] * scale + start_x + py4 = ps[1][1] * scale + start_y + else: + px1 = ps[1][0] * scale + start_x + py1 = ps[1][1] * scale + start_y + px4 = ps[0][0] * scale + start_x + py4 = ps[0][1] * scale + start_y + if ps[3][1] > ps[2][1]: + px2 = ps[2][0] * scale + start_x + py2 = ps[2][1] * scale + start_y + px3 = ps[3][0] * scale + start_x + py3 = ps[3][1] * scale + start_y + else: + px2 = ps[3][0] * scale + start_x + py2 = ps[3][1] * scale + start_y + px3 = ps[2][0] * scale + start_x + py3 = ps[2][1] * scale + start_y + + px1 = min(max(px1, 1), image_width - 1) + px2 = min(max(px2, 1), image_width - 1) + px3 = min(max(px3, 1), image_width - 1) + px4 = min(max(px4, 1), image_width - 1) + py1 = min(max(py1, 1), image_height - 1) + py2 = min(max(py2, 1), image_height - 1) + py3 = min(max(py3, 1), image_height - 1) + py4 = min(max(py4, 1), image_height - 1) + return [px1, py1, px2, py2, px3, py3, px4, py4] diff --git a/maskrcnn_benchmark/utils/checkpoint.py b/maskrcnn_benchmark/utils/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..fdb2293cd99cb78ce97e58ed3493dddf49716033 --- /dev/null +++ b/maskrcnn_benchmark/utils/checkpoint.py @@ -0,0 +1,141 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import os + +import torch + +from maskrcnn_benchmark.utils.model_serialization import load_state_dict +from maskrcnn_benchmark.utils.c2_model_loading import load_c2_format +from maskrcnn_benchmark.utils.imports import import_file +from maskrcnn_benchmark.utils.model_zoo import cache_url + + +class Checkpointer(object): + def __init__( + self, + model, + optimizer=None, + scheduler=None, + save_dir="", + save_to_disk=None, + logger=None, + ): + self.model = model + self.optimizer = optimizer + self.scheduler = scheduler + self.save_dir = save_dir + self.save_to_disk = save_to_disk + if logger is None: + logger = logging.getLogger(__name__) + self.logger = logger + + def save(self, name, **kwargs): + if not self.save_dir: + return + + if not self.save_to_disk: + return + + data = {} + data["model"] = self.model.state_dict() + if self.optimizer is not None: + data["optimizer"] = self.optimizer.state_dict() + if self.scheduler is not None: + data["scheduler"] = self.scheduler.state_dict() + data.update(kwargs) + + save_file = os.path.join(self.save_dir, "{}.pth".format(name)) + self.logger.info("Saving checkpoint to {}".format(save_file)) + torch.save(data, save_file) + self.tag_last_checkpoint(save_file) + + def load(self, f=None): + if self.has_checkpoint(): + # override argument with existing checkpoint + f = self.get_checkpoint_file() + if not f: + # no checkpoint could be found + self.logger.info("No checkpoint found. Initializing model from scratch") + return {} + + self.logger.info("Loading checkpoint from {}".format(f)) + + checkpoint = self._load_file(f) + self._load_model(checkpoint) + if "optimizer" in checkpoint and self.optimizer: + self.logger.info("Loading optimizer from {}".format(f)) + self.optimizer.load_state_dict(checkpoint.pop("optimizer")) + if "scheduler" in checkpoint and self.scheduler: + self.logger.info("Loading scheduler from {}".format(f)) + self.scheduler.load_state_dict(checkpoint.pop("scheduler")) + + # return any further checkpoint data + return checkpoint + + def has_checkpoint(self): + save_file = os.path.join(self.save_dir, "last_checkpoint") + return os.path.exists(save_file) + + def get_checkpoint_file(self): + save_file = os.path.join(self.save_dir, "last_checkpoint") + try: + with open(save_file, "r") as f: + last_saved = f.read() + last_saved = last_saved.strip() + except IOError: + # if file doesn't exist, maybe because it has just been + # deleted by a separate process + last_saved = "" + return last_saved + + def tag_last_checkpoint(self, last_filename): + save_file = os.path.join(self.save_dir, "last_checkpoint") + with open(save_file, "w") as f: + f.write(last_filename) + + def _load_file(self, f): + return torch.load(f, map_location=torch.device("cpu")) + + def _load_model(self, checkpoint): + load_state_dict(self.model, checkpoint.pop("model")) + + +class DetectronCheckpointer(Checkpointer): + def __init__( + self, + cfg, + model, + optimizer=None, + scheduler=None, + save_dir="", + save_to_disk=None, + logger=None, + ): + super(DetectronCheckpointer, self).__init__( + model, optimizer, scheduler, save_dir, save_to_disk, logger + ) + self.cfg = cfg.clone() + + def _load_file(self, f): + # catalog lookup + if f.startswith("catalog://"): + paths_catalog = import_file( + "maskrcnn_benchmark.config.paths_catalog", self.cfg.PATHS_CATALOG, True + ) + catalog_f = paths_catalog.ModelCatalog.get(f[len("catalog://") :]) + # self.logger.info("{} points to {}".format(f, catalog_f)) + f = catalog_f + # download url files + if f.startswith("http"): + # if the file is a url path, download it and cache it + cached_f = cache_url(f) + # self.logger.info("url {} cached in {}".format(f, cached_f)) + f = cached_f + # convert Caffe2 checkpoint from pkl + if f.endswith(".pkl"): + return load_c2_format(self.cfg, f) + # load native detectron.pytorch checkpoint + loaded = super(DetectronCheckpointer, self)._load_file(f) + if "model" not in loaded: + loaded = dict(model=loaded) + return loaded diff --git a/maskrcnn_benchmark/utils/collect_env.py b/maskrcnn_benchmark/utils/collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..2d0641dda61c9950cb54d0552106246248e571ef --- /dev/null +++ b/maskrcnn_benchmark/utils/collect_env.py @@ -0,0 +1,14 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import PIL + +from torch.utils.collect_env import get_pretty_env_info + + +def get_pil_version(): + return "\n Pillow ({})".format(PIL.__version__) + + +def collect_env_info(): + env_str = get_pretty_env_info() + env_str += get_pil_version() + return env_str diff --git a/maskrcnn_benchmark/utils/comm.py b/maskrcnn_benchmark/utils/comm.py new file mode 100644 index 0000000000000000000000000000000000000000..46d7c55ce04b4180def3909cd0989c21e544085f --- /dev/null +++ b/maskrcnn_benchmark/utils/comm.py @@ -0,0 +1,117 @@ +""" +This file contains primitives for multi-gpu communication. +This is useful when doing distributed training. +""" + +import pickle +import time + +import torch +import torch.distributed as dist + + +def get_world_size(): + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def synchronize(): + """ + Helper function to synchronize (barrier) among all processes when + using distributed training + """ + if not dist.is_available(): + return + if not dist.is_initialized(): + return + world_size = dist.get_world_size() + if world_size == 1: + return + dist.barrier() + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + + # serialized to a Tensor + buffer = pickle.dumps(data) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to("cuda") + + # obtain Tensor size of each rank + local_size = torch.IntTensor([tensor.numel()]).to("cuda") + size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] + dist.all_gather(size_list, local_size) + size_list = [int(size.item()) for size in size_list] + max_size = max(size_list) + + # receiving Tensor from all ranks + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + tensor_list = [] + for _ in size_list: + tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) + if local_size != max_size: + padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") + tensor = torch.cat((tensor, padding), dim=0) + dist.all_gather(tensor_list, tensor) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def reduce_dict(input_dict, average=True): + """ + Args: + input_dict (dict): all the values will be reduced + average (bool): whether to do average or sum + Reduce the values in the dictionary from all processes so that process with rank + 0 has the averaged results. Returns a dict with the same fields as + input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + dist.reduce(values, dst=0) + if dist.get_rank() == 0 and average: + # only main process gets accumulated, so only divide by + # world_size in this case + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict diff --git a/maskrcnn_benchmark/utils/cv2_util.py b/maskrcnn_benchmark/utils/cv2_util.py new file mode 100644 index 0000000000000000000000000000000000000000..0bbc0fb2d08337bfd8242cbedd514a41d8d7353f --- /dev/null +++ b/maskrcnn_benchmark/utils/cv2_util.py @@ -0,0 +1,24 @@ +""" +Module for cv2 utility functions and maintaining version compatibility +between 3.x and 4.x +""" +import cv2 + + +def findContours(*args, **kwargs): + """ + Wraps cv2.findContours to maintain compatiblity between versions + 3 and 4 + + Returns: + contours, hierarchy + """ + if cv2.__version__.startswith('4'): + contours, hierarchy = cv2.findContours(*args, **kwargs) + elif cv2.__version__.startswith('3'): + _, contours, hierarchy = cv2.findContours(*args, **kwargs) + else: + raise AssertionError( + 'cv2 must be either version 3 or 4 to call this method') + + return contours, hierarchy diff --git a/maskrcnn_benchmark/utils/env.py b/maskrcnn_benchmark/utils/env.py new file mode 100644 index 0000000000000000000000000000000000000000..1c7db32e41ec266ead9734f90d0173b4feff61ef --- /dev/null +++ b/maskrcnn_benchmark/utils/env.py @@ -0,0 +1,37 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import os + +from maskrcnn_benchmark.utils.imports import import_file + + +def setup_environment(): + """Perform environment setup work. The default setup is a no-op, but this + function allows the user to specify a Python source file that performs + custom setup work that may be necessary to their computing environment. + """ + custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE") + if custom_module_path: + setup_custom_environment(custom_module_path) + else: + # The default setup is a no-op + pass + + +def setup_custom_environment(custom_module_path): + """Load custom environment setup from a Python source file and run the setup + function. + """ + module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path) + assert hasattr(module, "setup_environment") and callable( + module.setup_environment + ), ( + "Custom environment module defined in {} does not have the " + "required callable attribute 'setup_environment'." + ).format( + custom_module_path + ) + module.setup_environment() + + +# Force environment setup when this module is imported +setup_environment() diff --git a/maskrcnn_benchmark/utils/imports.py b/maskrcnn_benchmark/utils/imports.py new file mode 100644 index 0000000000000000000000000000000000000000..53e27e2bcfd6d9dd57579f48d42811072daf0df5 --- /dev/null +++ b/maskrcnn_benchmark/utils/imports.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +if torch._six.PY3: + import importlib + import importlib.util + import sys + + + # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa + def import_file(module_name, file_path, make_importable=False): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + if make_importable: + sys.modules[module_name] = module + return module +else: + import imp + + def import_file(module_name, file_path, make_importable=None): + module = imp.load_source(module_name, file_path) + return module diff --git a/maskrcnn_benchmark/utils/logger.py b/maskrcnn_benchmark/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..13847a3a76b481e132190ee0757b3539fb8981ae --- /dev/null +++ b/maskrcnn_benchmark/utils/logger.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import os +import sys + + +def setup_logger(name, save_dir, distributed_rank, filename="log.txt"): + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + # don't log results for the non-master process + if distributed_rank > 0: + return logger + ch = logging.StreamHandler(stream=sys.stdout) + ch.setLevel(logging.DEBUG) + formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") + ch.setFormatter(formatter) + logger.addHandler(ch) + + if save_dir: + fh = logging.FileHandler(os.path.join(save_dir, filename)) + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + logger.addHandler(fh) + + return logger diff --git a/maskrcnn_benchmark/utils/metric_logger.py b/maskrcnn_benchmark/utils/metric_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..5e37a72ea4b4c85546de98210295a6adb134a297 --- /dev/null +++ b/maskrcnn_benchmark/utils/metric_logger.py @@ -0,0 +1,66 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import defaultdict +from collections import deque + +import torch + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20): + self.deque = deque(maxlen=window_size) + self.series = [] + self.total = 0.0 + self.count = 0 + + def update(self, value): + self.deque.append(value) + self.series.append(value) + self.count += 1 + self.total += value + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque)) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg) + ) + return self.delimiter.join(loss_str) diff --git a/maskrcnn_benchmark/utils/miscellaneous.py b/maskrcnn_benchmark/utils/miscellaneous.py new file mode 100644 index 0000000000000000000000000000000000000000..db9a8b3679ceea2a5cd2b807421793bbbd3d3677 --- /dev/null +++ b/maskrcnn_benchmark/utils/miscellaneous.py @@ -0,0 +1,11 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import errno +import os + + +def mkdir(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise diff --git a/maskrcnn_benchmark/utils/model_serialization.py b/maskrcnn_benchmark/utils/model_serialization.py new file mode 100644 index 0000000000000000000000000000000000000000..a95ad8b2a7a787d62dc3ea580b2dfd30e358da28 --- /dev/null +++ b/maskrcnn_benchmark/utils/model_serialization.py @@ -0,0 +1,80 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import OrderedDict +import logging + +import torch + +from maskrcnn_benchmark.utils.imports import import_file + + +def align_and_update_state_dicts(model_state_dict, loaded_state_dict): + """ + Strategy: suppose that the models that we will create will have prefixes appended + to each of its keys, for example due to an extra level of nesting that the original + pre-trained weights from ImageNet won't contain. For example, model.state_dict() + might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains + res2.conv1.weight. We thus want to match both parameters together. + For that, we look for each model weight, look among all loaded keys if there is one + that is a suffix of the current weight name, and use it if that's the case. + If multiple matches exist, take the one with longest size + of the corresponding name. For example, for the same model as before, the pretrained + weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, + we want to match backbone[0].body.conv1.weight to conv1.weight, and + backbone[0].body.res2.conv1.weight to res2.conv1.weight. + """ + current_keys = sorted(list(model_state_dict.keys())) + loaded_keys = sorted(list(loaded_state_dict.keys())) + # get a matrix of string matches, where each (i, j) entry correspond to the size of the + # loaded_key string, if it matches + match_matrix = [ + len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys + ] + match_matrix = torch.as_tensor(match_matrix).view( + len(current_keys), len(loaded_keys) + ) + max_match_size, idxs = match_matrix.max(1) + # remove indices that correspond to no-match + idxs[max_match_size == 0] = -1 + + # used for logging + max_size = max([len(key) for key in current_keys]) if current_keys else 1 + max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1 + log_str_template = "{: <{}} loaded from {: <{}} of shape {}" + logger = logging.getLogger(__name__) + for idx_new, idx_old in enumerate(idxs.tolist()): + if idx_old == -1: + continue + key = current_keys[idx_new] + key_old = loaded_keys[idx_old] + model_state_dict[key] = loaded_state_dict[key_old] + logger.info( + log_str_template.format( + key, + max_size, + key_old, + max_size_loaded, + tuple(loaded_state_dict[key_old].shape), + ) + ) + + +def strip_prefix_if_present(state_dict, prefix): + keys = sorted(state_dict.keys()) + if not all(key.startswith(prefix) for key in keys): + return state_dict + stripped_state_dict = OrderedDict() + for key, value in state_dict.items(): + stripped_state_dict[key.replace(prefix, "")] = value + return stripped_state_dict + + +def load_state_dict(model, loaded_state_dict): + model_state_dict = model.state_dict() + # if the state_dict comes from a model that was wrapped in a + # DataParallel or DistributedDataParallel during serialization, + # remove the "module" prefix before performing the matching + loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.") + align_and_update_state_dicts(model_state_dict, loaded_state_dict) + + # use strict loading + model.load_state_dict(model_state_dict) diff --git a/maskrcnn_benchmark/utils/model_zoo.py b/maskrcnn_benchmark/utils/model_zoo.py new file mode 100644 index 0000000000000000000000000000000000000000..92c1ed7e5dab54bd9fa3358185c71f9d5fcf26a8 --- /dev/null +++ b/maskrcnn_benchmark/utils/model_zoo.py @@ -0,0 +1,58 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import os +import sys + +try: + from torch.utils.model_zoo import _download_url_to_file, urlparse, HASH_REGEX +except ImportError: + # support for pytorch 1.1.0dev + from torch.hub import _download_url_to_file, urlparse, HASH_REGEX + +from maskrcnn_benchmark.utils.comm import is_main_process +from maskrcnn_benchmark.utils.comm import synchronize + + +# very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py +# but with a few improvements and modifications +def cache_url(url, model_dir=None, progress=True): + r"""Loads the Torch serialized object at the given URL. + If the object is already present in `model_dir`, it's deserialized and + returned. The filename part of the URL should follow the naming convention + ``filename-.ext`` where ```` is the first eight or more + digits of the SHA256 hash of the contents of the file. The hash is used to + ensure unique names and to verify the contents of the file. + The default value of `model_dir` is ``$TORCH_HOME/models`` where + ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be + overridden with the ``$TORCH_MODEL_ZOO`` environment variable. + Args: + url (string): URL of the object to download + model_dir (string, optional): directory in which to save the object + progress (bool, optional): whether or not to display a progress bar to stderr + Example: + >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') + """ + if model_dir is None: + torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch')) + model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models')) + if not os.path.exists(model_dir): + os.makedirs(model_dir) + parts = urlparse(url) + filename = os.path.basename(parts.path) + if filename == "model_final.pkl": + # workaround as pre-trained Caffe2 models from Detectron have all the same filename + # so make the full path the filename by replacing / with _ + filename = parts.path.replace("/", "_") + cached_file = os.path.join(model_dir, filename) + if not os.path.exists(cached_file) and is_main_process(): + sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) + hash_prefix = HASH_REGEX.search(filename) + if hash_prefix is not None: + hash_prefix = hash_prefix.group(1) + # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, + # which matches the hash PyTorch uses. So we skip the hash matching + # if the hash_prefix is less than 6 characters + if len(hash_prefix) < 6: + hash_prefix = None + _download_url_to_file(url, cached_file, hash_prefix, progress=progress) + synchronize() + return cached_file diff --git a/maskrcnn_benchmark/utils/registry.py b/maskrcnn_benchmark/utils/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..c3204e14148fe3341307c5d24ba9154c07449511 --- /dev/null +++ b/maskrcnn_benchmark/utils/registry.py @@ -0,0 +1,45 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + + +def _register_generic(module_dict, module_name, module): + assert module_name not in module_dict + module_dict[module_name] = module + + +class Registry(dict): + ''' + A helper class for managing registering modules, it extends a dictionary + and provides a register functions. + + Eg. creeting a registry: + some_registry = Registry({"default": default_module}) + + There're two ways of registering new modules: + 1): normal way is just calling register function: + def foo(): + ... + some_registry.register("foo_module", foo) + 2): used as decorator when declaring the module: + @some_registry.register("foo_module") + @some_registry.register("foo_modeul_nickname") + def foo(): + ... + + Access of module is just like using a dictionary, eg: + f = some_registry["foo_modeul"] + ''' + def __init__(self, *args, **kwargs): + super(Registry, self).__init__(*args, **kwargs) + + def register(self, module_name, module=None): + # used as function call + if module is not None: + _register_generic(self, module_name, module) + return + + # used as decorator + def register_fn(fn): + _register_generic(self, module_name, fn) + return fn + + return register_fn diff --git a/maskrcnn_benchmark/utils/timer.py b/maskrcnn_benchmark/utils/timer.py new file mode 100644 index 0000000000000000000000000000000000000000..935af1a30811abd81de29afd2cfec6cf6880cc5e --- /dev/null +++ b/maskrcnn_benchmark/utils/timer.py @@ -0,0 +1,46 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + + +import time +import datetime + + +class Timer(object): + def __init__(self): + self.reset() + + @property + def average_time(self): + return self.total_time / self.calls if self.calls > 0 else 0.0 + + def tic(self): + # using time.time instead of time.clock because time time.clock + # does not normalize for multithreading + self.start_time = time.time() + + def toc(self, average=True): + self.add(time.time() - self.start_time) + if average: + return self.average_time + else: + return self.diff + + def add(self, time_diff): + self.diff = time_diff + self.total_time += self.diff + self.calls += 1 + + def reset(self): + self.total_time = 0.0 + self.calls = 0 + self.start_time = 0.0 + self.diff = 0.0 + + def avg_time_str(self): + time_str = str(datetime.timedelta(seconds=self.average_time)) + return time_str + + +def get_time_str(time_diff): + time_str = str(datetime.timedelta(seconds=time_diff)) + return time_str diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a67b697bd543bc0648f92a63535180d18e870985 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +ninja +yacs +cython +matplotlib +tqdm diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..837c2cd15f4624f630540ef6993dcb9123adb39b --- /dev/null +++ b/setup.py @@ -0,0 +1,69 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#!/usr/bin/env python + +import glob +import os + +import torch +from setuptools import find_packages +from setuptools import setup +from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CppExtension +from torch.utils.cpp_extension import CUDAExtension + +requirements = ["torch", "torchvision"] + + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc") + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) + + sources = main_file + source_cpu + extension = CppExtension + + extra_compile_args = {"cxx": []} + define_macros = [] + + if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + + sources = [os.path.join(extensions_dir, s) for s in sources] + + include_dirs = [extensions_dir] + + ext_modules = [ + extension( + "maskrcnn_benchmark._C", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + + return ext_modules + + +setup( + name="maskrcnn_benchmark", + version="0.1", + author="fmassa", + url="https://github.com/facebookresearch/maskrcnn-benchmark", + description="object detection in pytorch", + packages=find_packages(exclude=("configs", "tests",)), + # install_requires=requirements, + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/test_contour.sh b/test_contour.sh new file mode 100644 index 0000000000000000000000000000000000000000..b6cb1d6f0a2b5aab192ac88725b537c26d3fd813 --- /dev/null +++ b/test_contour.sh @@ -0,0 +1,3 @@ +export NGPUS=1 +CUDA_VISIBLE_DEVICES=0 python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/test_net.py \ + --config-file "configs/ctw/r50_baseline.yaml" diff --git a/tools/demo.py b/tools/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..7f4b2b4da1c7ebf6593da0dd07fce157c889ddd6 --- /dev/null +++ b/tools/demo.py @@ -0,0 +1,620 @@ +import os +import cv2 +import torch +from torchvision import transforms as T +import torch.nn as nn + +from maskrcnn_benchmark.modeling.detector import build_detection_model +from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer +from maskrcnn_benchmark.structures.image_list import to_image_list +from maskrcnn_benchmark.config import cfg +from maskrcnn_benchmark.utils.chars import getstr_grid, get_tight_rect +from maskrcnn_benchmark.data.datasets.evaluation.word.alfashape import getAlfaShapes +from maskrcnn_benchmark.modeling.roi_heads.boundary_head.inference import Masker +from shapely.geometry import * +import random +from torchvision.transforms import functional as F + +from PIL import Image +import numpy as np +import argparse + +class Resize(object): + def __init__(self, min_size, max_size): + if not isinstance(min_size, (list, tuple)): + min_size = (min_size,) + self.min_size = min_size + self.max_size = max_size + + # modified from torchvision to add support for max size + def get_size(self, image_size): + w, h = image_size + size = random.choice(self.min_size) + max_size = self.max_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def __call__(self, image): + size = self.get_size(image.size) + image = F.resize(image, size) + return image + +class TextDemo(object): + def __init__( + self, + cfg, + confidence_threshold=0.7, + min_image_size=(1200,2000), + output_polygon=True + ): + self.cfg = cfg.clone() + self.model = build_detection_model(cfg) + self.model.eval() + self.device = torch.device(cfg.MODEL.DEVICE) + self.model.to(self.device) + self.min_image_size = min_image_size + + checkpointer = DetectronCheckpointer(cfg, self.model, save_dir=cfg.OUTPUT_DIR) + _ = checkpointer.load(cfg.MODEL.WEIGHT) + + self.transforms = self.build_transform() + self.cpu_device = torch.device("cpu") + self.confidence_threshold = confidence_threshold + self.output_polygon = output_polygon + + def build_transform(self): + """ + Creates a basic transformation that was used to train the models + """ + cfg = self.cfg + # we are loading images with OpenCV, so we don't need to convert them + # to BGR, they are already! So all we need to do is to normalize + # by 255 if we want to convert to BGR255 format, or flip the channels + # if we want it to be in RGB in [0-1] range. + if cfg.INPUT.TO_BGR255: + to_bgr_transform = T.Lambda(lambda x: x * 255) + else: + to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]]) + + normalize_transform = T.Normalize( + mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD + ) + min_size = cfg.INPUT.MIN_SIZE_TEST + max_size = cfg.INPUT.MAX_SIZE_TEST + + transform = T.Compose( + [ + T.ToPILImage(), + Resize(min_size, max_size), + T.ToTensor(), + to_bgr_transform, + normalize_transform, + ] + ) + return transform + + def run_on_opencv_image(self, image): + """ + Arguments: + image (np.ndarray): an image as returned by OpenCV + Returns: + result_polygons (list): detection results + result_words (list): recognition results + """ + result_polygons = self.compute_prediction(image) + return result_polygons + + def contour_to_valid(self, cnt, image_shape): + """Convert rect to xys, i.e., eight points + The `image_shape` is used to to make sure all points return are valid, i.e., within image area + """ + # rect = cv2.minAreaRect(cnt) + if len(cnt.shape) != 3: + assert 1 < 0 + rect = cnt.reshape([cnt.shape[0], cnt.shape[2]]) + h, w = image_shape[0:2] + + def get_valid_x(x): + if x < 0: + return 0 + if x >= w: + return w - 1 + return x + + def get_valid_y(y): + if y < 0: + return 0 + if y >= h: + return h - 1 + return y + for i_xy, (x, y) in enumerate(rect): + x = get_valid_x(x) + y = get_valid_y(y) + rect[i_xy, :] = [x, y] + + points = np.reshape(rect, -1) + return points + + def _nms_y(self, heat, kernel=3): + pad = (kernel - 1) // 2 + hmax = nn.functional.max_pool2d( + heat, (1, kernel), stride=1, padding=(0, pad)) + keep = (hmax == heat).float() + return heat * keep + + def _nms_x(self, heat, kernel=3): + pad = (kernel - 1) // 2 + hmax = nn.functional.max_pool2d( + heat, (kernel, 1), stride=1, padding=(pad, 0)) + keep = (hmax == heat).float() + return heat * keep + + def CTW_order_lr(self, map_in): + line_out_l2r = [] + line_out_r2l = [] + + map_in = torch.tensor(map_in) + value, top = torch.topk(map_in, 2, dim=0) + value = value.numpy() + top = top.numpy() + top_th = np.where(value[1] > 0.1)[0] # L + # print(top_th) + if len(top_th) == 0: + return [] + top1 = np.sort(top, axis=0) + for i in range(len(top_th)): + line_out_l2r.append([top_th[i], top1[0][top_th[i]]]) + line_out_r2l.append([top_th[i], top1[1][top_th[i]]]) + line_out = line_out_l2r+line_out_r2l[::-1] + # print(line_out) + return line_out + + def CTW_order_bt(self, map_in): + line_out_t2b = [] + line_out_b2t = [] + + map_in = torch.tensor(map_in) + value, top = torch.topk(map_in, 2, dim=1) + value = value.numpy() + top = top.numpy() + top_th = np.where(value[:, 1] > 0.1)[0] # H + if len(top_th) == 0: + return [] + top1 = np.sort(top, axis=1) + for i in range(len(top_th)): + line_out_b2t.append([top1[top_th[i]][0], top_th[i]]) + line_out_t2b.append([top1[top_th[i]][1], top_th[i]]) + line_out = line_out_b2t[::-1] + line_out_t2b + # print(line_out) + return line_out + + def boundary_to_mask_ic(self, bo_x, bo_y): + + # NMS Hmap and Vmap + Vmap = self._nms_x(bo_x, kernel=5) + Hmap = self._nms_y(bo_y, kernel=3) + Vmap = Vmap[0] + Hmap = Hmap[0] + ploys_Alfa_x = Vmap.clone().numpy() + ploys_Alfa_y = Hmap.clone().numpy() + + # Threshold Hmap and Vmap + thresh = 0.5 + ploys_Alfa_x[ploys_Alfa_x < thresh] = 0 + ploys_Alfa_x[ploys_Alfa_x >= thresh] = 1 + ploys_Alfa_y[ploys_Alfa_y < thresh] = 0 + ploys_Alfa_y[ploys_Alfa_y >= thresh] = 1 + # Output points with strong texture inforamtion in both maps + ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y + ploys_Alfa[ploys_Alfa < 2] = 0 + ploys_Alfa[ploys_Alfa == 2] = 1 + img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8) + + # calculate polygon by Alpha-Shape Algorithm + if ploys_Alfa.sum() == 0: + return img_draw + ploys_Alfa_inds = np.argwhere(ploys_Alfa == 1) + zero_detect_x = ploys_Alfa_inds[:, 0] - ploys_Alfa_inds[0, 0] + zero_detect_y = ploys_Alfa_inds[:, 1] - ploys_Alfa_inds[0, 1] + if np.where(zero_detect_x != 0)[0].shape[0] == 0 or np.where(zero_detect_y != 0)[0].shape[0] == 0 or \ + ploys_Alfa_inds.shape[0] < 4: + draw_line = ploys_Alfa_inds[np.newaxis, np.newaxis, :, :] + cv2.fillPoly(img_draw, draw_line, 1) + return img_draw + ploys_Alfa_inds = ploys_Alfa_inds.tolist() + ploys_Alfa_inds = [tuple(ploys_Alfa_ind) for ploys_Alfa_ind in ploys_Alfa_inds] + lines = getAlfaShapes(ploys_Alfa_inds, alfas=[1]) + draw_line = np.array(lines) + if len(draw_line.shape) == 4: + if draw_line.shape[1] == 1: + draw_line[0, 0, :, :] = draw_line[0, 0, :, ::-1] + cv2.fillPoly(img_draw, draw_line, 1) + else: + i_draw = 0 + for draw_l in draw_line[0]: + img_draw_new = np.zeros([28, 28], dtype=np.uint8) + draw_l = draw_l[np.newaxis, np.newaxis, :, :] + cv2.fillPoly(img_draw, np.int32(draw_l), 1) + cv2.fillPoly(img_draw_new, np.int32(draw_l), 1) + i_draw += 1 + + else: + for i, line in enumerate(lines[0]): + draw_line = np.array(line) + draw_line = draw_line[np.newaxis, np.newaxis, :, :] + draw_line[0, 0, :, :] = draw_line[0, 0, :, ::-1] + cv2.fillPoly(img_draw, draw_line, 1) + return img_draw + + def boundary_to_mask_ctw(self, bo_x, bo_y, p_temp_box): + w_half = (p_temp_box[2] - p_temp_box[0]) * .5 + h_half = (p_temp_box[3] - p_temp_box[1]) * .5 + thresh_total = 0.5 + + if w_half >= h_half: + # point re-scoring + bo_x = self._nms_x(bo_x, kernel=9) + bo_x = bo_x[0] + bo_y = bo_y[0] + ploys_Alfa_x = bo_x.clone().numpy() + ploys_Alfa_y = bo_y.clone().numpy() + thresh_x = thresh_total + thresh_y = thresh_total + ploys_Alfa_x_1 = bo_x.clone().numpy() + ploys_Alfa_y_1 = bo_y.clone().numpy() + ploys_Alfa__1 = ploys_Alfa_x_1 + ploys_Alfa_y_1 + ploys_Alfa_x[ploys_Alfa_x < thresh_x] = 0 + ploys_Alfa_x[ploys_Alfa_x >= thresh_x] = 1 + ploys_Alfa_y[ploys_Alfa_y < thresh_y] = 0 + ploys_Alfa_y[ploys_Alfa_y >= thresh_y] = 1 + ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y + ploys_Alfa[ploys_Alfa < 2] = 0 + ploys_Alfa[ploys_Alfa == 2] = 1 + ploys_Alfa *= ploys_Alfa__1 + # rebuild text region from contour points + img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8) + if ploys_Alfa.sum() == 0: + return img_draw + lines = self.CTW_order_lr(ploys_Alfa) + else: + bo_y = self._nms_y(bo_y,kernel=9) + bo_x = bo_x[0] + bo_y = bo_y[0] + ploys_Alfa_x = bo_x.clone().numpy() + ploys_Alfa_y = bo_y.clone().numpy() + thresh_x = thresh_total + thresh_y = thresh_total + ploys_Alfa_x_1 = bo_x.clone().numpy() + ploys_Alfa_y_1 = bo_y.clone().numpy() + ploys_Alfa__1 = ploys_Alfa_x_1 + ploys_Alfa_y_1 + ploys_Alfa_x[ploys_Alfa_x < thresh_x] = 0 + ploys_Alfa_x[ploys_Alfa_x >= thresh_x] = 1 + ploys_Alfa_y[ploys_Alfa_y < thresh_y] = 0 + ploys_Alfa_y[ploys_Alfa_y >= thresh_y] = 1 + ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y + ploys_Alfa[ploys_Alfa < 2] = 0 + ploys_Alfa[ploys_Alfa == 2] = 1 + ploys_Alfa *= ploys_Alfa__1 + img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8) + if ploys_Alfa.sum() == 0: + return img_draw + lines = self.CTW_order_bt(ploys_Alfa) + if len(lines) <=10: + return img_draw + draw_line = np.array(lines) + draw_line = draw_line[np.newaxis, np.newaxis, :, :] + cv2.fillPoly(img_draw, draw_line, 1) + img_draw = img_draw.astype(np.uint8) + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) + img_draw = cv2.morphologyEx(img_draw, cv2.MORPH_CLOSE, kernel) + return img_draw + + def contour_to_xys(self, cnt, image_shape): + """Convert rect to xys, i.e., eight points + The `image_shape` is used to to make sure all points return are valid, i.e., within image area + """ + rect = cv2.minAreaRect(cnt) + h, w = image_shape[0:2] + + def get_valid_x(x): + if x < 0: + return 0 + if x >= w: + return w - 1 + return x + + def get_valid_y(y): + if y < 0: + return 0 + if y >= h: + return h - 1 + return y + + points = cv2.boxPoints(rect) + points = np.int0(points) + for i_xy, (x, y) in enumerate(points): + x = get_valid_x(x) + y = get_valid_y(y) + points[i_xy, :] = [x, y] + points = np.reshape(points, -1) + return points + + def mask_to_roRect(self, mask, img_shape): + ## convert mask into rotated rect + e = mask[0, :, :] + _, countours, hier = cv2.findContours(e.clone().numpy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) # Aarlog + if len(countours) == 0: + return np.zeros((1, 8)) + t_c = countours[0].copy() + quad = self.contour_to_xys(t_c, img_shape) + return quad + + def mask_to_contours(self, mask, img_shape): + e = mask[0, :, :] + + _, countours, hier = cv2.findContours(e.clone().numpy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) # Aarlog + + if len(countours) == 0: + return np.zeros((1, 8)) + t_c = countours[0].copy() + quad = self.contour_to_valid(t_c, img_shape) + return quad + + def py_cpu_pnms(self, dets, scores, thresh): + pts = [] + for det in dets: + pts.append([[det[i][0], det[i][1]] for i in range(len(det))]) + order = scores.argsort()[::-1] + areas = np.zeros(scores.shape) + order = scores.argsort()[::-1] + inter_areas = np.zeros((scores.shape[0], scores.shape[0])) + for il in range(len(pts)): + poly = Polygon(pts[il]) + areas[il] = poly.area + for jl in range(il, len(pts)): + polyj = Polygon(pts[jl]) + try: + inS = poly.intersection(polyj) + except: + print(poly, polyj) + inter_areas[il][jl] = inS.area + inter_areas[jl][il] = inS.area + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = inter_areas[i][order[1:]] / (areas[i] + areas[order[1:]] - inter_areas[i][order[1:]]) + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + return keep + + def esd_pnms(self, esd, pnms_thresh): + scores = [] + dets = [] + for ele in esd: + score = ele['score'] + quad = ele['seg_rorect'] + # det = np.array([[quad[0][0], quad[0][1]], [quad[1][0], quad[1][1]],[quad[2][0], quad[2][1]],[quad[3][0], quad[3][1]]]) + det = np.array([[quad[0], quad[1]], [quad[2], quad[3]], [quad[4], quad[5]], [quad[6], quad[7]]]) + scores.append(score) + dets.append(det) + scores = np.array(scores) + dets = np.array(dets) + keep = self.py_cpu_pnms(dets, scores, pnms_thresh) + return keep + + def compute_prediction(self, original_image): + # apply pre-processing to image + image = self.transforms(original_image) + # convert to an ImageList, padded so that it is divisible by + # cfg.DATALOADER.SIZE_DIVISIBILITY + image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) + image_list = image_list.to(self.device) + # compute predictions + with torch.no_grad(): + output = self.model(image_list) + prediction = [o.to(self.cpu_device) for o in output][0] + #global_predictions = predictions[0] + #char_predictions = predictions[1] + #char_mask = char_predictions['char_mask'] + #char_boxes = char_predictions['boxes'] + #words, rec_scores = self.process_char_mask(char_mask, char_boxes) + #seq_words = char_predictions['seq_outputs'] + #seq_scores = char_predictions['seq_scores'] + + # reshape prediction (a BoxList) into the original image size + image_height, image_width = original_image.shape[:-1] + prediction = prediction.resize((image_width, image_height)) + if len(prediction) == 0: + return + prediction = prediction.convert("xywh") + boxes = prediction.bbox.tolist() + scores = prediction.get_field("scores").tolist() + masks_x = prediction.get_field("mask_x") + masks_y = prediction.get_field("mask_y") + #masks = [self.boundary_to_mask_ic(mask_x, mask_y) for + # mask_x, mask_y in zip(masks_x, masks_y)] + masks = [self.boundary_to_mask_ctw(mask_x, mask_y, p_temp) for + mask_x, mask_y, p_temp in zip(masks_x, masks_y, prediction.bbox)] + masks = torch.from_numpy(np.array(masks)[:, np.newaxis, :, :]) + # Masker is necessary only if masks haven't been already resized. + masker = Masker(threshold=0.5, padding=1) + if list(masks.shape[-2:]) != [image_height, image_width]: + masks = masker(masks.expand(1, -1, -1, -1, -1), prediction) + masks = masks[0] + + ''' + rects = [self.mask_to_roRect(mask, [image_height, image_width]) for mask in masks] + + esd = [] + for k, rect in enumerate(rects): + if rect.all() == 0: + continue + else: + esd.append( + { + "seg_rorect": rect.tolist(), + "score": scores[k], + } + ) + + if cfg.PROCESS.PNMS: + pnms_thresh = cfg.PROCESS.NMS_THRESH + keep = self.esd_pnms(esd, pnms_thresh) + im_write = cv2.imread('./demo/1.jpg')[:, :, ::-1] + for i in keep: + box = esd[i] + # print(box) + # assert 1<0 + box = np.array(box['seg_rorect']) + box = np.around(box).astype(np.int32) + cv2.polylines(im_write[:, :, ::-1], [box.astype(np.int32).reshape((-1, 1, 2))], True, + color=(0, 255, 0), thickness=2) # 0,255,255 y 0,255,0 g + cv2.imwrite('./demo/example_results.jpg', im_write[:, :, ::-1]) + + ''' + contours = [self.mask_to_contours(mask, [image_height, image_width]) for mask in masks] + ''' + im_write = original_image[:, :, ::-1] + for box in contours: + box = np.array(box) + box = np.around(box).astype(np.int32) + cv2.polylines(im_write[:, :, ::-1], [box.astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), thickness=2) # 0,255,255 y 0,255,0 g + cv2.imwrite('./demo/example_results.jpg', im_write[:, :, ::-1]) + ''' + + return contours, np.array(masks.repeat(1,3,1,1)).astype(np.bool_).transpose(0,2,3,1) + + def process_char_mask(self, char_masks, boxes, threshold=192): + texts, rec_scores = [], [] + for index in range(char_masks.shape[0]): + box = list(boxes[index]) + box = list(map(int, box)) + text, rec_score, _, _ = getstr_grid(char_masks[index,:,:,:].copy(), box, threshold=threshold) + texts.append(text) + rec_scores.append(rec_score) + return texts, rec_scores + + def mask2polygon(self, mask, box, im_size, threshold=0.5, output_polygon=True): + # mask 32*128 + image_width, image_height = im_size[1], im_size[0] + box_h = box[3] - box[1] + box_w = box[2] - box[0] + cls_polys = (mask*255).astype(np.uint8) + poly_map = np.array(Image.fromarray(cls_polys).resize((box_w, box_h))) + poly_map = poly_map.astype(np.float32) / 255 + poly_map=cv2.GaussianBlur(poly_map,(3,3),sigmaX=3) + ret, poly_map = cv2.threshold(poly_map,0.5,1,cv2.THRESH_BINARY) + if output_polygon: + SE1=cv2.getStructuringElement(cv2.MORPH_RECT,(3,3)) + poly_map = cv2.erode(poly_map,SE1) + poly_map = cv2.dilate(poly_map,SE1); + poly_map = cv2.morphologyEx(poly_map,cv2.MORPH_CLOSE,SE1) + try: + _, contours, _ = cv2.findContours((poly_map * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) + except: + contours, _ = cv2.findContours((poly_map * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) + if len(contours)==0: + print(contours) + print(len(contours)) + return None + max_area=0 + max_cnt = contours[0] + for cnt in contours: + area=cv2.contourArea(cnt) + if area > max_area: + max_area = area + max_cnt = cnt + perimeter = cv2.arcLength(max_cnt,True) + epsilon = 0.01*cv2.arcLength(max_cnt,True) + approx = cv2.approxPolyDP(max_cnt,epsilon,True) + pts = approx.reshape((-1,2)) + pts[:,0] = pts[:,0] + box[0] + pts[:,1] = pts[:,1] + box[1] + polygon = list(pts.reshape((-1,))) + polygon = list(map(int, polygon)) + if len(polygon)<6: + return None + else: + SE1=cv2.getStructuringElement(cv2.MORPH_RECT,(3,3)) + poly_map = cv2.erode(poly_map,SE1) + poly_map = cv2.dilate(poly_map,SE1); + poly_map = cv2.morphologyEx(poly_map,cv2.MORPH_CLOSE,SE1) + idy,idx=np.where(poly_map == 1) + xy=np.vstack((idx,idy)) + xy=np.transpose(xy) + hull = cv2.convexHull(xy, clockwise=True) + #reverse order of points. + if hull is None: + return None + hull=hull[::-1] + #find minimum area bounding box. + rect = cv2.minAreaRect(hull) + corners = cv2.boxPoints(rect) + corners = np.array(corners, dtype="int") + pts = get_tight_rect(corners, box[0], box[1], image_height, image_width, 1) + polygon = [x * 1.0 for x in pts] + polygon = list(map(int, polygon)) + return polygon + + def visualization(self, image, polygons, masks): + green = np.ones(image.shape).astype(np.uint8) + green[...,0] = 0 + green[...,1] = 255 + green[...,2] = 0 + for mask in masks: + image[mask] = image[mask] * 0.5 + green[mask] * 0.5 + ''' + for polygon in polygons: + pts = np.array(polygon, np.int32) + pts = pts.reshape((-1,1,2)) + xmin = min(pts[:,0,0]) + ymin = min(pts[:,0,1]) + cv2.polylines(image,[pts],True,(0,0,255)) + #cv2.putText(image, word, (xmin, ymin), cv2.FONT_HERSHEY_COMPLEX, 1, (0,0,255), 2) + ''' + return image + + +def main(args): + # update the config options with the config file + cfg.merge_from_file(args.config_file) + # manual override some options + # cfg.merge_from_list(["MODEL.DEVICE", "cpu"]) + + text_demo = TextDemo( + cfg, + min_image_size=(1200,2000), + confidence_threshold=0.85, + output_polygon=True + ) + # load image and then run prediction + + image = cv2.imread(args.image_path) + result_polygons, result_masks = text_demo.run_on_opencv_image(image) + image = text_demo.visualization(image, result_polygons, result_masks) + cv2.imwrite(args.visu_path, image) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='parameters for demo') + parser.add_argument("--config-file", type=str, default='./configs/ctw/r50_baseline.yaml') + parser.add_argument("--image_path", type=str, default='./det_visual/1223.jpg') + parser.add_argument("--visu_path", type=str, default='./demo/example_results.jpg') + args = parser.parse_args() + main(args) diff --git a/tools/test_net.py b/tools/test_net.py new file mode 100644 index 0000000000000000000000000000000000000000..e96ab47825679ab40718dc390a1688a91a5eb629 --- /dev/null +++ b/tools/test_net.py @@ -0,0 +1,95 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Set up custom environment before nearly anything else is imported +# NOTE: this should be the first import (no not reorder) +from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip + +import argparse +import os + +import torch +from maskrcnn_benchmark.config import cfg +from maskrcnn_benchmark.data import make_data_loader +from maskrcnn_benchmark.engine.inference import inference +from maskrcnn_benchmark.modeling.detector import build_detection_model +from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer +from maskrcnn_benchmark.utils.collect_env import collect_env_info +from maskrcnn_benchmark.utils.comm import synchronize, get_rank +from maskrcnn_benchmark.utils.logger import setup_logger +from maskrcnn_benchmark.utils.miscellaneous import mkdir + + +def main(): + parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") + parser.add_argument( + "--config-file", + default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument("--local_rank", type=int, default=0) + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + + args = parser.parse_args() + + num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 + distributed = num_gpus > 1 + + if distributed: + torch.cuda.set_device(args.local_rank) + torch.distributed.init_process_group( + backend="nccl", init_method="env://" + ) + synchronize() + + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + + save_dir = "" + logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) + logger.info("Using {} GPUs".format(num_gpus)) + logger.info(cfg) + + logger.info("Collecting env info (might take some time)") + logger.info("\n" + collect_env_info()) + + model = build_detection_model(cfg) + model.to(cfg.MODEL.DEVICE) + + output_dir = cfg.OUTPUT_DIR + checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) + _ = checkpointer.load(cfg.MODEL.WEIGHT) + + iou_types = ("bbox",) + if cfg.MODEL.BOUNDARY_ON: + iou_types = iou_types + ("bo",) + output_folders = [None] * len(cfg.DATASETS.TEST) + dataset_names = cfg.DATASETS.TEST + if cfg.OUTPUT_DIR: + for idx, dataset_name in enumerate(dataset_names): + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) + mkdir(output_folder) + output_folders[idx] = output_folder + data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) + for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): + inference( + model, + data_loader_val, + dataset_name=dataset_name, + iou_types=iou_types, + box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, + device=cfg.MODEL.DEVICE, + expected_results=cfg.TEST.EXPECTED_RESULTS, + expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, + output_folder=output_folder, + ) + synchronize() + + +if __name__ == "__main__": + main() diff --git a/tools/train_net.py b/tools/train_net.py new file mode 100644 index 0000000000000000000000000000000000000000..9b62001757b1d18b8cb74581af27dbded97be731 --- /dev/null +++ b/tools/train_net.py @@ -0,0 +1,174 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +r""" +Basic training script for PyTorch +""" + +# Set up custom environment before nearly anything else is imported +# NOTE: this should be the first import (no not reorder) +from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip + +import argparse +import os + +import torch +from maskrcnn_benchmark.config import cfg +from maskrcnn_benchmark.data import make_data_loader +from maskrcnn_benchmark.solver import make_lr_scheduler +from maskrcnn_benchmark.solver import make_optimizer +from maskrcnn_benchmark.engine.inference import inference +from maskrcnn_benchmark.engine.trainer import do_train +from maskrcnn_benchmark.modeling.detector import build_detection_model +from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer +from maskrcnn_benchmark.utils.collect_env import collect_env_info +from maskrcnn_benchmark.utils.comm import synchronize, get_rank +from maskrcnn_benchmark.utils.imports import import_file +from maskrcnn_benchmark.utils.logger import setup_logger +from maskrcnn_benchmark.utils.miscellaneous import mkdir + + +def train(cfg, local_rank, distributed): + model = build_detection_model(cfg) + device = torch.device(cfg.MODEL.DEVICE) + model.to(device) + + optimizer = make_optimizer(cfg, model) + scheduler = make_lr_scheduler(cfg, optimizer) + + if distributed: + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[local_rank], output_device=local_rank, + # this should be removed if we update BatchNorm stats + broadcast_buffers=False, + ) + + arguments = {} + arguments["iteration"] = 0 + + output_dir = cfg.OUTPUT_DIR + + save_to_disk = get_rank() == 0 + checkpointer = DetectronCheckpointer( + cfg, model, optimizer, scheduler, output_dir, save_to_disk + ) + extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) + arguments.update(extra_checkpoint_data) + + data_loader = make_data_loader( + cfg, + is_train=True, + is_distributed=distributed, + start_iter=arguments["iteration"], + ) + + checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD + + do_train( + model, + data_loader, + optimizer, + scheduler, + checkpointer, + device, + checkpoint_period, + arguments, + ) + + return model + + +def run_test(cfg, model, distributed): + if distributed: + model = model.module + torch.cuda.empty_cache() # TODO check if it helps + iou_types = ("bbox",) + if cfg.MODEL.MASK_ON: + iou_types = iou_types + ("segm",) + if cfg.MODEL.KEYPOINT_ON: + iou_types = iou_types + ("keypoints",) + output_folders = [None] * len(cfg.DATASETS.TEST) + dataset_names = cfg.DATASETS.TEST + if cfg.OUTPUT_DIR: + for idx, dataset_name in enumerate(dataset_names): + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) + mkdir(output_folder) + output_folders[idx] = output_folder + data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) + for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): + inference( + model, + data_loader_val, + dataset_name=dataset_name, + iou_types=iou_types, + box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, + device=cfg.MODEL.DEVICE, + expected_results=cfg.TEST.EXPECTED_RESULTS, + expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, + output_folder=output_folder, + ) + synchronize() + + +def main(): + parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") + parser.add_argument( + "--config-file", + default="", + metavar="FILE", + help="path to config file", + type=str, + ) + parser.add_argument("--local_rank", type=int, default=0) + parser.add_argument( + "--skip-test", + dest="skip_test", + help="Do not test the final model", + action="store_true", + ) + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + + args = parser.parse_args() + + num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 + args.distributed = num_gpus > 1 + + if args.distributed: + torch.cuda.set_device(args.local_rank) + torch.distributed.init_process_group( + backend="nccl", init_method="env://" + ) + synchronize() + + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + + output_dir = cfg.OUTPUT_DIR + if output_dir: + mkdir(output_dir) + + logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) + logger.info("Using {} GPUs".format(num_gpus)) + logger.info(args) + + logger.info("Collecting env info (might take some time)") + logger.info("\n" + collect_env_info()) + + logger.info("Loaded configuration file {}".format(args.config_file)) + with open(args.config_file, "r") as cf: + config_str = "\n" + cf.read() + logger.info(config_str) + logger.info("Running with config:\n{}".format(cfg)) + + model = train(cfg, args.local_rank, args.distributed) + + if not args.skip_test: + run_test(cfg, model, args.distributed) + + +if __name__ == "__main__": + main() diff --git a/train_contour.sh b/train_contour.sh new file mode 100644 index 0000000000000000000000000000000000000000..c96e18b37e8ed6ddc38c1df123f9ed436f554278 --- /dev/null +++ b/train_contour.sh @@ -0,0 +1,6 @@ +# export NCCL_P2P_DISABLE=1 +export NGPUS=1 +CUDA_VISIBLE_DEVICES=1 python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_net.py \ + --config-file "configs/ic/r50_baseline.yaml" \ + --skip-test +