Cyril666 commited on
Commit
6250360
·
1 Parent(s): 6a7a411

First model version

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LICENSE +25 -0
  2. configs/ctw/r50_baseline.yaml +70 -0
  3. configs/ic/r50_baseline.yaml +75 -0
  4. demo/1.jpg +0 -0
  5. demo/2.jpg +0 -0
  6. demo/example1.jpg +0 -0
  7. demo/example_results.jpg +0 -0
  8. maskrcnn_benchmark/__init__.py +1 -0
  9. maskrcnn_benchmark/__pycache__/__init__.cpython-37.pyc +0 -0
  10. maskrcnn_benchmark/config/__init__.py +2 -0
  11. maskrcnn_benchmark/config/__pycache__/__init__.cpython-37.pyc +0 -0
  12. maskrcnn_benchmark/config/__pycache__/defaults.cpython-37.pyc +0 -0
  13. maskrcnn_benchmark/config/__pycache__/paths_catalog.cpython-37.pyc +0 -0
  14. maskrcnn_benchmark/config/defaults.py +471 -0
  15. maskrcnn_benchmark/config/paths_catalog.py +120 -0
  16. maskrcnn_benchmark/csrc/ROIAlign.h +46 -0
  17. maskrcnn_benchmark/csrc/ROIPool.h +48 -0
  18. maskrcnn_benchmark/csrc/SigmoidFocalLoss.h +41 -0
  19. maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp +257 -0
  20. maskrcnn_benchmark/csrc/cpu/dcn_v2_cpu.cpp +74 -0
  21. maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp +75 -0
  22. maskrcnn_benchmark/csrc/cpu/vision.h +73 -0
  23. maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu +346 -0
  24. maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu +202 -0
  25. maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu +188 -0
  26. maskrcnn_benchmark/csrc/cuda/dcn_v2_cuda.cu +335 -0
  27. maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.cu +402 -0
  28. maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.h +101 -0
  29. maskrcnn_benchmark/csrc/cuda/dcn_v2_psroi_pooling_cuda.cu +419 -0
  30. maskrcnn_benchmark/csrc/cuda/nms.cu +131 -0
  31. maskrcnn_benchmark/csrc/cuda/vision.h +121 -0
  32. maskrcnn_benchmark/csrc/dcn_v2.h +145 -0
  33. maskrcnn_benchmark/csrc/nms.h +28 -0
  34. maskrcnn_benchmark/csrc/vision.cpp +21 -0
  35. maskrcnn_benchmark/data/README.md +90 -0
  36. maskrcnn_benchmark/data/__init__.py +2 -0
  37. maskrcnn_benchmark/data/__pycache__/__init__.cpython-37.pyc +0 -0
  38. maskrcnn_benchmark/data/__pycache__/build.cpython-37.pyc +0 -0
  39. maskrcnn_benchmark/data/__pycache__/collate_batch.cpython-37.pyc +0 -0
  40. maskrcnn_benchmark/data/build.py +176 -0
  41. maskrcnn_benchmark/data/collate_batch.py +20 -0
  42. maskrcnn_benchmark/data/datasets/__init__.py +8 -0
  43. maskrcnn_benchmark/data/datasets/__pycache__/__init__.cpython-37.pyc +0 -0
  44. maskrcnn_benchmark/data/datasets/__pycache__/char_dataset.cpython-37.pyc +0 -0
  45. maskrcnn_benchmark/data/datasets/__pycache__/coco.cpython-37.pyc +0 -0
  46. maskrcnn_benchmark/data/datasets/__pycache__/concat_dataset.cpython-37.pyc +0 -0
  47. maskrcnn_benchmark/data/datasets/__pycache__/voc.cpython-37.pyc +0 -0
  48. maskrcnn_benchmark/data/datasets/__pycache__/word_dataset.cpython-37.pyc +0 -0
  49. maskrcnn_benchmark/data/datasets/coco.py +101 -0
  50. maskrcnn_benchmark/data/datasets/concat_dataset.py +23 -0
LICENSE ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FCOS for non-commercial purposes
2
+
3
+ Copyright (c) 2019 the authors
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ * Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ * Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
configs/ctw/r50_baseline.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OUTPUT_DIR: "./output/ctw"
2
+ MODEL:
3
+ META_ARCHITECTURE: "GeneralizedRCNN"
4
+ WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
5
+ BACKBONE:
6
+ CONV_BODY: "R-50-FPN"
7
+ RESNETS:
8
+ BACKBONE_OUT_CHANNELS: 256
9
+ RPN:
10
+ USE_FPN: True
11
+ ANCHOR_STRIDE: (4, 8, 16, 32, 64)
12
+ ASPECT_RATIOS: (0.25, 0.5, 1.0, 2.0, 4.0)
13
+ ROI_HEADS:
14
+ USE_FPN: True
15
+ SCORE_THRESH: 0.85
16
+ NMS: 0.3
17
+ ROI_BOX_HEAD:
18
+ DEFORMABLE_POOLING: False
19
+ POOLER_RESOLUTION: 7
20
+ POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
21
+ POOLER_SAMPLING_RATIO: 2
22
+ FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
23
+ PREDICTOR: "FPNPredictor"
24
+ NUM_CLASSES: 2
25
+ CLASS_WEIGHT: 1.0
26
+ ## Boundary
27
+ BOUNDARY_ON: True
28
+ ROI_BOUNDARY_HEAD:
29
+ DEFORMABLE_POOLING: True
30
+ FEATURE_EXTRACTOR: "BoundaryRCNNFPNFeatureExtractor"
31
+ POOLER_RESOLUTION: 14
32
+ POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
33
+ POOLER_SAMPLING_RATIO: 2
34
+ PREDICTOR: "BoundaryRCNNC4Predictor"
35
+ RESOLUTION: 48
36
+ SHARE_BOX_FEATURE_EXTRACTOR: False
37
+ BO_WEIGHT: 0.1
38
+ Loss_balance: 1.1
39
+
40
+ PROCESS:
41
+ PNMS: True
42
+ NMS_THRESH: 0.25
43
+ DATASETS:
44
+ TRAIN: ("CTW1500_train",)
45
+ TEST: ("CTW1500_test",)
46
+ Test_Visual: True
47
+ DATALOADER:
48
+ SIZE_DIVISIBILITY: 32
49
+ SOLVER:
50
+ BASE_LR: 0.0025
51
+ BIAS_LR_FACTOR: 2
52
+ WEIGHT_DECAY: 0.0001
53
+ STEPS: (30000, 40000)
54
+ MAX_ITER: 45000
55
+ IMS_PER_BATCH: 1
56
+ CHECKPOINT_PERIOD: 1000
57
+ INPUT:
58
+
59
+ MIN_SIZE_TRAIN: (400,600,720,1000,1200)
60
+ MAX_SIZE_TRAIN: 2000
61
+ MIN_SIZE_TEST: 720
62
+ MAX_SIZE_TEST: 1280
63
+ CROP_PROB_TRAIN: 1.0
64
+ ROTATE_PROB_TRAIN: 0.0
65
+ ROTATE_DEGREE: (0,30,60,90,210,150,180,210,240,270,300,330,360)
66
+
67
+ TEST:
68
+ IMS_PER_BATCH: 1
69
+
70
+
configs/ic/r50_baseline.yaml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OUTPUT_DIR: "./output/ic15"
2
+ MODEL:
3
+ META_ARCHITECTURE: "GeneralizedRCNN"
4
+ WEIGHT: catalog://ImageNetPretrained/MSRA/R-50
5
+ BACKBONE:
6
+ CONV_BODY: "R-50-FPN"
7
+ RESNETS:
8
+ BACKBONE_OUT_CHANNELS: 256
9
+ RPN:
10
+ USE_FPN: True
11
+ ANCHOR_STRIDE: (4, 8, 16, 32, 64)
12
+ ASPECT_RATIOS: (0.25, 0.5, 1.0, 2.0, 4.0)
13
+ ROI_HEADS:
14
+ USE_FPN: True
15
+ SCORE_THRESH: 0.52 # ic15
16
+ NMS: 0.89
17
+ ROI_BOX_HEAD:
18
+ DEFORMABLE_POOLING: False
19
+ POOLER_RESOLUTION: 7
20
+ POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
21
+ POOLER_SAMPLING_RATIO: 2
22
+ FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
23
+ PREDICTOR: "FPNPredictor"
24
+ NUM_CLASSES: 2
25
+ CLASS_WEIGHT: 1.0
26
+ ## Boundary
27
+ BOUNDARY_ON: True
28
+ ROI_BOUNDARY_HEAD:
29
+ DEFORMABLE_POOLING: False
30
+ FEATURE_EXTRACTOR: "BoundaryRCNNFPNFeatureExtractor"
31
+ POOLER_RESOLUTION: 14
32
+ POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
33
+ POOLER_SAMPLING_RATIO: 2
34
+ PREDICTOR: "BoundaryRCNNC4Predictor"
35
+ RESOLUTION: 48
36
+ SHARE_BOX_FEATURE_EXTRACTOR: False
37
+ BO_WEIGHT: 0.1
38
+ Loss_balance: 1.0
39
+
40
+ PROCESS:
41
+ PNMS: True
42
+ NMS_THRESH: 0.25
43
+ DATASETS:
44
+ TRAIN: ("ic15_train",)
45
+ TEST: ("ic15_test",)
46
+ Test_Visual: True
47
+ DATALOADER:
48
+ SIZE_DIVISIBILITY: 32
49
+ SOLVER:
50
+ BASE_LR: 0.00025
51
+ BIAS_LR_FACTOR: 2
52
+ WEIGHT_DECAY: 0.0001
53
+ # STEPS: (120000, 160000)
54
+ STEPS: (5000, 10000) # fine-tune
55
+ # MAX_ITER: 180000
56
+ MAX_ITER: 190500 # fine-tune
57
+ IMS_PER_BATCH: 1
58
+ CHECKPOINT_PERIOD: 5000
59
+ INPUT:
60
+
61
+ MIN_SIZE_TRAIN: (400,600,720,1000,1200)
62
+ MAX_SIZE_TRAIN: 2000
63
+ MIN_SIZE_TEST: 1200
64
+ MAX_SIZE_TEST: 2000
65
+
66
+ CROP_PROB_TRAIN: 1.0
67
+ ROTATE_PROB_TRAIN: 0.3 # fine-tune
68
+ # ROTATE_PROB_TRAIN: 1.0
69
+ # ROTATE_DEGREE: (0,30,60,90,210,150,180,210,240,270,300,330,360)
70
+ ROTATE_DEGREE: (10,) # fine-tune
71
+
72
+ TEST:
73
+ IMS_PER_BATCH: 1
74
+
75
+
demo/1.jpg ADDED
demo/2.jpg ADDED
demo/example1.jpg ADDED
demo/example_results.jpg ADDED
maskrcnn_benchmark/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
maskrcnn_benchmark/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (145 Bytes). View file
 
maskrcnn_benchmark/config/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ from .defaults import _C as cfg
maskrcnn_benchmark/config/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (192 Bytes). View file
 
maskrcnn_benchmark/config/__pycache__/defaults.cpython-37.pyc ADDED
Binary file (5.64 kB). View file
 
maskrcnn_benchmark/config/__pycache__/paths_catalog.cpython-37.pyc ADDED
Binary file (3.74 kB). View file
 
maskrcnn_benchmark/config/defaults.py ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ import os
3
+
4
+ from yacs.config import CfgNode as CN
5
+
6
+
7
+ # -----------------------------------------------------------------------------
8
+ # Convention about Training / Test specific parameters
9
+ # -----------------------------------------------------------------------------
10
+ # Whenever an argument can be either used for training or for testing, the
11
+ # corresponding name will be post-fixed by a _TRAIN for a training parameter,
12
+ # or _TEST for a test-specific parameter.
13
+ # For example, the number of images during training will be
14
+ # IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
15
+ # IMAGES_PER_BATCH_TEST
16
+
17
+ # -----------------------------------------------------------------------------
18
+ # Config definition
19
+ # -----------------------------------------------------------------------------
20
+
21
+ _C = CN()
22
+
23
+ _C.MODEL = CN()
24
+ _C.MODEL.RPN_ONLY = False
25
+ _C.MODEL.MASK_ON = False
26
+ _C.MODEL.FCOS_ON = False
27
+ _C.MODEL.KE_ON = False
28
+ _C.MODEL.BOUNDARY_ON = False
29
+ _C.MODEL.MSR_ON = False
30
+ _C.MODEL.RETINANET_ON = False
31
+ _C.MODEL.KEYPOINT_ON = False
32
+ _C.MODEL.DEVICE = "cuda"
33
+ _C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
34
+ _C.MODEL.CLS_AGNOSTIC_BBOX_REG = False
35
+
36
+ # If the WEIGHT starts with a catalog://, like :R-50, the code will look for
37
+ # the path in paths_catalog. Else, it will use it as the specified absolute
38
+ # path
39
+ _C.MODEL.WEIGHT = ""
40
+
41
+
42
+ # -----------------------------------------------------------------------------
43
+ # INPUT
44
+ # -----------------------------------------------------------------------------
45
+ _C.INPUT = CN()
46
+ # Size of the smallest side of the image during training
47
+ _C.INPUT.MIN_SIZE_TRAIN = (800,) # (800,)
48
+ # The range of the smallest side for multi-scale training
49
+ _C.INPUT.MIN_SIZE_RANGE_TRAIN = (-1, -1) # -1 means disabled and it will use MIN_SIZE_TRAIN
50
+ # Maximum size of the side of the image during training
51
+ _C.INPUT.MAX_SIZE_TRAIN = 1333
52
+ # Size of the smallest side of the image during testing
53
+ _C.INPUT.MIN_SIZE_TEST = 1000
54
+ # Maximum size of the side of the image during testing
55
+ _C.INPUT.MAX_SIZE_TEST = 1333
56
+ # Values to be used for image normalization
57
+ _C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717]
58
+ # Values to be used for image normalization
59
+ _C.INPUT.PIXEL_STD = [1., 1., 1.]
60
+ # Convert image to BGR format (for Caffe2 models), in range 0-255
61
+ _C.INPUT.TO_BGR255 = True
62
+ _C.INPUT.CROP_PROB_TRAIN = 1.0
63
+ _C.INPUT.ROTATE_PROB_TRAIN = 0.3
64
+ _C.INPUT.ROTATE_DEGREE = (0,15,-15,45,-45,90,-90)
65
+ # _C.INPUT.ROTATE_DEGREE = 15
66
+
67
+
68
+
69
+
70
+ # -----------------------------------------------------------------------------
71
+ # Dataset
72
+ # -----------------------------------------------------------------------------
73
+ _C.DATASETS = CN()
74
+ # List of the dataset names for training, as present in paths_catalog.py
75
+ _C.DATASETS.TRAIN = ()
76
+ # List of the dataset names for testing, as present in paths_catalog.py
77
+ _C.DATASETS.TEST = ()
78
+ _C.DATASETS.Test_Visual = False
79
+ # -----------------------------------------------------------------------------
80
+ # DataLoader
81
+ # -----------------------------------------------------------------------------
82
+ _C.DATALOADER = CN()
83
+ # Number of data loading threads
84
+ _C.DATALOADER.NUM_WORKERS = 4
85
+ # If > 0, this enforces that each collated batch should have a size divisible
86
+ # by SIZE_DIVISIBILITY
87
+ _C.DATALOADER.SIZE_DIVISIBILITY = 0
88
+ # If True, each batch should contain only images for which the aspect ratio
89
+ # is compatible. This groups portrait images together, and landscape images
90
+ # are not batched with portrait images.
91
+ _C.DATALOADER.ASPECT_RATIO_GROUPING = True
92
+
93
+
94
+ # ---------------------------------------------------------------------------- #
95
+ # Backbone options
96
+ # ---------------------------------------------------------------------------- #
97
+ _C.MODEL.BACKBONE = CN()
98
+
99
+ # The backbone conv body to use
100
+ # The string must match a function that is imported in modeling.model_builder
101
+ # (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN
102
+ # backbone)
103
+ _C.MODEL.BACKBONE.CONV_BODY = "R-50-C4"
104
+
105
+ # Add StopGrad at a specified stage so the bottom layers are frozen
106
+ _C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2
107
+ # GN for backbone
108
+
109
+ ##123123123
110
+ _C.MODEL.BACKBONE.USE_GN = False
111
+
112
+
113
+ # ---------------------------------------------------------------------------- #
114
+ # FPN options
115
+ # ---------------------------------------------------------------------------- #
116
+ _C.MODEL.FPN = CN()
117
+
118
+ # 123123123
119
+ _C.MODEL.FPN.USE_GN = False
120
+ _C.MODEL.FPN.USE_RELU = False
121
+
122
+ #############123123123
123
+ _C.MODEL.FPN.USE_DEFORMABLE = False
124
+
125
+
126
+ # ---------------------------------------------------------------------------- #
127
+ # Group Norm options
128
+ # ---------------------------------------------------------------------------- #
129
+ _C.MODEL.GROUP_NORM = CN()
130
+ # Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS)
131
+ _C.MODEL.GROUP_NORM.DIM_PER_GP = -1
132
+ # Number of groups in GroupNorm (-1 if using DIM_PER_GP)
133
+ _C.MODEL.GROUP_NORM.NUM_GROUPS = 32
134
+ # GroupNorm's small constant in the denominator
135
+ _C.MODEL.GROUP_NORM.EPSILON = 1e-5
136
+
137
+
138
+ # ---------------------------------------------------------------------------- #
139
+ # RPN options
140
+ # ---------------------------------------------------------------------------- #
141
+ _C.MODEL.RPN = CN()
142
+ _C.MODEL.RPN.USE_FPN = False
143
+ # Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input
144
+ _C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512)
145
+ # Stride of the feature map that RPN is attached.
146
+ # For FPN, number of strides should match number of scales
147
+ _C.MODEL.RPN.ANCHOR_STRIDE = (16,)
148
+ # RPN anchor aspect ratios
149
+ _C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0)
150
+ # Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels
151
+ # Set to -1 or a large value, e.g. 100000, to disable pruning anchors
152
+ _C.MODEL.RPN.STRADDLE_THRESH = 0
153
+ # Minimum overlap required between an anchor and ground-truth box for the
154
+ # (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
155
+ # ==> positive RPN example)
156
+ _C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7
157
+ # Maximum overlap allowed between an anchor and ground-truth box for the
158
+ # (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
159
+ # ==> negative RPN example)
160
+ _C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3
161
+ # Total number of RPN examples per image
162
+ _C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
163
+ # Target fraction of foreground (positive) examples per RPN minibatch
164
+ _C.MODEL.RPN.POSITIVE_FRACTION = 0.5
165
+ # Number of top scoring RPN proposals to keep before applying NMS
166
+ # When FPN is used, this is *per FPN level* (not total)
167
+ _C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000
168
+
169
+ _C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000
170
+ # Number of top scoring RPN proposals to keep after applying NMS
171
+ _C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000
172
+ _C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000
173
+ # NMS threshold used on RPN proposals
174
+ _C.MODEL.RPN.NMS_THRESH = 0.7
175
+ # Proposal height and width both need to be greater than RPN_MIN_SIZE
176
+ # (a the scale used during training or inference)
177
+ _C.MODEL.RPN.MIN_SIZE = 0
178
+ # Number of top scoring RPN proposals to keep after combining proposals from
179
+ # all FPN levels
180
+ _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000
181
+ _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
182
+ # Custom rpn head, empty to use default conv or separable conv
183
+ _C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead_1"
184
+
185
+
186
+ # ---------------------------------------------------------------------------- #
187
+ # ROI HEADS options
188
+ # ---------------------------------------------------------------------------- #
189
+ _C.MODEL.ROI_HEADS = CN()
190
+ _C.MODEL.ROI_HEADS.USE_FPN = False
191
+ _C.MODEL.ROI_HEADS.USE_FPN = False
192
+ # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
193
+ _C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5
194
+ # Overlap threshold for an RoI to be considered background
195
+ # (class = 0 if overlap in [0, BG_IOU_THRESHOLD))
196
+ _C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5
197
+ # Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
198
+ # These are empirically chosen to approximately lead to unit variance targets
199
+ _C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.)
200
+ # RoI minibatch size *per image* (number of regions of interest [ROIs])
201
+ # Total number of RoIs per training minibatch =
202
+ # TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH
203
+ # E.g., a common configuration is: 512 * 2 * 8 = 8192
204
+ _C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
205
+ # Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
206
+ _C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
207
+
208
+ # Only used on test mode
209
+
210
+ # Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
211
+ # balance obtaining high recall with not having too many low precision
212
+ # detections that will slow down inference post processing steps (like NMS)
213
+ _C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05
214
+ # Overlap threshold used for non-maximum suppression (suppress boxes with
215
+ # IoU >= this threshold)
216
+ _C.MODEL.ROI_HEADS.NMS = 0.5
217
+ # Maximum number of detections to return per image (100 is based on the limit established for the COCO dataset)
218
+ _C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100
219
+
220
+
221
+ _C.MODEL.ROI_BOX_HEAD = CN()
222
+ _C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
223
+ _C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor"
224
+ _C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
225
+ _C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
226
+ _C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,)
227
+ _C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81
228
+ # Hidden layer dimension when using an MLP for the RoI box head
229
+ _C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024
230
+ # GN
231
+ #####123123123
232
+ _C.MODEL.ROI_BOX_HEAD.USE_GN = False
233
+ # Dilation
234
+ _C.MODEL.ROI_BOX_HEAD.DILATION = 1
235
+ _C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256
236
+
237
+ #### 123123
238
+ _C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4
239
+ _C.MODEL.ROI_BOX_HEAD.CLASS_WEIGHT = 0.1
240
+ _C.MODEL.ROI_BOX_HEAD.DEFORMABLE_POOLING = False
241
+
242
+ _C.MODEL.ROI_MASK_HEAD = CN()
243
+ # Whether or not resize and translate masks to the input image.
244
+ _C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False
245
+ _C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5
246
+ _C.MODEL.ROI_MASK_HEAD.DILATION = 1
247
+ _C.MODEL.ROI_MASK_HEAD.USE_GN = False
248
+
249
+ # Boundary edge
250
+ _C.MODEL.ROI_BOUNDARY_HEAD = CN()
251
+ _C.MODEL.ROI_BOUNDARY_HEAD.DEFORMABLE_POOLING = False
252
+
253
+ _C.MODEL.ROI_BOUNDARY_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
254
+ _C.MODEL.ROI_BOUNDARY_HEAD.POOLER_RESOLUTION = 14
255
+ _C.MODEL.ROI_BOUNDARY_HEAD.POOLER_SCALES = (1.0 / 16,)
256
+ _C.MODEL.ROI_BOUNDARY_HEAD.POOLER_SAMPLING_RATIO = 0
257
+ _C.MODEL.ROI_BOUNDARY_HEAD.CONV_LAYERS = (256, 256, 256, 256)
258
+
259
+ _C.MODEL.ROI_BOUNDARY_HEAD.PREDICTOR = "KERCNNC4Predictor"
260
+ _C.MODEL.ROI_BOUNDARY_HEAD.RESOLUTION = 14
261
+ _C.MODEL.ROI_BOUNDARY_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
262
+ _C.MODEL.ROI_BOUNDARY_HEAD.BO_WEIGHT = 1.0
263
+ _C.MODEL.ROI_BOUNDARY_HEAD.Loss_balance = 1.2
264
+
265
+ # ---------------------------------------------------------------------------- #
266
+ # ResNe[X]t options (ResNets = {ResNet, ResNeXt}
267
+ # Note that parts of a resnet may be used for both the backbone and the head
268
+ # These options apply to both
269
+ # ---------------------------------------------------------------------------- #
270
+ _C.MODEL.RESNETS = CN()
271
+
272
+ # Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
273
+ _C.MODEL.RESNETS.NUM_GROUPS = 1
274
+
275
+ # Baseline width of each group
276
+ _C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
277
+
278
+ # Place the stride 2 conv on the 1x1 filter
279
+ # Use True only for the original MSRA ResNet; use False for C2 and Torch models
280
+ _C.MODEL.RESNETS.STRIDE_IN_1X1 = True
281
+
282
+ # Residual transformation function
283
+ _C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm"
284
+ _C.MODEL.RESNETS.DEF_FUNC = "DeformableConvWithFixedBatchNorm"
285
+ # ResNet's stem function (conv1 and pool1)
286
+ _C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm"
287
+ _C.MODEL.RESNETS.DEF_START_MODULE = "NA"
288
+
289
+ #########123123123
290
+ _C.MODEL.RESNETS.DEFORM_POOLING = False
291
+
292
+ # Apply dilation in stage "res5"
293
+ _C.MODEL.RESNETS.RES5_DILATION = 1
294
+
295
+ _C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4
296
+ _C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
297
+ _C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
298
+
299
+ # ---------------------------------------------------------------------------- #
300
+ # FCOS Options
301
+ # ---------------------------------------------------------------------------- #
302
+ _C.MODEL.FCOS = CN()
303
+ _C.MODEL.FCOS.NUM_CLASSES = 81 # the number of classes including background
304
+ _C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
305
+ _C.MODEL.FCOS.PRIOR_PROB = 0.01
306
+ _C.MODEL.FCOS.INFERENCE_TH = 0.05
307
+ _C.MODEL.FCOS.NMS_TH = 0.4
308
+ _C.MODEL.FCOS.PRE_NMS_TOP_N = 1000
309
+
310
+ # Focal loss parameter: alpha
311
+ _C.MODEL.FCOS.LOSS_ALPHA = 0.25
312
+ # Focal loss parameter: gamma
313
+ _C.MODEL.FCOS.LOSS_GAMMA = 2.0
314
+ _C.MODEL.FCOS.SIZES_OF_INTEREST = [64, 128, 256, 512]
315
+
316
+ # the number of convolutions used in the cls and bbox tower
317
+ _C.MODEL.FCOS.NUM_CONVS = 4
318
+
319
+ # ---------------------------------------------------------------------------- #
320
+ # RetinaNet Options (Follow the Detectron version)
321
+ # ---------------------------------------------------------------------------- #
322
+ _C.MODEL.RETINANET = CN()
323
+
324
+ # This is the number of foreground classes and background.
325
+ _C.MODEL.RETINANET.NUM_CLASSES = 81
326
+
327
+ # Anchor aspect ratios to use
328
+ _C.MODEL.RETINANET.ANCHOR_SIZES = (32, 64, 128, 256, 512)
329
+ _C.MODEL.RETINANET.ASPECT_RATIOS = (0.5, 1.0, 2.0)
330
+ _C.MODEL.RETINANET.ANCHOR_STRIDES = (8, 16, 32, 64, 128)
331
+ _C.MODEL.RETINANET.STRADDLE_THRESH = 0
332
+
333
+ # Anchor scales per octave
334
+ _C.MODEL.RETINANET.OCTAVE = 2.0
335
+ _C.MODEL.RETINANET.SCALES_PER_OCTAVE = 3
336
+
337
+ # Use C5 or P5 to generate P6
338
+ _C.MODEL.RETINANET.USE_C5 = True
339
+
340
+ # Convolutions to use in the cls and bbox tower
341
+ # NOTE: this doesn't include the last conv for logits
342
+ _C.MODEL.RETINANET.NUM_CONVS = 4
343
+
344
+ # Weight for bbox_regression loss
345
+ _C.MODEL.RETINANET.BBOX_REG_WEIGHT = 4.0
346
+
347
+ # Smooth L1 loss beta for bbox regression
348
+ _C.MODEL.RETINANET.BBOX_REG_BETA = 0.11
349
+
350
+ # During inference, #locs to select based on cls score before NMS is performed
351
+ # per FPN level
352
+ _C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000
353
+
354
+ # IoU overlap ratio for labeling an anchor as positive
355
+ # Anchors with >= iou overlap are labeled positive
356
+ _C.MODEL.RETINANET.FG_IOU_THRESHOLD = 0.5
357
+
358
+ # IoU overlap ratio for labeling an anchor as negative
359
+ # Anchors with < iou overlap are labeled negative
360
+ _C.MODEL.RETINANET.BG_IOU_THRESHOLD = 0.4
361
+
362
+ # Focal loss parameter: alpha
363
+ _C.MODEL.RETINANET.LOSS_ALPHA = 0.25
364
+
365
+ # Focal loss parameter: gamma
366
+ _C.MODEL.RETINANET.LOSS_GAMMA = 2.0
367
+
368
+ # Prior prob for the positives at the beginning of training. This is used to set
369
+ # the bias init for the logits layer
370
+ _C.MODEL.RETINANET.PRIOR_PROB = 0.01
371
+
372
+ # Inference cls score threshold, anchors with score > INFERENCE_TH are
373
+ # considered for inference
374
+ _C.MODEL.RETINANET.INFERENCE_TH = 0.05
375
+
376
+ # NMS threshold used in RetinaNet
377
+ _C.MODEL.RETINANET.NMS_TH = 0.4
378
+
379
+
380
+ # ---------------------------------------------------------------------------- #
381
+ # FBNet options
382
+ # ---------------------------------------------------------------------------- #
383
+ _C.MODEL.FBNET = CN()
384
+ _C.MODEL.FBNET.ARCH = "default"
385
+ # custom arch
386
+ _C.MODEL.FBNET.ARCH_DEF = ""
387
+ _C.MODEL.FBNET.BN_TYPE = "bn"
388
+ _C.MODEL.FBNET.SCALE_FACTOR = 1.0
389
+ # the output channels will be divisible by WIDTH_DIVISOR
390
+ _C.MODEL.FBNET.WIDTH_DIVISOR = 1
391
+ _C.MODEL.FBNET.DW_CONV_SKIP_BN = True
392
+ _C.MODEL.FBNET.DW_CONV_SKIP_RELU = True
393
+
394
+ # > 0 scale, == 0 skip, < 0 same dimension
395
+ _C.MODEL.FBNET.DET_HEAD_LAST_SCALE = 1.0
396
+ _C.MODEL.FBNET.DET_HEAD_BLOCKS = []
397
+ # overwrite the stride for the head, 0 to use original value
398
+ _C.MODEL.FBNET.DET_HEAD_STRIDE = 0
399
+
400
+ # > 0 scale, == 0 skip, < 0 same dimension
401
+ _C.MODEL.FBNET.KPTS_HEAD_LAST_SCALE = 0.0
402
+ _C.MODEL.FBNET.KPTS_HEAD_BLOCKS = []
403
+ # overwrite the stride for the head, 0 to use original value
404
+ _C.MODEL.FBNET.KPTS_HEAD_STRIDE = 0
405
+
406
+ # > 0 scale, == 0 skip, < 0 same dimension
407
+ _C.MODEL.FBNET.MASK_HEAD_LAST_SCALE = 0.0
408
+ _C.MODEL.FBNET.MASK_HEAD_BLOCKS = []
409
+ # overwrite the stride for the head, 0 to use original value
410
+ _C.MODEL.FBNET.MASK_HEAD_STRIDE = 0
411
+
412
+ # 0 to use all blocks defined in arch_def
413
+ _C.MODEL.FBNET.RPN_HEAD_BLOCKS = 0
414
+ _C.MODEL.FBNET.RPN_BN_TYPE = ""
415
+
416
+
417
+ # ---------------------------------------------------------------------------- #
418
+ # Solver
419
+ # ---------------------------------------------------------------------------- #
420
+ _C.SOLVER = CN()
421
+ _C.SOLVER.MAX_ITER = 40000
422
+
423
+ _C.SOLVER.BASE_LR = 0.001
424
+ _C.SOLVER.BIAS_LR_FACTOR = 2
425
+
426
+ _C.SOLVER.MOMENTUM = 0.9
427
+
428
+ _C.SOLVER.WEIGHT_DECAY = 0.0005
429
+ _C.SOLVER.WEIGHT_DECAY_BIAS = 0
430
+
431
+ _C.SOLVER.GAMMA = 0.1
432
+ _C.SOLVER.STEPS = (30000,)
433
+
434
+ _C.SOLVER.WARMUP_FACTOR = 1.0 / 3
435
+ _C.SOLVER.WARMUP_ITERS = 500
436
+ _C.SOLVER.WARMUP_METHOD = "linear"
437
+
438
+ _C.SOLVER.CHECKPOINT_PERIOD = 2500
439
+
440
+ # Number of images per batch
441
+ # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
442
+ # see 2 images per batch
443
+ _C.SOLVER.IMS_PER_BATCH = 4
444
+
445
+ # ---------------------------------------------------------------------------- #
446
+ # Specific test options
447
+ # ---------------------------------------------------------------------------- #
448
+ _C.TEST = CN()
449
+ _C.TEST.EXPECTED_RESULTS = []
450
+ _C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4
451
+ # Number of images per batch
452
+ # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
453
+ # see 2 images per batch
454
+ _C.TEST.IMS_PER_BATCH = 16
455
+ # Number of detections per image
456
+ _C.TEST.DETECTIONS_PER_IMG = 100
457
+
458
+
459
+ # ---------------------------------------------------------------------------- #
460
+ # Misc options
461
+ # ---------------------------------------------------------------------------- #
462
+ _C.OUTPUT_DIR = "./1"
463
+ _C.IS_LOAD_OPTIMIZER = True
464
+ _C.IS_LOAD_SCHEDULER = True
465
+ _C.PROCESS = CN()
466
+
467
+ #####123123123
468
+ _C.PROCESS.PNMS = False
469
+ _C.PROCESS.NMS_THRESH = 0.4
470
+
471
+ _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
maskrcnn_benchmark/config/paths_catalog.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ """Centralized catalog of paths."""
3
+
4
+ import os
5
+
6
+ class DatasetCatalog(object):
7
+ DATA_DIR = "/home/zhangbq/ws/ct/dataset/"
8
+ DATASETS = {
9
+ "ic15_train": (
10
+ "ic15/ic15_train_images",
11
+ "ic15/annotations/ic15_train.json"
12
+ ),
13
+ "ic15_test": (
14
+ "ic15/ic15_test_images",
15
+ "ic15/annotations/ic15_test.json"
16
+ ),
17
+ "CTW1500_train": (
18
+ "ctw/ctw_train_images",
19
+ "ctw/annotations/ctw_train.json"
20
+ ),
21
+ "CTW1500_test": (
22
+ "ctw/ctw_test_images",
23
+ "ctw/annotations/ctw_test.json"
24
+ )
25
+
26
+ }
27
+
28
+ @staticmethod
29
+ def get(name):
30
+ data_dir = DatasetCatalog.DATA_DIR
31
+ attrs = DatasetCatalog.DATASETS[name]
32
+ if "coco" in name:
33
+ args = dict(
34
+ root=os.path.join(data_dir, attrs["img_dir"]),
35
+ ann_file=os.path.join(data_dir, attrs["ann_file"]),
36
+ )
37
+ return dict(
38
+ factory="COCODataset",
39
+ args=args,
40
+ )
41
+ elif "voc" in name:
42
+ args = dict(
43
+ data_dir=os.path.join(data_dir, attrs["data_dir"]),
44
+ split=attrs["split"],
45
+ )
46
+ return dict(
47
+ factory="PascalVOCDataset",
48
+ args=args,
49
+ )
50
+ elif True:
51
+ args = dict(
52
+ root=os.path.join(data_dir, attrs[0]),
53
+ ann_file=os.path.join(data_dir, attrs[1]),
54
+ )
55
+ return dict(
56
+ factory="WordDataset",
57
+ args=args,
58
+ )
59
+ raise RuntimeError("Dataset not available: {}".format(name))
60
+
61
+
62
+ class ModelCatalog(object):
63
+ S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron"
64
+ C2_IMAGENET_MODELS = {
65
+ "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
66
+ "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
67
+ "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
68
+ "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
69
+ "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
70
+ }
71
+
72
+ C2_DETECTRON_SUFFIX = "output/train/{}coco_2014_train%3A{}coco_2014_valminusminival/generalized_rcnn/model_final.pkl"
73
+ C2_DETECTRON_MODELS = {
74
+ "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
75
+ "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
76
+ "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7",
77
+ "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ",
78
+ "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB",
79
+ "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC",
80
+ "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
81
+ "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
82
+ "37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x": "09_35_36.8pzTQKYK",
83
+ # keypoints
84
+ "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "08_42_54.kdzV35ao"
85
+ }
86
+
87
+ @staticmethod
88
+ def get(name):
89
+ if name.startswith("Caffe2Detectron/COCO"):
90
+ return ModelCatalog.get_c2_detectron_12_2017_baselines(name)
91
+ if name.startswith("ImageNetPretrained"):
92
+ return ModelCatalog.get_c2_imagenet_pretrained(name)
93
+ raise RuntimeError("model not present in the catalog {}".format(name))
94
+
95
+ @staticmethod
96
+ def get_c2_imagenet_pretrained(name):
97
+ prefix = ModelCatalog.S3_C2_DETECTRON_URL
98
+ name = name[len("ImageNetPretrained/"):]
99
+ name = ModelCatalog.C2_IMAGENET_MODELS[name]
100
+ url = "/".join([prefix, name])
101
+ return url
102
+
103
+ @staticmethod
104
+ def get_c2_detectron_12_2017_baselines(name):
105
+ # Detectron C2 models are stored following the structure
106
+ # prefix/<model_id>/2012_2017_baselines/<model_name>.yaml.<signature>/suffix
107
+ # we use as identifiers in the catalog Caffe2Detectron/COCO/<model_id>/<model_name>
108
+ prefix = ModelCatalog.S3_C2_DETECTRON_URL
109
+ dataset_tag = "keypoints_" if "keypoint" in name else ""
110
+ suffix = ModelCatalog.C2_DETECTRON_SUFFIX.format(dataset_tag, dataset_tag)
111
+ # remove identification prefix
112
+ name = name[len("Caffe2Detectron/COCO/"):]
113
+ # split in <model_id> and <model_name>
114
+ model_id, model_name = name.split("/")
115
+ # parsing to make it match the url address from the Caffe2 models
116
+ model_name = "{}.yaml".format(model_name)
117
+ signature = ModelCatalog.C2_DETECTRON_MODELS[name]
118
+ unique_name = ".".join([model_name, signature])
119
+ url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix])
120
+ return url
maskrcnn_benchmark/csrc/ROIAlign.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+
4
+ #include "cpu/vision.h"
5
+
6
+ #ifdef WITH_CUDA
7
+ #include "cuda/vision.h"
8
+ #endif
9
+
10
+ // Interface for Python
11
+ at::Tensor ROIAlign_forward(const at::Tensor& input,
12
+ const at::Tensor& rois,
13
+ const float spatial_scale,
14
+ const int pooled_height,
15
+ const int pooled_width,
16
+ const int sampling_ratio) {
17
+ if (input.type().is_cuda()) {
18
+ #ifdef WITH_CUDA
19
+ return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
20
+ #else
21
+ AT_ERROR("Not compiled with GPU support");
22
+ #endif
23
+ }
24
+ return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
25
+ }
26
+
27
+ at::Tensor ROIAlign_backward(const at::Tensor& grad,
28
+ const at::Tensor& rois,
29
+ const float spatial_scale,
30
+ const int pooled_height,
31
+ const int pooled_width,
32
+ const int batch_size,
33
+ const int channels,
34
+ const int height,
35
+ const int width,
36
+ const int sampling_ratio) {
37
+ if (grad.type().is_cuda()) {
38
+ #ifdef WITH_CUDA
39
+ return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
40
+ #else
41
+ AT_ERROR("Not compiled with GPU support");
42
+ #endif
43
+ }
44
+ AT_ERROR("Not implemented on the CPU");
45
+ }
46
+
maskrcnn_benchmark/csrc/ROIPool.h ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+
4
+ #include "cpu/vision.h"
5
+
6
+ #ifdef WITH_CUDA
7
+ #include "cuda/vision.h"
8
+ #endif
9
+
10
+
11
+ std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
12
+ const at::Tensor& rois,
13
+ const float spatial_scale,
14
+ const int pooled_height,
15
+ const int pooled_width) {
16
+ if (input.type().is_cuda()) {
17
+ #ifdef WITH_CUDA
18
+ return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
19
+ #else
20
+ AT_ERROR("Not compiled with GPU support");
21
+ #endif
22
+ }
23
+ AT_ERROR("Not implemented on the CPU");
24
+ }
25
+
26
+ at::Tensor ROIPool_backward(const at::Tensor& grad,
27
+ const at::Tensor& input,
28
+ const at::Tensor& rois,
29
+ const at::Tensor& argmax,
30
+ const float spatial_scale,
31
+ const int pooled_height,
32
+ const int pooled_width,
33
+ const int batch_size,
34
+ const int channels,
35
+ const int height,
36
+ const int width) {
37
+ if (grad.type().is_cuda()) {
38
+ #ifdef WITH_CUDA
39
+ return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
40
+ #else
41
+ AT_ERROR("Not compiled with GPU support");
42
+ #endif
43
+ }
44
+ AT_ERROR("Not implemented on the CPU");
45
+ }
46
+
47
+
48
+
maskrcnn_benchmark/csrc/SigmoidFocalLoss.h ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "cpu/vision.h"
4
+
5
+ #ifdef WITH_CUDA
6
+ #include "cuda/vision.h"
7
+ #endif
8
+
9
+ // Interface for Python
10
+ at::Tensor SigmoidFocalLoss_forward(
11
+ const at::Tensor& logits,
12
+ const at::Tensor& targets,
13
+ const int num_classes,
14
+ const float gamma,
15
+ const float alpha) {
16
+ if (logits.type().is_cuda()) {
17
+ #ifdef WITH_CUDA
18
+ return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
19
+ #else
20
+ AT_ERROR("Not compiled with GPU support");
21
+ #endif
22
+ }
23
+ AT_ERROR("Not implemented on the CPU");
24
+ }
25
+
26
+ at::Tensor SigmoidFocalLoss_backward(
27
+ const at::Tensor& logits,
28
+ const at::Tensor& targets,
29
+ const at::Tensor& d_losses,
30
+ const int num_classes,
31
+ const float gamma,
32
+ const float alpha) {
33
+ if (logits.type().is_cuda()) {
34
+ #ifdef WITH_CUDA
35
+ return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
36
+ #else
37
+ AT_ERROR("Not compiled with GPU support");
38
+ #endif
39
+ }
40
+ AT_ERROR("Not implemented on the CPU");
41
+ }
maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include "cpu/vision.h"
3
+
4
+ // implementation taken from Caffe2
5
+ template <typename T>
6
+ struct PreCalc {
7
+ int pos1;
8
+ int pos2;
9
+ int pos3;
10
+ int pos4;
11
+ T w1;
12
+ T w2;
13
+ T w3;
14
+ T w4;
15
+ };
16
+
17
+ template <typename T>
18
+ void pre_calc_for_bilinear_interpolate(
19
+ const int height,
20
+ const int width,
21
+ const int pooled_height,
22
+ const int pooled_width,
23
+ const int iy_upper,
24
+ const int ix_upper,
25
+ T roi_start_h,
26
+ T roi_start_w,
27
+ T bin_size_h,
28
+ T bin_size_w,
29
+ int roi_bin_grid_h,
30
+ int roi_bin_grid_w,
31
+ std::vector<PreCalc<T>>& pre_calc) {
32
+ int pre_calc_index = 0;
33
+ for (int ph = 0; ph < pooled_height; ph++) {
34
+ for (int pw = 0; pw < pooled_width; pw++) {
35
+ for (int iy = 0; iy < iy_upper; iy++) {
36
+ const T yy = roi_start_h + ph * bin_size_h +
37
+ static_cast<T>(iy + .5f) * bin_size_h /
38
+ static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
39
+ for (int ix = 0; ix < ix_upper; ix++) {
40
+ const T xx = roi_start_w + pw * bin_size_w +
41
+ static_cast<T>(ix + .5f) * bin_size_w /
42
+ static_cast<T>(roi_bin_grid_w);
43
+
44
+ T x = xx;
45
+ T y = yy;
46
+ // deal with: inverse elements are out of feature map boundary
47
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
48
+ // empty
49
+ PreCalc<T> pc;
50
+ pc.pos1 = 0;
51
+ pc.pos2 = 0;
52
+ pc.pos3 = 0;
53
+ pc.pos4 = 0;
54
+ pc.w1 = 0;
55
+ pc.w2 = 0;
56
+ pc.w3 = 0;
57
+ pc.w4 = 0;
58
+ pre_calc[pre_calc_index] = pc;
59
+ pre_calc_index += 1;
60
+ continue;
61
+ }
62
+
63
+ if (y <= 0) {
64
+ y = 0;
65
+ }
66
+ if (x <= 0) {
67
+ x = 0;
68
+ }
69
+
70
+ int y_low = (int)y;
71
+ int x_low = (int)x;
72
+ int y_high;
73
+ int x_high;
74
+
75
+ if (y_low >= height - 1) {
76
+ y_high = y_low = height - 1;
77
+ y = (T)y_low;
78
+ } else {
79
+ y_high = y_low + 1;
80
+ }
81
+
82
+ if (x_low >= width - 1) {
83
+ x_high = x_low = width - 1;
84
+ x = (T)x_low;
85
+ } else {
86
+ x_high = x_low + 1;
87
+ }
88
+
89
+ T ly = y - y_low;
90
+ T lx = x - x_low;
91
+ T hy = 1. - ly, hx = 1. - lx;
92
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
93
+
94
+ // save weights and indeces
95
+ PreCalc<T> pc;
96
+ pc.pos1 = y_low * width + x_low;
97
+ pc.pos2 = y_low * width + x_high;
98
+ pc.pos3 = y_high * width + x_low;
99
+ pc.pos4 = y_high * width + x_high;
100
+ pc.w1 = w1;
101
+ pc.w2 = w2;
102
+ pc.w3 = w3;
103
+ pc.w4 = w4;
104
+ pre_calc[pre_calc_index] = pc;
105
+
106
+ pre_calc_index += 1;
107
+ }
108
+ }
109
+ }
110
+ }
111
+ }
112
+
113
+ template <typename T>
114
+ void ROIAlignForward_cpu_kernel(
115
+ const int nthreads,
116
+ const T* bottom_data,
117
+ const T& spatial_scale,
118
+ const int channels,
119
+ const int height,
120
+ const int width,
121
+ const int pooled_height,
122
+ const int pooled_width,
123
+ const int sampling_ratio,
124
+ const T* bottom_rois,
125
+ //int roi_cols,
126
+ T* top_data) {
127
+ //AT_ASSERT(roi_cols == 4 || roi_cols == 5);
128
+ int roi_cols = 5;
129
+
130
+ int n_rois = nthreads / channels / pooled_width / pooled_height;
131
+ // (n, c, ph, pw) is an element in the pooled output
132
+ // can be parallelized using omp
133
+ // #pragma omp parallel for num_threads(32)
134
+ for (int n = 0; n < n_rois; n++) {
135
+ int index_n = n * channels * pooled_width * pooled_height;
136
+
137
+ // roi could have 4 or 5 columns
138
+ const T* offset_bottom_rois = bottom_rois + n * roi_cols;
139
+ int roi_batch_ind = 0;
140
+ if (roi_cols == 5) {
141
+ roi_batch_ind = offset_bottom_rois[0];
142
+ offset_bottom_rois++;
143
+ }
144
+
145
+ // Do not using rounding; this implementation detail is critical
146
+ T roi_start_w = offset_bottom_rois[0] * spatial_scale;
147
+ T roi_start_h = offset_bottom_rois[1] * spatial_scale;
148
+ T roi_end_w = offset_bottom_rois[2] * spatial_scale;
149
+ T roi_end_h = offset_bottom_rois[3] * spatial_scale;
150
+ // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
151
+ // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
152
+ // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
153
+ // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
154
+
155
+ // Force malformed ROIs to be 1x1
156
+ T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
157
+ T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
158
+ T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
159
+ T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
160
+
161
+ // We use roi_bin_grid to sample the grid and mimic integral
162
+ int roi_bin_grid_h = (sampling_ratio > 0)
163
+ ? sampling_ratio
164
+ : ceil(roi_height / pooled_height); // e.g., = 2
165
+ int roi_bin_grid_w =
166
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
167
+
168
+ // We do average (integral) pooling inside a bin
169
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
170
+
171
+ // we want to precalculate indeces and weights shared by all chanels,
172
+ // this is the key point of optimiation
173
+ std::vector<PreCalc<T>> pre_calc(
174
+ roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
175
+ pre_calc_for_bilinear_interpolate(
176
+ height,
177
+ width,
178
+ pooled_height,
179
+ pooled_width,
180
+ roi_bin_grid_h,
181
+ roi_bin_grid_w,
182
+ roi_start_h,
183
+ roi_start_w,
184
+ bin_size_h,
185
+ bin_size_w,
186
+ roi_bin_grid_h,
187
+ roi_bin_grid_w,
188
+ pre_calc);
189
+
190
+ for (int c = 0; c < channels; c++) {
191
+ int index_n_c = index_n + c * pooled_width * pooled_height;
192
+ const T* offset_bottom_data =
193
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
194
+ int pre_calc_index = 0;
195
+
196
+ for (int ph = 0; ph < pooled_height; ph++) {
197
+ for (int pw = 0; pw < pooled_width; pw++) {
198
+ int index = index_n_c + ph * pooled_width + pw;
199
+
200
+ T output_val = 0.;
201
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
202
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
203
+ PreCalc<T> pc = pre_calc[pre_calc_index];
204
+ output_val += pc.w1 * offset_bottom_data[pc.pos1] +
205
+ pc.w2 * offset_bottom_data[pc.pos2] +
206
+ pc.w3 * offset_bottom_data[pc.pos3] +
207
+ pc.w4 * offset_bottom_data[pc.pos4];
208
+
209
+ pre_calc_index += 1;
210
+ }
211
+ }
212
+ output_val /= count;
213
+
214
+ top_data[index] = output_val;
215
+ } // for pw
216
+ } // for ph
217
+ } // for c
218
+ } // for n
219
+ }
220
+
221
+ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
222
+ const at::Tensor& rois,
223
+ const float spatial_scale,
224
+ const int pooled_height,
225
+ const int pooled_width,
226
+ const int sampling_ratio) {
227
+ AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor");
228
+ AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor");
229
+
230
+ auto num_rois = rois.size(0);
231
+ auto channels = input.size(1);
232
+ auto height = input.size(2);
233
+ auto width = input.size(3);
234
+
235
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
236
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
237
+
238
+ if (output.numel() == 0) {
239
+ return output;
240
+ }
241
+
242
+ AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
243
+ ROIAlignForward_cpu_kernel<scalar_t>(
244
+ output_size,
245
+ input.data<scalar_t>(),
246
+ spatial_scale,
247
+ channels,
248
+ height,
249
+ width,
250
+ pooled_height,
251
+ pooled_width,
252
+ sampling_ratio,
253
+ rois.data<scalar_t>(),
254
+ output.data<scalar_t>());
255
+ });
256
+ return output;
257
+ }
maskrcnn_benchmark/csrc/cpu/dcn_v2_cpu.cpp ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <vector>
2
+
3
+ #include <ATen/ATen.h>
4
+ #include <ATen/cuda/CUDAContext.h>
5
+
6
+
7
+ at::Tensor
8
+ dcn_v2_cpu_forward(const at::Tensor &input,
9
+ const at::Tensor &weight,
10
+ const at::Tensor &bias,
11
+ const at::Tensor &offset,
12
+ const at::Tensor &mask,
13
+ const int kernel_h,
14
+ const int kernel_w,
15
+ const int stride_h,
16
+ const int stride_w,
17
+ const int pad_h,
18
+ const int pad_w,
19
+ const int dilation_h,
20
+ const int dilation_w,
21
+ const int deformable_group)
22
+ {
23
+ AT_ERROR("Not implement on cpu");
24
+ }
25
+
26
+ std::vector<at::Tensor>
27
+ dcn_v2_cpu_backward(const at::Tensor &input,
28
+ const at::Tensor &weight,
29
+ const at::Tensor &bias,
30
+ const at::Tensor &offset,
31
+ const at::Tensor &mask,
32
+ const at::Tensor &grad_output,
33
+ int kernel_h, int kernel_w,
34
+ int stride_h, int stride_w,
35
+ int pad_h, int pad_w,
36
+ int dilation_h, int dilation_w,
37
+ int deformable_group)
38
+ {
39
+ AT_ERROR("Not implement on cpu");
40
+ }
41
+
42
+ std::tuple<at::Tensor, at::Tensor>
43
+ dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
44
+ const at::Tensor &bbox,
45
+ const at::Tensor &trans,
46
+ const int no_trans,
47
+ const float spatial_scale,
48
+ const int output_dim,
49
+ const int group_size,
50
+ const int pooled_size,
51
+ const int part_size,
52
+ const int sample_per_part,
53
+ const float trans_std)
54
+ {
55
+ AT_ERROR("Not implement on cpu");
56
+ }
57
+
58
+ std::tuple<at::Tensor, at::Tensor>
59
+ dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
60
+ const at::Tensor &input,
61
+ const at::Tensor &bbox,
62
+ const at::Tensor &trans,
63
+ const at::Tensor &top_count,
64
+ const int no_trans,
65
+ const float spatial_scale,
66
+ const int output_dim,
67
+ const int group_size,
68
+ const int pooled_size,
69
+ const int part_size,
70
+ const int sample_per_part,
71
+ const float trans_std)
72
+ {
73
+ AT_ERROR("Not implement on cpu");
74
+ }
maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include "cpu/vision.h"
3
+
4
+
5
+ template <typename scalar_t>
6
+ at::Tensor nms_cpu_kernel(const at::Tensor& dets,
7
+ const at::Tensor& scores,
8
+ const float threshold) {
9
+ AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
10
+ AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
11
+ AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
12
+
13
+ if (dets.numel() == 0) {
14
+ return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
15
+ }
16
+
17
+ auto x1_t = dets.select(1, 0).contiguous();
18
+ auto y1_t = dets.select(1, 1).contiguous();
19
+ auto x2_t = dets.select(1, 2).contiguous();
20
+ auto y2_t = dets.select(1, 3).contiguous();
21
+
22
+ at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
23
+
24
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
25
+
26
+ auto ndets = dets.size(0);
27
+ at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
28
+
29
+ auto suppressed = suppressed_t.data<uint8_t>();
30
+ auto order = order_t.data<int64_t>();
31
+ auto x1 = x1_t.data<scalar_t>();
32
+ auto y1 = y1_t.data<scalar_t>();
33
+ auto x2 = x2_t.data<scalar_t>();
34
+ auto y2 = y2_t.data<scalar_t>();
35
+ auto areas = areas_t.data<scalar_t>();
36
+
37
+ for (int64_t _i = 0; _i < ndets; _i++) {
38
+ auto i = order[_i];
39
+ if (suppressed[i] == 1)
40
+ continue;
41
+ auto ix1 = x1[i];
42
+ auto iy1 = y1[i];
43
+ auto ix2 = x2[i];
44
+ auto iy2 = y2[i];
45
+ auto iarea = areas[i];
46
+
47
+ for (int64_t _j = _i + 1; _j < ndets; _j++) {
48
+ auto j = order[_j];
49
+ if (suppressed[j] == 1)
50
+ continue;
51
+ auto xx1 = std::max(ix1, x1[j]);
52
+ auto yy1 = std::max(iy1, y1[j]);
53
+ auto xx2 = std::min(ix2, x2[j]);
54
+ auto yy2 = std::min(iy2, y2[j]);
55
+
56
+ auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
57
+ auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
58
+ auto inter = w * h;
59
+ auto ovr = inter / (iarea + areas[j] - inter);
60
+ if (ovr >= threshold)
61
+ suppressed[j] = 1;
62
+ }
63
+ }
64
+ return at::nonzero(suppressed_t == 0).squeeze(1);
65
+ }
66
+
67
+ at::Tensor nms_cpu(const at::Tensor& dets,
68
+ const at::Tensor& scores,
69
+ const float threshold) {
70
+ at::Tensor result;
71
+ AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
72
+ result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
73
+ });
74
+ return result;
75
+ }
maskrcnn_benchmark/csrc/cpu/vision.h ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+ #include <torch/extension.h>
4
+
5
+
6
+ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
7
+ const at::Tensor& rois,
8
+ const float spatial_scale,
9
+ const int pooled_height,
10
+ const int pooled_width,
11
+ const int sampling_ratio);
12
+
13
+
14
+ at::Tensor nms_cpu(const at::Tensor& dets,
15
+ const at::Tensor& scores,
16
+ const float threshold);
17
+ at::Tensor
18
+ dcn_v2_cpu_forward(const at::Tensor &input,
19
+ const at::Tensor &weight,
20
+ const at::Tensor &bias,
21
+ const at::Tensor &offset,
22
+ const at::Tensor &mask,
23
+ const int kernel_h,
24
+ const int kernel_w,
25
+ const int stride_h,
26
+ const int stride_w,
27
+ const int pad_h,
28
+ const int pad_w,
29
+ const int dilation_h,
30
+ const int dilation_w,
31
+ const int deformable_group);
32
+
33
+ std::vector<at::Tensor>
34
+ dcn_v2_cpu_backward(const at::Tensor &input,
35
+ const at::Tensor &weight,
36
+ const at::Tensor &bias,
37
+ const at::Tensor &offset,
38
+ const at::Tensor &mask,
39
+ const at::Tensor &grad_output,
40
+ int kernel_h, int kernel_w,
41
+ int stride_h, int stride_w,
42
+ int pad_h, int pad_w,
43
+ int dilation_h, int dilation_w,
44
+ int deformable_group);
45
+
46
+
47
+ std::tuple<at::Tensor, at::Tensor>
48
+ dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
49
+ const at::Tensor &bbox,
50
+ const at::Tensor &trans,
51
+ const int no_trans,
52
+ const float spatial_scale,
53
+ const int output_dim,
54
+ const int group_size,
55
+ const int pooled_size,
56
+ const int part_size,
57
+ const int sample_per_part,
58
+ const float trans_std);
59
+
60
+ std::tuple<at::Tensor, at::Tensor>
61
+ dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
62
+ const at::Tensor &input,
63
+ const at::Tensor &bbox,
64
+ const at::Tensor &trans,
65
+ const at::Tensor &top_count,
66
+ const int no_trans,
67
+ const float spatial_scale,
68
+ const int output_dim,
69
+ const int group_size,
70
+ const int pooled_size,
71
+ const int part_size,
72
+ const int sample_per_part,
73
+ const float trans_std);
maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include <ATen/ATen.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+
5
+ #include <THC/THC.h>
6
+ #include <THC/THCAtomics.cuh>
7
+ #include <THC/THCDeviceUtils.cuh>
8
+
9
+ // TODO make it in a common file
10
+ #define CUDA_1D_KERNEL_LOOP(i, n) \
11
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
12
+ i += blockDim.x * gridDim.x)
13
+
14
+
15
+ template <typename T>
16
+ __device__ T bilinear_interpolate(const T* bottom_data,
17
+ const int height, const int width,
18
+ T y, T x,
19
+ const int index /* index for debug only*/) {
20
+
21
+ // deal with cases that inverse elements are out of feature map boundary
22
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
23
+ //empty
24
+ return 0;
25
+ }
26
+
27
+ if (y <= 0) y = 0;
28
+ if (x <= 0) x = 0;
29
+
30
+ int y_low = (int) y;
31
+ int x_low = (int) x;
32
+ int y_high;
33
+ int x_high;
34
+
35
+ if (y_low >= height - 1) {
36
+ y_high = y_low = height - 1;
37
+ y = (T) y_low;
38
+ } else {
39
+ y_high = y_low + 1;
40
+ }
41
+
42
+ if (x_low >= width - 1) {
43
+ x_high = x_low = width - 1;
44
+ x = (T) x_low;
45
+ } else {
46
+ x_high = x_low + 1;
47
+ }
48
+
49
+ T ly = y - y_low;
50
+ T lx = x - x_low;
51
+ T hy = 1. - ly, hx = 1. - lx;
52
+ // do bilinear interpolation
53
+ T v1 = bottom_data[y_low * width + x_low];
54
+ T v2 = bottom_data[y_low * width + x_high];
55
+ T v3 = bottom_data[y_high * width + x_low];
56
+ T v4 = bottom_data[y_high * width + x_high];
57
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
58
+
59
+ T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
60
+
61
+ return val;
62
+ }
63
+
64
+ template <typename T>
65
+ __global__ void RoIAlignForward(const int nthreads, const T* bottom_data,
66
+ const T spatial_scale, const int channels,
67
+ const int height, const int width,
68
+ const int pooled_height, const int pooled_width,
69
+ const int sampling_ratio,
70
+ const T* bottom_rois, T* top_data) {
71
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
72
+ // (n, c, ph, pw) is an element in the pooled output
73
+ int pw = index % pooled_width;
74
+ int ph = (index / pooled_width) % pooled_height;
75
+ int c = (index / pooled_width / pooled_height) % channels;
76
+ int n = index / pooled_width / pooled_height / channels;
77
+
78
+ const T* offset_bottom_rois = bottom_rois + n * 5;
79
+ int roi_batch_ind = offset_bottom_rois[0];
80
+
81
+ // Do not using rounding; this implementation detail is critical
82
+ T roi_start_w = offset_bottom_rois[1] * spatial_scale;
83
+ T roi_start_h = offset_bottom_rois[2] * spatial_scale;
84
+ T roi_end_w = offset_bottom_rois[3] * spatial_scale;
85
+ T roi_end_h = offset_bottom_rois[4] * spatial_scale;
86
+ // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
87
+ // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
88
+ // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
89
+ // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
90
+
91
+ // Force malformed ROIs to be 1x1
92
+ T roi_width = max(roi_end_w - roi_start_w, (T)1.);
93
+ T roi_height = max(roi_end_h - roi_start_h, (T)1.);
94
+ T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
95
+ T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
96
+
97
+ const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
98
+
99
+ // We use roi_bin_grid to sample the grid and mimic integral
100
+ int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
101
+ int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
102
+
103
+ // We do average (integral) pooling inside a bin
104
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
105
+
106
+ T output_val = 0.;
107
+ for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
108
+ {
109
+ const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
110
+ for (int ix = 0; ix < roi_bin_grid_w; ix ++)
111
+ {
112
+ const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
113
+
114
+ T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index);
115
+ output_val += val;
116
+ }
117
+ }
118
+ output_val /= count;
119
+
120
+ top_data[index] = output_val;
121
+ }
122
+ }
123
+
124
+
125
+ template <typename T>
126
+ __device__ void bilinear_interpolate_gradient(
127
+ const int height, const int width,
128
+ T y, T x,
129
+ T & w1, T & w2, T & w3, T & w4,
130
+ int & x_low, int & x_high, int & y_low, int & y_high,
131
+ const int index /* index for debug only*/) {
132
+
133
+ // deal with cases that inverse elements are out of feature map boundary
134
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
135
+ //empty
136
+ w1 = w2 = w3 = w4 = 0.;
137
+ x_low = x_high = y_low = y_high = -1;
138
+ return;
139
+ }
140
+
141
+ if (y <= 0) y = 0;
142
+ if (x <= 0) x = 0;
143
+
144
+ y_low = (int) y;
145
+ x_low = (int) x;
146
+
147
+ if (y_low >= height - 1) {
148
+ y_high = y_low = height - 1;
149
+ y = (T) y_low;
150
+ } else {
151
+ y_high = y_low + 1;
152
+ }
153
+
154
+ if (x_low >= width - 1) {
155
+ x_high = x_low = width - 1;
156
+ x = (T) x_low;
157
+ } else {
158
+ x_high = x_low + 1;
159
+ }
160
+
161
+ T ly = y - y_low;
162
+ T lx = x - x_low;
163
+ T hy = 1. - ly, hx = 1. - lx;
164
+
165
+ // reference in forward
166
+ // T v1 = bottom_data[y_low * width + x_low];
167
+ // T v2 = bottom_data[y_low * width + x_high];
168
+ // T v3 = bottom_data[y_high * width + x_low];
169
+ // T v4 = bottom_data[y_high * width + x_high];
170
+ // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
171
+
172
+ w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
173
+
174
+ return;
175
+ }
176
+
177
+ template <typename T>
178
+ __global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff,
179
+ const int num_rois, const T spatial_scale,
180
+ const int channels, const int height, const int width,
181
+ const int pooled_height, const int pooled_width,
182
+ const int sampling_ratio,
183
+ T* bottom_diff,
184
+ const T* bottom_rois) {
185
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
186
+ // (n, c, ph, pw) is an element in the pooled output
187
+ int pw = index % pooled_width;
188
+ int ph = (index / pooled_width) % pooled_height;
189
+ int c = (index / pooled_width / pooled_height) % channels;
190
+ int n = index / pooled_width / pooled_height / channels;
191
+
192
+ const T* offset_bottom_rois = bottom_rois + n * 5;
193
+ int roi_batch_ind = offset_bottom_rois[0];
194
+
195
+ // Do not using rounding; this implementation detail is critical
196
+ T roi_start_w = offset_bottom_rois[1] * spatial_scale;
197
+ T roi_start_h = offset_bottom_rois[2] * spatial_scale;
198
+ T roi_end_w = offset_bottom_rois[3] * spatial_scale;
199
+ T roi_end_h = offset_bottom_rois[4] * spatial_scale;
200
+ // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
201
+ // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
202
+ // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
203
+ // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
204
+
205
+ // Force malformed ROIs to be 1x1
206
+ T roi_width = max(roi_end_w - roi_start_w, (T)1.);
207
+ T roi_height = max(roi_end_h - roi_start_h, (T)1.);
208
+ T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
209
+ T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
210
+
211
+ T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
212
+
213
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
214
+ const T* offset_top_diff = top_diff + top_offset;
215
+ const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
216
+
217
+ // We use roi_bin_grid to sample the grid and mimic integral
218
+ int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
219
+ int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
220
+
221
+ // We do average (integral) pooling inside a bin
222
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
223
+
224
+ for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
225
+ {
226
+ const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
227
+ for (int ix = 0; ix < roi_bin_grid_w; ix ++)
228
+ {
229
+ const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
230
+
231
+ T w1, w2, w3, w4;
232
+ int x_low, x_high, y_low, y_high;
233
+
234
+ bilinear_interpolate_gradient(height, width, y, x,
235
+ w1, w2, w3, w4,
236
+ x_low, x_high, y_low, y_high,
237
+ index);
238
+
239
+ T g1 = top_diff_this_bin * w1 / count;
240
+ T g2 = top_diff_this_bin * w2 / count;
241
+ T g3 = top_diff_this_bin * w3 / count;
242
+ T g4 = top_diff_this_bin * w4 / count;
243
+
244
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0)
245
+ {
246
+ atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
247
+ atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
248
+ atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
249
+ atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
250
+ } // if
251
+ } // ix
252
+ } // iy
253
+ } // CUDA_1D_KERNEL_LOOP
254
+ } // RoIAlignBackward
255
+
256
+
257
+ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
258
+ const at::Tensor& rois,
259
+ const float spatial_scale,
260
+ const int pooled_height,
261
+ const int pooled_width,
262
+ const int sampling_ratio) {
263
+ AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
264
+ AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
265
+
266
+ auto num_rois = rois.size(0);
267
+ auto channels = input.size(1);
268
+ auto height = input.size(2);
269
+ auto width = input.size(3);
270
+
271
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
272
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
273
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
274
+
275
+ dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
276
+ dim3 block(512);
277
+
278
+ if (output.numel() == 0) {
279
+ THCudaCheck(cudaGetLastError());
280
+ return output;
281
+ }
282
+
283
+ AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
284
+ RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
285
+ output_size,
286
+ input.contiguous().data<scalar_t>(),
287
+ spatial_scale,
288
+ channels,
289
+ height,
290
+ width,
291
+ pooled_height,
292
+ pooled_width,
293
+ sampling_ratio,
294
+ rois.contiguous().data<scalar_t>(),
295
+ output.data<scalar_t>());
296
+ });
297
+ THCudaCheck(cudaGetLastError());
298
+ return output;
299
+ }
300
+
301
+ // TODO remove the dependency on input and use instead its sizes -> save memory
302
+ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
303
+ const at::Tensor& rois,
304
+ const float spatial_scale,
305
+ const int pooled_height,
306
+ const int pooled_width,
307
+ const int batch_size,
308
+ const int channels,
309
+ const int height,
310
+ const int width,
311
+ const int sampling_ratio) {
312
+ AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
313
+ AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
314
+
315
+ auto num_rois = rois.size(0);
316
+ auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
317
+
318
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
319
+
320
+ dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
321
+ dim3 block(512);
322
+
323
+ // handle possibly empty gradients
324
+ if (grad.numel() == 0) {
325
+ THCudaCheck(cudaGetLastError());
326
+ return grad_input;
327
+ }
328
+
329
+ AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] {
330
+ RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
331
+ grad.numel(),
332
+ grad.contiguous().data<scalar_t>(),
333
+ num_rois,
334
+ spatial_scale,
335
+ channels,
336
+ height,
337
+ width,
338
+ pooled_height,
339
+ pooled_width,
340
+ sampling_ratio,
341
+ grad_input.data<scalar_t>(),
342
+ rois.contiguous().data<scalar_t>());
343
+ });
344
+ THCudaCheck(cudaGetLastError());
345
+ return grad_input;
346
+ }
maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include <ATen/ATen.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+
5
+ #include <THC/THC.h>
6
+ #include <THC/THCAtomics.cuh>
7
+ #include <THC/THCDeviceUtils.cuh>
8
+
9
+
10
+ // TODO make it in a common file
11
+ #define CUDA_1D_KERNEL_LOOP(i, n) \
12
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
13
+ i += blockDim.x * gridDim.x)
14
+
15
+
16
+ template <typename T>
17
+ __global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
18
+ const T spatial_scale, const int channels, const int height,
19
+ const int width, const int pooled_height, const int pooled_width,
20
+ const T* bottom_rois, T* top_data, int* argmax_data) {
21
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
22
+ // (n, c, ph, pw) is an element in the pooled output
23
+ int pw = index % pooled_width;
24
+ int ph = (index / pooled_width) % pooled_height;
25
+ int c = (index / pooled_width / pooled_height) % channels;
26
+ int n = index / pooled_width / pooled_height / channels;
27
+
28
+ const T* offset_bottom_rois = bottom_rois + n * 5;
29
+ int roi_batch_ind = offset_bottom_rois[0];
30
+ int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
31
+ int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
32
+ int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
33
+ int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
34
+
35
+ // Force malformed ROIs to be 1x1
36
+ int roi_width = max(roi_end_w - roi_start_w + 1, 1);
37
+ int roi_height = max(roi_end_h - roi_start_h + 1, 1);
38
+ T bin_size_h = static_cast<T>(roi_height)
39
+ / static_cast<T>(pooled_height);
40
+ T bin_size_w = static_cast<T>(roi_width)
41
+ / static_cast<T>(pooled_width);
42
+
43
+ int hstart = static_cast<int>(floor(static_cast<T>(ph)
44
+ * bin_size_h));
45
+ int wstart = static_cast<int>(floor(static_cast<T>(pw)
46
+ * bin_size_w));
47
+ int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
48
+ * bin_size_h));
49
+ int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
50
+ * bin_size_w));
51
+
52
+ // Add roi offsets and clip to input boundaries
53
+ hstart = min(max(hstart + roi_start_h, 0), height);
54
+ hend = min(max(hend + roi_start_h, 0), height);
55
+ wstart = min(max(wstart + roi_start_w, 0), width);
56
+ wend = min(max(wend + roi_start_w, 0), width);
57
+ bool is_empty = (hend <= hstart) || (wend <= wstart);
58
+
59
+ // Define an empty pooling region to be zero
60
+ T maxval = is_empty ? 0 : -FLT_MAX;
61
+ // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
62
+ int maxidx = -1;
63
+ const T* offset_bottom_data =
64
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
65
+ for (int h = hstart; h < hend; ++h) {
66
+ for (int w = wstart; w < wend; ++w) {
67
+ int bottom_index = h * width + w;
68
+ if (offset_bottom_data[bottom_index] > maxval) {
69
+ maxval = offset_bottom_data[bottom_index];
70
+ maxidx = bottom_index;
71
+ }
72
+ }
73
+ }
74
+ top_data[index] = maxval;
75
+ argmax_data[index] = maxidx;
76
+ }
77
+ }
78
+
79
+ template <typename T>
80
+ __global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
81
+ const int* argmax_data, const int num_rois, const T spatial_scale,
82
+ const int channels, const int height, const int width,
83
+ const int pooled_height, const int pooled_width, T* bottom_diff,
84
+ const T* bottom_rois) {
85
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
86
+ // (n, c, ph, pw) is an element in the pooled output
87
+ int pw = index % pooled_width;
88
+ int ph = (index / pooled_width) % pooled_height;
89
+ int c = (index / pooled_width / pooled_height) % channels;
90
+ int n = index / pooled_width / pooled_height / channels;
91
+
92
+ const T* offset_bottom_rois = bottom_rois + n * 5;
93
+ int roi_batch_ind = offset_bottom_rois[0];
94
+ int bottom_offset = (roi_batch_ind * channels + c) * height * width;
95
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
96
+ const T* offset_top_diff = top_diff + top_offset;
97
+ T* offset_bottom_diff = bottom_diff + bottom_offset;
98
+ const int* offset_argmax_data = argmax_data + top_offset;
99
+
100
+ int argmax = offset_argmax_data[ph * pooled_width + pw];
101
+ if (argmax != -1) {
102
+ atomicAdd(
103
+ offset_bottom_diff + argmax,
104
+ static_cast<T>(offset_top_diff[ph * pooled_width + pw]));
105
+
106
+ }
107
+ }
108
+ }
109
+
110
+ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
111
+ const at::Tensor& rois,
112
+ const float spatial_scale,
113
+ const int pooled_height,
114
+ const int pooled_width) {
115
+ AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
116
+ AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
117
+
118
+ auto num_rois = rois.size(0);
119
+ auto channels = input.size(1);
120
+ auto height = input.size(2);
121
+ auto width = input.size(3);
122
+
123
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
124
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
125
+ auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt));
126
+
127
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
128
+
129
+ dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
130
+ dim3 block(512);
131
+
132
+ if (output.numel() == 0) {
133
+ THCudaCheck(cudaGetLastError());
134
+ return std::make_tuple(output, argmax);
135
+ }
136
+
137
+ AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] {
138
+ RoIPoolFForward<scalar_t><<<grid, block, 0, stream>>>(
139
+ output_size,
140
+ input.contiguous().data<scalar_t>(),
141
+ spatial_scale,
142
+ channels,
143
+ height,
144
+ width,
145
+ pooled_height,
146
+ pooled_width,
147
+ rois.contiguous().data<scalar_t>(),
148
+ output.data<scalar_t>(),
149
+ argmax.data<int>());
150
+ });
151
+ THCudaCheck(cudaGetLastError());
152
+ return std::make_tuple(output, argmax);
153
+ }
154
+
155
+ // TODO remove the dependency on input and use instead its sizes -> save memory
156
+ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
157
+ const at::Tensor& input,
158
+ const at::Tensor& rois,
159
+ const at::Tensor& argmax,
160
+ const float spatial_scale,
161
+ const int pooled_height,
162
+ const int pooled_width,
163
+ const int batch_size,
164
+ const int channels,
165
+ const int height,
166
+ const int width) {
167
+ AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
168
+ AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
169
+ // TODO add more checks
170
+
171
+ auto num_rois = rois.size(0);
172
+ auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
173
+
174
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
175
+
176
+ dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
177
+ dim3 block(512);
178
+
179
+ // handle possibly empty gradients
180
+ if (grad.numel() == 0) {
181
+ THCudaCheck(cudaGetLastError());
182
+ return grad_input;
183
+ }
184
+
185
+ AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] {
186
+ RoIPoolFBackward<scalar_t><<<grid, block, 0, stream>>>(
187
+ grad.numel(),
188
+ grad.contiguous().data<scalar_t>(),
189
+ argmax.data<int>(),
190
+ num_rois,
191
+ spatial_scale,
192
+ channels,
193
+ height,
194
+ width,
195
+ pooled_height,
196
+ pooled_width,
197
+ grad_input.data<scalar_t>(),
198
+ rois.contiguous().data<scalar_t>());
199
+ });
200
+ THCudaCheck(cudaGetLastError());
201
+ return grad_input;
202
+ }
maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ // This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
3
+ // Cheng-Yang Fu
4
5
+ #include <ATen/ATen.h>
6
+ #include <ATen/cuda/CUDAContext.h>
7
+
8
+ #include <THC/THC.h>
9
+ #include <THC/THCAtomics.cuh>
10
+ #include <THC/THCDeviceUtils.cuh>
11
+
12
+ #include <cfloat>
13
+
14
+ // TODO make it in a common file
15
+ #define CUDA_1D_KERNEL_LOOP(i, n) \
16
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
17
+ i += blockDim.x * gridDim.x)
18
+
19
+
20
+ template <typename T>
21
+ __global__ void SigmoidFocalLossForward(const int nthreads,
22
+ const T* logits,
23
+ const int* targets,
24
+ const int num_classes,
25
+ const float gamma,
26
+ const float alpha,
27
+ const int num,
28
+ T* losses) {
29
+ CUDA_1D_KERNEL_LOOP(i, nthreads) {
30
+
31
+ int n = i / num_classes;
32
+ int d = i % num_classes; // current class[0~79];
33
+ int t = targets[n]; // target class [1~80];
34
+
35
+ // Decide it is positive or negative case.
36
+ T c1 = (t == (d+1));
37
+ T c2 = (t>=0 & t != (d+1));
38
+
39
+ T zn = (1.0 - alpha);
40
+ T zp = (alpha);
41
+
42
+ // p = 1. / 1. + expf(-x); p = sigmoid(x)
43
+ T p = 1. / (1. + expf(-logits[i]));
44
+
45
+ // (1-p)**gamma * log(p) where
46
+ T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
47
+
48
+ // p**gamma * log(1-p)
49
+ T term2 = powf(p, gamma) *
50
+ (-1. * logits[i] * (logits[i] >= 0) -
51
+ logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
52
+
53
+ losses[i] = 0.0;
54
+ losses[i] += -c1 * term1 * zp;
55
+ losses[i] += -c2 * term2 * zn;
56
+
57
+ } // CUDA_1D_KERNEL_LOOP
58
+ } // SigmoidFocalLossForward
59
+
60
+
61
+ template <typename T>
62
+ __global__ void SigmoidFocalLossBackward(const int nthreads,
63
+ const T* logits,
64
+ const int* targets,
65
+ const T* d_losses,
66
+ const int num_classes,
67
+ const float gamma,
68
+ const float alpha,
69
+ const int num,
70
+ T* d_logits) {
71
+ CUDA_1D_KERNEL_LOOP(i, nthreads) {
72
+
73
+ int n = i / num_classes;
74
+ int d = i % num_classes; // current class[0~79];
75
+ int t = targets[n]; // target class [1~80], 0 is background;
76
+
77
+ // Decide it is positive or negative case.
78
+ T c1 = (t == (d+1));
79
+ T c2 = (t>=0 & t != (d+1));
80
+
81
+ T zn = (1.0 - alpha);
82
+ T zp = (alpha);
83
+ // p = 1. / 1. + expf(-x); p = sigmoid(x)
84
+ T p = 1. / (1. + expf(-logits[i]));
85
+
86
+ // (1-p)**g * (1 - p - g*p*log(p)
87
+ T term1 = powf((1. - p), gamma) *
88
+ (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
89
+
90
+ // (p**g) * (g*(1-p)*log(1-p) - p)
91
+ T term2 = powf(p, gamma) *
92
+ ((-1. * logits[i] * (logits[i] >= 0) -
93
+ logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
94
+ (1. - p) * gamma - p);
95
+ d_logits[i] = 0.0;
96
+ d_logits[i] += -c1 * term1 * zp;
97
+ d_logits[i] += -c2 * term2 * zn;
98
+ d_logits[i] = d_logits[i] * d_losses[i];
99
+
100
+ } // CUDA_1D_KERNEL_LOOP
101
+ } // SigmoidFocalLossBackward
102
+
103
+
104
+ at::Tensor SigmoidFocalLoss_forward_cuda(
105
+ const at::Tensor& logits,
106
+ const at::Tensor& targets,
107
+ const int num_classes,
108
+ const float gamma,
109
+ const float alpha) {
110
+ AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
111
+ AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
112
+ AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
113
+
114
+ const int num_samples = logits.size(0);
115
+
116
+ auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
117
+ auto losses_size = num_samples * logits.size(1);
118
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
119
+
120
+ dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
121
+ dim3 block(512);
122
+
123
+ if (losses.numel() == 0) {
124
+ THCudaCheck(cudaGetLastError());
125
+ return losses;
126
+ }
127
+
128
+ AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] {
129
+ SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
130
+ losses_size,
131
+ logits.contiguous().data<scalar_t>(),
132
+ targets.contiguous().data<int>(),
133
+ num_classes,
134
+ gamma,
135
+ alpha,
136
+ num_samples,
137
+ losses.data<scalar_t>());
138
+ });
139
+ THCudaCheck(cudaGetLastError());
140
+ return losses;
141
+ }
142
+
143
+
144
+ at::Tensor SigmoidFocalLoss_backward_cuda(
145
+ const at::Tensor& logits,
146
+ const at::Tensor& targets,
147
+ const at::Tensor& d_losses,
148
+ const int num_classes,
149
+ const float gamma,
150
+ const float alpha) {
151
+ AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
152
+ AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
153
+ AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor");
154
+
155
+ AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
156
+
157
+ const int num_samples = logits.size(0);
158
+ AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
159
+
160
+ auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
161
+ auto d_logits_size = num_samples * logits.size(1);
162
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
163
+
164
+ dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
165
+ dim3 block(512);
166
+
167
+ if (d_logits.numel() == 0) {
168
+ THCudaCheck(cudaGetLastError());
169
+ return d_logits;
170
+ }
171
+
172
+ AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] {
173
+ SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
174
+ d_logits_size,
175
+ logits.contiguous().data<scalar_t>(),
176
+ targets.contiguous().data<int>(),
177
+ d_losses.contiguous().data<scalar_t>(),
178
+ num_classes,
179
+ gamma,
180
+ alpha,
181
+ num_samples,
182
+ d_logits.data<scalar_t>());
183
+ });
184
+
185
+ THCudaCheck(cudaGetLastError());
186
+ return d_logits;
187
+ }
188
+
maskrcnn_benchmark/csrc/cuda/dcn_v2_cuda.cu ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <vector>
2
+ #include "cuda/dcn_v2_im2col_cuda.h"
3
+
4
+ #include <ATen/ATen.h>
5
+ #include <ATen/cuda/CUDAContext.h>
6
+
7
+ #include <THC/THC.h>
8
+ #include <THC/THCAtomics.cuh>
9
+ #include <THC/THCDeviceUtils.cuh>
10
+
11
+ extern THCState *state;
12
+
13
+ // author: Charles Shang
14
+ // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
15
+
16
+ // [batch gemm]
17
+ // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu
18
+
19
+ __global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
20
+ float **columns_b, const float **ones_b,
21
+ const float **weight_b, const float **bias_b,
22
+ float *input, float *output,
23
+ float *columns, float *ones,
24
+ float *weight, float *bias,
25
+ const int input_stride, const int output_stride,
26
+ const int columns_stride, const int ones_stride,
27
+ const int num_batches)
28
+ {
29
+ const int idx = blockIdx.x * blockDim.x + threadIdx.x;
30
+ if (idx < num_batches)
31
+ {
32
+ input_b[idx] = input + idx * input_stride;
33
+ output_b[idx] = output + idx * output_stride;
34
+ columns_b[idx] = columns + idx * columns_stride;
35
+ ones_b[idx] = ones + idx * ones_stride;
36
+ // share weights and bias within a Mini-Batch
37
+ weight_b[idx] = weight;
38
+ bias_b[idx] = bias;
39
+ }
40
+ }
41
+
42
+ at::Tensor
43
+ dcn_v2_cuda_forward(const at::Tensor &input,
44
+ const at::Tensor &weight,
45
+ const at::Tensor &bias,
46
+ const at::Tensor &offset,
47
+ const at::Tensor &mask,
48
+ const int kernel_h,
49
+ const int kernel_w,
50
+ const int stride_h,
51
+ const int stride_w,
52
+ const int pad_h,
53
+ const int pad_w,
54
+ const int dilation_h,
55
+ const int dilation_w,
56
+ const int deformable_group)
57
+ {
58
+ using scalar_t = float;
59
+ // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
60
+ AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
61
+ AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
62
+ AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
63
+ AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
64
+ AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
65
+
66
+ const int batch = input.size(0);
67
+ const int channels = input.size(1);
68
+ const int height = input.size(2);
69
+ const int width = input.size(3);
70
+
71
+ const int channels_out = weight.size(0);
72
+ const int channels_kernel = weight.size(1);
73
+ const int kernel_h_ = weight.size(2);
74
+ const int kernel_w_ = weight.size(3);
75
+
76
+ // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
77
+ // printf("Channels: %d %d\n", channels, channels_kernel);
78
+ // printf("Channels: %d %d\n", channels_out, channels_kernel);
79
+
80
+ AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
81
+ "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
82
+
83
+ AT_ASSERTM(channels == channels_kernel,
84
+ "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
85
+
86
+ const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
87
+ const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
88
+
89
+ auto ones = at::ones({batch, height_out, width_out}, input.options());
90
+ auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
91
+ auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
92
+
93
+ // prepare for batch-wise computing, which is significantly faster than instance-wise computing
94
+ // when batch size is large.
95
+ // launch batch threads
96
+ int matrices_size = batch * sizeof(float *);
97
+ auto input_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
98
+ auto output_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
99
+ auto columns_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
100
+ auto ones_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
101
+ auto weight_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
102
+ auto bias_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
103
+
104
+ const int block = 128;
105
+ const int grid = (batch + block - 1) / block;
106
+
107
+ createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
108
+ input_b, output_b,
109
+ columns_b, ones_b,
110
+ weight_b, bias_b,
111
+ input.data<scalar_t>(),
112
+ output.data<scalar_t>(),
113
+ columns.data<scalar_t>(),
114
+ ones.data<scalar_t>(),
115
+ weight.data<scalar_t>(),
116
+ bias.data<scalar_t>(),
117
+ channels * width * height,
118
+ channels_out * width_out * height_out,
119
+ channels * kernel_h * kernel_w * height_out * width_out,
120
+ height_out * width_out,
121
+ batch);
122
+
123
+ long m_ = channels_out;
124
+ long n_ = height_out * width_out;
125
+ long k_ = 1;
126
+ THCudaBlas_SgemmBatched(state,
127
+ 't',
128
+ 'n',
129
+ n_,
130
+ m_,
131
+ k_,
132
+ 1.0f,
133
+ ones_b, k_,
134
+ bias_b, k_,
135
+ 0.0f,
136
+ output_b, n_,
137
+ batch);
138
+
139
+ modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
140
+ input.data<scalar_t>(),
141
+ offset.data<scalar_t>(),
142
+ mask.data<scalar_t>(),
143
+ batch, channels, height, width,
144
+ height_out, width_out, kernel_h, kernel_w,
145
+ pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
146
+ deformable_group,
147
+ columns.data<scalar_t>());
148
+
149
+ long m = channels_out;
150
+ long n = height_out * width_out;
151
+ long k = channels * kernel_h * kernel_w;
152
+ THCudaBlas_SgemmBatched(state,
153
+ 'n',
154
+ 'n',
155
+ n,
156
+ m,
157
+ k,
158
+ 1.0f,
159
+ (const float **)columns_b, n,
160
+ weight_b, k,
161
+ 1.0f,
162
+ output_b, n,
163
+ batch);
164
+
165
+ THCudaFree(state, input_b);
166
+ THCudaFree(state, output_b);
167
+ THCudaFree(state, columns_b);
168
+ THCudaFree(state, ones_b);
169
+ THCudaFree(state, weight_b);
170
+ THCudaFree(state, bias_b);
171
+ return output;
172
+ }
173
+
174
+ __global__ void createBatchGemmBufferBackward(
175
+ float **grad_output_b,
176
+ float **columns_b,
177
+ float **ones_b,
178
+ float **weight_b,
179
+ float **grad_weight_b,
180
+ float **grad_bias_b,
181
+ float *grad_output,
182
+ float *columns,
183
+ float *ones,
184
+ float *weight,
185
+ float *grad_weight,
186
+ float *grad_bias,
187
+ const int grad_output_stride,
188
+ const int columns_stride,
189
+ const int ones_stride,
190
+ const int num_batches)
191
+ {
192
+ const int idx = blockIdx.x * blockDim.x + threadIdx.x;
193
+ if (idx < num_batches)
194
+ {
195
+ grad_output_b[idx] = grad_output + idx * grad_output_stride;
196
+ columns_b[idx] = columns + idx * columns_stride;
197
+ ones_b[idx] = ones + idx * ones_stride;
198
+
199
+ // share weights and bias within a Mini-Batch
200
+ weight_b[idx] = weight;
201
+ grad_weight_b[idx] = grad_weight;
202
+ grad_bias_b[idx] = grad_bias;
203
+ }
204
+ }
205
+
206
+ std::vector<at::Tensor> dcn_v2_cuda_backward(const at::Tensor &input,
207
+ const at::Tensor &weight,
208
+ const at::Tensor &bias,
209
+ const at::Tensor &offset,
210
+ const at::Tensor &mask,
211
+ const at::Tensor &grad_output,
212
+ int kernel_h, int kernel_w,
213
+ int stride_h, int stride_w,
214
+ int pad_h, int pad_w,
215
+ int dilation_h, int dilation_w,
216
+ int deformable_group)
217
+ {
218
+
219
+ THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
220
+ THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
221
+
222
+ AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
223
+ AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
224
+ AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
225
+ AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
226
+ AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
227
+
228
+ const int batch = input.size(0);
229
+ const int channels = input.size(1);
230
+ const int height = input.size(2);
231
+ const int width = input.size(3);
232
+
233
+ const int channels_out = weight.size(0);
234
+ const int channels_kernel = weight.size(1);
235
+ const int kernel_h_ = weight.size(2);
236
+ const int kernel_w_ = weight.size(3);
237
+
238
+ AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
239
+ "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
240
+
241
+ AT_ASSERTM(channels == channels_kernel,
242
+ "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
243
+
244
+ const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
245
+ const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
246
+
247
+ auto ones = at::ones({height_out, width_out}, input.options());
248
+ auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
249
+ auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
250
+
251
+ auto grad_input = at::zeros_like(input);
252
+ auto grad_weight = at::zeros_like(weight);
253
+ auto grad_bias = at::zeros_like(bias);
254
+ auto grad_offset = at::zeros_like(offset);
255
+ auto grad_mask = at::zeros_like(mask);
256
+
257
+ using scalar_t = float;
258
+
259
+ for (int b = 0; b < batch; b++)
260
+ {
261
+ auto input_n = input.select(0, b);
262
+ auto offset_n = offset.select(0, b);
263
+ auto mask_n = mask.select(0, b);
264
+ auto grad_output_n = grad_output.select(0, b);
265
+ auto grad_input_n = grad_input.select(0, b);
266
+ auto grad_offset_n = grad_offset.select(0, b);
267
+ auto grad_mask_n = grad_mask.select(0, b);
268
+
269
+ long m = channels * kernel_h * kernel_w;
270
+ long n = height_out * width_out;
271
+ long k = channels_out;
272
+
273
+ THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f,
274
+ grad_output_n.data<scalar_t>(), n,
275
+ weight.data<scalar_t>(), m, 0.0f,
276
+ columns.data<scalar_t>(), n);
277
+
278
+ // gradient w.r.t. input coordinate data
279
+ modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state),
280
+ columns.data<scalar_t>(),
281
+ input_n.data<scalar_t>(),
282
+ offset_n.data<scalar_t>(),
283
+ mask_n.data<scalar_t>(),
284
+ 1, channels, height, width,
285
+ height_out, width_out, kernel_h, kernel_w,
286
+ pad_h, pad_w, stride_h, stride_w,
287
+ dilation_h, dilation_w, deformable_group,
288
+ grad_offset_n.data<scalar_t>(),
289
+ grad_mask_n.data<scalar_t>());
290
+ // gradient w.r.t. input data
291
+ modulated_deformable_col2im_cuda(THCState_getCurrentStream(state),
292
+ columns.data<scalar_t>(),
293
+ offset_n.data<scalar_t>(),
294
+ mask_n.data<scalar_t>(),
295
+ 1, channels, height, width,
296
+ height_out, width_out, kernel_h, kernel_w,
297
+ pad_h, pad_w, stride_h, stride_w,
298
+ dilation_h, dilation_w, deformable_group,
299
+ grad_input_n.data<scalar_t>());
300
+
301
+ // gradient w.r.t. weight, dWeight should accumulate across the batch and group
302
+ modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
303
+ input_n.data<scalar_t>(),
304
+ offset_n.data<scalar_t>(),
305
+ mask_n.data<scalar_t>(),
306
+ 1, channels, height, width,
307
+ height_out, width_out, kernel_h, kernel_w,
308
+ pad_h, pad_w, stride_h, stride_w,
309
+ dilation_h, dilation_w, deformable_group,
310
+ columns.data<scalar_t>());
311
+
312
+ long m_ = channels_out;
313
+ long n_ = channels * kernel_h * kernel_w;
314
+ long k_ = height_out * width_out;
315
+
316
+ THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f,
317
+ columns.data<scalar_t>(), k_,
318
+ grad_output_n.data<scalar_t>(), k_, 1.0f,
319
+ grad_weight.data<scalar_t>(), n_);
320
+
321
+ // gradient w.r.t. bias
322
+ // long m_ = channels_out;
323
+ // long k__ = height_out * width_out;
324
+ THCudaBlas_Sgemv(state,
325
+ 't',
326
+ k_, m_, 1.0f,
327
+ grad_output_n.data<scalar_t>(), k_,
328
+ ones.data<scalar_t>(), 1, 1.0f,
329
+ grad_bias.data<scalar_t>(), 1);
330
+ }
331
+
332
+ return {
333
+ grad_input, grad_offset, grad_mask, grad_weight, grad_bias
334
+ };
335
+ }
maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.cu ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "dcn_v2_im2col_cuda.h"
2
+ #include <cstdio>
3
+ #include <algorithm>
4
+ #include <cstring>
5
+
6
+ #include <ATen/ATen.h>
7
+ #include <ATen/cuda/CUDAContext.h>
8
+
9
+ #include <THC/THC.h>
10
+ #include <THC/THCAtomics.cuh>
11
+ #include <THC/THCDeviceUtils.cuh>
12
+
13
+ #define CUDA_KERNEL_LOOP(i, n) \
14
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
15
+ i < (n); \
16
+ i += blockDim.x * gridDim.x)
17
+
18
+ const int CUDA_NUM_THREADS = 1024;
19
+ inline int GET_BLOCKS(const int N)
20
+ {
21
+ return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
22
+ }
23
+
24
+
25
+ __device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width,
26
+ const int height, const int width, float h, float w)
27
+ {
28
+ int h_low = floor(h);
29
+ int w_low = floor(w);
30
+ int h_high = h_low + 1;
31
+ int w_high = w_low + 1;
32
+
33
+ float lh = h - h_low;
34
+ float lw = w - w_low;
35
+ float hh = 1 - lh, hw = 1 - lw;
36
+
37
+ float v1 = 0;
38
+ if (h_low >= 0 && w_low >= 0)
39
+ v1 = bottom_data[h_low * data_width + w_low];
40
+ float v2 = 0;
41
+ if (h_low >= 0 && w_high <= width - 1)
42
+ v2 = bottom_data[h_low * data_width + w_high];
43
+ float v3 = 0;
44
+ if (h_high <= height - 1 && w_low >= 0)
45
+ v3 = bottom_data[h_high * data_width + w_low];
46
+ float v4 = 0;
47
+ if (h_high <= height - 1 && w_high <= width - 1)
48
+ v4 = bottom_data[h_high * data_width + w_high];
49
+
50
+ float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
51
+
52
+ float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
53
+ return val;
54
+ }
55
+
56
+ __device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w,
57
+ const int h, const int w, const int height, const int width)
58
+ {
59
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
60
+ {
61
+ //empty
62
+ return 0;
63
+ }
64
+
65
+ int argmax_h_low = floor(argmax_h);
66
+ int argmax_w_low = floor(argmax_w);
67
+ int argmax_h_high = argmax_h_low + 1;
68
+ int argmax_w_high = argmax_w_low + 1;
69
+
70
+ float weight = 0;
71
+ if (h == argmax_h_low && w == argmax_w_low)
72
+ weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
73
+ if (h == argmax_h_low && w == argmax_w_high)
74
+ weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
75
+ if (h == argmax_h_high && w == argmax_w_low)
76
+ weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
77
+ if (h == argmax_h_high && w == argmax_w_high)
78
+ weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
79
+ return weight;
80
+ }
81
+
82
+ __device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w,
83
+ const int height, const int width, const float *im_data,
84
+ const int data_width, const int bp_dir)
85
+ {
86
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
87
+ {
88
+ //empty
89
+ return 0;
90
+ }
91
+
92
+ int argmax_h_low = floor(argmax_h);
93
+ int argmax_w_low = floor(argmax_w);
94
+ int argmax_h_high = argmax_h_low + 1;
95
+ int argmax_w_high = argmax_w_low + 1;
96
+
97
+ float weight = 0;
98
+
99
+ if (bp_dir == 0)
100
+ {
101
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
102
+ weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
103
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
104
+ weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
105
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
106
+ weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
107
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
108
+ weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
109
+ }
110
+ else if (bp_dir == 1)
111
+ {
112
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
113
+ weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
114
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
115
+ weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
116
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
117
+ weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
118
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
119
+ weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
120
+ }
121
+
122
+ return weight;
123
+ }
124
+
125
+ __global__ void modulated_deformable_im2col_gpu_kernel(const int n,
126
+ const float *data_im, const float *data_offset, const float *data_mask,
127
+ const int height, const int width, const int kernel_h, const int kernel_w,
128
+ const int pad_h, const int pad_w,
129
+ const int stride_h, const int stride_w,
130
+ const int dilation_h, const int dilation_w,
131
+ const int channel_per_deformable_group,
132
+ const int batch_size, const int num_channels, const int deformable_group,
133
+ const int height_col, const int width_col,
134
+ float *data_col)
135
+ {
136
+ // launch channels * batch_size * height_col * width_col cores
137
+ CUDA_KERNEL_LOOP(index, n)
138
+ {
139
+ // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
140
+ // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
141
+
142
+ // index index of output matrix
143
+ const int w_col = index % width_col;
144
+ const int h_col = (index / width_col) % height_col;
145
+ // const int b_col = (index / width_col / height_col) % batch_size;
146
+ const int b_col = (index / width_col / height_col / num_channels) % batch_size;
147
+ // const int c_im = (index / width_col / height_col) / batch_size;
148
+ const int c_im = (index / width_col / height_col) % num_channels;
149
+ // const int c_col = c_im * kernel_h * kernel_w;
150
+ const int c_col = c_im * kernel_h * kernel_w;
151
+
152
+ // compute deformable group index
153
+ const int deformable_group_index = c_im / channel_per_deformable_group;
154
+
155
+ const int h_in = h_col * stride_h - pad_h;
156
+ const int w_in = w_col * stride_w - pad_w;
157
+
158
+ // float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
159
+ float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
160
+ //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
161
+ const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
162
+ const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
163
+
164
+ const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
165
+
166
+ for (int i = 0; i < kernel_h; ++i)
167
+ {
168
+ for (int j = 0; j < kernel_w; ++j)
169
+ {
170
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
171
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
172
+ const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
173
+ const float offset_h = data_offset_ptr[data_offset_h_ptr];
174
+ const float offset_w = data_offset_ptr[data_offset_w_ptr];
175
+ const float mask = data_mask_ptr[data_mask_hw_ptr];
176
+ float val = static_cast<float>(0);
177
+ const float h_im = h_in + i * dilation_h + offset_h;
178
+ const float w_im = w_in + j * dilation_w + offset_w;
179
+ //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
180
+ if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
181
+ {
182
+ //const float map_h = i * dilation_h + offset_h;
183
+ //const float map_w = j * dilation_w + offset_w;
184
+ //const int cur_height = height - h_in;
185
+ //const int cur_width = width - w_in;
186
+ //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
187
+ val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
188
+ }
189
+ *data_col_ptr = val * mask;
190
+ // data_col_ptr += batch_size * height_col * width_col;
191
+ data_col_ptr += height_col * width_col;
192
+ }
193
+ }
194
+ }
195
+ }
196
+
197
+ __global__ void modulated_deformable_col2im_gpu_kernel(const int n,
198
+ const float *data_col, const float *data_offset, const float *data_mask,
199
+ const int channels, const int height, const int width,
200
+ const int kernel_h, const int kernel_w,
201
+ const int pad_h, const int pad_w,
202
+ const int stride_h, const int stride_w,
203
+ const int dilation_h, const int dilation_w,
204
+ const int channel_per_deformable_group,
205
+ const int batch_size, const int deformable_group,
206
+ const int height_col, const int width_col,
207
+ float *grad_im)
208
+ {
209
+ CUDA_KERNEL_LOOP(index, n)
210
+ {
211
+ const int j = (index / width_col / height_col / batch_size) % kernel_w;
212
+ const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
213
+ const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
214
+ // compute the start and end of the output
215
+
216
+ const int deformable_group_index = c / channel_per_deformable_group;
217
+
218
+ int w_out = index % width_col;
219
+ int h_out = (index / width_col) % height_col;
220
+ int b = (index / width_col / height_col) % batch_size;
221
+ int w_in = w_out * stride_w - pad_w;
222
+ int h_in = h_out * stride_h - pad_h;
223
+
224
+ const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
225
+ const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
226
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
227
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
228
+ const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
229
+ const float offset_h = data_offset_ptr[data_offset_h_ptr];
230
+ const float offset_w = data_offset_ptr[data_offset_w_ptr];
231
+ const float mask = data_mask_ptr[data_mask_hw_ptr];
232
+ const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
233
+ const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
234
+
235
+ const float cur_top_grad = data_col[index] * mask;
236
+ const int cur_h = (int)cur_inv_h_data;
237
+ const int cur_w = (int)cur_inv_w_data;
238
+ for (int dy = -2; dy <= 2; dy++)
239
+ {
240
+ for (int dx = -2; dx <= 2; dx++)
241
+ {
242
+ if (cur_h + dy >= 0 && cur_h + dy < height &&
243
+ cur_w + dx >= 0 && cur_w + dx < width &&
244
+ abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
245
+ abs(cur_inv_w_data - (cur_w + dx)) < 1)
246
+ {
247
+ int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
248
+ float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
249
+ atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
250
+ }
251
+ }
252
+ }
253
+ }
254
+ }
255
+
256
+ __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
257
+ const float *data_col, const float *data_im,
258
+ const float *data_offset, const float *data_mask,
259
+ const int channels, const int height, const int width,
260
+ const int kernel_h, const int kernel_w,
261
+ const int pad_h, const int pad_w,
262
+ const int stride_h, const int stride_w,
263
+ const int dilation_h, const int dilation_w,
264
+ const int channel_per_deformable_group,
265
+ const int batch_size, const int offset_channels, const int deformable_group,
266
+ const int height_col, const int width_col,
267
+ float *grad_offset, float *grad_mask)
268
+ {
269
+ CUDA_KERNEL_LOOP(index, n)
270
+ {
271
+ float val = 0, mval = 0;
272
+ int w = index % width_col;
273
+ int h = (index / width_col) % height_col;
274
+ int c = (index / width_col / height_col) % offset_channels;
275
+ int b = (index / width_col / height_col) / offset_channels;
276
+ // compute the start and end of the output
277
+
278
+ const int deformable_group_index = c / (2 * kernel_h * kernel_w);
279
+ const int col_step = kernel_h * kernel_w;
280
+ int cnt = 0;
281
+ const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
282
+ const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
283
+ const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
284
+ const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
285
+
286
+ const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
287
+
288
+ for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
289
+ {
290
+ const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
291
+ const int bp_dir = offset_c % 2;
292
+
293
+ int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
294
+ int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
295
+ int w_out = col_pos % width_col;
296
+ int h_out = (col_pos / width_col) % height_col;
297
+ int w_in = w_out * stride_w - pad_w;
298
+ int h_in = h_out * stride_h - pad_h;
299
+ const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
300
+ const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
301
+ const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
302
+ const float offset_h = data_offset_ptr[data_offset_h_ptr];
303
+ const float offset_w = data_offset_ptr[data_offset_w_ptr];
304
+ const float mask = data_mask_ptr[data_mask_hw_ptr];
305
+ float inv_h = h_in + i * dilation_h + offset_h;
306
+ float inv_w = w_in + j * dilation_w + offset_w;
307
+ if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
308
+ {
309
+ inv_h = inv_w = -2;
310
+ }
311
+ else
312
+ {
313
+ mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
314
+ }
315
+ const float weight = dmcn_get_coordinate_weight(
316
+ inv_h, inv_w,
317
+ height, width, data_im_ptr + cnt * height * width, width, bp_dir);
318
+ val += weight * data_col_ptr[col_pos] * mask;
319
+ cnt += 1;
320
+ }
321
+ // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
322
+ grad_offset[index] = val;
323
+ if (offset_c % 2 == 0)
324
+ // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
325
+ grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
326
+ }
327
+ }
328
+
329
+ void modulated_deformable_im2col_cuda(cudaStream_t stream,
330
+ const float* data_im, const float* data_offset, const float* data_mask,
331
+ const int batch_size, const int channels, const int height_im, const int width_im,
332
+ const int height_col, const int width_col, const int kernel_h, const int kernel_w,
333
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
334
+ const int dilation_h, const int dilation_w,
335
+ const int deformable_group, float* data_col) {
336
+ // num_axes should be smaller than block size
337
+ const int channel_per_deformable_group = channels / deformable_group;
338
+ const int num_kernels = channels * batch_size * height_col * width_col;
339
+ modulated_deformable_im2col_gpu_kernel
340
+ <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
341
+ 0, stream>>>(
342
+ num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
343
+ pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
344
+ batch_size, channels, deformable_group, height_col, width_col, data_col);
345
+
346
+ cudaError_t err = cudaGetLastError();
347
+ if (err != cudaSuccess)
348
+ {
349
+ printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
350
+ }
351
+
352
+ }
353
+
354
+ void modulated_deformable_col2im_cuda(cudaStream_t stream,
355
+ const float* data_col, const float* data_offset, const float* data_mask,
356
+ const int batch_size, const int channels, const int height_im, const int width_im,
357
+ const int height_col, const int width_col, const int kernel_h, const int kernel_w,
358
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
359
+ const int dilation_h, const int dilation_w,
360
+ const int deformable_group, float* grad_im){
361
+
362
+ const int channel_per_deformable_group = channels / deformable_group;
363
+ const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
364
+ modulated_deformable_col2im_gpu_kernel
365
+ <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
366
+ 0, stream>>>(
367
+ num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
368
+ kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
369
+ dilation_h, dilation_w, channel_per_deformable_group,
370
+ batch_size, deformable_group, height_col, width_col, grad_im);
371
+ cudaError_t err = cudaGetLastError();
372
+ if (err != cudaSuccess)
373
+ {
374
+ printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
375
+ }
376
+
377
+ }
378
+
379
+ void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
380
+ const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
381
+ const int batch_size, const int channels, const int height_im, const int width_im,
382
+ const int height_col, const int width_col, const int kernel_h, const int kernel_w,
383
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
384
+ const int dilation_h, const int dilation_w,
385
+ const int deformable_group,
386
+ float* grad_offset, float* grad_mask) {
387
+ const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
388
+ const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
389
+ modulated_deformable_col2im_coord_gpu_kernel
390
+ <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
391
+ 0, stream>>>(
392
+ num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
393
+ kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
394
+ dilation_h, dilation_w, channel_per_deformable_group,
395
+ batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
396
+ grad_offset, grad_mask);
397
+ cudaError_t err = cudaGetLastError();
398
+ if (err != cudaSuccess)
399
+ {
400
+ printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
401
+ }
402
+ }
maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.h ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /*!
3
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
4
+ *
5
+ * COPYRIGHT
6
+ *
7
+ * All contributions by the University of California:
8
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
9
+ * All rights reserved.
10
+ *
11
+ * All other contributions:
12
+ * Copyright (c) 2014-2017, the respective contributors
13
+ * All rights reserved.
14
+ *
15
+ * Caffe uses a shared copyright model: each contributor holds copyright over
16
+ * their contributions to Caffe. The project versioning records all such
17
+ * contribution and copyright details. If a contributor wants to further mark
18
+ * their specific copyright on a particular contribution, they should indicate
19
+ * their copyright solely in the commit message of the change when it is
20
+ * committed.
21
+ *
22
+ * LICENSE
23
+ *
24
+ * Redistribution and use in source and binary forms, with or without
25
+ * modification, are permitted provided that the following conditions are met:
26
+ *
27
+ * 1. Redistributions of source code must retain the above copyright notice, this
28
+ * list of conditions and the following disclaimer.
29
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
30
+ * this list of conditions and the following disclaimer in the documentation
31
+ * and/or other materials provided with the distribution.
32
+ *
33
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
34
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
35
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
36
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
37
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
38
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
39
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
40
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43
+ *
44
+ * CONTRIBUTION AGREEMENT
45
+ *
46
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
47
+ * or otherwise, the contributor releases their content to the
48
+ * license and copyright terms herein.
49
+ *
50
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
51
+ *
52
+ * Copyright (c) 2018 Microsoft
53
+ * Licensed under The MIT License [see LICENSE for details]
54
+ * \file modulated_deformable_im2col.h
55
+ * \brief Function definitions of converting an image to
56
+ * column matrix based on kernel, padding, dilation, and offset.
57
+ * These functions are mainly used in deformable convolution operators.
58
+ * \ref: https://arxiv.org/abs/1811.11168
59
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
60
+ */
61
+
62
+ /***************** Adapted by Charles Shang *********************/
63
+
64
+ #ifndef DCN_V2_IM2COL_CUDA
65
+ #define DCN_V2_IM2COL_CUDA
66
+
67
+ #ifdef __cplusplus
68
+ extern "C"
69
+ {
70
+ #endif
71
+
72
+ void modulated_deformable_im2col_cuda(cudaStream_t stream,
73
+ const float *data_im, const float *data_offset, const float *data_mask,
74
+ const int batch_size, const int channels, const int height_im, const int width_im,
75
+ const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
76
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
77
+ const int dilation_h, const int dilation_w,
78
+ const int deformable_group, float *data_col);
79
+
80
+ void modulated_deformable_col2im_cuda(cudaStream_t stream,
81
+ const float *data_col, const float *data_offset, const float *data_mask,
82
+ const int batch_size, const int channels, const int height_im, const int width_im,
83
+ const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
84
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
85
+ const int dilation_h, const int dilation_w,
86
+ const int deformable_group, float *grad_im);
87
+
88
+ void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
89
+ const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
90
+ const int batch_size, const int channels, const int height_im, const int width_im,
91
+ const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
92
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
93
+ const int dilation_h, const int dilation_w,
94
+ const int deformable_group,
95
+ float *grad_offset, float *grad_mask);
96
+
97
+ #ifdef __cplusplus
98
+ }
99
+ #endif
100
+
101
+ #endif
maskrcnn_benchmark/csrc/cuda/dcn_v2_psroi_pooling_cuda.cu ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ * Copyright (c) 2017 Microsoft
3
+ * Licensed under The MIT License [see LICENSE for details]
4
+ * \file deformable_psroi_pooling.cu
5
+ * \brief
6
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
7
+ */
8
+ /***************** Adapted by Charles Shang *********************/
9
+
10
+ #include <cstdio>
11
+ #include <algorithm>
12
+ #include <cstring>
13
+ #include <iostream>
14
+
15
+ #include <ATen/ATen.h>
16
+ #include <ATen/cuda/CUDAContext.h>
17
+
18
+ #include <THC/THC.h>
19
+ #include <THC/THCAtomics.cuh>
20
+ #include <THC/THCDeviceUtils.cuh>
21
+
22
+ #define CUDA_KERNEL_LOOP(i, n) \
23
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
24
+ i < (n); \
25
+ i += blockDim.x * gridDim.x)
26
+
27
+ const int CUDA_NUM_THREADS = 1024;
28
+ inline int GET_BLOCKS(const int N)
29
+ {
30
+ return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
31
+ }
32
+
33
+ template <typename T>
34
+ __device__ T bilinear_interp(
35
+ const T *data,
36
+ const T x,
37
+ const T y,
38
+ const int width,
39
+ const int height)
40
+ {
41
+ int x1 = floor(x);
42
+ int x2 = ceil(x);
43
+ int y1 = floor(y);
44
+ int y2 = ceil(y);
45
+ T dist_x = static_cast<T>(x - x1);
46
+ T dist_y = static_cast<T>(y - y1);
47
+ T value11 = data[y1 * width + x1];
48
+ T value12 = data[y2 * width + x1];
49
+ T value21 = data[y1 * width + x2];
50
+ T value22 = data[y2 * width + x2];
51
+ T value = (1 - dist_x) * (1 - dist_y) * value11 +
52
+ (1 - dist_x) * dist_y * value12 +
53
+ dist_x * (1 - dist_y) * value21 +
54
+ dist_x * dist_y * value22;
55
+ return value;
56
+ }
57
+
58
+ template <typename T>
59
+ __global__ void DeformablePSROIPoolForwardKernel(
60
+ const int count,
61
+ const T *bottom_data,
62
+ const T spatial_scale,
63
+ const int channels,
64
+ const int height, const int width,
65
+ const int pooled_height, const int pooled_width,
66
+ const T *bottom_rois, const T *bottom_trans,
67
+ const int no_trans,
68
+ const T trans_std,
69
+ const int sample_per_part,
70
+ const int output_dim,
71
+ const int group_size,
72
+ const int part_size,
73
+ const int num_classes,
74
+ const int channels_each_class,
75
+ T *top_data,
76
+ T *top_count)
77
+ {
78
+ CUDA_KERNEL_LOOP(index, count)
79
+ {
80
+ // The output is in order (n, ctop, ph, pw)
81
+ int pw = index % pooled_width;
82
+ int ph = (index / pooled_width) % pooled_height;
83
+ int ctop = (index / pooled_width / pooled_height) % output_dim;
84
+ int n = index / pooled_width / pooled_height / output_dim;
85
+
86
+ // [start, end) interval for spatial sampling
87
+ const T *offset_bottom_rois = bottom_rois + n * 5;
88
+ int roi_batch_ind = offset_bottom_rois[0];
89
+ T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
90
+ T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
91
+ T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
92
+ T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
93
+
94
+ // Force too small ROIs to be 1x1
95
+ T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
96
+ T roi_height = max(roi_end_h - roi_start_h, 0.1);
97
+
98
+ // Compute w and h at bottom
99
+ T bin_size_h = roi_height / static_cast<T>(pooled_height);
100
+ T bin_size_w = roi_width / static_cast<T>(pooled_width);
101
+
102
+ T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
103
+ T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
104
+
105
+ int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
106
+ int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
107
+ int class_id = ctop / channels_each_class;
108
+ T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
109
+ T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
110
+
111
+ T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
112
+ wstart += trans_x * roi_width;
113
+ T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
114
+ hstart += trans_y * roi_height;
115
+
116
+ T sum = 0;
117
+ int count = 0;
118
+ int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
119
+ int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
120
+ gw = min(max(gw, 0), group_size - 1);
121
+ gh = min(max(gh, 0), group_size - 1);
122
+
123
+ const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
124
+ for (int ih = 0; ih < sample_per_part; ih++)
125
+ {
126
+ for (int iw = 0; iw < sample_per_part; iw++)
127
+ {
128
+ T w = wstart + iw * sub_bin_size_w;
129
+ T h = hstart + ih * sub_bin_size_h;
130
+ // bilinear interpolation
131
+ if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
132
+ {
133
+ continue;
134
+ }
135
+ w = min(max(w, 0.), width - 1.);
136
+ h = min(max(h, 0.), height - 1.);
137
+ int c = (ctop * group_size + gh) * group_size + gw;
138
+ T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
139
+ sum += val;
140
+ count++;
141
+ }
142
+ }
143
+ top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
144
+ top_count[index] = count;
145
+ }
146
+ }
147
+
148
+ template <typename T>
149
+ __global__ void DeformablePSROIPoolBackwardAccKernel(
150
+ const int count,
151
+ const T *top_diff,
152
+ const T *top_count,
153
+ const int num_rois,
154
+ const T spatial_scale,
155
+ const int channels,
156
+ const int height, const int width,
157
+ const int pooled_height, const int pooled_width,
158
+ const int output_dim,
159
+ T *bottom_data_diff, T *bottom_trans_diff,
160
+ const T *bottom_data,
161
+ const T *bottom_rois,
162
+ const T *bottom_trans,
163
+ const int no_trans,
164
+ const T trans_std,
165
+ const int sample_per_part,
166
+ const int group_size,
167
+ const int part_size,
168
+ const int num_classes,
169
+ const int channels_each_class)
170
+ {
171
+ CUDA_KERNEL_LOOP(index, count)
172
+ {
173
+ // The output is in order (n, ctop, ph, pw)
174
+ int pw = index % pooled_width;
175
+ int ph = (index / pooled_width) % pooled_height;
176
+ int ctop = (index / pooled_width / pooled_height) % output_dim;
177
+ int n = index / pooled_width / pooled_height / output_dim;
178
+
179
+ // [start, end) interval for spatial sampling
180
+ const T *offset_bottom_rois = bottom_rois + n * 5;
181
+ int roi_batch_ind = offset_bottom_rois[0];
182
+ T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
183
+ T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
184
+ T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
185
+ T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
186
+
187
+ // Force too small ROIs to be 1x1
188
+ T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
189
+ T roi_height = max(roi_end_h - roi_start_h, 0.1);
190
+
191
+ // Compute w and h at bottom
192
+ T bin_size_h = roi_height / static_cast<T>(pooled_height);
193
+ T bin_size_w = roi_width / static_cast<T>(pooled_width);
194
+
195
+ T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
196
+ T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
197
+
198
+ int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
199
+ int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
200
+ int class_id = ctop / channels_each_class;
201
+ T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
202
+ T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
203
+
204
+ T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
205
+ wstart += trans_x * roi_width;
206
+ T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
207
+ hstart += trans_y * roi_height;
208
+
209
+ if (top_count[index] <= 0)
210
+ {
211
+ continue;
212
+ }
213
+ T diff_val = top_diff[index] / top_count[index];
214
+ const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
215
+ T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
216
+ int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
217
+ int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
218
+ gw = min(max(gw, 0), group_size - 1);
219
+ gh = min(max(gh, 0), group_size - 1);
220
+
221
+ for (int ih = 0; ih < sample_per_part; ih++)
222
+ {
223
+ for (int iw = 0; iw < sample_per_part; iw++)
224
+ {
225
+ T w = wstart + iw * sub_bin_size_w;
226
+ T h = hstart + ih * sub_bin_size_h;
227
+ // bilinear interpolation
228
+ if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
229
+ {
230
+ continue;
231
+ }
232
+ w = min(max(w, 0.), width - 1.);
233
+ h = min(max(h, 0.), height - 1.);
234
+ int c = (ctop * group_size + gh) * group_size + gw;
235
+ // backward on feature
236
+ int x0 = floor(w);
237
+ int x1 = ceil(w);
238
+ int y0 = floor(h);
239
+ int y1 = ceil(h);
240
+ T dist_x = w - x0, dist_y = h - y0;
241
+ T q00 = (1 - dist_x) * (1 - dist_y);
242
+ T q01 = (1 - dist_x) * dist_y;
243
+ T q10 = dist_x * (1 - dist_y);
244
+ T q11 = dist_x * dist_y;
245
+ int bottom_index_base = c * height * width;
246
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
247
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
248
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
249
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
250
+
251
+ if (no_trans)
252
+ {
253
+ continue;
254
+ }
255
+ T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
256
+ T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
257
+ T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
258
+ T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
259
+ T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
260
+ diff_x *= roi_width;
261
+ T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
262
+ diff_y *= roi_height;
263
+
264
+ atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
265
+ atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
266
+ }
267
+ }
268
+ }
269
+ }
270
+
271
+ std::tuple<at::Tensor, at::Tensor>
272
+ dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
273
+ const at::Tensor &bbox,
274
+ const at::Tensor &trans,
275
+ const int no_trans,
276
+ const float spatial_scale,
277
+ const int output_dim,
278
+ const int group_size,
279
+ const int pooled_size,
280
+ const int part_size,
281
+ const int sample_per_part,
282
+ const float trans_std)
283
+ {
284
+ AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
285
+ AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
286
+ AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
287
+
288
+ const int batch = input.size(0);
289
+ const int channels = input.size(1);
290
+ const int height = input.size(2);
291
+ const int width = input.size(3);
292
+ const int channels_trans = no_trans ? 2 : trans.size(1);
293
+ const int num_bbox = bbox.size(0);
294
+
295
+ AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
296
+ auto pooled_height = pooled_size;
297
+ auto pooled_width = pooled_size;
298
+
299
+ auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
300
+ long out_size = num_bbox * output_dim * pooled_height * pooled_width;
301
+ auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
302
+
303
+ const int num_classes = no_trans ? 1 : channels_trans / 2;
304
+ const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
305
+
306
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
307
+
308
+ if (out.numel() == 0)
309
+ {
310
+ THCudaCheck(cudaGetLastError());
311
+ return std::make_tuple(out, top_count);
312
+ }
313
+
314
+ dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
315
+ dim3 block(512);
316
+
317
+ AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] {
318
+ DeformablePSROIPoolForwardKernel<scalar_t><<<grid, block, 0, stream>>>(
319
+ out_size,
320
+ input.contiguous().data<scalar_t>(),
321
+ spatial_scale,
322
+ channels,
323
+ height, width,
324
+ pooled_height,
325
+ pooled_width,
326
+ bbox.contiguous().data<scalar_t>(),
327
+ trans.contiguous().data<scalar_t>(),
328
+ no_trans,
329
+ trans_std,
330
+ sample_per_part,
331
+ output_dim,
332
+ group_size,
333
+ part_size,
334
+ num_classes,
335
+ channels_each_class,
336
+ out.data<scalar_t>(),
337
+ top_count.data<scalar_t>());
338
+ });
339
+ THCudaCheck(cudaGetLastError());
340
+ return std::make_tuple(out, top_count);
341
+ }
342
+
343
+ std::tuple<at::Tensor, at::Tensor>
344
+ dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
345
+ const at::Tensor &input,
346
+ const at::Tensor &bbox,
347
+ const at::Tensor &trans,
348
+ const at::Tensor &top_count,
349
+ const int no_trans,
350
+ const float spatial_scale,
351
+ const int output_dim,
352
+ const int group_size,
353
+ const int pooled_size,
354
+ const int part_size,
355
+ const int sample_per_part,
356
+ const float trans_std)
357
+ {
358
+ AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
359
+ AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
360
+ AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
361
+ AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
362
+ AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");
363
+
364
+ const int batch = input.size(0);
365
+ const int channels = input.size(1);
366
+ const int height = input.size(2);
367
+ const int width = input.size(3);
368
+ const int channels_trans = no_trans ? 2 : trans.size(1);
369
+ const int num_bbox = bbox.size(0);
370
+
371
+ AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
372
+ auto pooled_height = pooled_size;
373
+ auto pooled_width = pooled_size;
374
+ long out_size = num_bbox * output_dim * pooled_height * pooled_width;
375
+ const int num_classes = no_trans ? 1 : channels_trans / 2;
376
+ const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
377
+
378
+ auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
379
+ auto trans_grad = at::zeros_like(trans);
380
+
381
+ if (input_grad.numel() == 0)
382
+ {
383
+ THCudaCheck(cudaGetLastError());
384
+ return std::make_tuple(input_grad, trans_grad);
385
+ }
386
+
387
+ dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
388
+ dim3 block(512);
389
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
390
+
391
+ AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] {
392
+ DeformablePSROIPoolBackwardAccKernel<scalar_t><<<grid, block, 0, stream>>>(
393
+ out_size,
394
+ out_grad.contiguous().data<scalar_t>(),
395
+ top_count.contiguous().data<scalar_t>(),
396
+ num_bbox,
397
+ spatial_scale,
398
+ channels,
399
+ height,
400
+ width,
401
+ pooled_height,
402
+ pooled_width,
403
+ output_dim,
404
+ input_grad.contiguous().data<scalar_t>(),
405
+ trans_grad.contiguous().data<scalar_t>(),
406
+ input.contiguous().data<scalar_t>(),
407
+ bbox.contiguous().data<scalar_t>(),
408
+ trans.contiguous().data<scalar_t>(),
409
+ no_trans,
410
+ trans_std,
411
+ sample_per_part,
412
+ group_size,
413
+ part_size,
414
+ num_classes,
415
+ channels_each_class);
416
+ });
417
+ THCudaCheck(cudaGetLastError());
418
+ return std::make_tuple(input_grad, trans_grad);
419
+ }
maskrcnn_benchmark/csrc/cuda/nms.cu ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include <ATen/ATen.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+
5
+ #include <THC/THC.h>
6
+ #include <THC/THCDeviceUtils.cuh>
7
+
8
+ #include <vector>
9
+ #include <iostream>
10
+
11
+ int const threadsPerBlock = sizeof(unsigned long long) * 8;
12
+
13
+ __device__ inline float devIoU(float const * const a, float const * const b) {
14
+ float left = max(a[0], b[0]), right = min(a[2], b[2]);
15
+ float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
16
+ float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
17
+ float interS = width * height;
18
+ float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
19
+ float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
20
+ return interS / (Sa + Sb - interS);
21
+ }
22
+
23
+ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
24
+ const float *dev_boxes, unsigned long long *dev_mask) {
25
+ const int row_start = blockIdx.y;
26
+ const int col_start = blockIdx.x;
27
+
28
+ // if (row_start > col_start) return;
29
+
30
+ const int row_size =
31
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
32
+ const int col_size =
33
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
34
+
35
+ __shared__ float block_boxes[threadsPerBlock * 5];
36
+ if (threadIdx.x < col_size) {
37
+ block_boxes[threadIdx.x * 5 + 0] =
38
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
39
+ block_boxes[threadIdx.x * 5 + 1] =
40
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
41
+ block_boxes[threadIdx.x * 5 + 2] =
42
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
43
+ block_boxes[threadIdx.x * 5 + 3] =
44
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
45
+ block_boxes[threadIdx.x * 5 + 4] =
46
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
47
+ }
48
+ __syncthreads();
49
+
50
+ if (threadIdx.x < row_size) {
51
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
52
+ const float *cur_box = dev_boxes + cur_box_idx * 5;
53
+ int i = 0;
54
+ unsigned long long t = 0;
55
+ int start = 0;
56
+ if (row_start == col_start) {
57
+ start = threadIdx.x + 1;
58
+ }
59
+ for (i = start; i < col_size; i++) {
60
+ if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
61
+ t |= 1ULL << i;
62
+ }
63
+ }
64
+ const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
65
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
66
+ }
67
+ }
68
+
69
+ // boxes is a N x 5 tensor
70
+ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
71
+ using scalar_t = float;
72
+ AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
73
+ auto scores = boxes.select(1, 4);
74
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
75
+ auto boxes_sorted = boxes.index_select(0, order_t);
76
+
77
+ int boxes_num = boxes.size(0);
78
+
79
+ const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
80
+
81
+ scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
82
+
83
+ THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
84
+
85
+ unsigned long long* mask_dev = NULL;
86
+ //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
87
+ // boxes_num * col_blocks * sizeof(unsigned long long)));
88
+
89
+ mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
90
+
91
+ dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
92
+ THCCeilDiv(boxes_num, threadsPerBlock));
93
+ dim3 threads(threadsPerBlock);
94
+ nms_kernel<<<blocks, threads>>>(boxes_num,
95
+ nms_overlap_thresh,
96
+ boxes_dev,
97
+ mask_dev);
98
+
99
+ std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
100
+ THCudaCheck(cudaMemcpy(&mask_host[0],
101
+ mask_dev,
102
+ sizeof(unsigned long long) * boxes_num * col_blocks,
103
+ cudaMemcpyDeviceToHost));
104
+
105
+ std::vector<unsigned long long> remv(col_blocks);
106
+ memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
107
+
108
+ at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
109
+ int64_t* keep_out = keep.data<int64_t>();
110
+
111
+ int num_to_keep = 0;
112
+ for (int i = 0; i < boxes_num; i++) {
113
+ int nblock = i / threadsPerBlock;
114
+ int inblock = i % threadsPerBlock;
115
+
116
+ if (!(remv[nblock] & (1ULL << inblock))) {
117
+ keep_out[num_to_keep++] = i;
118
+ unsigned long long *p = &mask_host[0] + i * col_blocks;
119
+ for (int j = nblock; j < col_blocks; j++) {
120
+ remv[j] |= p[j];
121
+ }
122
+ }
123
+ }
124
+
125
+ THCudaFree(state, mask_dev);
126
+ // TODO improve this part
127
+ return std::get<0>(order_t.index({
128
+ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
129
+ order_t.device(), keep.scalar_type())
130
+ }).sort(0, false));
131
+ }
maskrcnn_benchmark/csrc/cuda/vision.h ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+ #include <torch/extension.h>
4
+
5
+
6
+ at::Tensor SigmoidFocalLoss_forward_cuda(
7
+ const at::Tensor& logits,
8
+ const at::Tensor& targets,
9
+ const int num_classes,
10
+ const float gamma,
11
+ const float alpha);
12
+
13
+ at::Tensor SigmoidFocalLoss_backward_cuda(
14
+ const at::Tensor& logits,
15
+ const at::Tensor& targets,
16
+ const at::Tensor& d_losses,
17
+ const int num_classes,
18
+ const float gamma,
19
+ const float alpha);
20
+
21
+ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
22
+ const at::Tensor& rois,
23
+ const float spatial_scale,
24
+ const int pooled_height,
25
+ const int pooled_width,
26
+ const int sampling_ratio);
27
+
28
+ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
29
+ const at::Tensor& rois,
30
+ const float spatial_scale,
31
+ const int pooled_height,
32
+ const int pooled_width,
33
+ const int batch_size,
34
+ const int channels,
35
+ const int height,
36
+ const int width,
37
+ const int sampling_ratio);
38
+
39
+
40
+ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
41
+ const at::Tensor& rois,
42
+ const float spatial_scale,
43
+ const int pooled_height,
44
+ const int pooled_width);
45
+
46
+ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
47
+ const at::Tensor& input,
48
+ const at::Tensor& rois,
49
+ const at::Tensor& argmax,
50
+ const float spatial_scale,
51
+ const int pooled_height,
52
+ const int pooled_width,
53
+ const int batch_size,
54
+ const int channels,
55
+ const int height,
56
+ const int width);
57
+
58
+ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
59
+
60
+
61
+ at::Tensor compute_flow_cuda(const at::Tensor& boxes,
62
+ const int height,
63
+ const int width);
64
+
65
+ at::Tensor
66
+ dcn_v2_cuda_forward(const at::Tensor &input,
67
+ const at::Tensor &weight,
68
+ const at::Tensor &bias,
69
+ const at::Tensor &offset,
70
+ const at::Tensor &mask,
71
+ const int kernel_h,
72
+ const int kernel_w,
73
+ const int stride_h,
74
+ const int stride_w,
75
+ const int pad_h,
76
+ const int pad_w,
77
+ const int dilation_h,
78
+ const int dilation_w,
79
+ const int deformable_group);
80
+
81
+ std::vector<at::Tensor>
82
+ dcn_v2_cuda_backward(const at::Tensor &input,
83
+ const at::Tensor &weight,
84
+ const at::Tensor &bias,
85
+ const at::Tensor &offset,
86
+ const at::Tensor &mask,
87
+ const at::Tensor &grad_output,
88
+ int kernel_h, int kernel_w,
89
+ int stride_h, int stride_w,
90
+ int pad_h, int pad_w,
91
+ int dilation_h, int dilation_w,
92
+ int deformable_group);
93
+
94
+
95
+ std::tuple<at::Tensor, at::Tensor>
96
+ dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
97
+ const at::Tensor &bbox,
98
+ const at::Tensor &trans,
99
+ const int no_trans,
100
+ const float spatial_scale,
101
+ const int output_dim,
102
+ const int group_size,
103
+ const int pooled_size,
104
+ const int part_size,
105
+ const int sample_per_part,
106
+ const float trans_std);
107
+
108
+ std::tuple<at::Tensor, at::Tensor>
109
+ dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
110
+ const at::Tensor &input,
111
+ const at::Tensor &bbox,
112
+ const at::Tensor &trans,
113
+ const at::Tensor &top_count,
114
+ const int no_trans,
115
+ const float spatial_scale,
116
+ const int output_dim,
117
+ const int group_size,
118
+ const int pooled_size,
119
+ const int part_size,
120
+ const int sample_per_part,
121
+ const float trans_std);
maskrcnn_benchmark/csrc/dcn_v2.h ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "cpu/vision.h"
4
+
5
+ #ifdef WITH_CUDA
6
+ #include "cuda/vision.h"
7
+ #endif
8
+
9
+ at::Tensor
10
+ dcn_v2_forward(const at::Tensor &input,
11
+ const at::Tensor &weight,
12
+ const at::Tensor &bias,
13
+ const at::Tensor &offset,
14
+ const at::Tensor &mask,
15
+ const int kernel_h,
16
+ const int kernel_w,
17
+ const int stride_h,
18
+ const int stride_w,
19
+ const int pad_h,
20
+ const int pad_w,
21
+ const int dilation_h,
22
+ const int dilation_w,
23
+ const int deformable_group)
24
+ {
25
+ if (input.type().is_cuda())
26
+ {
27
+ #ifdef WITH_CUDA
28
+ return dcn_v2_cuda_forward(input, weight, bias, offset, mask,
29
+ kernel_h, kernel_w,
30
+ stride_h, stride_w,
31
+ pad_h, pad_w,
32
+ dilation_h, dilation_w,
33
+ deformable_group);
34
+ #else
35
+ AT_ERROR("Not compiled with GPU support");
36
+ #endif
37
+ }
38
+ AT_ERROR("Not implemented on the CPU");
39
+ }
40
+
41
+ std::vector<at::Tensor>
42
+ dcn_v2_backward(const at::Tensor &input,
43
+ const at::Tensor &weight,
44
+ const at::Tensor &bias,
45
+ const at::Tensor &offset,
46
+ const at::Tensor &mask,
47
+ const at::Tensor &grad_output,
48
+ int kernel_h, int kernel_w,
49
+ int stride_h, int stride_w,
50
+ int pad_h, int pad_w,
51
+ int dilation_h, int dilation_w,
52
+ int deformable_group)
53
+ {
54
+ if (input.type().is_cuda())
55
+ {
56
+ #ifdef WITH_CUDA
57
+ return dcn_v2_cuda_backward(input,
58
+ weight,
59
+ bias,
60
+ offset,
61
+ mask,
62
+ grad_output,
63
+ kernel_h, kernel_w,
64
+ stride_h, stride_w,
65
+ pad_h, pad_w,
66
+ dilation_h, dilation_w,
67
+ deformable_group);
68
+ #else
69
+ AT_ERROR("Not compiled with GPU support");
70
+ #endif
71
+ }
72
+ AT_ERROR("Not implemented on the CPU");
73
+ }
74
+
75
+ std::tuple<at::Tensor, at::Tensor>
76
+ dcn_v2_psroi_pooling_forward(const at::Tensor &input,
77
+ const at::Tensor &bbox,
78
+ const at::Tensor &trans,
79
+ const int no_trans,
80
+ const float spatial_scale,
81
+ const int output_dim,
82
+ const int group_size,
83
+ const int pooled_size,
84
+ const int part_size,
85
+ const int sample_per_part,
86
+ const float trans_std)
87
+ {
88
+ if (input.type().is_cuda())
89
+ {
90
+ #ifdef WITH_CUDA
91
+ return dcn_v2_psroi_pooling_cuda_forward(input,
92
+ bbox,
93
+ trans,
94
+ no_trans,
95
+ spatial_scale,
96
+ output_dim,
97
+ group_size,
98
+ pooled_size,
99
+ part_size,
100
+ sample_per_part,
101
+ trans_std);
102
+ #else
103
+ AT_ERROR("Not compiled with GPU support");
104
+ #endif
105
+ }
106
+ AT_ERROR("Not implemented on the CPU");
107
+ }
108
+
109
+ std::tuple<at::Tensor, at::Tensor>
110
+ dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad,
111
+ const at::Tensor &input,
112
+ const at::Tensor &bbox,
113
+ const at::Tensor &trans,
114
+ const at::Tensor &top_count,
115
+ const int no_trans,
116
+ const float spatial_scale,
117
+ const int output_dim,
118
+ const int group_size,
119
+ const int pooled_size,
120
+ const int part_size,
121
+ const int sample_per_part,
122
+ const float trans_std)
123
+ {
124
+ if (input.type().is_cuda())
125
+ {
126
+ #ifdef WITH_CUDA
127
+ return dcn_v2_psroi_pooling_cuda_backward(out_grad,
128
+ input,
129
+ bbox,
130
+ trans,
131
+ top_count,
132
+ no_trans,
133
+ spatial_scale,
134
+ output_dim,
135
+ group_size,
136
+ pooled_size,
137
+ part_size,
138
+ sample_per_part,
139
+ trans_std);
140
+ #else
141
+ AT_ERROR("Not compiled with GPU support");
142
+ #endif
143
+ }
144
+ AT_ERROR("Not implemented on the CPU");
145
+ }
maskrcnn_benchmark/csrc/nms.h ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+ #include "cpu/vision.h"
4
+
5
+ #ifdef WITH_CUDA
6
+ #include "cuda/vision.h"
7
+ #endif
8
+
9
+
10
+ at::Tensor nms(const at::Tensor& dets,
11
+ const at::Tensor& scores,
12
+ const float threshold) {
13
+
14
+ if (dets.type().is_cuda()) {
15
+ #ifdef WITH_CUDA
16
+ // TODO raise error if not compiled with CUDA
17
+ if (dets.numel() == 0)
18
+ return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
19
+ auto b = at::cat({dets, scores.unsqueeze(1)}, 1);
20
+ return nms_cuda(b, threshold);
21
+ #else
22
+ AT_ERROR("Not compiled with GPU support");
23
+ #endif
24
+ }
25
+
26
+ at::Tensor result = nms_cpu(dets, scores, threshold);
27
+ return result;
28
+ }
maskrcnn_benchmark/csrc/vision.cpp ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include "nms.h"
3
+ #include "ROIAlign.h"
4
+ #include "ROIPool.h"
5
+ #include "SigmoidFocalLoss.h"
6
+ #include "dcn_v2.h"
7
+
8
+
9
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
10
+ m.def("nms", &nms, "non-maximum suppression");
11
+ m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
12
+ m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
13
+ m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward");
14
+ m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
15
+ m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
16
+ m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
17
+ m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward");
18
+ m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward");
19
+ m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward");
20
+ m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward");
21
+ }
maskrcnn_benchmark/data/README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Setting Up Datasets
2
+ This file describes how to perform training on other datasets.
3
+
4
+ Only Pascal VOC dataset can be loaded from its original format and be outputted to Pascal style results currently.
5
+
6
+ We expect the annotations from other datasets be converted to COCO json format, and
7
+ the output will be in COCO-style. (i.e. AP, AP50, AP75, APs, APm, APl for bbox and segm)
8
+
9
+ ## Creating Symlinks for PASCAL VOC
10
+
11
+ We assume that your symlinked `datasets/voc/VOC<year>` directory has the following structure:
12
+
13
+ ```
14
+ VOC<year>
15
+ |_ JPEGImages
16
+ | |_ <im-1-name>.jpg
17
+ | |_ ...
18
+ | |_ <im-N-name>.jpg
19
+ |_ Annotations
20
+ | |_ pascal_train<year>.json (optional)
21
+ | |_ pascal_val<year>.json (optional)
22
+ | |_ pascal_test<year>.json (optional)
23
+ | |_ <im-1-name>.xml
24
+ | |_ ...
25
+ | |_ <im-N-name>.xml
26
+ |_ VOCdevkit<year>
27
+ ```
28
+
29
+ Create symlinks for `voc/VOC<year>`:
30
+
31
+ ```
32
+ cd ~/github/maskrcnn-benchmark
33
+ mkdir -p datasets/voc/VOC<year>
34
+ ln -s /path/to/VOC<year> /datasets/voc/VOC<year>
35
+ ```
36
+ Example configuration files for PASCAL VOC could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/pascal_voc/).
37
+
38
+ ### PASCAL VOC Annotations in COCO Format
39
+ To output COCO-style evaluation result, PASCAL VOC annotations in COCO json format is required and could be downloaded from [here](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip)
40
+ via http://cocodataset.org/#external.
41
+
42
+ ## Creating Symlinks for Cityscapes:
43
+
44
+ We assume that your symlinked `datasets/cityscapes` directory has the following structure:
45
+
46
+ ```
47
+ cityscapes
48
+ |_ images
49
+ | |_ <im-1-name>.jpg
50
+ | |_ ...
51
+ | |_ <im-N-name>.jpg
52
+ |_ annotations
53
+ | |_ instanceonly_gtFile_train.json
54
+ | |_ ...
55
+ |_ raw
56
+ |_ gtFine
57
+ |_ ...
58
+ |_ README.md
59
+ ```
60
+
61
+ Create symlinks for `cityscapes`:
62
+
63
+ ```
64
+ cd ~/github/maskrcnn-benchmark
65
+ mkdir -p datasets/cityscapes
66
+ ln -s /path/to/cityscapes datasets/data/cityscapes
67
+ ```
68
+
69
+ ### Steps to convert Cityscapes Annotations to COCO Format
70
+ 1. Download gtFine_trainvaltest.zip from https://www.cityscapes-dataset.com/downloads/ (login required)
71
+ 2. Extract it to /path/to/gtFine_trainvaltest
72
+ ```
73
+ cityscapes
74
+ |_ gtFine_trainvaltest.zip
75
+ |_ gtFine_trainvaltest
76
+ |_ gtFine
77
+ ```
78
+ 3. Run the below commands to convert the annotations
79
+
80
+ ```
81
+ cd ~/github
82
+ git clone https://github.com/mcordts/cityscapesScripts.git
83
+ cd cityscapesScripts
84
+ cp ~/github/maskrcnn-benchmark/tools/cityscapes/instances2dict_with_polygons.py cityscapesscripts/evaluation
85
+ python setup.py install
86
+ cd ~/github/maskrcnn-benchmark
87
+ python tools/cityscapes/convert_cityscapes_to_coco.py --datadir /path/to/cityscapes --outdir /path/to/cityscapes/annotations
88
+ ```
89
+
90
+ Example configuration files for Cityscapes could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/cityscapes/).
maskrcnn_benchmark/data/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ from .build import make_data_loader
maskrcnn_benchmark/data/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (196 Bytes). View file
 
maskrcnn_benchmark/data/__pycache__/build.cpython-37.pyc ADDED
Binary file (4.81 kB). View file
 
maskrcnn_benchmark/data/__pycache__/collate_batch.cpython-37.pyc ADDED
Binary file (973 Bytes). View file
 
maskrcnn_benchmark/data/build.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ import bisect
3
+ import copy
4
+ import logging
5
+
6
+ import torch.utils.data
7
+ from maskrcnn_benchmark.utils.comm import get_world_size
8
+ from maskrcnn_benchmark.utils.imports import import_file
9
+
10
+ from . import datasets as D
11
+ from . import samplers
12
+
13
+ from .collate_batch import BatchCollator
14
+ from .transforms import build_transforms
15
+
16
+
17
+ def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True):
18
+ """
19
+ Arguments:
20
+ dataset_list (list[str]): Contains the names of the datasets, i.e.,
21
+ coco_2014_trian, coco_2014_val, etc
22
+ transforms (callable): transforms to apply to each (image, target) sample
23
+ dataset_catalog (DatasetCatalog): contains the information on how to
24
+ construct a dataset.
25
+ is_train (bool): whether to setup the dataset for training or testing
26
+ """
27
+ if not isinstance(dataset_list, (list, tuple)):
28
+ raise RuntimeError(
29
+ "dataset_list should be a list of strings, got {}".format(dataset_list)
30
+ )
31
+ datasets = []
32
+ for dataset_name in dataset_list:
33
+ data = dataset_catalog.get(dataset_name)
34
+ factory = getattr(D, data["factory"])
35
+ args = data["args"]
36
+ # for COCODataset, we want to remove images without annotations
37
+ # during training
38
+ if data["factory"] in ["COCODataset",
39
+ "WordDataset"]:
40
+ args["remove_images_without_annotations"] = is_train
41
+ if data["factory"] == "PascalVOCDataset":
42
+ args["use_difficult"] = not is_train
43
+ args["transforms"] = transforms
44
+ # make dataset from factory
45
+ dataset = factory(**args)
46
+ datasets.append(dataset)
47
+
48
+ # for testing, return a list of datasets
49
+ if not is_train:
50
+ return datasets
51
+
52
+ # for training, concatenate all datasets into a single one
53
+ dataset = datasets[0]
54
+ if len(datasets) > 1:
55
+ dataset = D.ConcatDataset(datasets)
56
+
57
+ return [dataset]
58
+
59
+
60
+ def make_data_sampler(dataset, shuffle, distributed):
61
+ if distributed:
62
+ return samplers.DistributedSampler(dataset, shuffle=shuffle)
63
+ if shuffle:
64
+ sampler = torch.utils.data.sampler.RandomSampler(dataset)
65
+ else:
66
+ sampler = torch.utils.data.sampler.SequentialSampler(dataset)
67
+ return sampler
68
+
69
+
70
+ def _quantize(x, bins):
71
+ bins = copy.copy(bins)
72
+ bins = sorted(bins)
73
+ quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
74
+ return quantized
75
+
76
+
77
+ def _compute_aspect_ratios(dataset):
78
+ aspect_ratios = []
79
+ for i in range(len(dataset)):
80
+ img_info = dataset.get_img_info(i)
81
+ aspect_ratio = float(img_info["height"]) / float(img_info["width"])
82
+ aspect_ratios.append(aspect_ratio)
83
+ return aspect_ratios
84
+
85
+
86
+ def make_batch_data_sampler(
87
+ dataset, sampler, aspect_grouping, images_per_batch, num_iters=None, start_iter=0
88
+ ):
89
+ if aspect_grouping:
90
+ if not isinstance(aspect_grouping, (list, tuple)):
91
+ aspect_grouping = [aspect_grouping]
92
+ aspect_ratios = _compute_aspect_ratios(dataset)
93
+ group_ids = _quantize(aspect_ratios, aspect_grouping)
94
+ batch_sampler = samplers.GroupedBatchSampler(
95
+ sampler, group_ids, images_per_batch, drop_uneven=False
96
+ )
97
+ else:
98
+ batch_sampler = torch.utils.data.sampler.BatchSampler(
99
+ sampler, images_per_batch, drop_last=False
100
+ )
101
+ if num_iters is not None:
102
+ batch_sampler = samplers.IterationBasedBatchSampler(
103
+ batch_sampler, num_iters, start_iter
104
+ )
105
+ return batch_sampler
106
+
107
+
108
+ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
109
+ num_gpus = get_world_size()
110
+ if is_train:
111
+ images_per_batch = cfg.SOLVER.IMS_PER_BATCH
112
+ assert (
113
+ images_per_batch % num_gpus == 0
114
+ ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
115
+ "of GPUs ({}) used.".format(images_per_batch, num_gpus)
116
+ images_per_gpu = images_per_batch // num_gpus
117
+ shuffle = True
118
+ num_iters = cfg.SOLVER.MAX_ITER
119
+ else:
120
+ images_per_batch = cfg.TEST.IMS_PER_BATCH
121
+ assert (
122
+ images_per_batch % num_gpus == 0
123
+ ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number "
124
+ "of GPUs ({}) used.".format(images_per_batch, num_gpus)
125
+ images_per_gpu = images_per_batch // num_gpus
126
+ shuffle = False if not is_distributed else True
127
+ num_iters = None
128
+ start_iter = 0
129
+
130
+ if images_per_gpu > 1:
131
+ logger = logging.getLogger(__name__)
132
+ logger.warning(
133
+ "When using more than one image per GPU you may encounter "
134
+ "an out-of-memory (OOM) error if your GPU does not have "
135
+ "sufficient memory. If this happens, you can reduce "
136
+ "SOLVER.IMS_PER_BATCH (for training) or "
137
+ "TEST.IMS_PER_BATCH (for inference). For training, you must "
138
+ "also adjust the learning rate and schedule length according "
139
+ "to the linear scaling rule. See for example: "
140
+ "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14"
141
+ )
142
+
143
+ # group images which have similar aspect ratio. In this case, we only
144
+ # group in two cases: those with width / height > 1, and the other way around,
145
+ # but the code supports more general grouping strategy
146
+ aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []
147
+
148
+ paths_catalog = import_file(
149
+ "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True
150
+ )
151
+ DatasetCatalog = paths_catalog.DatasetCatalog
152
+ dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST
153
+
154
+ transforms = build_transforms(cfg, is_train)
155
+ datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train)
156
+
157
+ data_loaders = []
158
+ for dataset in datasets:
159
+ sampler = make_data_sampler(dataset, shuffle, is_distributed)
160
+ batch_sampler = make_batch_data_sampler(
161
+ dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter
162
+ )
163
+ collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
164
+ num_workers = cfg.DATALOADER.NUM_WORKERS
165
+ data_loader = torch.utils.data.DataLoader(
166
+ dataset,
167
+ num_workers=num_workers,
168
+ batch_sampler=batch_sampler,
169
+ collate_fn=collator,
170
+ )
171
+ data_loaders.append(data_loader)
172
+ if is_train:
173
+ # during training, a single (possibly concatenated) data_loader is returned
174
+ assert len(data_loaders) == 1
175
+ return data_loaders[0]
176
+ return data_loaders
maskrcnn_benchmark/data/collate_batch.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ from maskrcnn_benchmark.structures.image_list import to_image_list
3
+
4
+
5
+ class BatchCollator(object):
6
+ """
7
+ From a list of samples from the dataset,
8
+ returns the batched images and targets.
9
+ This should be passed to the DataLoader
10
+ """
11
+
12
+ def __init__(self, size_divisible=0):
13
+ self.size_divisible = size_divisible
14
+
15
+ def __call__(self, batch):
16
+ transposed_batch = list(zip(*batch))
17
+ images = to_image_list(transposed_batch[0], self.size_divisible)
18
+ targets = transposed_batch[1]
19
+ img_ids = transposed_batch[2]
20
+ return images, targets, img_ids
maskrcnn_benchmark/data/datasets/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ from .coco import COCODataset
3
+ from .voc import PascalVOCDataset
4
+ from .concat_dataset import ConcatDataset
5
+ from .word_dataset import WordDataset
6
+
7
+ __all__ = ["COCODataset", "ConcatDataset", "PascalVOCDataset",
8
+ "WordDataset"]
maskrcnn_benchmark/data/datasets/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (388 Bytes). View file
 
maskrcnn_benchmark/data/datasets/__pycache__/char_dataset.cpython-37.pyc ADDED
Binary file (7.64 kB). View file
 
maskrcnn_benchmark/data/datasets/__pycache__/coco.cpython-37.pyc ADDED
Binary file (4.43 kB). View file
 
maskrcnn_benchmark/data/datasets/__pycache__/concat_dataset.cpython-37.pyc ADDED
Binary file (985 Bytes). View file
 
maskrcnn_benchmark/data/datasets/__pycache__/voc.cpython-37.pyc ADDED
Binary file (4.13 kB). View file
 
maskrcnn_benchmark/data/datasets/__pycache__/word_dataset.cpython-37.pyc ADDED
Binary file (4.56 kB). View file
 
maskrcnn_benchmark/data/datasets/coco.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ import torch
3
+ import torchvision
4
+
5
+ from maskrcnn_benchmark.structures.bounding_box import BoxList
6
+ from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask
7
+ from maskrcnn_benchmark.structures.keypoint import PersonKeypoints
8
+
9
+
10
+ min_keypoints_per_image = 10
11
+
12
+
13
+ def _count_visible_keypoints(anno):
14
+ return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
15
+
16
+
17
+ def _has_only_empty_bbox(anno):
18
+ return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
19
+
20
+
21
+ def has_valid_annotation(anno):
22
+ # if it's empty, there is no annotation
23
+ if len(anno) == 0:
24
+ return False
25
+ # if all boxes have close to zero area, there is no annotation
26
+ if _has_only_empty_bbox(anno):
27
+ return False
28
+ # keypoints task have a slight different critera for considering
29
+ # if an annotation is valid
30
+ if "keypoints" not in anno[0]:
31
+ return True
32
+ # for keypoint detection tasks, only consider valid images those
33
+ # containing at least min_keypoints_per_image
34
+ if _count_visible_keypoints(anno) >= min_keypoints_per_image:
35
+ return True
36
+ return False
37
+
38
+
39
+ class COCODataset(torchvision.datasets.coco.CocoDetection):
40
+ def __init__(
41
+ self, ann_file, root, remove_images_without_annotations, transforms=None
42
+ ):
43
+ super(COCODataset, self).__init__(root, ann_file)
44
+ # sort indices for reproducible results
45
+ self.ids = sorted(self.ids)
46
+
47
+ # filter images without detection annotations
48
+ if remove_images_without_annotations:
49
+ ids = []
50
+ for img_id in self.ids:
51
+ ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
52
+ anno = self.coco.loadAnns(ann_ids)
53
+ if has_valid_annotation(anno):
54
+ ids.append(img_id)
55
+ self.ids = ids
56
+
57
+ self.json_category_id_to_contiguous_id = {
58
+ v: i + 1 for i, v in enumerate(self.coco.getCatIds())
59
+ }
60
+ self.contiguous_category_id_to_json_id = {
61
+ v: k for k, v in self.json_category_id_to_contiguous_id.items()
62
+ }
63
+ self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
64
+ self.transforms = transforms
65
+
66
+ def __getitem__(self, idx):
67
+ img, anno = super(COCODataset, self).__getitem__(idx)
68
+
69
+ # filter crowd annotations
70
+ # TODO might be better to add an extra field
71
+ anno = [obj for obj in anno if obj["iscrowd"] == 0]
72
+
73
+ boxes = [obj["bbox"] for obj in anno]
74
+ boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes
75
+ target = BoxList(boxes, img.size, mode="xywh").convert("xyxy")
76
+
77
+ classes = [obj["category_id"] for obj in anno]
78
+ classes = [self.json_category_id_to_contiguous_id[c] for c in classes]
79
+ classes = torch.tensor(classes)
80
+ target.add_field("labels", classes)
81
+
82
+ masks = [obj["segmentation"] for obj in anno]
83
+ masks = SegmentationMask(masks, img.size, mode='poly')
84
+ target.add_field("masks", masks)
85
+
86
+ if anno and "keypoints" in anno[0]:
87
+ keypoints = [obj["keypoints"] for obj in anno]
88
+ keypoints = PersonKeypoints(keypoints, img.size)
89
+ target.add_field("keypoints", keypoints)
90
+
91
+ target = target.clip_to_image(remove_empty=True)
92
+
93
+ if self.transforms is not None:
94
+ img, target = self.transforms(img, target)
95
+
96
+ return img, target, idx
97
+
98
+ def get_img_info(self, index):
99
+ img_id = self.id_to_img_map[index]
100
+ img_data = self.coco.imgs[img_id]
101
+ return img_data
maskrcnn_benchmark/data/datasets/concat_dataset.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ import bisect
3
+
4
+ from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
5
+
6
+
7
+ class ConcatDataset(_ConcatDataset):
8
+ """
9
+ Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra
10
+ method for querying the sizes of the image
11
+ """
12
+
13
+ def get_idxs(self, idx):
14
+ dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
15
+ if dataset_idx == 0:
16
+ sample_idx = idx
17
+ else:
18
+ sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
19
+ return dataset_idx, sample_idx
20
+
21
+ def get_img_info(self, idx):
22
+ dataset_idx, sample_idx = self.get_idxs(idx)
23
+ return self.datasets[dataset_idx].get_img_info(sample_idx)