Spaces:
Build error
Build error
First model version
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- LICENSE +25 -0
- configs/ctw/r50_baseline.yaml +70 -0
- configs/ic/r50_baseline.yaml +75 -0
- demo/1.jpg +0 -0
- demo/2.jpg +0 -0
- demo/example1.jpg +0 -0
- demo/example_results.jpg +0 -0
- maskrcnn_benchmark/__init__.py +1 -0
- maskrcnn_benchmark/__pycache__/__init__.cpython-37.pyc +0 -0
- maskrcnn_benchmark/config/__init__.py +2 -0
- maskrcnn_benchmark/config/__pycache__/__init__.cpython-37.pyc +0 -0
- maskrcnn_benchmark/config/__pycache__/defaults.cpython-37.pyc +0 -0
- maskrcnn_benchmark/config/__pycache__/paths_catalog.cpython-37.pyc +0 -0
- maskrcnn_benchmark/config/defaults.py +471 -0
- maskrcnn_benchmark/config/paths_catalog.py +120 -0
- maskrcnn_benchmark/csrc/ROIAlign.h +46 -0
- maskrcnn_benchmark/csrc/ROIPool.h +48 -0
- maskrcnn_benchmark/csrc/SigmoidFocalLoss.h +41 -0
- maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp +257 -0
- maskrcnn_benchmark/csrc/cpu/dcn_v2_cpu.cpp +74 -0
- maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp +75 -0
- maskrcnn_benchmark/csrc/cpu/vision.h +73 -0
- maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu +346 -0
- maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu +202 -0
- maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu +188 -0
- maskrcnn_benchmark/csrc/cuda/dcn_v2_cuda.cu +335 -0
- maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.cu +402 -0
- maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.h +101 -0
- maskrcnn_benchmark/csrc/cuda/dcn_v2_psroi_pooling_cuda.cu +419 -0
- maskrcnn_benchmark/csrc/cuda/nms.cu +131 -0
- maskrcnn_benchmark/csrc/cuda/vision.h +121 -0
- maskrcnn_benchmark/csrc/dcn_v2.h +145 -0
- maskrcnn_benchmark/csrc/nms.h +28 -0
- maskrcnn_benchmark/csrc/vision.cpp +21 -0
- maskrcnn_benchmark/data/README.md +90 -0
- maskrcnn_benchmark/data/__init__.py +2 -0
- maskrcnn_benchmark/data/__pycache__/__init__.cpython-37.pyc +0 -0
- maskrcnn_benchmark/data/__pycache__/build.cpython-37.pyc +0 -0
- maskrcnn_benchmark/data/__pycache__/collate_batch.cpython-37.pyc +0 -0
- maskrcnn_benchmark/data/build.py +176 -0
- maskrcnn_benchmark/data/collate_batch.py +20 -0
- maskrcnn_benchmark/data/datasets/__init__.py +8 -0
- maskrcnn_benchmark/data/datasets/__pycache__/__init__.cpython-37.pyc +0 -0
- maskrcnn_benchmark/data/datasets/__pycache__/char_dataset.cpython-37.pyc +0 -0
- maskrcnn_benchmark/data/datasets/__pycache__/coco.cpython-37.pyc +0 -0
- maskrcnn_benchmark/data/datasets/__pycache__/concat_dataset.cpython-37.pyc +0 -0
- maskrcnn_benchmark/data/datasets/__pycache__/voc.cpython-37.pyc +0 -0
- maskrcnn_benchmark/data/datasets/__pycache__/word_dataset.cpython-37.pyc +0 -0
- maskrcnn_benchmark/data/datasets/coco.py +101 -0
- maskrcnn_benchmark/data/datasets/concat_dataset.py +23 -0
LICENSE
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FCOS for non-commercial purposes
|
2 |
+
|
3 |
+
Copyright (c) 2019 the authors
|
4 |
+
All rights reserved.
|
5 |
+
|
6 |
+
Redistribution and use in source and binary forms, with or without
|
7 |
+
modification, are permitted provided that the following conditions are met:
|
8 |
+
|
9 |
+
* Redistributions of source code must retain the above copyright notice, this
|
10 |
+
list of conditions and the following disclaimer.
|
11 |
+
|
12 |
+
* Redistributions in binary form must reproduce the above copyright notice,
|
13 |
+
this list of conditions and the following disclaimer in the documentation
|
14 |
+
and/or other materials provided with the distribution.
|
15 |
+
|
16 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
17 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
18 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
19 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
20 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
21 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
22 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
23 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
24 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
configs/ctw/r50_baseline.yaml
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
OUTPUT_DIR: "./output/ctw"
|
2 |
+
MODEL:
|
3 |
+
META_ARCHITECTURE: "GeneralizedRCNN"
|
4 |
+
WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
|
5 |
+
BACKBONE:
|
6 |
+
CONV_BODY: "R-50-FPN"
|
7 |
+
RESNETS:
|
8 |
+
BACKBONE_OUT_CHANNELS: 256
|
9 |
+
RPN:
|
10 |
+
USE_FPN: True
|
11 |
+
ANCHOR_STRIDE: (4, 8, 16, 32, 64)
|
12 |
+
ASPECT_RATIOS: (0.25, 0.5, 1.0, 2.0, 4.0)
|
13 |
+
ROI_HEADS:
|
14 |
+
USE_FPN: True
|
15 |
+
SCORE_THRESH: 0.85
|
16 |
+
NMS: 0.3
|
17 |
+
ROI_BOX_HEAD:
|
18 |
+
DEFORMABLE_POOLING: False
|
19 |
+
POOLER_RESOLUTION: 7
|
20 |
+
POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
|
21 |
+
POOLER_SAMPLING_RATIO: 2
|
22 |
+
FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
|
23 |
+
PREDICTOR: "FPNPredictor"
|
24 |
+
NUM_CLASSES: 2
|
25 |
+
CLASS_WEIGHT: 1.0
|
26 |
+
## Boundary
|
27 |
+
BOUNDARY_ON: True
|
28 |
+
ROI_BOUNDARY_HEAD:
|
29 |
+
DEFORMABLE_POOLING: True
|
30 |
+
FEATURE_EXTRACTOR: "BoundaryRCNNFPNFeatureExtractor"
|
31 |
+
POOLER_RESOLUTION: 14
|
32 |
+
POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
|
33 |
+
POOLER_SAMPLING_RATIO: 2
|
34 |
+
PREDICTOR: "BoundaryRCNNC4Predictor"
|
35 |
+
RESOLUTION: 48
|
36 |
+
SHARE_BOX_FEATURE_EXTRACTOR: False
|
37 |
+
BO_WEIGHT: 0.1
|
38 |
+
Loss_balance: 1.1
|
39 |
+
|
40 |
+
PROCESS:
|
41 |
+
PNMS: True
|
42 |
+
NMS_THRESH: 0.25
|
43 |
+
DATASETS:
|
44 |
+
TRAIN: ("CTW1500_train",)
|
45 |
+
TEST: ("CTW1500_test",)
|
46 |
+
Test_Visual: True
|
47 |
+
DATALOADER:
|
48 |
+
SIZE_DIVISIBILITY: 32
|
49 |
+
SOLVER:
|
50 |
+
BASE_LR: 0.0025
|
51 |
+
BIAS_LR_FACTOR: 2
|
52 |
+
WEIGHT_DECAY: 0.0001
|
53 |
+
STEPS: (30000, 40000)
|
54 |
+
MAX_ITER: 45000
|
55 |
+
IMS_PER_BATCH: 1
|
56 |
+
CHECKPOINT_PERIOD: 1000
|
57 |
+
INPUT:
|
58 |
+
|
59 |
+
MIN_SIZE_TRAIN: (400,600,720,1000,1200)
|
60 |
+
MAX_SIZE_TRAIN: 2000
|
61 |
+
MIN_SIZE_TEST: 720
|
62 |
+
MAX_SIZE_TEST: 1280
|
63 |
+
CROP_PROB_TRAIN: 1.0
|
64 |
+
ROTATE_PROB_TRAIN: 0.0
|
65 |
+
ROTATE_DEGREE: (0,30,60,90,210,150,180,210,240,270,300,330,360)
|
66 |
+
|
67 |
+
TEST:
|
68 |
+
IMS_PER_BATCH: 1
|
69 |
+
|
70 |
+
|
configs/ic/r50_baseline.yaml
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
OUTPUT_DIR: "./output/ic15"
|
2 |
+
MODEL:
|
3 |
+
META_ARCHITECTURE: "GeneralizedRCNN"
|
4 |
+
WEIGHT: catalog://ImageNetPretrained/MSRA/R-50
|
5 |
+
BACKBONE:
|
6 |
+
CONV_BODY: "R-50-FPN"
|
7 |
+
RESNETS:
|
8 |
+
BACKBONE_OUT_CHANNELS: 256
|
9 |
+
RPN:
|
10 |
+
USE_FPN: True
|
11 |
+
ANCHOR_STRIDE: (4, 8, 16, 32, 64)
|
12 |
+
ASPECT_RATIOS: (0.25, 0.5, 1.0, 2.0, 4.0)
|
13 |
+
ROI_HEADS:
|
14 |
+
USE_FPN: True
|
15 |
+
SCORE_THRESH: 0.52 # ic15
|
16 |
+
NMS: 0.89
|
17 |
+
ROI_BOX_HEAD:
|
18 |
+
DEFORMABLE_POOLING: False
|
19 |
+
POOLER_RESOLUTION: 7
|
20 |
+
POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
|
21 |
+
POOLER_SAMPLING_RATIO: 2
|
22 |
+
FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
|
23 |
+
PREDICTOR: "FPNPredictor"
|
24 |
+
NUM_CLASSES: 2
|
25 |
+
CLASS_WEIGHT: 1.0
|
26 |
+
## Boundary
|
27 |
+
BOUNDARY_ON: True
|
28 |
+
ROI_BOUNDARY_HEAD:
|
29 |
+
DEFORMABLE_POOLING: False
|
30 |
+
FEATURE_EXTRACTOR: "BoundaryRCNNFPNFeatureExtractor"
|
31 |
+
POOLER_RESOLUTION: 14
|
32 |
+
POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
|
33 |
+
POOLER_SAMPLING_RATIO: 2
|
34 |
+
PREDICTOR: "BoundaryRCNNC4Predictor"
|
35 |
+
RESOLUTION: 48
|
36 |
+
SHARE_BOX_FEATURE_EXTRACTOR: False
|
37 |
+
BO_WEIGHT: 0.1
|
38 |
+
Loss_balance: 1.0
|
39 |
+
|
40 |
+
PROCESS:
|
41 |
+
PNMS: True
|
42 |
+
NMS_THRESH: 0.25
|
43 |
+
DATASETS:
|
44 |
+
TRAIN: ("ic15_train",)
|
45 |
+
TEST: ("ic15_test",)
|
46 |
+
Test_Visual: True
|
47 |
+
DATALOADER:
|
48 |
+
SIZE_DIVISIBILITY: 32
|
49 |
+
SOLVER:
|
50 |
+
BASE_LR: 0.00025
|
51 |
+
BIAS_LR_FACTOR: 2
|
52 |
+
WEIGHT_DECAY: 0.0001
|
53 |
+
# STEPS: (120000, 160000)
|
54 |
+
STEPS: (5000, 10000) # fine-tune
|
55 |
+
# MAX_ITER: 180000
|
56 |
+
MAX_ITER: 190500 # fine-tune
|
57 |
+
IMS_PER_BATCH: 1
|
58 |
+
CHECKPOINT_PERIOD: 5000
|
59 |
+
INPUT:
|
60 |
+
|
61 |
+
MIN_SIZE_TRAIN: (400,600,720,1000,1200)
|
62 |
+
MAX_SIZE_TRAIN: 2000
|
63 |
+
MIN_SIZE_TEST: 1200
|
64 |
+
MAX_SIZE_TEST: 2000
|
65 |
+
|
66 |
+
CROP_PROB_TRAIN: 1.0
|
67 |
+
ROTATE_PROB_TRAIN: 0.3 # fine-tune
|
68 |
+
# ROTATE_PROB_TRAIN: 1.0
|
69 |
+
# ROTATE_DEGREE: (0,30,60,90,210,150,180,210,240,270,300,330,360)
|
70 |
+
ROTATE_DEGREE: (10,) # fine-tune
|
71 |
+
|
72 |
+
TEST:
|
73 |
+
IMS_PER_BATCH: 1
|
74 |
+
|
75 |
+
|
demo/1.jpg
ADDED
demo/2.jpg
ADDED
demo/example1.jpg
ADDED
demo/example_results.jpg
ADDED
maskrcnn_benchmark/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
maskrcnn_benchmark/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (145 Bytes). View file
|
|
maskrcnn_benchmark/config/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
from .defaults import _C as cfg
|
maskrcnn_benchmark/config/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (192 Bytes). View file
|
|
maskrcnn_benchmark/config/__pycache__/defaults.cpython-37.pyc
ADDED
Binary file (5.64 kB). View file
|
|
maskrcnn_benchmark/config/__pycache__/paths_catalog.cpython-37.pyc
ADDED
Binary file (3.74 kB). View file
|
|
maskrcnn_benchmark/config/defaults.py
ADDED
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
import os
|
3 |
+
|
4 |
+
from yacs.config import CfgNode as CN
|
5 |
+
|
6 |
+
|
7 |
+
# -----------------------------------------------------------------------------
|
8 |
+
# Convention about Training / Test specific parameters
|
9 |
+
# -----------------------------------------------------------------------------
|
10 |
+
# Whenever an argument can be either used for training or for testing, the
|
11 |
+
# corresponding name will be post-fixed by a _TRAIN for a training parameter,
|
12 |
+
# or _TEST for a test-specific parameter.
|
13 |
+
# For example, the number of images during training will be
|
14 |
+
# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
|
15 |
+
# IMAGES_PER_BATCH_TEST
|
16 |
+
|
17 |
+
# -----------------------------------------------------------------------------
|
18 |
+
# Config definition
|
19 |
+
# -----------------------------------------------------------------------------
|
20 |
+
|
21 |
+
_C = CN()
|
22 |
+
|
23 |
+
_C.MODEL = CN()
|
24 |
+
_C.MODEL.RPN_ONLY = False
|
25 |
+
_C.MODEL.MASK_ON = False
|
26 |
+
_C.MODEL.FCOS_ON = False
|
27 |
+
_C.MODEL.KE_ON = False
|
28 |
+
_C.MODEL.BOUNDARY_ON = False
|
29 |
+
_C.MODEL.MSR_ON = False
|
30 |
+
_C.MODEL.RETINANET_ON = False
|
31 |
+
_C.MODEL.KEYPOINT_ON = False
|
32 |
+
_C.MODEL.DEVICE = "cuda"
|
33 |
+
_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
|
34 |
+
_C.MODEL.CLS_AGNOSTIC_BBOX_REG = False
|
35 |
+
|
36 |
+
# If the WEIGHT starts with a catalog://, like :R-50, the code will look for
|
37 |
+
# the path in paths_catalog. Else, it will use it as the specified absolute
|
38 |
+
# path
|
39 |
+
_C.MODEL.WEIGHT = ""
|
40 |
+
|
41 |
+
|
42 |
+
# -----------------------------------------------------------------------------
|
43 |
+
# INPUT
|
44 |
+
# -----------------------------------------------------------------------------
|
45 |
+
_C.INPUT = CN()
|
46 |
+
# Size of the smallest side of the image during training
|
47 |
+
_C.INPUT.MIN_SIZE_TRAIN = (800,) # (800,)
|
48 |
+
# The range of the smallest side for multi-scale training
|
49 |
+
_C.INPUT.MIN_SIZE_RANGE_TRAIN = (-1, -1) # -1 means disabled and it will use MIN_SIZE_TRAIN
|
50 |
+
# Maximum size of the side of the image during training
|
51 |
+
_C.INPUT.MAX_SIZE_TRAIN = 1333
|
52 |
+
# Size of the smallest side of the image during testing
|
53 |
+
_C.INPUT.MIN_SIZE_TEST = 1000
|
54 |
+
# Maximum size of the side of the image during testing
|
55 |
+
_C.INPUT.MAX_SIZE_TEST = 1333
|
56 |
+
# Values to be used for image normalization
|
57 |
+
_C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717]
|
58 |
+
# Values to be used for image normalization
|
59 |
+
_C.INPUT.PIXEL_STD = [1., 1., 1.]
|
60 |
+
# Convert image to BGR format (for Caffe2 models), in range 0-255
|
61 |
+
_C.INPUT.TO_BGR255 = True
|
62 |
+
_C.INPUT.CROP_PROB_TRAIN = 1.0
|
63 |
+
_C.INPUT.ROTATE_PROB_TRAIN = 0.3
|
64 |
+
_C.INPUT.ROTATE_DEGREE = (0,15,-15,45,-45,90,-90)
|
65 |
+
# _C.INPUT.ROTATE_DEGREE = 15
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
|
70 |
+
# -----------------------------------------------------------------------------
|
71 |
+
# Dataset
|
72 |
+
# -----------------------------------------------------------------------------
|
73 |
+
_C.DATASETS = CN()
|
74 |
+
# List of the dataset names for training, as present in paths_catalog.py
|
75 |
+
_C.DATASETS.TRAIN = ()
|
76 |
+
# List of the dataset names for testing, as present in paths_catalog.py
|
77 |
+
_C.DATASETS.TEST = ()
|
78 |
+
_C.DATASETS.Test_Visual = False
|
79 |
+
# -----------------------------------------------------------------------------
|
80 |
+
# DataLoader
|
81 |
+
# -----------------------------------------------------------------------------
|
82 |
+
_C.DATALOADER = CN()
|
83 |
+
# Number of data loading threads
|
84 |
+
_C.DATALOADER.NUM_WORKERS = 4
|
85 |
+
# If > 0, this enforces that each collated batch should have a size divisible
|
86 |
+
# by SIZE_DIVISIBILITY
|
87 |
+
_C.DATALOADER.SIZE_DIVISIBILITY = 0
|
88 |
+
# If True, each batch should contain only images for which the aspect ratio
|
89 |
+
# is compatible. This groups portrait images together, and landscape images
|
90 |
+
# are not batched with portrait images.
|
91 |
+
_C.DATALOADER.ASPECT_RATIO_GROUPING = True
|
92 |
+
|
93 |
+
|
94 |
+
# ---------------------------------------------------------------------------- #
|
95 |
+
# Backbone options
|
96 |
+
# ---------------------------------------------------------------------------- #
|
97 |
+
_C.MODEL.BACKBONE = CN()
|
98 |
+
|
99 |
+
# The backbone conv body to use
|
100 |
+
# The string must match a function that is imported in modeling.model_builder
|
101 |
+
# (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN
|
102 |
+
# backbone)
|
103 |
+
_C.MODEL.BACKBONE.CONV_BODY = "R-50-C4"
|
104 |
+
|
105 |
+
# Add StopGrad at a specified stage so the bottom layers are frozen
|
106 |
+
_C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2
|
107 |
+
# GN for backbone
|
108 |
+
|
109 |
+
##123123123
|
110 |
+
_C.MODEL.BACKBONE.USE_GN = False
|
111 |
+
|
112 |
+
|
113 |
+
# ---------------------------------------------------------------------------- #
|
114 |
+
# FPN options
|
115 |
+
# ---------------------------------------------------------------------------- #
|
116 |
+
_C.MODEL.FPN = CN()
|
117 |
+
|
118 |
+
# 123123123
|
119 |
+
_C.MODEL.FPN.USE_GN = False
|
120 |
+
_C.MODEL.FPN.USE_RELU = False
|
121 |
+
|
122 |
+
#############123123123
|
123 |
+
_C.MODEL.FPN.USE_DEFORMABLE = False
|
124 |
+
|
125 |
+
|
126 |
+
# ---------------------------------------------------------------------------- #
|
127 |
+
# Group Norm options
|
128 |
+
# ---------------------------------------------------------------------------- #
|
129 |
+
_C.MODEL.GROUP_NORM = CN()
|
130 |
+
# Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS)
|
131 |
+
_C.MODEL.GROUP_NORM.DIM_PER_GP = -1
|
132 |
+
# Number of groups in GroupNorm (-1 if using DIM_PER_GP)
|
133 |
+
_C.MODEL.GROUP_NORM.NUM_GROUPS = 32
|
134 |
+
# GroupNorm's small constant in the denominator
|
135 |
+
_C.MODEL.GROUP_NORM.EPSILON = 1e-5
|
136 |
+
|
137 |
+
|
138 |
+
# ---------------------------------------------------------------------------- #
|
139 |
+
# RPN options
|
140 |
+
# ---------------------------------------------------------------------------- #
|
141 |
+
_C.MODEL.RPN = CN()
|
142 |
+
_C.MODEL.RPN.USE_FPN = False
|
143 |
+
# Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input
|
144 |
+
_C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512)
|
145 |
+
# Stride of the feature map that RPN is attached.
|
146 |
+
# For FPN, number of strides should match number of scales
|
147 |
+
_C.MODEL.RPN.ANCHOR_STRIDE = (16,)
|
148 |
+
# RPN anchor aspect ratios
|
149 |
+
_C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0)
|
150 |
+
# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels
|
151 |
+
# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
|
152 |
+
_C.MODEL.RPN.STRADDLE_THRESH = 0
|
153 |
+
# Minimum overlap required between an anchor and ground-truth box for the
|
154 |
+
# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
|
155 |
+
# ==> positive RPN example)
|
156 |
+
_C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7
|
157 |
+
# Maximum overlap allowed between an anchor and ground-truth box for the
|
158 |
+
# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
|
159 |
+
# ==> negative RPN example)
|
160 |
+
_C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3
|
161 |
+
# Total number of RPN examples per image
|
162 |
+
_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
|
163 |
+
# Target fraction of foreground (positive) examples per RPN minibatch
|
164 |
+
_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
|
165 |
+
# Number of top scoring RPN proposals to keep before applying NMS
|
166 |
+
# When FPN is used, this is *per FPN level* (not total)
|
167 |
+
_C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000
|
168 |
+
|
169 |
+
_C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000
|
170 |
+
# Number of top scoring RPN proposals to keep after applying NMS
|
171 |
+
_C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000
|
172 |
+
_C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000
|
173 |
+
# NMS threshold used on RPN proposals
|
174 |
+
_C.MODEL.RPN.NMS_THRESH = 0.7
|
175 |
+
# Proposal height and width both need to be greater than RPN_MIN_SIZE
|
176 |
+
# (a the scale used during training or inference)
|
177 |
+
_C.MODEL.RPN.MIN_SIZE = 0
|
178 |
+
# Number of top scoring RPN proposals to keep after combining proposals from
|
179 |
+
# all FPN levels
|
180 |
+
_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000
|
181 |
+
_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
|
182 |
+
# Custom rpn head, empty to use default conv or separable conv
|
183 |
+
_C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead_1"
|
184 |
+
|
185 |
+
|
186 |
+
# ---------------------------------------------------------------------------- #
|
187 |
+
# ROI HEADS options
|
188 |
+
# ---------------------------------------------------------------------------- #
|
189 |
+
_C.MODEL.ROI_HEADS = CN()
|
190 |
+
_C.MODEL.ROI_HEADS.USE_FPN = False
|
191 |
+
_C.MODEL.ROI_HEADS.USE_FPN = False
|
192 |
+
# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
|
193 |
+
_C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5
|
194 |
+
# Overlap threshold for an RoI to be considered background
|
195 |
+
# (class = 0 if overlap in [0, BG_IOU_THRESHOLD))
|
196 |
+
_C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5
|
197 |
+
# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
|
198 |
+
# These are empirically chosen to approximately lead to unit variance targets
|
199 |
+
_C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.)
|
200 |
+
# RoI minibatch size *per image* (number of regions of interest [ROIs])
|
201 |
+
# Total number of RoIs per training minibatch =
|
202 |
+
# TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH
|
203 |
+
# E.g., a common configuration is: 512 * 2 * 8 = 8192
|
204 |
+
_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
|
205 |
+
# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
|
206 |
+
_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
|
207 |
+
|
208 |
+
# Only used on test mode
|
209 |
+
|
210 |
+
# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
|
211 |
+
# balance obtaining high recall with not having too many low precision
|
212 |
+
# detections that will slow down inference post processing steps (like NMS)
|
213 |
+
_C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05
|
214 |
+
# Overlap threshold used for non-maximum suppression (suppress boxes with
|
215 |
+
# IoU >= this threshold)
|
216 |
+
_C.MODEL.ROI_HEADS.NMS = 0.5
|
217 |
+
# Maximum number of detections to return per image (100 is based on the limit established for the COCO dataset)
|
218 |
+
_C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100
|
219 |
+
|
220 |
+
|
221 |
+
_C.MODEL.ROI_BOX_HEAD = CN()
|
222 |
+
_C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
|
223 |
+
_C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor"
|
224 |
+
_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
|
225 |
+
_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
|
226 |
+
_C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,)
|
227 |
+
_C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81
|
228 |
+
# Hidden layer dimension when using an MLP for the RoI box head
|
229 |
+
_C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024
|
230 |
+
# GN
|
231 |
+
#####123123123
|
232 |
+
_C.MODEL.ROI_BOX_HEAD.USE_GN = False
|
233 |
+
# Dilation
|
234 |
+
_C.MODEL.ROI_BOX_HEAD.DILATION = 1
|
235 |
+
_C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256
|
236 |
+
|
237 |
+
#### 123123
|
238 |
+
_C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4
|
239 |
+
_C.MODEL.ROI_BOX_HEAD.CLASS_WEIGHT = 0.1
|
240 |
+
_C.MODEL.ROI_BOX_HEAD.DEFORMABLE_POOLING = False
|
241 |
+
|
242 |
+
_C.MODEL.ROI_MASK_HEAD = CN()
|
243 |
+
# Whether or not resize and translate masks to the input image.
|
244 |
+
_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False
|
245 |
+
_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5
|
246 |
+
_C.MODEL.ROI_MASK_HEAD.DILATION = 1
|
247 |
+
_C.MODEL.ROI_MASK_HEAD.USE_GN = False
|
248 |
+
|
249 |
+
# Boundary edge
|
250 |
+
_C.MODEL.ROI_BOUNDARY_HEAD = CN()
|
251 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.DEFORMABLE_POOLING = False
|
252 |
+
|
253 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
|
254 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.POOLER_RESOLUTION = 14
|
255 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.POOLER_SCALES = (1.0 / 16,)
|
256 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.POOLER_SAMPLING_RATIO = 0
|
257 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.CONV_LAYERS = (256, 256, 256, 256)
|
258 |
+
|
259 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.PREDICTOR = "KERCNNC4Predictor"
|
260 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.RESOLUTION = 14
|
261 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
|
262 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.BO_WEIGHT = 1.0
|
263 |
+
_C.MODEL.ROI_BOUNDARY_HEAD.Loss_balance = 1.2
|
264 |
+
|
265 |
+
# ---------------------------------------------------------------------------- #
|
266 |
+
# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
|
267 |
+
# Note that parts of a resnet may be used for both the backbone and the head
|
268 |
+
# These options apply to both
|
269 |
+
# ---------------------------------------------------------------------------- #
|
270 |
+
_C.MODEL.RESNETS = CN()
|
271 |
+
|
272 |
+
# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
|
273 |
+
_C.MODEL.RESNETS.NUM_GROUPS = 1
|
274 |
+
|
275 |
+
# Baseline width of each group
|
276 |
+
_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
|
277 |
+
|
278 |
+
# Place the stride 2 conv on the 1x1 filter
|
279 |
+
# Use True only for the original MSRA ResNet; use False for C2 and Torch models
|
280 |
+
_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
|
281 |
+
|
282 |
+
# Residual transformation function
|
283 |
+
_C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm"
|
284 |
+
_C.MODEL.RESNETS.DEF_FUNC = "DeformableConvWithFixedBatchNorm"
|
285 |
+
# ResNet's stem function (conv1 and pool1)
|
286 |
+
_C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm"
|
287 |
+
_C.MODEL.RESNETS.DEF_START_MODULE = "NA"
|
288 |
+
|
289 |
+
#########123123123
|
290 |
+
_C.MODEL.RESNETS.DEFORM_POOLING = False
|
291 |
+
|
292 |
+
# Apply dilation in stage "res5"
|
293 |
+
_C.MODEL.RESNETS.RES5_DILATION = 1
|
294 |
+
|
295 |
+
_C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4
|
296 |
+
_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
|
297 |
+
_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
|
298 |
+
|
299 |
+
# ---------------------------------------------------------------------------- #
|
300 |
+
# FCOS Options
|
301 |
+
# ---------------------------------------------------------------------------- #
|
302 |
+
_C.MODEL.FCOS = CN()
|
303 |
+
_C.MODEL.FCOS.NUM_CLASSES = 81 # the number of classes including background
|
304 |
+
_C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
|
305 |
+
_C.MODEL.FCOS.PRIOR_PROB = 0.01
|
306 |
+
_C.MODEL.FCOS.INFERENCE_TH = 0.05
|
307 |
+
_C.MODEL.FCOS.NMS_TH = 0.4
|
308 |
+
_C.MODEL.FCOS.PRE_NMS_TOP_N = 1000
|
309 |
+
|
310 |
+
# Focal loss parameter: alpha
|
311 |
+
_C.MODEL.FCOS.LOSS_ALPHA = 0.25
|
312 |
+
# Focal loss parameter: gamma
|
313 |
+
_C.MODEL.FCOS.LOSS_GAMMA = 2.0
|
314 |
+
_C.MODEL.FCOS.SIZES_OF_INTEREST = [64, 128, 256, 512]
|
315 |
+
|
316 |
+
# the number of convolutions used in the cls and bbox tower
|
317 |
+
_C.MODEL.FCOS.NUM_CONVS = 4
|
318 |
+
|
319 |
+
# ---------------------------------------------------------------------------- #
|
320 |
+
# RetinaNet Options (Follow the Detectron version)
|
321 |
+
# ---------------------------------------------------------------------------- #
|
322 |
+
_C.MODEL.RETINANET = CN()
|
323 |
+
|
324 |
+
# This is the number of foreground classes and background.
|
325 |
+
_C.MODEL.RETINANET.NUM_CLASSES = 81
|
326 |
+
|
327 |
+
# Anchor aspect ratios to use
|
328 |
+
_C.MODEL.RETINANET.ANCHOR_SIZES = (32, 64, 128, 256, 512)
|
329 |
+
_C.MODEL.RETINANET.ASPECT_RATIOS = (0.5, 1.0, 2.0)
|
330 |
+
_C.MODEL.RETINANET.ANCHOR_STRIDES = (8, 16, 32, 64, 128)
|
331 |
+
_C.MODEL.RETINANET.STRADDLE_THRESH = 0
|
332 |
+
|
333 |
+
# Anchor scales per octave
|
334 |
+
_C.MODEL.RETINANET.OCTAVE = 2.0
|
335 |
+
_C.MODEL.RETINANET.SCALES_PER_OCTAVE = 3
|
336 |
+
|
337 |
+
# Use C5 or P5 to generate P6
|
338 |
+
_C.MODEL.RETINANET.USE_C5 = True
|
339 |
+
|
340 |
+
# Convolutions to use in the cls and bbox tower
|
341 |
+
# NOTE: this doesn't include the last conv for logits
|
342 |
+
_C.MODEL.RETINANET.NUM_CONVS = 4
|
343 |
+
|
344 |
+
# Weight for bbox_regression loss
|
345 |
+
_C.MODEL.RETINANET.BBOX_REG_WEIGHT = 4.0
|
346 |
+
|
347 |
+
# Smooth L1 loss beta for bbox regression
|
348 |
+
_C.MODEL.RETINANET.BBOX_REG_BETA = 0.11
|
349 |
+
|
350 |
+
# During inference, #locs to select based on cls score before NMS is performed
|
351 |
+
# per FPN level
|
352 |
+
_C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000
|
353 |
+
|
354 |
+
# IoU overlap ratio for labeling an anchor as positive
|
355 |
+
# Anchors with >= iou overlap are labeled positive
|
356 |
+
_C.MODEL.RETINANET.FG_IOU_THRESHOLD = 0.5
|
357 |
+
|
358 |
+
# IoU overlap ratio for labeling an anchor as negative
|
359 |
+
# Anchors with < iou overlap are labeled negative
|
360 |
+
_C.MODEL.RETINANET.BG_IOU_THRESHOLD = 0.4
|
361 |
+
|
362 |
+
# Focal loss parameter: alpha
|
363 |
+
_C.MODEL.RETINANET.LOSS_ALPHA = 0.25
|
364 |
+
|
365 |
+
# Focal loss parameter: gamma
|
366 |
+
_C.MODEL.RETINANET.LOSS_GAMMA = 2.0
|
367 |
+
|
368 |
+
# Prior prob for the positives at the beginning of training. This is used to set
|
369 |
+
# the bias init for the logits layer
|
370 |
+
_C.MODEL.RETINANET.PRIOR_PROB = 0.01
|
371 |
+
|
372 |
+
# Inference cls score threshold, anchors with score > INFERENCE_TH are
|
373 |
+
# considered for inference
|
374 |
+
_C.MODEL.RETINANET.INFERENCE_TH = 0.05
|
375 |
+
|
376 |
+
# NMS threshold used in RetinaNet
|
377 |
+
_C.MODEL.RETINANET.NMS_TH = 0.4
|
378 |
+
|
379 |
+
|
380 |
+
# ---------------------------------------------------------------------------- #
|
381 |
+
# FBNet options
|
382 |
+
# ---------------------------------------------------------------------------- #
|
383 |
+
_C.MODEL.FBNET = CN()
|
384 |
+
_C.MODEL.FBNET.ARCH = "default"
|
385 |
+
# custom arch
|
386 |
+
_C.MODEL.FBNET.ARCH_DEF = ""
|
387 |
+
_C.MODEL.FBNET.BN_TYPE = "bn"
|
388 |
+
_C.MODEL.FBNET.SCALE_FACTOR = 1.0
|
389 |
+
# the output channels will be divisible by WIDTH_DIVISOR
|
390 |
+
_C.MODEL.FBNET.WIDTH_DIVISOR = 1
|
391 |
+
_C.MODEL.FBNET.DW_CONV_SKIP_BN = True
|
392 |
+
_C.MODEL.FBNET.DW_CONV_SKIP_RELU = True
|
393 |
+
|
394 |
+
# > 0 scale, == 0 skip, < 0 same dimension
|
395 |
+
_C.MODEL.FBNET.DET_HEAD_LAST_SCALE = 1.0
|
396 |
+
_C.MODEL.FBNET.DET_HEAD_BLOCKS = []
|
397 |
+
# overwrite the stride for the head, 0 to use original value
|
398 |
+
_C.MODEL.FBNET.DET_HEAD_STRIDE = 0
|
399 |
+
|
400 |
+
# > 0 scale, == 0 skip, < 0 same dimension
|
401 |
+
_C.MODEL.FBNET.KPTS_HEAD_LAST_SCALE = 0.0
|
402 |
+
_C.MODEL.FBNET.KPTS_HEAD_BLOCKS = []
|
403 |
+
# overwrite the stride for the head, 0 to use original value
|
404 |
+
_C.MODEL.FBNET.KPTS_HEAD_STRIDE = 0
|
405 |
+
|
406 |
+
# > 0 scale, == 0 skip, < 0 same dimension
|
407 |
+
_C.MODEL.FBNET.MASK_HEAD_LAST_SCALE = 0.0
|
408 |
+
_C.MODEL.FBNET.MASK_HEAD_BLOCKS = []
|
409 |
+
# overwrite the stride for the head, 0 to use original value
|
410 |
+
_C.MODEL.FBNET.MASK_HEAD_STRIDE = 0
|
411 |
+
|
412 |
+
# 0 to use all blocks defined in arch_def
|
413 |
+
_C.MODEL.FBNET.RPN_HEAD_BLOCKS = 0
|
414 |
+
_C.MODEL.FBNET.RPN_BN_TYPE = ""
|
415 |
+
|
416 |
+
|
417 |
+
# ---------------------------------------------------------------------------- #
|
418 |
+
# Solver
|
419 |
+
# ---------------------------------------------------------------------------- #
|
420 |
+
_C.SOLVER = CN()
|
421 |
+
_C.SOLVER.MAX_ITER = 40000
|
422 |
+
|
423 |
+
_C.SOLVER.BASE_LR = 0.001
|
424 |
+
_C.SOLVER.BIAS_LR_FACTOR = 2
|
425 |
+
|
426 |
+
_C.SOLVER.MOMENTUM = 0.9
|
427 |
+
|
428 |
+
_C.SOLVER.WEIGHT_DECAY = 0.0005
|
429 |
+
_C.SOLVER.WEIGHT_DECAY_BIAS = 0
|
430 |
+
|
431 |
+
_C.SOLVER.GAMMA = 0.1
|
432 |
+
_C.SOLVER.STEPS = (30000,)
|
433 |
+
|
434 |
+
_C.SOLVER.WARMUP_FACTOR = 1.0 / 3
|
435 |
+
_C.SOLVER.WARMUP_ITERS = 500
|
436 |
+
_C.SOLVER.WARMUP_METHOD = "linear"
|
437 |
+
|
438 |
+
_C.SOLVER.CHECKPOINT_PERIOD = 2500
|
439 |
+
|
440 |
+
# Number of images per batch
|
441 |
+
# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
|
442 |
+
# see 2 images per batch
|
443 |
+
_C.SOLVER.IMS_PER_BATCH = 4
|
444 |
+
|
445 |
+
# ---------------------------------------------------------------------------- #
|
446 |
+
# Specific test options
|
447 |
+
# ---------------------------------------------------------------------------- #
|
448 |
+
_C.TEST = CN()
|
449 |
+
_C.TEST.EXPECTED_RESULTS = []
|
450 |
+
_C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4
|
451 |
+
# Number of images per batch
|
452 |
+
# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
|
453 |
+
# see 2 images per batch
|
454 |
+
_C.TEST.IMS_PER_BATCH = 16
|
455 |
+
# Number of detections per image
|
456 |
+
_C.TEST.DETECTIONS_PER_IMG = 100
|
457 |
+
|
458 |
+
|
459 |
+
# ---------------------------------------------------------------------------- #
|
460 |
+
# Misc options
|
461 |
+
# ---------------------------------------------------------------------------- #
|
462 |
+
_C.OUTPUT_DIR = "./1"
|
463 |
+
_C.IS_LOAD_OPTIMIZER = True
|
464 |
+
_C.IS_LOAD_SCHEDULER = True
|
465 |
+
_C.PROCESS = CN()
|
466 |
+
|
467 |
+
#####123123123
|
468 |
+
_C.PROCESS.PNMS = False
|
469 |
+
_C.PROCESS.NMS_THRESH = 0.4
|
470 |
+
|
471 |
+
_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
|
maskrcnn_benchmark/config/paths_catalog.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
"""Centralized catalog of paths."""
|
3 |
+
|
4 |
+
import os
|
5 |
+
|
6 |
+
class DatasetCatalog(object):
|
7 |
+
DATA_DIR = "/home/zhangbq/ws/ct/dataset/"
|
8 |
+
DATASETS = {
|
9 |
+
"ic15_train": (
|
10 |
+
"ic15/ic15_train_images",
|
11 |
+
"ic15/annotations/ic15_train.json"
|
12 |
+
),
|
13 |
+
"ic15_test": (
|
14 |
+
"ic15/ic15_test_images",
|
15 |
+
"ic15/annotations/ic15_test.json"
|
16 |
+
),
|
17 |
+
"CTW1500_train": (
|
18 |
+
"ctw/ctw_train_images",
|
19 |
+
"ctw/annotations/ctw_train.json"
|
20 |
+
),
|
21 |
+
"CTW1500_test": (
|
22 |
+
"ctw/ctw_test_images",
|
23 |
+
"ctw/annotations/ctw_test.json"
|
24 |
+
)
|
25 |
+
|
26 |
+
}
|
27 |
+
|
28 |
+
@staticmethod
|
29 |
+
def get(name):
|
30 |
+
data_dir = DatasetCatalog.DATA_DIR
|
31 |
+
attrs = DatasetCatalog.DATASETS[name]
|
32 |
+
if "coco" in name:
|
33 |
+
args = dict(
|
34 |
+
root=os.path.join(data_dir, attrs["img_dir"]),
|
35 |
+
ann_file=os.path.join(data_dir, attrs["ann_file"]),
|
36 |
+
)
|
37 |
+
return dict(
|
38 |
+
factory="COCODataset",
|
39 |
+
args=args,
|
40 |
+
)
|
41 |
+
elif "voc" in name:
|
42 |
+
args = dict(
|
43 |
+
data_dir=os.path.join(data_dir, attrs["data_dir"]),
|
44 |
+
split=attrs["split"],
|
45 |
+
)
|
46 |
+
return dict(
|
47 |
+
factory="PascalVOCDataset",
|
48 |
+
args=args,
|
49 |
+
)
|
50 |
+
elif True:
|
51 |
+
args = dict(
|
52 |
+
root=os.path.join(data_dir, attrs[0]),
|
53 |
+
ann_file=os.path.join(data_dir, attrs[1]),
|
54 |
+
)
|
55 |
+
return dict(
|
56 |
+
factory="WordDataset",
|
57 |
+
args=args,
|
58 |
+
)
|
59 |
+
raise RuntimeError("Dataset not available: {}".format(name))
|
60 |
+
|
61 |
+
|
62 |
+
class ModelCatalog(object):
|
63 |
+
S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron"
|
64 |
+
C2_IMAGENET_MODELS = {
|
65 |
+
"MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
|
66 |
+
"MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
|
67 |
+
"MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
|
68 |
+
"MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
|
69 |
+
"FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
|
70 |
+
}
|
71 |
+
|
72 |
+
C2_DETECTRON_SUFFIX = "output/train/{}coco_2014_train%3A{}coco_2014_valminusminival/generalized_rcnn/model_final.pkl"
|
73 |
+
C2_DETECTRON_MODELS = {
|
74 |
+
"35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
|
75 |
+
"35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
|
76 |
+
"35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7",
|
77 |
+
"36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ",
|
78 |
+
"35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB",
|
79 |
+
"35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC",
|
80 |
+
"35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
|
81 |
+
"36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
|
82 |
+
"37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x": "09_35_36.8pzTQKYK",
|
83 |
+
# keypoints
|
84 |
+
"37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "08_42_54.kdzV35ao"
|
85 |
+
}
|
86 |
+
|
87 |
+
@staticmethod
|
88 |
+
def get(name):
|
89 |
+
if name.startswith("Caffe2Detectron/COCO"):
|
90 |
+
return ModelCatalog.get_c2_detectron_12_2017_baselines(name)
|
91 |
+
if name.startswith("ImageNetPretrained"):
|
92 |
+
return ModelCatalog.get_c2_imagenet_pretrained(name)
|
93 |
+
raise RuntimeError("model not present in the catalog {}".format(name))
|
94 |
+
|
95 |
+
@staticmethod
|
96 |
+
def get_c2_imagenet_pretrained(name):
|
97 |
+
prefix = ModelCatalog.S3_C2_DETECTRON_URL
|
98 |
+
name = name[len("ImageNetPretrained/"):]
|
99 |
+
name = ModelCatalog.C2_IMAGENET_MODELS[name]
|
100 |
+
url = "/".join([prefix, name])
|
101 |
+
return url
|
102 |
+
|
103 |
+
@staticmethod
|
104 |
+
def get_c2_detectron_12_2017_baselines(name):
|
105 |
+
# Detectron C2 models are stored following the structure
|
106 |
+
# prefix/<model_id>/2012_2017_baselines/<model_name>.yaml.<signature>/suffix
|
107 |
+
# we use as identifiers in the catalog Caffe2Detectron/COCO/<model_id>/<model_name>
|
108 |
+
prefix = ModelCatalog.S3_C2_DETECTRON_URL
|
109 |
+
dataset_tag = "keypoints_" if "keypoint" in name else ""
|
110 |
+
suffix = ModelCatalog.C2_DETECTRON_SUFFIX.format(dataset_tag, dataset_tag)
|
111 |
+
# remove identification prefix
|
112 |
+
name = name[len("Caffe2Detectron/COCO/"):]
|
113 |
+
# split in <model_id> and <model_name>
|
114 |
+
model_id, model_name = name.split("/")
|
115 |
+
# parsing to make it match the url address from the Caffe2 models
|
116 |
+
model_name = "{}.yaml".format(model_name)
|
117 |
+
signature = ModelCatalog.C2_DETECTRON_MODELS[name]
|
118 |
+
unique_name = ".".join([model_name, signature])
|
119 |
+
url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix])
|
120 |
+
return url
|
maskrcnn_benchmark/csrc/ROIAlign.h
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#pragma once
|
3 |
+
|
4 |
+
#include "cpu/vision.h"
|
5 |
+
|
6 |
+
#ifdef WITH_CUDA
|
7 |
+
#include "cuda/vision.h"
|
8 |
+
#endif
|
9 |
+
|
10 |
+
// Interface for Python
|
11 |
+
at::Tensor ROIAlign_forward(const at::Tensor& input,
|
12 |
+
const at::Tensor& rois,
|
13 |
+
const float spatial_scale,
|
14 |
+
const int pooled_height,
|
15 |
+
const int pooled_width,
|
16 |
+
const int sampling_ratio) {
|
17 |
+
if (input.type().is_cuda()) {
|
18 |
+
#ifdef WITH_CUDA
|
19 |
+
return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
|
20 |
+
#else
|
21 |
+
AT_ERROR("Not compiled with GPU support");
|
22 |
+
#endif
|
23 |
+
}
|
24 |
+
return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
|
25 |
+
}
|
26 |
+
|
27 |
+
at::Tensor ROIAlign_backward(const at::Tensor& grad,
|
28 |
+
const at::Tensor& rois,
|
29 |
+
const float spatial_scale,
|
30 |
+
const int pooled_height,
|
31 |
+
const int pooled_width,
|
32 |
+
const int batch_size,
|
33 |
+
const int channels,
|
34 |
+
const int height,
|
35 |
+
const int width,
|
36 |
+
const int sampling_ratio) {
|
37 |
+
if (grad.type().is_cuda()) {
|
38 |
+
#ifdef WITH_CUDA
|
39 |
+
return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
|
40 |
+
#else
|
41 |
+
AT_ERROR("Not compiled with GPU support");
|
42 |
+
#endif
|
43 |
+
}
|
44 |
+
AT_ERROR("Not implemented on the CPU");
|
45 |
+
}
|
46 |
+
|
maskrcnn_benchmark/csrc/ROIPool.h
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#pragma once
|
3 |
+
|
4 |
+
#include "cpu/vision.h"
|
5 |
+
|
6 |
+
#ifdef WITH_CUDA
|
7 |
+
#include "cuda/vision.h"
|
8 |
+
#endif
|
9 |
+
|
10 |
+
|
11 |
+
std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
|
12 |
+
const at::Tensor& rois,
|
13 |
+
const float spatial_scale,
|
14 |
+
const int pooled_height,
|
15 |
+
const int pooled_width) {
|
16 |
+
if (input.type().is_cuda()) {
|
17 |
+
#ifdef WITH_CUDA
|
18 |
+
return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
|
19 |
+
#else
|
20 |
+
AT_ERROR("Not compiled with GPU support");
|
21 |
+
#endif
|
22 |
+
}
|
23 |
+
AT_ERROR("Not implemented on the CPU");
|
24 |
+
}
|
25 |
+
|
26 |
+
at::Tensor ROIPool_backward(const at::Tensor& grad,
|
27 |
+
const at::Tensor& input,
|
28 |
+
const at::Tensor& rois,
|
29 |
+
const at::Tensor& argmax,
|
30 |
+
const float spatial_scale,
|
31 |
+
const int pooled_height,
|
32 |
+
const int pooled_width,
|
33 |
+
const int batch_size,
|
34 |
+
const int channels,
|
35 |
+
const int height,
|
36 |
+
const int width) {
|
37 |
+
if (grad.type().is_cuda()) {
|
38 |
+
#ifdef WITH_CUDA
|
39 |
+
return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
|
40 |
+
#else
|
41 |
+
AT_ERROR("Not compiled with GPU support");
|
42 |
+
#endif
|
43 |
+
}
|
44 |
+
AT_ERROR("Not implemented on the CPU");
|
45 |
+
}
|
46 |
+
|
47 |
+
|
48 |
+
|
maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
#include "cpu/vision.h"
|
4 |
+
|
5 |
+
#ifdef WITH_CUDA
|
6 |
+
#include "cuda/vision.h"
|
7 |
+
#endif
|
8 |
+
|
9 |
+
// Interface for Python
|
10 |
+
at::Tensor SigmoidFocalLoss_forward(
|
11 |
+
const at::Tensor& logits,
|
12 |
+
const at::Tensor& targets,
|
13 |
+
const int num_classes,
|
14 |
+
const float gamma,
|
15 |
+
const float alpha) {
|
16 |
+
if (logits.type().is_cuda()) {
|
17 |
+
#ifdef WITH_CUDA
|
18 |
+
return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
|
19 |
+
#else
|
20 |
+
AT_ERROR("Not compiled with GPU support");
|
21 |
+
#endif
|
22 |
+
}
|
23 |
+
AT_ERROR("Not implemented on the CPU");
|
24 |
+
}
|
25 |
+
|
26 |
+
at::Tensor SigmoidFocalLoss_backward(
|
27 |
+
const at::Tensor& logits,
|
28 |
+
const at::Tensor& targets,
|
29 |
+
const at::Tensor& d_losses,
|
30 |
+
const int num_classes,
|
31 |
+
const float gamma,
|
32 |
+
const float alpha) {
|
33 |
+
if (logits.type().is_cuda()) {
|
34 |
+
#ifdef WITH_CUDA
|
35 |
+
return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
|
36 |
+
#else
|
37 |
+
AT_ERROR("Not compiled with GPU support");
|
38 |
+
#endif
|
39 |
+
}
|
40 |
+
AT_ERROR("Not implemented on the CPU");
|
41 |
+
}
|
maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
ADDED
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#include "cpu/vision.h"
|
3 |
+
|
4 |
+
// implementation taken from Caffe2
|
5 |
+
template <typename T>
|
6 |
+
struct PreCalc {
|
7 |
+
int pos1;
|
8 |
+
int pos2;
|
9 |
+
int pos3;
|
10 |
+
int pos4;
|
11 |
+
T w1;
|
12 |
+
T w2;
|
13 |
+
T w3;
|
14 |
+
T w4;
|
15 |
+
};
|
16 |
+
|
17 |
+
template <typename T>
|
18 |
+
void pre_calc_for_bilinear_interpolate(
|
19 |
+
const int height,
|
20 |
+
const int width,
|
21 |
+
const int pooled_height,
|
22 |
+
const int pooled_width,
|
23 |
+
const int iy_upper,
|
24 |
+
const int ix_upper,
|
25 |
+
T roi_start_h,
|
26 |
+
T roi_start_w,
|
27 |
+
T bin_size_h,
|
28 |
+
T bin_size_w,
|
29 |
+
int roi_bin_grid_h,
|
30 |
+
int roi_bin_grid_w,
|
31 |
+
std::vector<PreCalc<T>>& pre_calc) {
|
32 |
+
int pre_calc_index = 0;
|
33 |
+
for (int ph = 0; ph < pooled_height; ph++) {
|
34 |
+
for (int pw = 0; pw < pooled_width; pw++) {
|
35 |
+
for (int iy = 0; iy < iy_upper; iy++) {
|
36 |
+
const T yy = roi_start_h + ph * bin_size_h +
|
37 |
+
static_cast<T>(iy + .5f) * bin_size_h /
|
38 |
+
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
|
39 |
+
for (int ix = 0; ix < ix_upper; ix++) {
|
40 |
+
const T xx = roi_start_w + pw * bin_size_w +
|
41 |
+
static_cast<T>(ix + .5f) * bin_size_w /
|
42 |
+
static_cast<T>(roi_bin_grid_w);
|
43 |
+
|
44 |
+
T x = xx;
|
45 |
+
T y = yy;
|
46 |
+
// deal with: inverse elements are out of feature map boundary
|
47 |
+
if (y < -1.0 || y > height || x < -1.0 || x > width) {
|
48 |
+
// empty
|
49 |
+
PreCalc<T> pc;
|
50 |
+
pc.pos1 = 0;
|
51 |
+
pc.pos2 = 0;
|
52 |
+
pc.pos3 = 0;
|
53 |
+
pc.pos4 = 0;
|
54 |
+
pc.w1 = 0;
|
55 |
+
pc.w2 = 0;
|
56 |
+
pc.w3 = 0;
|
57 |
+
pc.w4 = 0;
|
58 |
+
pre_calc[pre_calc_index] = pc;
|
59 |
+
pre_calc_index += 1;
|
60 |
+
continue;
|
61 |
+
}
|
62 |
+
|
63 |
+
if (y <= 0) {
|
64 |
+
y = 0;
|
65 |
+
}
|
66 |
+
if (x <= 0) {
|
67 |
+
x = 0;
|
68 |
+
}
|
69 |
+
|
70 |
+
int y_low = (int)y;
|
71 |
+
int x_low = (int)x;
|
72 |
+
int y_high;
|
73 |
+
int x_high;
|
74 |
+
|
75 |
+
if (y_low >= height - 1) {
|
76 |
+
y_high = y_low = height - 1;
|
77 |
+
y = (T)y_low;
|
78 |
+
} else {
|
79 |
+
y_high = y_low + 1;
|
80 |
+
}
|
81 |
+
|
82 |
+
if (x_low >= width - 1) {
|
83 |
+
x_high = x_low = width - 1;
|
84 |
+
x = (T)x_low;
|
85 |
+
} else {
|
86 |
+
x_high = x_low + 1;
|
87 |
+
}
|
88 |
+
|
89 |
+
T ly = y - y_low;
|
90 |
+
T lx = x - x_low;
|
91 |
+
T hy = 1. - ly, hx = 1. - lx;
|
92 |
+
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
|
93 |
+
|
94 |
+
// save weights and indeces
|
95 |
+
PreCalc<T> pc;
|
96 |
+
pc.pos1 = y_low * width + x_low;
|
97 |
+
pc.pos2 = y_low * width + x_high;
|
98 |
+
pc.pos3 = y_high * width + x_low;
|
99 |
+
pc.pos4 = y_high * width + x_high;
|
100 |
+
pc.w1 = w1;
|
101 |
+
pc.w2 = w2;
|
102 |
+
pc.w3 = w3;
|
103 |
+
pc.w4 = w4;
|
104 |
+
pre_calc[pre_calc_index] = pc;
|
105 |
+
|
106 |
+
pre_calc_index += 1;
|
107 |
+
}
|
108 |
+
}
|
109 |
+
}
|
110 |
+
}
|
111 |
+
}
|
112 |
+
|
113 |
+
template <typename T>
|
114 |
+
void ROIAlignForward_cpu_kernel(
|
115 |
+
const int nthreads,
|
116 |
+
const T* bottom_data,
|
117 |
+
const T& spatial_scale,
|
118 |
+
const int channels,
|
119 |
+
const int height,
|
120 |
+
const int width,
|
121 |
+
const int pooled_height,
|
122 |
+
const int pooled_width,
|
123 |
+
const int sampling_ratio,
|
124 |
+
const T* bottom_rois,
|
125 |
+
//int roi_cols,
|
126 |
+
T* top_data) {
|
127 |
+
//AT_ASSERT(roi_cols == 4 || roi_cols == 5);
|
128 |
+
int roi_cols = 5;
|
129 |
+
|
130 |
+
int n_rois = nthreads / channels / pooled_width / pooled_height;
|
131 |
+
// (n, c, ph, pw) is an element in the pooled output
|
132 |
+
// can be parallelized using omp
|
133 |
+
// #pragma omp parallel for num_threads(32)
|
134 |
+
for (int n = 0; n < n_rois; n++) {
|
135 |
+
int index_n = n * channels * pooled_width * pooled_height;
|
136 |
+
|
137 |
+
// roi could have 4 or 5 columns
|
138 |
+
const T* offset_bottom_rois = bottom_rois + n * roi_cols;
|
139 |
+
int roi_batch_ind = 0;
|
140 |
+
if (roi_cols == 5) {
|
141 |
+
roi_batch_ind = offset_bottom_rois[0];
|
142 |
+
offset_bottom_rois++;
|
143 |
+
}
|
144 |
+
|
145 |
+
// Do not using rounding; this implementation detail is critical
|
146 |
+
T roi_start_w = offset_bottom_rois[0] * spatial_scale;
|
147 |
+
T roi_start_h = offset_bottom_rois[1] * spatial_scale;
|
148 |
+
T roi_end_w = offset_bottom_rois[2] * spatial_scale;
|
149 |
+
T roi_end_h = offset_bottom_rois[3] * spatial_scale;
|
150 |
+
// T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
|
151 |
+
// T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
|
152 |
+
// T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
|
153 |
+
// T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
|
154 |
+
|
155 |
+
// Force malformed ROIs to be 1x1
|
156 |
+
T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
|
157 |
+
T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
|
158 |
+
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
|
159 |
+
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
|
160 |
+
|
161 |
+
// We use roi_bin_grid to sample the grid and mimic integral
|
162 |
+
int roi_bin_grid_h = (sampling_ratio > 0)
|
163 |
+
? sampling_ratio
|
164 |
+
: ceil(roi_height / pooled_height); // e.g., = 2
|
165 |
+
int roi_bin_grid_w =
|
166 |
+
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
|
167 |
+
|
168 |
+
// We do average (integral) pooling inside a bin
|
169 |
+
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
|
170 |
+
|
171 |
+
// we want to precalculate indeces and weights shared by all chanels,
|
172 |
+
// this is the key point of optimiation
|
173 |
+
std::vector<PreCalc<T>> pre_calc(
|
174 |
+
roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
|
175 |
+
pre_calc_for_bilinear_interpolate(
|
176 |
+
height,
|
177 |
+
width,
|
178 |
+
pooled_height,
|
179 |
+
pooled_width,
|
180 |
+
roi_bin_grid_h,
|
181 |
+
roi_bin_grid_w,
|
182 |
+
roi_start_h,
|
183 |
+
roi_start_w,
|
184 |
+
bin_size_h,
|
185 |
+
bin_size_w,
|
186 |
+
roi_bin_grid_h,
|
187 |
+
roi_bin_grid_w,
|
188 |
+
pre_calc);
|
189 |
+
|
190 |
+
for (int c = 0; c < channels; c++) {
|
191 |
+
int index_n_c = index_n + c * pooled_width * pooled_height;
|
192 |
+
const T* offset_bottom_data =
|
193 |
+
bottom_data + (roi_batch_ind * channels + c) * height * width;
|
194 |
+
int pre_calc_index = 0;
|
195 |
+
|
196 |
+
for (int ph = 0; ph < pooled_height; ph++) {
|
197 |
+
for (int pw = 0; pw < pooled_width; pw++) {
|
198 |
+
int index = index_n_c + ph * pooled_width + pw;
|
199 |
+
|
200 |
+
T output_val = 0.;
|
201 |
+
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
|
202 |
+
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
|
203 |
+
PreCalc<T> pc = pre_calc[pre_calc_index];
|
204 |
+
output_val += pc.w1 * offset_bottom_data[pc.pos1] +
|
205 |
+
pc.w2 * offset_bottom_data[pc.pos2] +
|
206 |
+
pc.w3 * offset_bottom_data[pc.pos3] +
|
207 |
+
pc.w4 * offset_bottom_data[pc.pos4];
|
208 |
+
|
209 |
+
pre_calc_index += 1;
|
210 |
+
}
|
211 |
+
}
|
212 |
+
output_val /= count;
|
213 |
+
|
214 |
+
top_data[index] = output_val;
|
215 |
+
} // for pw
|
216 |
+
} // for ph
|
217 |
+
} // for c
|
218 |
+
} // for n
|
219 |
+
}
|
220 |
+
|
221 |
+
at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
|
222 |
+
const at::Tensor& rois,
|
223 |
+
const float spatial_scale,
|
224 |
+
const int pooled_height,
|
225 |
+
const int pooled_width,
|
226 |
+
const int sampling_ratio) {
|
227 |
+
AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor");
|
228 |
+
AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor");
|
229 |
+
|
230 |
+
auto num_rois = rois.size(0);
|
231 |
+
auto channels = input.size(1);
|
232 |
+
auto height = input.size(2);
|
233 |
+
auto width = input.size(3);
|
234 |
+
|
235 |
+
auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
|
236 |
+
auto output_size = num_rois * pooled_height * pooled_width * channels;
|
237 |
+
|
238 |
+
if (output.numel() == 0) {
|
239 |
+
return output;
|
240 |
+
}
|
241 |
+
|
242 |
+
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
|
243 |
+
ROIAlignForward_cpu_kernel<scalar_t>(
|
244 |
+
output_size,
|
245 |
+
input.data<scalar_t>(),
|
246 |
+
spatial_scale,
|
247 |
+
channels,
|
248 |
+
height,
|
249 |
+
width,
|
250 |
+
pooled_height,
|
251 |
+
pooled_width,
|
252 |
+
sampling_ratio,
|
253 |
+
rois.data<scalar_t>(),
|
254 |
+
output.data<scalar_t>());
|
255 |
+
});
|
256 |
+
return output;
|
257 |
+
}
|
maskrcnn_benchmark/csrc/cpu/dcn_v2_cpu.cpp
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <vector>
|
2 |
+
|
3 |
+
#include <ATen/ATen.h>
|
4 |
+
#include <ATen/cuda/CUDAContext.h>
|
5 |
+
|
6 |
+
|
7 |
+
at::Tensor
|
8 |
+
dcn_v2_cpu_forward(const at::Tensor &input,
|
9 |
+
const at::Tensor &weight,
|
10 |
+
const at::Tensor &bias,
|
11 |
+
const at::Tensor &offset,
|
12 |
+
const at::Tensor &mask,
|
13 |
+
const int kernel_h,
|
14 |
+
const int kernel_w,
|
15 |
+
const int stride_h,
|
16 |
+
const int stride_w,
|
17 |
+
const int pad_h,
|
18 |
+
const int pad_w,
|
19 |
+
const int dilation_h,
|
20 |
+
const int dilation_w,
|
21 |
+
const int deformable_group)
|
22 |
+
{
|
23 |
+
AT_ERROR("Not implement on cpu");
|
24 |
+
}
|
25 |
+
|
26 |
+
std::vector<at::Tensor>
|
27 |
+
dcn_v2_cpu_backward(const at::Tensor &input,
|
28 |
+
const at::Tensor &weight,
|
29 |
+
const at::Tensor &bias,
|
30 |
+
const at::Tensor &offset,
|
31 |
+
const at::Tensor &mask,
|
32 |
+
const at::Tensor &grad_output,
|
33 |
+
int kernel_h, int kernel_w,
|
34 |
+
int stride_h, int stride_w,
|
35 |
+
int pad_h, int pad_w,
|
36 |
+
int dilation_h, int dilation_w,
|
37 |
+
int deformable_group)
|
38 |
+
{
|
39 |
+
AT_ERROR("Not implement on cpu");
|
40 |
+
}
|
41 |
+
|
42 |
+
std::tuple<at::Tensor, at::Tensor>
|
43 |
+
dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
|
44 |
+
const at::Tensor &bbox,
|
45 |
+
const at::Tensor &trans,
|
46 |
+
const int no_trans,
|
47 |
+
const float spatial_scale,
|
48 |
+
const int output_dim,
|
49 |
+
const int group_size,
|
50 |
+
const int pooled_size,
|
51 |
+
const int part_size,
|
52 |
+
const int sample_per_part,
|
53 |
+
const float trans_std)
|
54 |
+
{
|
55 |
+
AT_ERROR("Not implement on cpu");
|
56 |
+
}
|
57 |
+
|
58 |
+
std::tuple<at::Tensor, at::Tensor>
|
59 |
+
dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
|
60 |
+
const at::Tensor &input,
|
61 |
+
const at::Tensor &bbox,
|
62 |
+
const at::Tensor &trans,
|
63 |
+
const at::Tensor &top_count,
|
64 |
+
const int no_trans,
|
65 |
+
const float spatial_scale,
|
66 |
+
const int output_dim,
|
67 |
+
const int group_size,
|
68 |
+
const int pooled_size,
|
69 |
+
const int part_size,
|
70 |
+
const int sample_per_part,
|
71 |
+
const float trans_std)
|
72 |
+
{
|
73 |
+
AT_ERROR("Not implement on cpu");
|
74 |
+
}
|
maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#include "cpu/vision.h"
|
3 |
+
|
4 |
+
|
5 |
+
template <typename scalar_t>
|
6 |
+
at::Tensor nms_cpu_kernel(const at::Tensor& dets,
|
7 |
+
const at::Tensor& scores,
|
8 |
+
const float threshold) {
|
9 |
+
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
|
10 |
+
AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
|
11 |
+
AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
|
12 |
+
|
13 |
+
if (dets.numel() == 0) {
|
14 |
+
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
|
15 |
+
}
|
16 |
+
|
17 |
+
auto x1_t = dets.select(1, 0).contiguous();
|
18 |
+
auto y1_t = dets.select(1, 1).contiguous();
|
19 |
+
auto x2_t = dets.select(1, 2).contiguous();
|
20 |
+
auto y2_t = dets.select(1, 3).contiguous();
|
21 |
+
|
22 |
+
at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
|
23 |
+
|
24 |
+
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
|
25 |
+
|
26 |
+
auto ndets = dets.size(0);
|
27 |
+
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
|
28 |
+
|
29 |
+
auto suppressed = suppressed_t.data<uint8_t>();
|
30 |
+
auto order = order_t.data<int64_t>();
|
31 |
+
auto x1 = x1_t.data<scalar_t>();
|
32 |
+
auto y1 = y1_t.data<scalar_t>();
|
33 |
+
auto x2 = x2_t.data<scalar_t>();
|
34 |
+
auto y2 = y2_t.data<scalar_t>();
|
35 |
+
auto areas = areas_t.data<scalar_t>();
|
36 |
+
|
37 |
+
for (int64_t _i = 0; _i < ndets; _i++) {
|
38 |
+
auto i = order[_i];
|
39 |
+
if (suppressed[i] == 1)
|
40 |
+
continue;
|
41 |
+
auto ix1 = x1[i];
|
42 |
+
auto iy1 = y1[i];
|
43 |
+
auto ix2 = x2[i];
|
44 |
+
auto iy2 = y2[i];
|
45 |
+
auto iarea = areas[i];
|
46 |
+
|
47 |
+
for (int64_t _j = _i + 1; _j < ndets; _j++) {
|
48 |
+
auto j = order[_j];
|
49 |
+
if (suppressed[j] == 1)
|
50 |
+
continue;
|
51 |
+
auto xx1 = std::max(ix1, x1[j]);
|
52 |
+
auto yy1 = std::max(iy1, y1[j]);
|
53 |
+
auto xx2 = std::min(ix2, x2[j]);
|
54 |
+
auto yy2 = std::min(iy2, y2[j]);
|
55 |
+
|
56 |
+
auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
|
57 |
+
auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
|
58 |
+
auto inter = w * h;
|
59 |
+
auto ovr = inter / (iarea + areas[j] - inter);
|
60 |
+
if (ovr >= threshold)
|
61 |
+
suppressed[j] = 1;
|
62 |
+
}
|
63 |
+
}
|
64 |
+
return at::nonzero(suppressed_t == 0).squeeze(1);
|
65 |
+
}
|
66 |
+
|
67 |
+
at::Tensor nms_cpu(const at::Tensor& dets,
|
68 |
+
const at::Tensor& scores,
|
69 |
+
const float threshold) {
|
70 |
+
at::Tensor result;
|
71 |
+
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
|
72 |
+
result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
|
73 |
+
});
|
74 |
+
return result;
|
75 |
+
}
|
maskrcnn_benchmark/csrc/cpu/vision.h
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#pragma once
|
3 |
+
#include <torch/extension.h>
|
4 |
+
|
5 |
+
|
6 |
+
at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
|
7 |
+
const at::Tensor& rois,
|
8 |
+
const float spatial_scale,
|
9 |
+
const int pooled_height,
|
10 |
+
const int pooled_width,
|
11 |
+
const int sampling_ratio);
|
12 |
+
|
13 |
+
|
14 |
+
at::Tensor nms_cpu(const at::Tensor& dets,
|
15 |
+
const at::Tensor& scores,
|
16 |
+
const float threshold);
|
17 |
+
at::Tensor
|
18 |
+
dcn_v2_cpu_forward(const at::Tensor &input,
|
19 |
+
const at::Tensor &weight,
|
20 |
+
const at::Tensor &bias,
|
21 |
+
const at::Tensor &offset,
|
22 |
+
const at::Tensor &mask,
|
23 |
+
const int kernel_h,
|
24 |
+
const int kernel_w,
|
25 |
+
const int stride_h,
|
26 |
+
const int stride_w,
|
27 |
+
const int pad_h,
|
28 |
+
const int pad_w,
|
29 |
+
const int dilation_h,
|
30 |
+
const int dilation_w,
|
31 |
+
const int deformable_group);
|
32 |
+
|
33 |
+
std::vector<at::Tensor>
|
34 |
+
dcn_v2_cpu_backward(const at::Tensor &input,
|
35 |
+
const at::Tensor &weight,
|
36 |
+
const at::Tensor &bias,
|
37 |
+
const at::Tensor &offset,
|
38 |
+
const at::Tensor &mask,
|
39 |
+
const at::Tensor &grad_output,
|
40 |
+
int kernel_h, int kernel_w,
|
41 |
+
int stride_h, int stride_w,
|
42 |
+
int pad_h, int pad_w,
|
43 |
+
int dilation_h, int dilation_w,
|
44 |
+
int deformable_group);
|
45 |
+
|
46 |
+
|
47 |
+
std::tuple<at::Tensor, at::Tensor>
|
48 |
+
dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
|
49 |
+
const at::Tensor &bbox,
|
50 |
+
const at::Tensor &trans,
|
51 |
+
const int no_trans,
|
52 |
+
const float spatial_scale,
|
53 |
+
const int output_dim,
|
54 |
+
const int group_size,
|
55 |
+
const int pooled_size,
|
56 |
+
const int part_size,
|
57 |
+
const int sample_per_part,
|
58 |
+
const float trans_std);
|
59 |
+
|
60 |
+
std::tuple<at::Tensor, at::Tensor>
|
61 |
+
dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
|
62 |
+
const at::Tensor &input,
|
63 |
+
const at::Tensor &bbox,
|
64 |
+
const at::Tensor &trans,
|
65 |
+
const at::Tensor &top_count,
|
66 |
+
const int no_trans,
|
67 |
+
const float spatial_scale,
|
68 |
+
const int output_dim,
|
69 |
+
const int group_size,
|
70 |
+
const int pooled_size,
|
71 |
+
const int part_size,
|
72 |
+
const int sample_per_part,
|
73 |
+
const float trans_std);
|
maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#include <ATen/ATen.h>
|
3 |
+
#include <ATen/cuda/CUDAContext.h>
|
4 |
+
|
5 |
+
#include <THC/THC.h>
|
6 |
+
#include <THC/THCAtomics.cuh>
|
7 |
+
#include <THC/THCDeviceUtils.cuh>
|
8 |
+
|
9 |
+
// TODO make it in a common file
|
10 |
+
#define CUDA_1D_KERNEL_LOOP(i, n) \
|
11 |
+
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
|
12 |
+
i += blockDim.x * gridDim.x)
|
13 |
+
|
14 |
+
|
15 |
+
template <typename T>
|
16 |
+
__device__ T bilinear_interpolate(const T* bottom_data,
|
17 |
+
const int height, const int width,
|
18 |
+
T y, T x,
|
19 |
+
const int index /* index for debug only*/) {
|
20 |
+
|
21 |
+
// deal with cases that inverse elements are out of feature map boundary
|
22 |
+
if (y < -1.0 || y > height || x < -1.0 || x > width) {
|
23 |
+
//empty
|
24 |
+
return 0;
|
25 |
+
}
|
26 |
+
|
27 |
+
if (y <= 0) y = 0;
|
28 |
+
if (x <= 0) x = 0;
|
29 |
+
|
30 |
+
int y_low = (int) y;
|
31 |
+
int x_low = (int) x;
|
32 |
+
int y_high;
|
33 |
+
int x_high;
|
34 |
+
|
35 |
+
if (y_low >= height - 1) {
|
36 |
+
y_high = y_low = height - 1;
|
37 |
+
y = (T) y_low;
|
38 |
+
} else {
|
39 |
+
y_high = y_low + 1;
|
40 |
+
}
|
41 |
+
|
42 |
+
if (x_low >= width - 1) {
|
43 |
+
x_high = x_low = width - 1;
|
44 |
+
x = (T) x_low;
|
45 |
+
} else {
|
46 |
+
x_high = x_low + 1;
|
47 |
+
}
|
48 |
+
|
49 |
+
T ly = y - y_low;
|
50 |
+
T lx = x - x_low;
|
51 |
+
T hy = 1. - ly, hx = 1. - lx;
|
52 |
+
// do bilinear interpolation
|
53 |
+
T v1 = bottom_data[y_low * width + x_low];
|
54 |
+
T v2 = bottom_data[y_low * width + x_high];
|
55 |
+
T v3 = bottom_data[y_high * width + x_low];
|
56 |
+
T v4 = bottom_data[y_high * width + x_high];
|
57 |
+
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
|
58 |
+
|
59 |
+
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
|
60 |
+
|
61 |
+
return val;
|
62 |
+
}
|
63 |
+
|
64 |
+
template <typename T>
|
65 |
+
__global__ void RoIAlignForward(const int nthreads, const T* bottom_data,
|
66 |
+
const T spatial_scale, const int channels,
|
67 |
+
const int height, const int width,
|
68 |
+
const int pooled_height, const int pooled_width,
|
69 |
+
const int sampling_ratio,
|
70 |
+
const T* bottom_rois, T* top_data) {
|
71 |
+
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
72 |
+
// (n, c, ph, pw) is an element in the pooled output
|
73 |
+
int pw = index % pooled_width;
|
74 |
+
int ph = (index / pooled_width) % pooled_height;
|
75 |
+
int c = (index / pooled_width / pooled_height) % channels;
|
76 |
+
int n = index / pooled_width / pooled_height / channels;
|
77 |
+
|
78 |
+
const T* offset_bottom_rois = bottom_rois + n * 5;
|
79 |
+
int roi_batch_ind = offset_bottom_rois[0];
|
80 |
+
|
81 |
+
// Do not using rounding; this implementation detail is critical
|
82 |
+
T roi_start_w = offset_bottom_rois[1] * spatial_scale;
|
83 |
+
T roi_start_h = offset_bottom_rois[2] * spatial_scale;
|
84 |
+
T roi_end_w = offset_bottom_rois[3] * spatial_scale;
|
85 |
+
T roi_end_h = offset_bottom_rois[4] * spatial_scale;
|
86 |
+
// T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
|
87 |
+
// T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
|
88 |
+
// T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
|
89 |
+
// T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
|
90 |
+
|
91 |
+
// Force malformed ROIs to be 1x1
|
92 |
+
T roi_width = max(roi_end_w - roi_start_w, (T)1.);
|
93 |
+
T roi_height = max(roi_end_h - roi_start_h, (T)1.);
|
94 |
+
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
|
95 |
+
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
|
96 |
+
|
97 |
+
const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
|
98 |
+
|
99 |
+
// We use roi_bin_grid to sample the grid and mimic integral
|
100 |
+
int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
|
101 |
+
int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
|
102 |
+
|
103 |
+
// We do average (integral) pooling inside a bin
|
104 |
+
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
|
105 |
+
|
106 |
+
T output_val = 0.;
|
107 |
+
for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
|
108 |
+
{
|
109 |
+
const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
|
110 |
+
for (int ix = 0; ix < roi_bin_grid_w; ix ++)
|
111 |
+
{
|
112 |
+
const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
|
113 |
+
|
114 |
+
T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index);
|
115 |
+
output_val += val;
|
116 |
+
}
|
117 |
+
}
|
118 |
+
output_val /= count;
|
119 |
+
|
120 |
+
top_data[index] = output_val;
|
121 |
+
}
|
122 |
+
}
|
123 |
+
|
124 |
+
|
125 |
+
template <typename T>
|
126 |
+
__device__ void bilinear_interpolate_gradient(
|
127 |
+
const int height, const int width,
|
128 |
+
T y, T x,
|
129 |
+
T & w1, T & w2, T & w3, T & w4,
|
130 |
+
int & x_low, int & x_high, int & y_low, int & y_high,
|
131 |
+
const int index /* index for debug only*/) {
|
132 |
+
|
133 |
+
// deal with cases that inverse elements are out of feature map boundary
|
134 |
+
if (y < -1.0 || y > height || x < -1.0 || x > width) {
|
135 |
+
//empty
|
136 |
+
w1 = w2 = w3 = w4 = 0.;
|
137 |
+
x_low = x_high = y_low = y_high = -1;
|
138 |
+
return;
|
139 |
+
}
|
140 |
+
|
141 |
+
if (y <= 0) y = 0;
|
142 |
+
if (x <= 0) x = 0;
|
143 |
+
|
144 |
+
y_low = (int) y;
|
145 |
+
x_low = (int) x;
|
146 |
+
|
147 |
+
if (y_low >= height - 1) {
|
148 |
+
y_high = y_low = height - 1;
|
149 |
+
y = (T) y_low;
|
150 |
+
} else {
|
151 |
+
y_high = y_low + 1;
|
152 |
+
}
|
153 |
+
|
154 |
+
if (x_low >= width - 1) {
|
155 |
+
x_high = x_low = width - 1;
|
156 |
+
x = (T) x_low;
|
157 |
+
} else {
|
158 |
+
x_high = x_low + 1;
|
159 |
+
}
|
160 |
+
|
161 |
+
T ly = y - y_low;
|
162 |
+
T lx = x - x_low;
|
163 |
+
T hy = 1. - ly, hx = 1. - lx;
|
164 |
+
|
165 |
+
// reference in forward
|
166 |
+
// T v1 = bottom_data[y_low * width + x_low];
|
167 |
+
// T v2 = bottom_data[y_low * width + x_high];
|
168 |
+
// T v3 = bottom_data[y_high * width + x_low];
|
169 |
+
// T v4 = bottom_data[y_high * width + x_high];
|
170 |
+
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
|
171 |
+
|
172 |
+
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
|
173 |
+
|
174 |
+
return;
|
175 |
+
}
|
176 |
+
|
177 |
+
template <typename T>
|
178 |
+
__global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff,
|
179 |
+
const int num_rois, const T spatial_scale,
|
180 |
+
const int channels, const int height, const int width,
|
181 |
+
const int pooled_height, const int pooled_width,
|
182 |
+
const int sampling_ratio,
|
183 |
+
T* bottom_diff,
|
184 |
+
const T* bottom_rois) {
|
185 |
+
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
186 |
+
// (n, c, ph, pw) is an element in the pooled output
|
187 |
+
int pw = index % pooled_width;
|
188 |
+
int ph = (index / pooled_width) % pooled_height;
|
189 |
+
int c = (index / pooled_width / pooled_height) % channels;
|
190 |
+
int n = index / pooled_width / pooled_height / channels;
|
191 |
+
|
192 |
+
const T* offset_bottom_rois = bottom_rois + n * 5;
|
193 |
+
int roi_batch_ind = offset_bottom_rois[0];
|
194 |
+
|
195 |
+
// Do not using rounding; this implementation detail is critical
|
196 |
+
T roi_start_w = offset_bottom_rois[1] * spatial_scale;
|
197 |
+
T roi_start_h = offset_bottom_rois[2] * spatial_scale;
|
198 |
+
T roi_end_w = offset_bottom_rois[3] * spatial_scale;
|
199 |
+
T roi_end_h = offset_bottom_rois[4] * spatial_scale;
|
200 |
+
// T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
|
201 |
+
// T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
|
202 |
+
// T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
|
203 |
+
// T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
|
204 |
+
|
205 |
+
// Force malformed ROIs to be 1x1
|
206 |
+
T roi_width = max(roi_end_w - roi_start_w, (T)1.);
|
207 |
+
T roi_height = max(roi_end_h - roi_start_h, (T)1.);
|
208 |
+
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
|
209 |
+
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
|
210 |
+
|
211 |
+
T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
|
212 |
+
|
213 |
+
int top_offset = (n * channels + c) * pooled_height * pooled_width;
|
214 |
+
const T* offset_top_diff = top_diff + top_offset;
|
215 |
+
const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
|
216 |
+
|
217 |
+
// We use roi_bin_grid to sample the grid and mimic integral
|
218 |
+
int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
|
219 |
+
int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
|
220 |
+
|
221 |
+
// We do average (integral) pooling inside a bin
|
222 |
+
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
|
223 |
+
|
224 |
+
for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
|
225 |
+
{
|
226 |
+
const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
|
227 |
+
for (int ix = 0; ix < roi_bin_grid_w; ix ++)
|
228 |
+
{
|
229 |
+
const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
|
230 |
+
|
231 |
+
T w1, w2, w3, w4;
|
232 |
+
int x_low, x_high, y_low, y_high;
|
233 |
+
|
234 |
+
bilinear_interpolate_gradient(height, width, y, x,
|
235 |
+
w1, w2, w3, w4,
|
236 |
+
x_low, x_high, y_low, y_high,
|
237 |
+
index);
|
238 |
+
|
239 |
+
T g1 = top_diff_this_bin * w1 / count;
|
240 |
+
T g2 = top_diff_this_bin * w2 / count;
|
241 |
+
T g3 = top_diff_this_bin * w3 / count;
|
242 |
+
T g4 = top_diff_this_bin * w4 / count;
|
243 |
+
|
244 |
+
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0)
|
245 |
+
{
|
246 |
+
atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
|
247 |
+
atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
|
248 |
+
atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
|
249 |
+
atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
|
250 |
+
} // if
|
251 |
+
} // ix
|
252 |
+
} // iy
|
253 |
+
} // CUDA_1D_KERNEL_LOOP
|
254 |
+
} // RoIAlignBackward
|
255 |
+
|
256 |
+
|
257 |
+
at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
|
258 |
+
const at::Tensor& rois,
|
259 |
+
const float spatial_scale,
|
260 |
+
const int pooled_height,
|
261 |
+
const int pooled_width,
|
262 |
+
const int sampling_ratio) {
|
263 |
+
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
|
264 |
+
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
|
265 |
+
|
266 |
+
auto num_rois = rois.size(0);
|
267 |
+
auto channels = input.size(1);
|
268 |
+
auto height = input.size(2);
|
269 |
+
auto width = input.size(3);
|
270 |
+
|
271 |
+
auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
|
272 |
+
auto output_size = num_rois * pooled_height * pooled_width * channels;
|
273 |
+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
274 |
+
|
275 |
+
dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
|
276 |
+
dim3 block(512);
|
277 |
+
|
278 |
+
if (output.numel() == 0) {
|
279 |
+
THCudaCheck(cudaGetLastError());
|
280 |
+
return output;
|
281 |
+
}
|
282 |
+
|
283 |
+
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
|
284 |
+
RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
|
285 |
+
output_size,
|
286 |
+
input.contiguous().data<scalar_t>(),
|
287 |
+
spatial_scale,
|
288 |
+
channels,
|
289 |
+
height,
|
290 |
+
width,
|
291 |
+
pooled_height,
|
292 |
+
pooled_width,
|
293 |
+
sampling_ratio,
|
294 |
+
rois.contiguous().data<scalar_t>(),
|
295 |
+
output.data<scalar_t>());
|
296 |
+
});
|
297 |
+
THCudaCheck(cudaGetLastError());
|
298 |
+
return output;
|
299 |
+
}
|
300 |
+
|
301 |
+
// TODO remove the dependency on input and use instead its sizes -> save memory
|
302 |
+
at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
|
303 |
+
const at::Tensor& rois,
|
304 |
+
const float spatial_scale,
|
305 |
+
const int pooled_height,
|
306 |
+
const int pooled_width,
|
307 |
+
const int batch_size,
|
308 |
+
const int channels,
|
309 |
+
const int height,
|
310 |
+
const int width,
|
311 |
+
const int sampling_ratio) {
|
312 |
+
AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
|
313 |
+
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
|
314 |
+
|
315 |
+
auto num_rois = rois.size(0);
|
316 |
+
auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
|
317 |
+
|
318 |
+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
319 |
+
|
320 |
+
dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
|
321 |
+
dim3 block(512);
|
322 |
+
|
323 |
+
// handle possibly empty gradients
|
324 |
+
if (grad.numel() == 0) {
|
325 |
+
THCudaCheck(cudaGetLastError());
|
326 |
+
return grad_input;
|
327 |
+
}
|
328 |
+
|
329 |
+
AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] {
|
330 |
+
RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
|
331 |
+
grad.numel(),
|
332 |
+
grad.contiguous().data<scalar_t>(),
|
333 |
+
num_rois,
|
334 |
+
spatial_scale,
|
335 |
+
channels,
|
336 |
+
height,
|
337 |
+
width,
|
338 |
+
pooled_height,
|
339 |
+
pooled_width,
|
340 |
+
sampling_ratio,
|
341 |
+
grad_input.data<scalar_t>(),
|
342 |
+
rois.contiguous().data<scalar_t>());
|
343 |
+
});
|
344 |
+
THCudaCheck(cudaGetLastError());
|
345 |
+
return grad_input;
|
346 |
+
}
|
maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#include <ATen/ATen.h>
|
3 |
+
#include <ATen/cuda/CUDAContext.h>
|
4 |
+
|
5 |
+
#include <THC/THC.h>
|
6 |
+
#include <THC/THCAtomics.cuh>
|
7 |
+
#include <THC/THCDeviceUtils.cuh>
|
8 |
+
|
9 |
+
|
10 |
+
// TODO make it in a common file
|
11 |
+
#define CUDA_1D_KERNEL_LOOP(i, n) \
|
12 |
+
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
|
13 |
+
i += blockDim.x * gridDim.x)
|
14 |
+
|
15 |
+
|
16 |
+
template <typename T>
|
17 |
+
__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
|
18 |
+
const T spatial_scale, const int channels, const int height,
|
19 |
+
const int width, const int pooled_height, const int pooled_width,
|
20 |
+
const T* bottom_rois, T* top_data, int* argmax_data) {
|
21 |
+
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
22 |
+
// (n, c, ph, pw) is an element in the pooled output
|
23 |
+
int pw = index % pooled_width;
|
24 |
+
int ph = (index / pooled_width) % pooled_height;
|
25 |
+
int c = (index / pooled_width / pooled_height) % channels;
|
26 |
+
int n = index / pooled_width / pooled_height / channels;
|
27 |
+
|
28 |
+
const T* offset_bottom_rois = bottom_rois + n * 5;
|
29 |
+
int roi_batch_ind = offset_bottom_rois[0];
|
30 |
+
int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
|
31 |
+
int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
|
32 |
+
int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
|
33 |
+
int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
|
34 |
+
|
35 |
+
// Force malformed ROIs to be 1x1
|
36 |
+
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
|
37 |
+
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
|
38 |
+
T bin_size_h = static_cast<T>(roi_height)
|
39 |
+
/ static_cast<T>(pooled_height);
|
40 |
+
T bin_size_w = static_cast<T>(roi_width)
|
41 |
+
/ static_cast<T>(pooled_width);
|
42 |
+
|
43 |
+
int hstart = static_cast<int>(floor(static_cast<T>(ph)
|
44 |
+
* bin_size_h));
|
45 |
+
int wstart = static_cast<int>(floor(static_cast<T>(pw)
|
46 |
+
* bin_size_w));
|
47 |
+
int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
|
48 |
+
* bin_size_h));
|
49 |
+
int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
|
50 |
+
* bin_size_w));
|
51 |
+
|
52 |
+
// Add roi offsets and clip to input boundaries
|
53 |
+
hstart = min(max(hstart + roi_start_h, 0), height);
|
54 |
+
hend = min(max(hend + roi_start_h, 0), height);
|
55 |
+
wstart = min(max(wstart + roi_start_w, 0), width);
|
56 |
+
wend = min(max(wend + roi_start_w, 0), width);
|
57 |
+
bool is_empty = (hend <= hstart) || (wend <= wstart);
|
58 |
+
|
59 |
+
// Define an empty pooling region to be zero
|
60 |
+
T maxval = is_empty ? 0 : -FLT_MAX;
|
61 |
+
// If nothing is pooled, argmax = -1 causes nothing to be backprop'd
|
62 |
+
int maxidx = -1;
|
63 |
+
const T* offset_bottom_data =
|
64 |
+
bottom_data + (roi_batch_ind * channels + c) * height * width;
|
65 |
+
for (int h = hstart; h < hend; ++h) {
|
66 |
+
for (int w = wstart; w < wend; ++w) {
|
67 |
+
int bottom_index = h * width + w;
|
68 |
+
if (offset_bottom_data[bottom_index] > maxval) {
|
69 |
+
maxval = offset_bottom_data[bottom_index];
|
70 |
+
maxidx = bottom_index;
|
71 |
+
}
|
72 |
+
}
|
73 |
+
}
|
74 |
+
top_data[index] = maxval;
|
75 |
+
argmax_data[index] = maxidx;
|
76 |
+
}
|
77 |
+
}
|
78 |
+
|
79 |
+
template <typename T>
|
80 |
+
__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
|
81 |
+
const int* argmax_data, const int num_rois, const T spatial_scale,
|
82 |
+
const int channels, const int height, const int width,
|
83 |
+
const int pooled_height, const int pooled_width, T* bottom_diff,
|
84 |
+
const T* bottom_rois) {
|
85 |
+
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
86 |
+
// (n, c, ph, pw) is an element in the pooled output
|
87 |
+
int pw = index % pooled_width;
|
88 |
+
int ph = (index / pooled_width) % pooled_height;
|
89 |
+
int c = (index / pooled_width / pooled_height) % channels;
|
90 |
+
int n = index / pooled_width / pooled_height / channels;
|
91 |
+
|
92 |
+
const T* offset_bottom_rois = bottom_rois + n * 5;
|
93 |
+
int roi_batch_ind = offset_bottom_rois[0];
|
94 |
+
int bottom_offset = (roi_batch_ind * channels + c) * height * width;
|
95 |
+
int top_offset = (n * channels + c) * pooled_height * pooled_width;
|
96 |
+
const T* offset_top_diff = top_diff + top_offset;
|
97 |
+
T* offset_bottom_diff = bottom_diff + bottom_offset;
|
98 |
+
const int* offset_argmax_data = argmax_data + top_offset;
|
99 |
+
|
100 |
+
int argmax = offset_argmax_data[ph * pooled_width + pw];
|
101 |
+
if (argmax != -1) {
|
102 |
+
atomicAdd(
|
103 |
+
offset_bottom_diff + argmax,
|
104 |
+
static_cast<T>(offset_top_diff[ph * pooled_width + pw]));
|
105 |
+
|
106 |
+
}
|
107 |
+
}
|
108 |
+
}
|
109 |
+
|
110 |
+
std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
|
111 |
+
const at::Tensor& rois,
|
112 |
+
const float spatial_scale,
|
113 |
+
const int pooled_height,
|
114 |
+
const int pooled_width) {
|
115 |
+
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
|
116 |
+
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
|
117 |
+
|
118 |
+
auto num_rois = rois.size(0);
|
119 |
+
auto channels = input.size(1);
|
120 |
+
auto height = input.size(2);
|
121 |
+
auto width = input.size(3);
|
122 |
+
|
123 |
+
auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
|
124 |
+
auto output_size = num_rois * pooled_height * pooled_width * channels;
|
125 |
+
auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt));
|
126 |
+
|
127 |
+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
128 |
+
|
129 |
+
dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
|
130 |
+
dim3 block(512);
|
131 |
+
|
132 |
+
if (output.numel() == 0) {
|
133 |
+
THCudaCheck(cudaGetLastError());
|
134 |
+
return std::make_tuple(output, argmax);
|
135 |
+
}
|
136 |
+
|
137 |
+
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] {
|
138 |
+
RoIPoolFForward<scalar_t><<<grid, block, 0, stream>>>(
|
139 |
+
output_size,
|
140 |
+
input.contiguous().data<scalar_t>(),
|
141 |
+
spatial_scale,
|
142 |
+
channels,
|
143 |
+
height,
|
144 |
+
width,
|
145 |
+
pooled_height,
|
146 |
+
pooled_width,
|
147 |
+
rois.contiguous().data<scalar_t>(),
|
148 |
+
output.data<scalar_t>(),
|
149 |
+
argmax.data<int>());
|
150 |
+
});
|
151 |
+
THCudaCheck(cudaGetLastError());
|
152 |
+
return std::make_tuple(output, argmax);
|
153 |
+
}
|
154 |
+
|
155 |
+
// TODO remove the dependency on input and use instead its sizes -> save memory
|
156 |
+
at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
|
157 |
+
const at::Tensor& input,
|
158 |
+
const at::Tensor& rois,
|
159 |
+
const at::Tensor& argmax,
|
160 |
+
const float spatial_scale,
|
161 |
+
const int pooled_height,
|
162 |
+
const int pooled_width,
|
163 |
+
const int batch_size,
|
164 |
+
const int channels,
|
165 |
+
const int height,
|
166 |
+
const int width) {
|
167 |
+
AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
|
168 |
+
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
|
169 |
+
// TODO add more checks
|
170 |
+
|
171 |
+
auto num_rois = rois.size(0);
|
172 |
+
auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
|
173 |
+
|
174 |
+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
175 |
+
|
176 |
+
dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
|
177 |
+
dim3 block(512);
|
178 |
+
|
179 |
+
// handle possibly empty gradients
|
180 |
+
if (grad.numel() == 0) {
|
181 |
+
THCudaCheck(cudaGetLastError());
|
182 |
+
return grad_input;
|
183 |
+
}
|
184 |
+
|
185 |
+
AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] {
|
186 |
+
RoIPoolFBackward<scalar_t><<<grid, block, 0, stream>>>(
|
187 |
+
grad.numel(),
|
188 |
+
grad.contiguous().data<scalar_t>(),
|
189 |
+
argmax.data<int>(),
|
190 |
+
num_rois,
|
191 |
+
spatial_scale,
|
192 |
+
channels,
|
193 |
+
height,
|
194 |
+
width,
|
195 |
+
pooled_height,
|
196 |
+
pooled_width,
|
197 |
+
grad_input.data<scalar_t>(),
|
198 |
+
rois.contiguous().data<scalar_t>());
|
199 |
+
});
|
200 |
+
THCudaCheck(cudaGetLastError());
|
201 |
+
return grad_input;
|
202 |
+
}
|
maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
// This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
|
3 |
+
// Cheng-Yang Fu
|
4 |
+
// [email protected]
|
5 |
+
#include <ATen/ATen.h>
|
6 |
+
#include <ATen/cuda/CUDAContext.h>
|
7 |
+
|
8 |
+
#include <THC/THC.h>
|
9 |
+
#include <THC/THCAtomics.cuh>
|
10 |
+
#include <THC/THCDeviceUtils.cuh>
|
11 |
+
|
12 |
+
#include <cfloat>
|
13 |
+
|
14 |
+
// TODO make it in a common file
|
15 |
+
#define CUDA_1D_KERNEL_LOOP(i, n) \
|
16 |
+
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
|
17 |
+
i += blockDim.x * gridDim.x)
|
18 |
+
|
19 |
+
|
20 |
+
template <typename T>
|
21 |
+
__global__ void SigmoidFocalLossForward(const int nthreads,
|
22 |
+
const T* logits,
|
23 |
+
const int* targets,
|
24 |
+
const int num_classes,
|
25 |
+
const float gamma,
|
26 |
+
const float alpha,
|
27 |
+
const int num,
|
28 |
+
T* losses) {
|
29 |
+
CUDA_1D_KERNEL_LOOP(i, nthreads) {
|
30 |
+
|
31 |
+
int n = i / num_classes;
|
32 |
+
int d = i % num_classes; // current class[0~79];
|
33 |
+
int t = targets[n]; // target class [1~80];
|
34 |
+
|
35 |
+
// Decide it is positive or negative case.
|
36 |
+
T c1 = (t == (d+1));
|
37 |
+
T c2 = (t>=0 & t != (d+1));
|
38 |
+
|
39 |
+
T zn = (1.0 - alpha);
|
40 |
+
T zp = (alpha);
|
41 |
+
|
42 |
+
// p = 1. / 1. + expf(-x); p = sigmoid(x)
|
43 |
+
T p = 1. / (1. + expf(-logits[i]));
|
44 |
+
|
45 |
+
// (1-p)**gamma * log(p) where
|
46 |
+
T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
|
47 |
+
|
48 |
+
// p**gamma * log(1-p)
|
49 |
+
T term2 = powf(p, gamma) *
|
50 |
+
(-1. * logits[i] * (logits[i] >= 0) -
|
51 |
+
logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
|
52 |
+
|
53 |
+
losses[i] = 0.0;
|
54 |
+
losses[i] += -c1 * term1 * zp;
|
55 |
+
losses[i] += -c2 * term2 * zn;
|
56 |
+
|
57 |
+
} // CUDA_1D_KERNEL_LOOP
|
58 |
+
} // SigmoidFocalLossForward
|
59 |
+
|
60 |
+
|
61 |
+
template <typename T>
|
62 |
+
__global__ void SigmoidFocalLossBackward(const int nthreads,
|
63 |
+
const T* logits,
|
64 |
+
const int* targets,
|
65 |
+
const T* d_losses,
|
66 |
+
const int num_classes,
|
67 |
+
const float gamma,
|
68 |
+
const float alpha,
|
69 |
+
const int num,
|
70 |
+
T* d_logits) {
|
71 |
+
CUDA_1D_KERNEL_LOOP(i, nthreads) {
|
72 |
+
|
73 |
+
int n = i / num_classes;
|
74 |
+
int d = i % num_classes; // current class[0~79];
|
75 |
+
int t = targets[n]; // target class [1~80], 0 is background;
|
76 |
+
|
77 |
+
// Decide it is positive or negative case.
|
78 |
+
T c1 = (t == (d+1));
|
79 |
+
T c2 = (t>=0 & t != (d+1));
|
80 |
+
|
81 |
+
T zn = (1.0 - alpha);
|
82 |
+
T zp = (alpha);
|
83 |
+
// p = 1. / 1. + expf(-x); p = sigmoid(x)
|
84 |
+
T p = 1. / (1. + expf(-logits[i]));
|
85 |
+
|
86 |
+
// (1-p)**g * (1 - p - g*p*log(p)
|
87 |
+
T term1 = powf((1. - p), gamma) *
|
88 |
+
(1. - p - (p * gamma * logf(max(p, FLT_MIN))));
|
89 |
+
|
90 |
+
// (p**g) * (g*(1-p)*log(1-p) - p)
|
91 |
+
T term2 = powf(p, gamma) *
|
92 |
+
((-1. * logits[i] * (logits[i] >= 0) -
|
93 |
+
logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
|
94 |
+
(1. - p) * gamma - p);
|
95 |
+
d_logits[i] = 0.0;
|
96 |
+
d_logits[i] += -c1 * term1 * zp;
|
97 |
+
d_logits[i] += -c2 * term2 * zn;
|
98 |
+
d_logits[i] = d_logits[i] * d_losses[i];
|
99 |
+
|
100 |
+
} // CUDA_1D_KERNEL_LOOP
|
101 |
+
} // SigmoidFocalLossBackward
|
102 |
+
|
103 |
+
|
104 |
+
at::Tensor SigmoidFocalLoss_forward_cuda(
|
105 |
+
const at::Tensor& logits,
|
106 |
+
const at::Tensor& targets,
|
107 |
+
const int num_classes,
|
108 |
+
const float gamma,
|
109 |
+
const float alpha) {
|
110 |
+
AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
|
111 |
+
AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
|
112 |
+
AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
|
113 |
+
|
114 |
+
const int num_samples = logits.size(0);
|
115 |
+
|
116 |
+
auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
|
117 |
+
auto losses_size = num_samples * logits.size(1);
|
118 |
+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
119 |
+
|
120 |
+
dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
|
121 |
+
dim3 block(512);
|
122 |
+
|
123 |
+
if (losses.numel() == 0) {
|
124 |
+
THCudaCheck(cudaGetLastError());
|
125 |
+
return losses;
|
126 |
+
}
|
127 |
+
|
128 |
+
AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] {
|
129 |
+
SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
|
130 |
+
losses_size,
|
131 |
+
logits.contiguous().data<scalar_t>(),
|
132 |
+
targets.contiguous().data<int>(),
|
133 |
+
num_classes,
|
134 |
+
gamma,
|
135 |
+
alpha,
|
136 |
+
num_samples,
|
137 |
+
losses.data<scalar_t>());
|
138 |
+
});
|
139 |
+
THCudaCheck(cudaGetLastError());
|
140 |
+
return losses;
|
141 |
+
}
|
142 |
+
|
143 |
+
|
144 |
+
at::Tensor SigmoidFocalLoss_backward_cuda(
|
145 |
+
const at::Tensor& logits,
|
146 |
+
const at::Tensor& targets,
|
147 |
+
const at::Tensor& d_losses,
|
148 |
+
const int num_classes,
|
149 |
+
const float gamma,
|
150 |
+
const float alpha) {
|
151 |
+
AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
|
152 |
+
AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
|
153 |
+
AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor");
|
154 |
+
|
155 |
+
AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
|
156 |
+
|
157 |
+
const int num_samples = logits.size(0);
|
158 |
+
AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
|
159 |
+
|
160 |
+
auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
|
161 |
+
auto d_logits_size = num_samples * logits.size(1);
|
162 |
+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
163 |
+
|
164 |
+
dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
|
165 |
+
dim3 block(512);
|
166 |
+
|
167 |
+
if (d_logits.numel() == 0) {
|
168 |
+
THCudaCheck(cudaGetLastError());
|
169 |
+
return d_logits;
|
170 |
+
}
|
171 |
+
|
172 |
+
AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] {
|
173 |
+
SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
|
174 |
+
d_logits_size,
|
175 |
+
logits.contiguous().data<scalar_t>(),
|
176 |
+
targets.contiguous().data<int>(),
|
177 |
+
d_losses.contiguous().data<scalar_t>(),
|
178 |
+
num_classes,
|
179 |
+
gamma,
|
180 |
+
alpha,
|
181 |
+
num_samples,
|
182 |
+
d_logits.data<scalar_t>());
|
183 |
+
});
|
184 |
+
|
185 |
+
THCudaCheck(cudaGetLastError());
|
186 |
+
return d_logits;
|
187 |
+
}
|
188 |
+
|
maskrcnn_benchmark/csrc/cuda/dcn_v2_cuda.cu
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <vector>
|
2 |
+
#include "cuda/dcn_v2_im2col_cuda.h"
|
3 |
+
|
4 |
+
#include <ATen/ATen.h>
|
5 |
+
#include <ATen/cuda/CUDAContext.h>
|
6 |
+
|
7 |
+
#include <THC/THC.h>
|
8 |
+
#include <THC/THCAtomics.cuh>
|
9 |
+
#include <THC/THCDeviceUtils.cuh>
|
10 |
+
|
11 |
+
extern THCState *state;
|
12 |
+
|
13 |
+
// author: Charles Shang
|
14 |
+
// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
|
15 |
+
|
16 |
+
// [batch gemm]
|
17 |
+
// https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu
|
18 |
+
|
19 |
+
__global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
|
20 |
+
float **columns_b, const float **ones_b,
|
21 |
+
const float **weight_b, const float **bias_b,
|
22 |
+
float *input, float *output,
|
23 |
+
float *columns, float *ones,
|
24 |
+
float *weight, float *bias,
|
25 |
+
const int input_stride, const int output_stride,
|
26 |
+
const int columns_stride, const int ones_stride,
|
27 |
+
const int num_batches)
|
28 |
+
{
|
29 |
+
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
30 |
+
if (idx < num_batches)
|
31 |
+
{
|
32 |
+
input_b[idx] = input + idx * input_stride;
|
33 |
+
output_b[idx] = output + idx * output_stride;
|
34 |
+
columns_b[idx] = columns + idx * columns_stride;
|
35 |
+
ones_b[idx] = ones + idx * ones_stride;
|
36 |
+
// share weights and bias within a Mini-Batch
|
37 |
+
weight_b[idx] = weight;
|
38 |
+
bias_b[idx] = bias;
|
39 |
+
}
|
40 |
+
}
|
41 |
+
|
42 |
+
at::Tensor
|
43 |
+
dcn_v2_cuda_forward(const at::Tensor &input,
|
44 |
+
const at::Tensor &weight,
|
45 |
+
const at::Tensor &bias,
|
46 |
+
const at::Tensor &offset,
|
47 |
+
const at::Tensor &mask,
|
48 |
+
const int kernel_h,
|
49 |
+
const int kernel_w,
|
50 |
+
const int stride_h,
|
51 |
+
const int stride_w,
|
52 |
+
const int pad_h,
|
53 |
+
const int pad_w,
|
54 |
+
const int dilation_h,
|
55 |
+
const int dilation_w,
|
56 |
+
const int deformable_group)
|
57 |
+
{
|
58 |
+
using scalar_t = float;
|
59 |
+
// THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
|
60 |
+
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
|
61 |
+
AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
|
62 |
+
AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
|
63 |
+
AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
|
64 |
+
AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
|
65 |
+
|
66 |
+
const int batch = input.size(0);
|
67 |
+
const int channels = input.size(1);
|
68 |
+
const int height = input.size(2);
|
69 |
+
const int width = input.size(3);
|
70 |
+
|
71 |
+
const int channels_out = weight.size(0);
|
72 |
+
const int channels_kernel = weight.size(1);
|
73 |
+
const int kernel_h_ = weight.size(2);
|
74 |
+
const int kernel_w_ = weight.size(3);
|
75 |
+
|
76 |
+
// printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
|
77 |
+
// printf("Channels: %d %d\n", channels, channels_kernel);
|
78 |
+
// printf("Channels: %d %d\n", channels_out, channels_kernel);
|
79 |
+
|
80 |
+
AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
|
81 |
+
"Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
|
82 |
+
|
83 |
+
AT_ASSERTM(channels == channels_kernel,
|
84 |
+
"Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
|
85 |
+
|
86 |
+
const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
|
87 |
+
const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
|
88 |
+
|
89 |
+
auto ones = at::ones({batch, height_out, width_out}, input.options());
|
90 |
+
auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
|
91 |
+
auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
|
92 |
+
|
93 |
+
// prepare for batch-wise computing, which is significantly faster than instance-wise computing
|
94 |
+
// when batch size is large.
|
95 |
+
// launch batch threads
|
96 |
+
int matrices_size = batch * sizeof(float *);
|
97 |
+
auto input_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
|
98 |
+
auto output_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
|
99 |
+
auto columns_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
|
100 |
+
auto ones_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
|
101 |
+
auto weight_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
|
102 |
+
auto bias_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
|
103 |
+
|
104 |
+
const int block = 128;
|
105 |
+
const int grid = (batch + block - 1) / block;
|
106 |
+
|
107 |
+
createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
|
108 |
+
input_b, output_b,
|
109 |
+
columns_b, ones_b,
|
110 |
+
weight_b, bias_b,
|
111 |
+
input.data<scalar_t>(),
|
112 |
+
output.data<scalar_t>(),
|
113 |
+
columns.data<scalar_t>(),
|
114 |
+
ones.data<scalar_t>(),
|
115 |
+
weight.data<scalar_t>(),
|
116 |
+
bias.data<scalar_t>(),
|
117 |
+
channels * width * height,
|
118 |
+
channels_out * width_out * height_out,
|
119 |
+
channels * kernel_h * kernel_w * height_out * width_out,
|
120 |
+
height_out * width_out,
|
121 |
+
batch);
|
122 |
+
|
123 |
+
long m_ = channels_out;
|
124 |
+
long n_ = height_out * width_out;
|
125 |
+
long k_ = 1;
|
126 |
+
THCudaBlas_SgemmBatched(state,
|
127 |
+
't',
|
128 |
+
'n',
|
129 |
+
n_,
|
130 |
+
m_,
|
131 |
+
k_,
|
132 |
+
1.0f,
|
133 |
+
ones_b, k_,
|
134 |
+
bias_b, k_,
|
135 |
+
0.0f,
|
136 |
+
output_b, n_,
|
137 |
+
batch);
|
138 |
+
|
139 |
+
modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
|
140 |
+
input.data<scalar_t>(),
|
141 |
+
offset.data<scalar_t>(),
|
142 |
+
mask.data<scalar_t>(),
|
143 |
+
batch, channels, height, width,
|
144 |
+
height_out, width_out, kernel_h, kernel_w,
|
145 |
+
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
|
146 |
+
deformable_group,
|
147 |
+
columns.data<scalar_t>());
|
148 |
+
|
149 |
+
long m = channels_out;
|
150 |
+
long n = height_out * width_out;
|
151 |
+
long k = channels * kernel_h * kernel_w;
|
152 |
+
THCudaBlas_SgemmBatched(state,
|
153 |
+
'n',
|
154 |
+
'n',
|
155 |
+
n,
|
156 |
+
m,
|
157 |
+
k,
|
158 |
+
1.0f,
|
159 |
+
(const float **)columns_b, n,
|
160 |
+
weight_b, k,
|
161 |
+
1.0f,
|
162 |
+
output_b, n,
|
163 |
+
batch);
|
164 |
+
|
165 |
+
THCudaFree(state, input_b);
|
166 |
+
THCudaFree(state, output_b);
|
167 |
+
THCudaFree(state, columns_b);
|
168 |
+
THCudaFree(state, ones_b);
|
169 |
+
THCudaFree(state, weight_b);
|
170 |
+
THCudaFree(state, bias_b);
|
171 |
+
return output;
|
172 |
+
}
|
173 |
+
|
174 |
+
__global__ void createBatchGemmBufferBackward(
|
175 |
+
float **grad_output_b,
|
176 |
+
float **columns_b,
|
177 |
+
float **ones_b,
|
178 |
+
float **weight_b,
|
179 |
+
float **grad_weight_b,
|
180 |
+
float **grad_bias_b,
|
181 |
+
float *grad_output,
|
182 |
+
float *columns,
|
183 |
+
float *ones,
|
184 |
+
float *weight,
|
185 |
+
float *grad_weight,
|
186 |
+
float *grad_bias,
|
187 |
+
const int grad_output_stride,
|
188 |
+
const int columns_stride,
|
189 |
+
const int ones_stride,
|
190 |
+
const int num_batches)
|
191 |
+
{
|
192 |
+
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
193 |
+
if (idx < num_batches)
|
194 |
+
{
|
195 |
+
grad_output_b[idx] = grad_output + idx * grad_output_stride;
|
196 |
+
columns_b[idx] = columns + idx * columns_stride;
|
197 |
+
ones_b[idx] = ones + idx * ones_stride;
|
198 |
+
|
199 |
+
// share weights and bias within a Mini-Batch
|
200 |
+
weight_b[idx] = weight;
|
201 |
+
grad_weight_b[idx] = grad_weight;
|
202 |
+
grad_bias_b[idx] = grad_bias;
|
203 |
+
}
|
204 |
+
}
|
205 |
+
|
206 |
+
std::vector<at::Tensor> dcn_v2_cuda_backward(const at::Tensor &input,
|
207 |
+
const at::Tensor &weight,
|
208 |
+
const at::Tensor &bias,
|
209 |
+
const at::Tensor &offset,
|
210 |
+
const at::Tensor &mask,
|
211 |
+
const at::Tensor &grad_output,
|
212 |
+
int kernel_h, int kernel_w,
|
213 |
+
int stride_h, int stride_w,
|
214 |
+
int pad_h, int pad_w,
|
215 |
+
int dilation_h, int dilation_w,
|
216 |
+
int deformable_group)
|
217 |
+
{
|
218 |
+
|
219 |
+
THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
|
220 |
+
THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
|
221 |
+
|
222 |
+
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
|
223 |
+
AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
|
224 |
+
AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
|
225 |
+
AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
|
226 |
+
AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
|
227 |
+
|
228 |
+
const int batch = input.size(0);
|
229 |
+
const int channels = input.size(1);
|
230 |
+
const int height = input.size(2);
|
231 |
+
const int width = input.size(3);
|
232 |
+
|
233 |
+
const int channels_out = weight.size(0);
|
234 |
+
const int channels_kernel = weight.size(1);
|
235 |
+
const int kernel_h_ = weight.size(2);
|
236 |
+
const int kernel_w_ = weight.size(3);
|
237 |
+
|
238 |
+
AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
|
239 |
+
"Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
|
240 |
+
|
241 |
+
AT_ASSERTM(channels == channels_kernel,
|
242 |
+
"Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
|
243 |
+
|
244 |
+
const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
|
245 |
+
const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
|
246 |
+
|
247 |
+
auto ones = at::ones({height_out, width_out}, input.options());
|
248 |
+
auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
|
249 |
+
auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
|
250 |
+
|
251 |
+
auto grad_input = at::zeros_like(input);
|
252 |
+
auto grad_weight = at::zeros_like(weight);
|
253 |
+
auto grad_bias = at::zeros_like(bias);
|
254 |
+
auto grad_offset = at::zeros_like(offset);
|
255 |
+
auto grad_mask = at::zeros_like(mask);
|
256 |
+
|
257 |
+
using scalar_t = float;
|
258 |
+
|
259 |
+
for (int b = 0; b < batch; b++)
|
260 |
+
{
|
261 |
+
auto input_n = input.select(0, b);
|
262 |
+
auto offset_n = offset.select(0, b);
|
263 |
+
auto mask_n = mask.select(0, b);
|
264 |
+
auto grad_output_n = grad_output.select(0, b);
|
265 |
+
auto grad_input_n = grad_input.select(0, b);
|
266 |
+
auto grad_offset_n = grad_offset.select(0, b);
|
267 |
+
auto grad_mask_n = grad_mask.select(0, b);
|
268 |
+
|
269 |
+
long m = channels * kernel_h * kernel_w;
|
270 |
+
long n = height_out * width_out;
|
271 |
+
long k = channels_out;
|
272 |
+
|
273 |
+
THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f,
|
274 |
+
grad_output_n.data<scalar_t>(), n,
|
275 |
+
weight.data<scalar_t>(), m, 0.0f,
|
276 |
+
columns.data<scalar_t>(), n);
|
277 |
+
|
278 |
+
// gradient w.r.t. input coordinate data
|
279 |
+
modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state),
|
280 |
+
columns.data<scalar_t>(),
|
281 |
+
input_n.data<scalar_t>(),
|
282 |
+
offset_n.data<scalar_t>(),
|
283 |
+
mask_n.data<scalar_t>(),
|
284 |
+
1, channels, height, width,
|
285 |
+
height_out, width_out, kernel_h, kernel_w,
|
286 |
+
pad_h, pad_w, stride_h, stride_w,
|
287 |
+
dilation_h, dilation_w, deformable_group,
|
288 |
+
grad_offset_n.data<scalar_t>(),
|
289 |
+
grad_mask_n.data<scalar_t>());
|
290 |
+
// gradient w.r.t. input data
|
291 |
+
modulated_deformable_col2im_cuda(THCState_getCurrentStream(state),
|
292 |
+
columns.data<scalar_t>(),
|
293 |
+
offset_n.data<scalar_t>(),
|
294 |
+
mask_n.data<scalar_t>(),
|
295 |
+
1, channels, height, width,
|
296 |
+
height_out, width_out, kernel_h, kernel_w,
|
297 |
+
pad_h, pad_w, stride_h, stride_w,
|
298 |
+
dilation_h, dilation_w, deformable_group,
|
299 |
+
grad_input_n.data<scalar_t>());
|
300 |
+
|
301 |
+
// gradient w.r.t. weight, dWeight should accumulate across the batch and group
|
302 |
+
modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
|
303 |
+
input_n.data<scalar_t>(),
|
304 |
+
offset_n.data<scalar_t>(),
|
305 |
+
mask_n.data<scalar_t>(),
|
306 |
+
1, channels, height, width,
|
307 |
+
height_out, width_out, kernel_h, kernel_w,
|
308 |
+
pad_h, pad_w, stride_h, stride_w,
|
309 |
+
dilation_h, dilation_w, deformable_group,
|
310 |
+
columns.data<scalar_t>());
|
311 |
+
|
312 |
+
long m_ = channels_out;
|
313 |
+
long n_ = channels * kernel_h * kernel_w;
|
314 |
+
long k_ = height_out * width_out;
|
315 |
+
|
316 |
+
THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f,
|
317 |
+
columns.data<scalar_t>(), k_,
|
318 |
+
grad_output_n.data<scalar_t>(), k_, 1.0f,
|
319 |
+
grad_weight.data<scalar_t>(), n_);
|
320 |
+
|
321 |
+
// gradient w.r.t. bias
|
322 |
+
// long m_ = channels_out;
|
323 |
+
// long k__ = height_out * width_out;
|
324 |
+
THCudaBlas_Sgemv(state,
|
325 |
+
't',
|
326 |
+
k_, m_, 1.0f,
|
327 |
+
grad_output_n.data<scalar_t>(), k_,
|
328 |
+
ones.data<scalar_t>(), 1, 1.0f,
|
329 |
+
grad_bias.data<scalar_t>(), 1);
|
330 |
+
}
|
331 |
+
|
332 |
+
return {
|
333 |
+
grad_input, grad_offset, grad_mask, grad_weight, grad_bias
|
334 |
+
};
|
335 |
+
}
|
maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.cu
ADDED
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "dcn_v2_im2col_cuda.h"
|
2 |
+
#include <cstdio>
|
3 |
+
#include <algorithm>
|
4 |
+
#include <cstring>
|
5 |
+
|
6 |
+
#include <ATen/ATen.h>
|
7 |
+
#include <ATen/cuda/CUDAContext.h>
|
8 |
+
|
9 |
+
#include <THC/THC.h>
|
10 |
+
#include <THC/THCAtomics.cuh>
|
11 |
+
#include <THC/THCDeviceUtils.cuh>
|
12 |
+
|
13 |
+
#define CUDA_KERNEL_LOOP(i, n) \
|
14 |
+
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
|
15 |
+
i < (n); \
|
16 |
+
i += blockDim.x * gridDim.x)
|
17 |
+
|
18 |
+
const int CUDA_NUM_THREADS = 1024;
|
19 |
+
inline int GET_BLOCKS(const int N)
|
20 |
+
{
|
21 |
+
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
|
22 |
+
}
|
23 |
+
|
24 |
+
|
25 |
+
__device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width,
|
26 |
+
const int height, const int width, float h, float w)
|
27 |
+
{
|
28 |
+
int h_low = floor(h);
|
29 |
+
int w_low = floor(w);
|
30 |
+
int h_high = h_low + 1;
|
31 |
+
int w_high = w_low + 1;
|
32 |
+
|
33 |
+
float lh = h - h_low;
|
34 |
+
float lw = w - w_low;
|
35 |
+
float hh = 1 - lh, hw = 1 - lw;
|
36 |
+
|
37 |
+
float v1 = 0;
|
38 |
+
if (h_low >= 0 && w_low >= 0)
|
39 |
+
v1 = bottom_data[h_low * data_width + w_low];
|
40 |
+
float v2 = 0;
|
41 |
+
if (h_low >= 0 && w_high <= width - 1)
|
42 |
+
v2 = bottom_data[h_low * data_width + w_high];
|
43 |
+
float v3 = 0;
|
44 |
+
if (h_high <= height - 1 && w_low >= 0)
|
45 |
+
v3 = bottom_data[h_high * data_width + w_low];
|
46 |
+
float v4 = 0;
|
47 |
+
if (h_high <= height - 1 && w_high <= width - 1)
|
48 |
+
v4 = bottom_data[h_high * data_width + w_high];
|
49 |
+
|
50 |
+
float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
|
51 |
+
|
52 |
+
float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
|
53 |
+
return val;
|
54 |
+
}
|
55 |
+
|
56 |
+
__device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w,
|
57 |
+
const int h, const int w, const int height, const int width)
|
58 |
+
{
|
59 |
+
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
|
60 |
+
{
|
61 |
+
//empty
|
62 |
+
return 0;
|
63 |
+
}
|
64 |
+
|
65 |
+
int argmax_h_low = floor(argmax_h);
|
66 |
+
int argmax_w_low = floor(argmax_w);
|
67 |
+
int argmax_h_high = argmax_h_low + 1;
|
68 |
+
int argmax_w_high = argmax_w_low + 1;
|
69 |
+
|
70 |
+
float weight = 0;
|
71 |
+
if (h == argmax_h_low && w == argmax_w_low)
|
72 |
+
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
|
73 |
+
if (h == argmax_h_low && w == argmax_w_high)
|
74 |
+
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
|
75 |
+
if (h == argmax_h_high && w == argmax_w_low)
|
76 |
+
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
|
77 |
+
if (h == argmax_h_high && w == argmax_w_high)
|
78 |
+
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
|
79 |
+
return weight;
|
80 |
+
}
|
81 |
+
|
82 |
+
__device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w,
|
83 |
+
const int height, const int width, const float *im_data,
|
84 |
+
const int data_width, const int bp_dir)
|
85 |
+
{
|
86 |
+
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
|
87 |
+
{
|
88 |
+
//empty
|
89 |
+
return 0;
|
90 |
+
}
|
91 |
+
|
92 |
+
int argmax_h_low = floor(argmax_h);
|
93 |
+
int argmax_w_low = floor(argmax_w);
|
94 |
+
int argmax_h_high = argmax_h_low + 1;
|
95 |
+
int argmax_w_high = argmax_w_low + 1;
|
96 |
+
|
97 |
+
float weight = 0;
|
98 |
+
|
99 |
+
if (bp_dir == 0)
|
100 |
+
{
|
101 |
+
if (argmax_h_low >= 0 && argmax_w_low >= 0)
|
102 |
+
weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
|
103 |
+
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
|
104 |
+
weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
|
105 |
+
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
|
106 |
+
weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
|
107 |
+
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
|
108 |
+
weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
|
109 |
+
}
|
110 |
+
else if (bp_dir == 1)
|
111 |
+
{
|
112 |
+
if (argmax_h_low >= 0 && argmax_w_low >= 0)
|
113 |
+
weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
|
114 |
+
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
|
115 |
+
weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
|
116 |
+
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
|
117 |
+
weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
|
118 |
+
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
|
119 |
+
weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
|
120 |
+
}
|
121 |
+
|
122 |
+
return weight;
|
123 |
+
}
|
124 |
+
|
125 |
+
__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
|
126 |
+
const float *data_im, const float *data_offset, const float *data_mask,
|
127 |
+
const int height, const int width, const int kernel_h, const int kernel_w,
|
128 |
+
const int pad_h, const int pad_w,
|
129 |
+
const int stride_h, const int stride_w,
|
130 |
+
const int dilation_h, const int dilation_w,
|
131 |
+
const int channel_per_deformable_group,
|
132 |
+
const int batch_size, const int num_channels, const int deformable_group,
|
133 |
+
const int height_col, const int width_col,
|
134 |
+
float *data_col)
|
135 |
+
{
|
136 |
+
// launch channels * batch_size * height_col * width_col cores
|
137 |
+
CUDA_KERNEL_LOOP(index, n)
|
138 |
+
{
|
139 |
+
// NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
|
140 |
+
// here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
|
141 |
+
|
142 |
+
// index index of output matrix
|
143 |
+
const int w_col = index % width_col;
|
144 |
+
const int h_col = (index / width_col) % height_col;
|
145 |
+
// const int b_col = (index / width_col / height_col) % batch_size;
|
146 |
+
const int b_col = (index / width_col / height_col / num_channels) % batch_size;
|
147 |
+
// const int c_im = (index / width_col / height_col) / batch_size;
|
148 |
+
const int c_im = (index / width_col / height_col) % num_channels;
|
149 |
+
// const int c_col = c_im * kernel_h * kernel_w;
|
150 |
+
const int c_col = c_im * kernel_h * kernel_w;
|
151 |
+
|
152 |
+
// compute deformable group index
|
153 |
+
const int deformable_group_index = c_im / channel_per_deformable_group;
|
154 |
+
|
155 |
+
const int h_in = h_col * stride_h - pad_h;
|
156 |
+
const int w_in = w_col * stride_w - pad_w;
|
157 |
+
|
158 |
+
// float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
|
159 |
+
float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
|
160 |
+
//const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
|
161 |
+
const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
|
162 |
+
const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
|
163 |
+
|
164 |
+
const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
|
165 |
+
|
166 |
+
for (int i = 0; i < kernel_h; ++i)
|
167 |
+
{
|
168 |
+
for (int j = 0; j < kernel_w; ++j)
|
169 |
+
{
|
170 |
+
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
|
171 |
+
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
|
172 |
+
const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
|
173 |
+
const float offset_h = data_offset_ptr[data_offset_h_ptr];
|
174 |
+
const float offset_w = data_offset_ptr[data_offset_w_ptr];
|
175 |
+
const float mask = data_mask_ptr[data_mask_hw_ptr];
|
176 |
+
float val = static_cast<float>(0);
|
177 |
+
const float h_im = h_in + i * dilation_h + offset_h;
|
178 |
+
const float w_im = w_in + j * dilation_w + offset_w;
|
179 |
+
//if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
|
180 |
+
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
|
181 |
+
{
|
182 |
+
//const float map_h = i * dilation_h + offset_h;
|
183 |
+
//const float map_w = j * dilation_w + offset_w;
|
184 |
+
//const int cur_height = height - h_in;
|
185 |
+
//const int cur_width = width - w_in;
|
186 |
+
//val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
|
187 |
+
val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
|
188 |
+
}
|
189 |
+
*data_col_ptr = val * mask;
|
190 |
+
// data_col_ptr += batch_size * height_col * width_col;
|
191 |
+
data_col_ptr += height_col * width_col;
|
192 |
+
}
|
193 |
+
}
|
194 |
+
}
|
195 |
+
}
|
196 |
+
|
197 |
+
__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
|
198 |
+
const float *data_col, const float *data_offset, const float *data_mask,
|
199 |
+
const int channels, const int height, const int width,
|
200 |
+
const int kernel_h, const int kernel_w,
|
201 |
+
const int pad_h, const int pad_w,
|
202 |
+
const int stride_h, const int stride_w,
|
203 |
+
const int dilation_h, const int dilation_w,
|
204 |
+
const int channel_per_deformable_group,
|
205 |
+
const int batch_size, const int deformable_group,
|
206 |
+
const int height_col, const int width_col,
|
207 |
+
float *grad_im)
|
208 |
+
{
|
209 |
+
CUDA_KERNEL_LOOP(index, n)
|
210 |
+
{
|
211 |
+
const int j = (index / width_col / height_col / batch_size) % kernel_w;
|
212 |
+
const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
|
213 |
+
const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
|
214 |
+
// compute the start and end of the output
|
215 |
+
|
216 |
+
const int deformable_group_index = c / channel_per_deformable_group;
|
217 |
+
|
218 |
+
int w_out = index % width_col;
|
219 |
+
int h_out = (index / width_col) % height_col;
|
220 |
+
int b = (index / width_col / height_col) % batch_size;
|
221 |
+
int w_in = w_out * stride_w - pad_w;
|
222 |
+
int h_in = h_out * stride_h - pad_h;
|
223 |
+
|
224 |
+
const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
|
225 |
+
const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
|
226 |
+
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
|
227 |
+
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
|
228 |
+
const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
|
229 |
+
const float offset_h = data_offset_ptr[data_offset_h_ptr];
|
230 |
+
const float offset_w = data_offset_ptr[data_offset_w_ptr];
|
231 |
+
const float mask = data_mask_ptr[data_mask_hw_ptr];
|
232 |
+
const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
|
233 |
+
const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
|
234 |
+
|
235 |
+
const float cur_top_grad = data_col[index] * mask;
|
236 |
+
const int cur_h = (int)cur_inv_h_data;
|
237 |
+
const int cur_w = (int)cur_inv_w_data;
|
238 |
+
for (int dy = -2; dy <= 2; dy++)
|
239 |
+
{
|
240 |
+
for (int dx = -2; dx <= 2; dx++)
|
241 |
+
{
|
242 |
+
if (cur_h + dy >= 0 && cur_h + dy < height &&
|
243 |
+
cur_w + dx >= 0 && cur_w + dx < width &&
|
244 |
+
abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
|
245 |
+
abs(cur_inv_w_data - (cur_w + dx)) < 1)
|
246 |
+
{
|
247 |
+
int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
|
248 |
+
float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
|
249 |
+
atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
|
250 |
+
}
|
251 |
+
}
|
252 |
+
}
|
253 |
+
}
|
254 |
+
}
|
255 |
+
|
256 |
+
__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
|
257 |
+
const float *data_col, const float *data_im,
|
258 |
+
const float *data_offset, const float *data_mask,
|
259 |
+
const int channels, const int height, const int width,
|
260 |
+
const int kernel_h, const int kernel_w,
|
261 |
+
const int pad_h, const int pad_w,
|
262 |
+
const int stride_h, const int stride_w,
|
263 |
+
const int dilation_h, const int dilation_w,
|
264 |
+
const int channel_per_deformable_group,
|
265 |
+
const int batch_size, const int offset_channels, const int deformable_group,
|
266 |
+
const int height_col, const int width_col,
|
267 |
+
float *grad_offset, float *grad_mask)
|
268 |
+
{
|
269 |
+
CUDA_KERNEL_LOOP(index, n)
|
270 |
+
{
|
271 |
+
float val = 0, mval = 0;
|
272 |
+
int w = index % width_col;
|
273 |
+
int h = (index / width_col) % height_col;
|
274 |
+
int c = (index / width_col / height_col) % offset_channels;
|
275 |
+
int b = (index / width_col / height_col) / offset_channels;
|
276 |
+
// compute the start and end of the output
|
277 |
+
|
278 |
+
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
|
279 |
+
const int col_step = kernel_h * kernel_w;
|
280 |
+
int cnt = 0;
|
281 |
+
const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
|
282 |
+
const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
|
283 |
+
const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
|
284 |
+
const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
|
285 |
+
|
286 |
+
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
|
287 |
+
|
288 |
+
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
|
289 |
+
{
|
290 |
+
const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
|
291 |
+
const int bp_dir = offset_c % 2;
|
292 |
+
|
293 |
+
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
|
294 |
+
int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
|
295 |
+
int w_out = col_pos % width_col;
|
296 |
+
int h_out = (col_pos / width_col) % height_col;
|
297 |
+
int w_in = w_out * stride_w - pad_w;
|
298 |
+
int h_in = h_out * stride_h - pad_h;
|
299 |
+
const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
|
300 |
+
const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
|
301 |
+
const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
|
302 |
+
const float offset_h = data_offset_ptr[data_offset_h_ptr];
|
303 |
+
const float offset_w = data_offset_ptr[data_offset_w_ptr];
|
304 |
+
const float mask = data_mask_ptr[data_mask_hw_ptr];
|
305 |
+
float inv_h = h_in + i * dilation_h + offset_h;
|
306 |
+
float inv_w = w_in + j * dilation_w + offset_w;
|
307 |
+
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
|
308 |
+
{
|
309 |
+
inv_h = inv_w = -2;
|
310 |
+
}
|
311 |
+
else
|
312 |
+
{
|
313 |
+
mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
|
314 |
+
}
|
315 |
+
const float weight = dmcn_get_coordinate_weight(
|
316 |
+
inv_h, inv_w,
|
317 |
+
height, width, data_im_ptr + cnt * height * width, width, bp_dir);
|
318 |
+
val += weight * data_col_ptr[col_pos] * mask;
|
319 |
+
cnt += 1;
|
320 |
+
}
|
321 |
+
// KERNEL_ASSIGN(grad_offset[index], offset_req, val);
|
322 |
+
grad_offset[index] = val;
|
323 |
+
if (offset_c % 2 == 0)
|
324 |
+
// KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
|
325 |
+
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
|
326 |
+
}
|
327 |
+
}
|
328 |
+
|
329 |
+
void modulated_deformable_im2col_cuda(cudaStream_t stream,
|
330 |
+
const float* data_im, const float* data_offset, const float* data_mask,
|
331 |
+
const int batch_size, const int channels, const int height_im, const int width_im,
|
332 |
+
const int height_col, const int width_col, const int kernel_h, const int kernel_w,
|
333 |
+
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
|
334 |
+
const int dilation_h, const int dilation_w,
|
335 |
+
const int deformable_group, float* data_col) {
|
336 |
+
// num_axes should be smaller than block size
|
337 |
+
const int channel_per_deformable_group = channels / deformable_group;
|
338 |
+
const int num_kernels = channels * batch_size * height_col * width_col;
|
339 |
+
modulated_deformable_im2col_gpu_kernel
|
340 |
+
<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
|
341 |
+
0, stream>>>(
|
342 |
+
num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
|
343 |
+
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
|
344 |
+
batch_size, channels, deformable_group, height_col, width_col, data_col);
|
345 |
+
|
346 |
+
cudaError_t err = cudaGetLastError();
|
347 |
+
if (err != cudaSuccess)
|
348 |
+
{
|
349 |
+
printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
|
350 |
+
}
|
351 |
+
|
352 |
+
}
|
353 |
+
|
354 |
+
void modulated_deformable_col2im_cuda(cudaStream_t stream,
|
355 |
+
const float* data_col, const float* data_offset, const float* data_mask,
|
356 |
+
const int batch_size, const int channels, const int height_im, const int width_im,
|
357 |
+
const int height_col, const int width_col, const int kernel_h, const int kernel_w,
|
358 |
+
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
|
359 |
+
const int dilation_h, const int dilation_w,
|
360 |
+
const int deformable_group, float* grad_im){
|
361 |
+
|
362 |
+
const int channel_per_deformable_group = channels / deformable_group;
|
363 |
+
const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
|
364 |
+
modulated_deformable_col2im_gpu_kernel
|
365 |
+
<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
|
366 |
+
0, stream>>>(
|
367 |
+
num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
|
368 |
+
kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
|
369 |
+
dilation_h, dilation_w, channel_per_deformable_group,
|
370 |
+
batch_size, deformable_group, height_col, width_col, grad_im);
|
371 |
+
cudaError_t err = cudaGetLastError();
|
372 |
+
if (err != cudaSuccess)
|
373 |
+
{
|
374 |
+
printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
|
375 |
+
}
|
376 |
+
|
377 |
+
}
|
378 |
+
|
379 |
+
void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
|
380 |
+
const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
|
381 |
+
const int batch_size, const int channels, const int height_im, const int width_im,
|
382 |
+
const int height_col, const int width_col, const int kernel_h, const int kernel_w,
|
383 |
+
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
|
384 |
+
const int dilation_h, const int dilation_w,
|
385 |
+
const int deformable_group,
|
386 |
+
float* grad_offset, float* grad_mask) {
|
387 |
+
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
|
388 |
+
const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
|
389 |
+
modulated_deformable_col2im_coord_gpu_kernel
|
390 |
+
<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
|
391 |
+
0, stream>>>(
|
392 |
+
num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
|
393 |
+
kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
|
394 |
+
dilation_h, dilation_w, channel_per_deformable_group,
|
395 |
+
batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
|
396 |
+
grad_offset, grad_mask);
|
397 |
+
cudaError_t err = cudaGetLastError();
|
398 |
+
if (err != cudaSuccess)
|
399 |
+
{
|
400 |
+
printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
|
401 |
+
}
|
402 |
+
}
|
maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.h
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
/*!
|
3 |
+
******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
|
4 |
+
*
|
5 |
+
* COPYRIGHT
|
6 |
+
*
|
7 |
+
* All contributions by the University of California:
|
8 |
+
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
|
9 |
+
* All rights reserved.
|
10 |
+
*
|
11 |
+
* All other contributions:
|
12 |
+
* Copyright (c) 2014-2017, the respective contributors
|
13 |
+
* All rights reserved.
|
14 |
+
*
|
15 |
+
* Caffe uses a shared copyright model: each contributor holds copyright over
|
16 |
+
* their contributions to Caffe. The project versioning records all such
|
17 |
+
* contribution and copyright details. If a contributor wants to further mark
|
18 |
+
* their specific copyright on a particular contribution, they should indicate
|
19 |
+
* their copyright solely in the commit message of the change when it is
|
20 |
+
* committed.
|
21 |
+
*
|
22 |
+
* LICENSE
|
23 |
+
*
|
24 |
+
* Redistribution and use in source and binary forms, with or without
|
25 |
+
* modification, are permitted provided that the following conditions are met:
|
26 |
+
*
|
27 |
+
* 1. Redistributions of source code must retain the above copyright notice, this
|
28 |
+
* list of conditions and the following disclaimer.
|
29 |
+
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
30 |
+
* this list of conditions and the following disclaimer in the documentation
|
31 |
+
* and/or other materials provided with the distribution.
|
32 |
+
*
|
33 |
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
34 |
+
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
35 |
+
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
36 |
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
37 |
+
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
38 |
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
39 |
+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
40 |
+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
41 |
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
42 |
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
43 |
+
*
|
44 |
+
* CONTRIBUTION AGREEMENT
|
45 |
+
*
|
46 |
+
* By contributing to the BVLC/caffe repository through pull-request, comment,
|
47 |
+
* or otherwise, the contributor releases their content to the
|
48 |
+
* license and copyright terms herein.
|
49 |
+
*
|
50 |
+
***************** END Caffe Copyright Notice and Disclaimer ********************
|
51 |
+
*
|
52 |
+
* Copyright (c) 2018 Microsoft
|
53 |
+
* Licensed under The MIT License [see LICENSE for details]
|
54 |
+
* \file modulated_deformable_im2col.h
|
55 |
+
* \brief Function definitions of converting an image to
|
56 |
+
* column matrix based on kernel, padding, dilation, and offset.
|
57 |
+
* These functions are mainly used in deformable convolution operators.
|
58 |
+
* \ref: https://arxiv.org/abs/1811.11168
|
59 |
+
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
|
60 |
+
*/
|
61 |
+
|
62 |
+
/***************** Adapted by Charles Shang *********************/
|
63 |
+
|
64 |
+
#ifndef DCN_V2_IM2COL_CUDA
|
65 |
+
#define DCN_V2_IM2COL_CUDA
|
66 |
+
|
67 |
+
#ifdef __cplusplus
|
68 |
+
extern "C"
|
69 |
+
{
|
70 |
+
#endif
|
71 |
+
|
72 |
+
void modulated_deformable_im2col_cuda(cudaStream_t stream,
|
73 |
+
const float *data_im, const float *data_offset, const float *data_mask,
|
74 |
+
const int batch_size, const int channels, const int height_im, const int width_im,
|
75 |
+
const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
|
76 |
+
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
|
77 |
+
const int dilation_h, const int dilation_w,
|
78 |
+
const int deformable_group, float *data_col);
|
79 |
+
|
80 |
+
void modulated_deformable_col2im_cuda(cudaStream_t stream,
|
81 |
+
const float *data_col, const float *data_offset, const float *data_mask,
|
82 |
+
const int batch_size, const int channels, const int height_im, const int width_im,
|
83 |
+
const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
|
84 |
+
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
|
85 |
+
const int dilation_h, const int dilation_w,
|
86 |
+
const int deformable_group, float *grad_im);
|
87 |
+
|
88 |
+
void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
|
89 |
+
const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
|
90 |
+
const int batch_size, const int channels, const int height_im, const int width_im,
|
91 |
+
const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
|
92 |
+
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
|
93 |
+
const int dilation_h, const int dilation_w,
|
94 |
+
const int deformable_group,
|
95 |
+
float *grad_offset, float *grad_mask);
|
96 |
+
|
97 |
+
#ifdef __cplusplus
|
98 |
+
}
|
99 |
+
#endif
|
100 |
+
|
101 |
+
#endif
|
maskrcnn_benchmark/csrc/cuda/dcn_v2_psroi_pooling_cuda.cu
ADDED
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/*!
|
2 |
+
* Copyright (c) 2017 Microsoft
|
3 |
+
* Licensed under The MIT License [see LICENSE for details]
|
4 |
+
* \file deformable_psroi_pooling.cu
|
5 |
+
* \brief
|
6 |
+
* \author Yi Li, Guodong Zhang, Jifeng Dai
|
7 |
+
*/
|
8 |
+
/***************** Adapted by Charles Shang *********************/
|
9 |
+
|
10 |
+
#include <cstdio>
|
11 |
+
#include <algorithm>
|
12 |
+
#include <cstring>
|
13 |
+
#include <iostream>
|
14 |
+
|
15 |
+
#include <ATen/ATen.h>
|
16 |
+
#include <ATen/cuda/CUDAContext.h>
|
17 |
+
|
18 |
+
#include <THC/THC.h>
|
19 |
+
#include <THC/THCAtomics.cuh>
|
20 |
+
#include <THC/THCDeviceUtils.cuh>
|
21 |
+
|
22 |
+
#define CUDA_KERNEL_LOOP(i, n) \
|
23 |
+
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
|
24 |
+
i < (n); \
|
25 |
+
i += blockDim.x * gridDim.x)
|
26 |
+
|
27 |
+
const int CUDA_NUM_THREADS = 1024;
|
28 |
+
inline int GET_BLOCKS(const int N)
|
29 |
+
{
|
30 |
+
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
|
31 |
+
}
|
32 |
+
|
33 |
+
template <typename T>
|
34 |
+
__device__ T bilinear_interp(
|
35 |
+
const T *data,
|
36 |
+
const T x,
|
37 |
+
const T y,
|
38 |
+
const int width,
|
39 |
+
const int height)
|
40 |
+
{
|
41 |
+
int x1 = floor(x);
|
42 |
+
int x2 = ceil(x);
|
43 |
+
int y1 = floor(y);
|
44 |
+
int y2 = ceil(y);
|
45 |
+
T dist_x = static_cast<T>(x - x1);
|
46 |
+
T dist_y = static_cast<T>(y - y1);
|
47 |
+
T value11 = data[y1 * width + x1];
|
48 |
+
T value12 = data[y2 * width + x1];
|
49 |
+
T value21 = data[y1 * width + x2];
|
50 |
+
T value22 = data[y2 * width + x2];
|
51 |
+
T value = (1 - dist_x) * (1 - dist_y) * value11 +
|
52 |
+
(1 - dist_x) * dist_y * value12 +
|
53 |
+
dist_x * (1 - dist_y) * value21 +
|
54 |
+
dist_x * dist_y * value22;
|
55 |
+
return value;
|
56 |
+
}
|
57 |
+
|
58 |
+
template <typename T>
|
59 |
+
__global__ void DeformablePSROIPoolForwardKernel(
|
60 |
+
const int count,
|
61 |
+
const T *bottom_data,
|
62 |
+
const T spatial_scale,
|
63 |
+
const int channels,
|
64 |
+
const int height, const int width,
|
65 |
+
const int pooled_height, const int pooled_width,
|
66 |
+
const T *bottom_rois, const T *bottom_trans,
|
67 |
+
const int no_trans,
|
68 |
+
const T trans_std,
|
69 |
+
const int sample_per_part,
|
70 |
+
const int output_dim,
|
71 |
+
const int group_size,
|
72 |
+
const int part_size,
|
73 |
+
const int num_classes,
|
74 |
+
const int channels_each_class,
|
75 |
+
T *top_data,
|
76 |
+
T *top_count)
|
77 |
+
{
|
78 |
+
CUDA_KERNEL_LOOP(index, count)
|
79 |
+
{
|
80 |
+
// The output is in order (n, ctop, ph, pw)
|
81 |
+
int pw = index % pooled_width;
|
82 |
+
int ph = (index / pooled_width) % pooled_height;
|
83 |
+
int ctop = (index / pooled_width / pooled_height) % output_dim;
|
84 |
+
int n = index / pooled_width / pooled_height / output_dim;
|
85 |
+
|
86 |
+
// [start, end) interval for spatial sampling
|
87 |
+
const T *offset_bottom_rois = bottom_rois + n * 5;
|
88 |
+
int roi_batch_ind = offset_bottom_rois[0];
|
89 |
+
T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
|
90 |
+
T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
|
91 |
+
T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
|
92 |
+
T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
|
93 |
+
|
94 |
+
// Force too small ROIs to be 1x1
|
95 |
+
T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
|
96 |
+
T roi_height = max(roi_end_h - roi_start_h, 0.1);
|
97 |
+
|
98 |
+
// Compute w and h at bottom
|
99 |
+
T bin_size_h = roi_height / static_cast<T>(pooled_height);
|
100 |
+
T bin_size_w = roi_width / static_cast<T>(pooled_width);
|
101 |
+
|
102 |
+
T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
|
103 |
+
T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
|
104 |
+
|
105 |
+
int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
|
106 |
+
int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
|
107 |
+
int class_id = ctop / channels_each_class;
|
108 |
+
T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
|
109 |
+
T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
|
110 |
+
|
111 |
+
T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
|
112 |
+
wstart += trans_x * roi_width;
|
113 |
+
T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
|
114 |
+
hstart += trans_y * roi_height;
|
115 |
+
|
116 |
+
T sum = 0;
|
117 |
+
int count = 0;
|
118 |
+
int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
|
119 |
+
int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
|
120 |
+
gw = min(max(gw, 0), group_size - 1);
|
121 |
+
gh = min(max(gh, 0), group_size - 1);
|
122 |
+
|
123 |
+
const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
|
124 |
+
for (int ih = 0; ih < sample_per_part; ih++)
|
125 |
+
{
|
126 |
+
for (int iw = 0; iw < sample_per_part; iw++)
|
127 |
+
{
|
128 |
+
T w = wstart + iw * sub_bin_size_w;
|
129 |
+
T h = hstart + ih * sub_bin_size_h;
|
130 |
+
// bilinear interpolation
|
131 |
+
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
|
132 |
+
{
|
133 |
+
continue;
|
134 |
+
}
|
135 |
+
w = min(max(w, 0.), width - 1.);
|
136 |
+
h = min(max(h, 0.), height - 1.);
|
137 |
+
int c = (ctop * group_size + gh) * group_size + gw;
|
138 |
+
T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
|
139 |
+
sum += val;
|
140 |
+
count++;
|
141 |
+
}
|
142 |
+
}
|
143 |
+
top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
|
144 |
+
top_count[index] = count;
|
145 |
+
}
|
146 |
+
}
|
147 |
+
|
148 |
+
template <typename T>
|
149 |
+
__global__ void DeformablePSROIPoolBackwardAccKernel(
|
150 |
+
const int count,
|
151 |
+
const T *top_diff,
|
152 |
+
const T *top_count,
|
153 |
+
const int num_rois,
|
154 |
+
const T spatial_scale,
|
155 |
+
const int channels,
|
156 |
+
const int height, const int width,
|
157 |
+
const int pooled_height, const int pooled_width,
|
158 |
+
const int output_dim,
|
159 |
+
T *bottom_data_diff, T *bottom_trans_diff,
|
160 |
+
const T *bottom_data,
|
161 |
+
const T *bottom_rois,
|
162 |
+
const T *bottom_trans,
|
163 |
+
const int no_trans,
|
164 |
+
const T trans_std,
|
165 |
+
const int sample_per_part,
|
166 |
+
const int group_size,
|
167 |
+
const int part_size,
|
168 |
+
const int num_classes,
|
169 |
+
const int channels_each_class)
|
170 |
+
{
|
171 |
+
CUDA_KERNEL_LOOP(index, count)
|
172 |
+
{
|
173 |
+
// The output is in order (n, ctop, ph, pw)
|
174 |
+
int pw = index % pooled_width;
|
175 |
+
int ph = (index / pooled_width) % pooled_height;
|
176 |
+
int ctop = (index / pooled_width / pooled_height) % output_dim;
|
177 |
+
int n = index / pooled_width / pooled_height / output_dim;
|
178 |
+
|
179 |
+
// [start, end) interval for spatial sampling
|
180 |
+
const T *offset_bottom_rois = bottom_rois + n * 5;
|
181 |
+
int roi_batch_ind = offset_bottom_rois[0];
|
182 |
+
T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
|
183 |
+
T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
|
184 |
+
T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
|
185 |
+
T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
|
186 |
+
|
187 |
+
// Force too small ROIs to be 1x1
|
188 |
+
T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
|
189 |
+
T roi_height = max(roi_end_h - roi_start_h, 0.1);
|
190 |
+
|
191 |
+
// Compute w and h at bottom
|
192 |
+
T bin_size_h = roi_height / static_cast<T>(pooled_height);
|
193 |
+
T bin_size_w = roi_width / static_cast<T>(pooled_width);
|
194 |
+
|
195 |
+
T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
|
196 |
+
T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
|
197 |
+
|
198 |
+
int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
|
199 |
+
int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
|
200 |
+
int class_id = ctop / channels_each_class;
|
201 |
+
T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
|
202 |
+
T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
|
203 |
+
|
204 |
+
T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
|
205 |
+
wstart += trans_x * roi_width;
|
206 |
+
T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
|
207 |
+
hstart += trans_y * roi_height;
|
208 |
+
|
209 |
+
if (top_count[index] <= 0)
|
210 |
+
{
|
211 |
+
continue;
|
212 |
+
}
|
213 |
+
T diff_val = top_diff[index] / top_count[index];
|
214 |
+
const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
|
215 |
+
T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
|
216 |
+
int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
|
217 |
+
int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
|
218 |
+
gw = min(max(gw, 0), group_size - 1);
|
219 |
+
gh = min(max(gh, 0), group_size - 1);
|
220 |
+
|
221 |
+
for (int ih = 0; ih < sample_per_part; ih++)
|
222 |
+
{
|
223 |
+
for (int iw = 0; iw < sample_per_part; iw++)
|
224 |
+
{
|
225 |
+
T w = wstart + iw * sub_bin_size_w;
|
226 |
+
T h = hstart + ih * sub_bin_size_h;
|
227 |
+
// bilinear interpolation
|
228 |
+
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
|
229 |
+
{
|
230 |
+
continue;
|
231 |
+
}
|
232 |
+
w = min(max(w, 0.), width - 1.);
|
233 |
+
h = min(max(h, 0.), height - 1.);
|
234 |
+
int c = (ctop * group_size + gh) * group_size + gw;
|
235 |
+
// backward on feature
|
236 |
+
int x0 = floor(w);
|
237 |
+
int x1 = ceil(w);
|
238 |
+
int y0 = floor(h);
|
239 |
+
int y1 = ceil(h);
|
240 |
+
T dist_x = w - x0, dist_y = h - y0;
|
241 |
+
T q00 = (1 - dist_x) * (1 - dist_y);
|
242 |
+
T q01 = (1 - dist_x) * dist_y;
|
243 |
+
T q10 = dist_x * (1 - dist_y);
|
244 |
+
T q11 = dist_x * dist_y;
|
245 |
+
int bottom_index_base = c * height * width;
|
246 |
+
atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
|
247 |
+
atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
|
248 |
+
atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
|
249 |
+
atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
|
250 |
+
|
251 |
+
if (no_trans)
|
252 |
+
{
|
253 |
+
continue;
|
254 |
+
}
|
255 |
+
T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
|
256 |
+
T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
|
257 |
+
T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
|
258 |
+
T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
|
259 |
+
T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
|
260 |
+
diff_x *= roi_width;
|
261 |
+
T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
|
262 |
+
diff_y *= roi_height;
|
263 |
+
|
264 |
+
atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
|
265 |
+
atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
|
266 |
+
}
|
267 |
+
}
|
268 |
+
}
|
269 |
+
}
|
270 |
+
|
271 |
+
std::tuple<at::Tensor, at::Tensor>
|
272 |
+
dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
|
273 |
+
const at::Tensor &bbox,
|
274 |
+
const at::Tensor &trans,
|
275 |
+
const int no_trans,
|
276 |
+
const float spatial_scale,
|
277 |
+
const int output_dim,
|
278 |
+
const int group_size,
|
279 |
+
const int pooled_size,
|
280 |
+
const int part_size,
|
281 |
+
const int sample_per_part,
|
282 |
+
const float trans_std)
|
283 |
+
{
|
284 |
+
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
|
285 |
+
AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
|
286 |
+
AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
|
287 |
+
|
288 |
+
const int batch = input.size(0);
|
289 |
+
const int channels = input.size(1);
|
290 |
+
const int height = input.size(2);
|
291 |
+
const int width = input.size(3);
|
292 |
+
const int channels_trans = no_trans ? 2 : trans.size(1);
|
293 |
+
const int num_bbox = bbox.size(0);
|
294 |
+
|
295 |
+
AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
|
296 |
+
auto pooled_height = pooled_size;
|
297 |
+
auto pooled_width = pooled_size;
|
298 |
+
|
299 |
+
auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
|
300 |
+
long out_size = num_bbox * output_dim * pooled_height * pooled_width;
|
301 |
+
auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
|
302 |
+
|
303 |
+
const int num_classes = no_trans ? 1 : channels_trans / 2;
|
304 |
+
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
|
305 |
+
|
306 |
+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
307 |
+
|
308 |
+
if (out.numel() == 0)
|
309 |
+
{
|
310 |
+
THCudaCheck(cudaGetLastError());
|
311 |
+
return std::make_tuple(out, top_count);
|
312 |
+
}
|
313 |
+
|
314 |
+
dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
|
315 |
+
dim3 block(512);
|
316 |
+
|
317 |
+
AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] {
|
318 |
+
DeformablePSROIPoolForwardKernel<scalar_t><<<grid, block, 0, stream>>>(
|
319 |
+
out_size,
|
320 |
+
input.contiguous().data<scalar_t>(),
|
321 |
+
spatial_scale,
|
322 |
+
channels,
|
323 |
+
height, width,
|
324 |
+
pooled_height,
|
325 |
+
pooled_width,
|
326 |
+
bbox.contiguous().data<scalar_t>(),
|
327 |
+
trans.contiguous().data<scalar_t>(),
|
328 |
+
no_trans,
|
329 |
+
trans_std,
|
330 |
+
sample_per_part,
|
331 |
+
output_dim,
|
332 |
+
group_size,
|
333 |
+
part_size,
|
334 |
+
num_classes,
|
335 |
+
channels_each_class,
|
336 |
+
out.data<scalar_t>(),
|
337 |
+
top_count.data<scalar_t>());
|
338 |
+
});
|
339 |
+
THCudaCheck(cudaGetLastError());
|
340 |
+
return std::make_tuple(out, top_count);
|
341 |
+
}
|
342 |
+
|
343 |
+
std::tuple<at::Tensor, at::Tensor>
|
344 |
+
dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
|
345 |
+
const at::Tensor &input,
|
346 |
+
const at::Tensor &bbox,
|
347 |
+
const at::Tensor &trans,
|
348 |
+
const at::Tensor &top_count,
|
349 |
+
const int no_trans,
|
350 |
+
const float spatial_scale,
|
351 |
+
const int output_dim,
|
352 |
+
const int group_size,
|
353 |
+
const int pooled_size,
|
354 |
+
const int part_size,
|
355 |
+
const int sample_per_part,
|
356 |
+
const float trans_std)
|
357 |
+
{
|
358 |
+
AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
|
359 |
+
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
|
360 |
+
AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
|
361 |
+
AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
|
362 |
+
AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");
|
363 |
+
|
364 |
+
const int batch = input.size(0);
|
365 |
+
const int channels = input.size(1);
|
366 |
+
const int height = input.size(2);
|
367 |
+
const int width = input.size(3);
|
368 |
+
const int channels_trans = no_trans ? 2 : trans.size(1);
|
369 |
+
const int num_bbox = bbox.size(0);
|
370 |
+
|
371 |
+
AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
|
372 |
+
auto pooled_height = pooled_size;
|
373 |
+
auto pooled_width = pooled_size;
|
374 |
+
long out_size = num_bbox * output_dim * pooled_height * pooled_width;
|
375 |
+
const int num_classes = no_trans ? 1 : channels_trans / 2;
|
376 |
+
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
|
377 |
+
|
378 |
+
auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
|
379 |
+
auto trans_grad = at::zeros_like(trans);
|
380 |
+
|
381 |
+
if (input_grad.numel() == 0)
|
382 |
+
{
|
383 |
+
THCudaCheck(cudaGetLastError());
|
384 |
+
return std::make_tuple(input_grad, trans_grad);
|
385 |
+
}
|
386 |
+
|
387 |
+
dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
|
388 |
+
dim3 block(512);
|
389 |
+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
390 |
+
|
391 |
+
AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] {
|
392 |
+
DeformablePSROIPoolBackwardAccKernel<scalar_t><<<grid, block, 0, stream>>>(
|
393 |
+
out_size,
|
394 |
+
out_grad.contiguous().data<scalar_t>(),
|
395 |
+
top_count.contiguous().data<scalar_t>(),
|
396 |
+
num_bbox,
|
397 |
+
spatial_scale,
|
398 |
+
channels,
|
399 |
+
height,
|
400 |
+
width,
|
401 |
+
pooled_height,
|
402 |
+
pooled_width,
|
403 |
+
output_dim,
|
404 |
+
input_grad.contiguous().data<scalar_t>(),
|
405 |
+
trans_grad.contiguous().data<scalar_t>(),
|
406 |
+
input.contiguous().data<scalar_t>(),
|
407 |
+
bbox.contiguous().data<scalar_t>(),
|
408 |
+
trans.contiguous().data<scalar_t>(),
|
409 |
+
no_trans,
|
410 |
+
trans_std,
|
411 |
+
sample_per_part,
|
412 |
+
group_size,
|
413 |
+
part_size,
|
414 |
+
num_classes,
|
415 |
+
channels_each_class);
|
416 |
+
});
|
417 |
+
THCudaCheck(cudaGetLastError());
|
418 |
+
return std::make_tuple(input_grad, trans_grad);
|
419 |
+
}
|
maskrcnn_benchmark/csrc/cuda/nms.cu
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#include <ATen/ATen.h>
|
3 |
+
#include <ATen/cuda/CUDAContext.h>
|
4 |
+
|
5 |
+
#include <THC/THC.h>
|
6 |
+
#include <THC/THCDeviceUtils.cuh>
|
7 |
+
|
8 |
+
#include <vector>
|
9 |
+
#include <iostream>
|
10 |
+
|
11 |
+
int const threadsPerBlock = sizeof(unsigned long long) * 8;
|
12 |
+
|
13 |
+
__device__ inline float devIoU(float const * const a, float const * const b) {
|
14 |
+
float left = max(a[0], b[0]), right = min(a[2], b[2]);
|
15 |
+
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
|
16 |
+
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
|
17 |
+
float interS = width * height;
|
18 |
+
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
|
19 |
+
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
|
20 |
+
return interS / (Sa + Sb - interS);
|
21 |
+
}
|
22 |
+
|
23 |
+
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
|
24 |
+
const float *dev_boxes, unsigned long long *dev_mask) {
|
25 |
+
const int row_start = blockIdx.y;
|
26 |
+
const int col_start = blockIdx.x;
|
27 |
+
|
28 |
+
// if (row_start > col_start) return;
|
29 |
+
|
30 |
+
const int row_size =
|
31 |
+
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
|
32 |
+
const int col_size =
|
33 |
+
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
|
34 |
+
|
35 |
+
__shared__ float block_boxes[threadsPerBlock * 5];
|
36 |
+
if (threadIdx.x < col_size) {
|
37 |
+
block_boxes[threadIdx.x * 5 + 0] =
|
38 |
+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
|
39 |
+
block_boxes[threadIdx.x * 5 + 1] =
|
40 |
+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
|
41 |
+
block_boxes[threadIdx.x * 5 + 2] =
|
42 |
+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
|
43 |
+
block_boxes[threadIdx.x * 5 + 3] =
|
44 |
+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
|
45 |
+
block_boxes[threadIdx.x * 5 + 4] =
|
46 |
+
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
|
47 |
+
}
|
48 |
+
__syncthreads();
|
49 |
+
|
50 |
+
if (threadIdx.x < row_size) {
|
51 |
+
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
|
52 |
+
const float *cur_box = dev_boxes + cur_box_idx * 5;
|
53 |
+
int i = 0;
|
54 |
+
unsigned long long t = 0;
|
55 |
+
int start = 0;
|
56 |
+
if (row_start == col_start) {
|
57 |
+
start = threadIdx.x + 1;
|
58 |
+
}
|
59 |
+
for (i = start; i < col_size; i++) {
|
60 |
+
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
|
61 |
+
t |= 1ULL << i;
|
62 |
+
}
|
63 |
+
}
|
64 |
+
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
|
65 |
+
dev_mask[cur_box_idx * col_blocks + col_start] = t;
|
66 |
+
}
|
67 |
+
}
|
68 |
+
|
69 |
+
// boxes is a N x 5 tensor
|
70 |
+
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
|
71 |
+
using scalar_t = float;
|
72 |
+
AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
|
73 |
+
auto scores = boxes.select(1, 4);
|
74 |
+
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
|
75 |
+
auto boxes_sorted = boxes.index_select(0, order_t);
|
76 |
+
|
77 |
+
int boxes_num = boxes.size(0);
|
78 |
+
|
79 |
+
const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
|
80 |
+
|
81 |
+
scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
|
82 |
+
|
83 |
+
THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
|
84 |
+
|
85 |
+
unsigned long long* mask_dev = NULL;
|
86 |
+
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
|
87 |
+
// boxes_num * col_blocks * sizeof(unsigned long long)));
|
88 |
+
|
89 |
+
mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
|
90 |
+
|
91 |
+
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
|
92 |
+
THCCeilDiv(boxes_num, threadsPerBlock));
|
93 |
+
dim3 threads(threadsPerBlock);
|
94 |
+
nms_kernel<<<blocks, threads>>>(boxes_num,
|
95 |
+
nms_overlap_thresh,
|
96 |
+
boxes_dev,
|
97 |
+
mask_dev);
|
98 |
+
|
99 |
+
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
|
100 |
+
THCudaCheck(cudaMemcpy(&mask_host[0],
|
101 |
+
mask_dev,
|
102 |
+
sizeof(unsigned long long) * boxes_num * col_blocks,
|
103 |
+
cudaMemcpyDeviceToHost));
|
104 |
+
|
105 |
+
std::vector<unsigned long long> remv(col_blocks);
|
106 |
+
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
|
107 |
+
|
108 |
+
at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
|
109 |
+
int64_t* keep_out = keep.data<int64_t>();
|
110 |
+
|
111 |
+
int num_to_keep = 0;
|
112 |
+
for (int i = 0; i < boxes_num; i++) {
|
113 |
+
int nblock = i / threadsPerBlock;
|
114 |
+
int inblock = i % threadsPerBlock;
|
115 |
+
|
116 |
+
if (!(remv[nblock] & (1ULL << inblock))) {
|
117 |
+
keep_out[num_to_keep++] = i;
|
118 |
+
unsigned long long *p = &mask_host[0] + i * col_blocks;
|
119 |
+
for (int j = nblock; j < col_blocks; j++) {
|
120 |
+
remv[j] |= p[j];
|
121 |
+
}
|
122 |
+
}
|
123 |
+
}
|
124 |
+
|
125 |
+
THCudaFree(state, mask_dev);
|
126 |
+
// TODO improve this part
|
127 |
+
return std::get<0>(order_t.index({
|
128 |
+
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
|
129 |
+
order_t.device(), keep.scalar_type())
|
130 |
+
}).sort(0, false));
|
131 |
+
}
|
maskrcnn_benchmark/csrc/cuda/vision.h
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#pragma once
|
3 |
+
#include <torch/extension.h>
|
4 |
+
|
5 |
+
|
6 |
+
at::Tensor SigmoidFocalLoss_forward_cuda(
|
7 |
+
const at::Tensor& logits,
|
8 |
+
const at::Tensor& targets,
|
9 |
+
const int num_classes,
|
10 |
+
const float gamma,
|
11 |
+
const float alpha);
|
12 |
+
|
13 |
+
at::Tensor SigmoidFocalLoss_backward_cuda(
|
14 |
+
const at::Tensor& logits,
|
15 |
+
const at::Tensor& targets,
|
16 |
+
const at::Tensor& d_losses,
|
17 |
+
const int num_classes,
|
18 |
+
const float gamma,
|
19 |
+
const float alpha);
|
20 |
+
|
21 |
+
at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
|
22 |
+
const at::Tensor& rois,
|
23 |
+
const float spatial_scale,
|
24 |
+
const int pooled_height,
|
25 |
+
const int pooled_width,
|
26 |
+
const int sampling_ratio);
|
27 |
+
|
28 |
+
at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
|
29 |
+
const at::Tensor& rois,
|
30 |
+
const float spatial_scale,
|
31 |
+
const int pooled_height,
|
32 |
+
const int pooled_width,
|
33 |
+
const int batch_size,
|
34 |
+
const int channels,
|
35 |
+
const int height,
|
36 |
+
const int width,
|
37 |
+
const int sampling_ratio);
|
38 |
+
|
39 |
+
|
40 |
+
std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
|
41 |
+
const at::Tensor& rois,
|
42 |
+
const float spatial_scale,
|
43 |
+
const int pooled_height,
|
44 |
+
const int pooled_width);
|
45 |
+
|
46 |
+
at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
|
47 |
+
const at::Tensor& input,
|
48 |
+
const at::Tensor& rois,
|
49 |
+
const at::Tensor& argmax,
|
50 |
+
const float spatial_scale,
|
51 |
+
const int pooled_height,
|
52 |
+
const int pooled_width,
|
53 |
+
const int batch_size,
|
54 |
+
const int channels,
|
55 |
+
const int height,
|
56 |
+
const int width);
|
57 |
+
|
58 |
+
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
|
59 |
+
|
60 |
+
|
61 |
+
at::Tensor compute_flow_cuda(const at::Tensor& boxes,
|
62 |
+
const int height,
|
63 |
+
const int width);
|
64 |
+
|
65 |
+
at::Tensor
|
66 |
+
dcn_v2_cuda_forward(const at::Tensor &input,
|
67 |
+
const at::Tensor &weight,
|
68 |
+
const at::Tensor &bias,
|
69 |
+
const at::Tensor &offset,
|
70 |
+
const at::Tensor &mask,
|
71 |
+
const int kernel_h,
|
72 |
+
const int kernel_w,
|
73 |
+
const int stride_h,
|
74 |
+
const int stride_w,
|
75 |
+
const int pad_h,
|
76 |
+
const int pad_w,
|
77 |
+
const int dilation_h,
|
78 |
+
const int dilation_w,
|
79 |
+
const int deformable_group);
|
80 |
+
|
81 |
+
std::vector<at::Tensor>
|
82 |
+
dcn_v2_cuda_backward(const at::Tensor &input,
|
83 |
+
const at::Tensor &weight,
|
84 |
+
const at::Tensor &bias,
|
85 |
+
const at::Tensor &offset,
|
86 |
+
const at::Tensor &mask,
|
87 |
+
const at::Tensor &grad_output,
|
88 |
+
int kernel_h, int kernel_w,
|
89 |
+
int stride_h, int stride_w,
|
90 |
+
int pad_h, int pad_w,
|
91 |
+
int dilation_h, int dilation_w,
|
92 |
+
int deformable_group);
|
93 |
+
|
94 |
+
|
95 |
+
std::tuple<at::Tensor, at::Tensor>
|
96 |
+
dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
|
97 |
+
const at::Tensor &bbox,
|
98 |
+
const at::Tensor &trans,
|
99 |
+
const int no_trans,
|
100 |
+
const float spatial_scale,
|
101 |
+
const int output_dim,
|
102 |
+
const int group_size,
|
103 |
+
const int pooled_size,
|
104 |
+
const int part_size,
|
105 |
+
const int sample_per_part,
|
106 |
+
const float trans_std);
|
107 |
+
|
108 |
+
std::tuple<at::Tensor, at::Tensor>
|
109 |
+
dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
|
110 |
+
const at::Tensor &input,
|
111 |
+
const at::Tensor &bbox,
|
112 |
+
const at::Tensor &trans,
|
113 |
+
const at::Tensor &top_count,
|
114 |
+
const int no_trans,
|
115 |
+
const float spatial_scale,
|
116 |
+
const int output_dim,
|
117 |
+
const int group_size,
|
118 |
+
const int pooled_size,
|
119 |
+
const int part_size,
|
120 |
+
const int sample_per_part,
|
121 |
+
const float trans_std);
|
maskrcnn_benchmark/csrc/dcn_v2.h
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
#include "cpu/vision.h"
|
4 |
+
|
5 |
+
#ifdef WITH_CUDA
|
6 |
+
#include "cuda/vision.h"
|
7 |
+
#endif
|
8 |
+
|
9 |
+
at::Tensor
|
10 |
+
dcn_v2_forward(const at::Tensor &input,
|
11 |
+
const at::Tensor &weight,
|
12 |
+
const at::Tensor &bias,
|
13 |
+
const at::Tensor &offset,
|
14 |
+
const at::Tensor &mask,
|
15 |
+
const int kernel_h,
|
16 |
+
const int kernel_w,
|
17 |
+
const int stride_h,
|
18 |
+
const int stride_w,
|
19 |
+
const int pad_h,
|
20 |
+
const int pad_w,
|
21 |
+
const int dilation_h,
|
22 |
+
const int dilation_w,
|
23 |
+
const int deformable_group)
|
24 |
+
{
|
25 |
+
if (input.type().is_cuda())
|
26 |
+
{
|
27 |
+
#ifdef WITH_CUDA
|
28 |
+
return dcn_v2_cuda_forward(input, weight, bias, offset, mask,
|
29 |
+
kernel_h, kernel_w,
|
30 |
+
stride_h, stride_w,
|
31 |
+
pad_h, pad_w,
|
32 |
+
dilation_h, dilation_w,
|
33 |
+
deformable_group);
|
34 |
+
#else
|
35 |
+
AT_ERROR("Not compiled with GPU support");
|
36 |
+
#endif
|
37 |
+
}
|
38 |
+
AT_ERROR("Not implemented on the CPU");
|
39 |
+
}
|
40 |
+
|
41 |
+
std::vector<at::Tensor>
|
42 |
+
dcn_v2_backward(const at::Tensor &input,
|
43 |
+
const at::Tensor &weight,
|
44 |
+
const at::Tensor &bias,
|
45 |
+
const at::Tensor &offset,
|
46 |
+
const at::Tensor &mask,
|
47 |
+
const at::Tensor &grad_output,
|
48 |
+
int kernel_h, int kernel_w,
|
49 |
+
int stride_h, int stride_w,
|
50 |
+
int pad_h, int pad_w,
|
51 |
+
int dilation_h, int dilation_w,
|
52 |
+
int deformable_group)
|
53 |
+
{
|
54 |
+
if (input.type().is_cuda())
|
55 |
+
{
|
56 |
+
#ifdef WITH_CUDA
|
57 |
+
return dcn_v2_cuda_backward(input,
|
58 |
+
weight,
|
59 |
+
bias,
|
60 |
+
offset,
|
61 |
+
mask,
|
62 |
+
grad_output,
|
63 |
+
kernel_h, kernel_w,
|
64 |
+
stride_h, stride_w,
|
65 |
+
pad_h, pad_w,
|
66 |
+
dilation_h, dilation_w,
|
67 |
+
deformable_group);
|
68 |
+
#else
|
69 |
+
AT_ERROR("Not compiled with GPU support");
|
70 |
+
#endif
|
71 |
+
}
|
72 |
+
AT_ERROR("Not implemented on the CPU");
|
73 |
+
}
|
74 |
+
|
75 |
+
std::tuple<at::Tensor, at::Tensor>
|
76 |
+
dcn_v2_psroi_pooling_forward(const at::Tensor &input,
|
77 |
+
const at::Tensor &bbox,
|
78 |
+
const at::Tensor &trans,
|
79 |
+
const int no_trans,
|
80 |
+
const float spatial_scale,
|
81 |
+
const int output_dim,
|
82 |
+
const int group_size,
|
83 |
+
const int pooled_size,
|
84 |
+
const int part_size,
|
85 |
+
const int sample_per_part,
|
86 |
+
const float trans_std)
|
87 |
+
{
|
88 |
+
if (input.type().is_cuda())
|
89 |
+
{
|
90 |
+
#ifdef WITH_CUDA
|
91 |
+
return dcn_v2_psroi_pooling_cuda_forward(input,
|
92 |
+
bbox,
|
93 |
+
trans,
|
94 |
+
no_trans,
|
95 |
+
spatial_scale,
|
96 |
+
output_dim,
|
97 |
+
group_size,
|
98 |
+
pooled_size,
|
99 |
+
part_size,
|
100 |
+
sample_per_part,
|
101 |
+
trans_std);
|
102 |
+
#else
|
103 |
+
AT_ERROR("Not compiled with GPU support");
|
104 |
+
#endif
|
105 |
+
}
|
106 |
+
AT_ERROR("Not implemented on the CPU");
|
107 |
+
}
|
108 |
+
|
109 |
+
std::tuple<at::Tensor, at::Tensor>
|
110 |
+
dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad,
|
111 |
+
const at::Tensor &input,
|
112 |
+
const at::Tensor &bbox,
|
113 |
+
const at::Tensor &trans,
|
114 |
+
const at::Tensor &top_count,
|
115 |
+
const int no_trans,
|
116 |
+
const float spatial_scale,
|
117 |
+
const int output_dim,
|
118 |
+
const int group_size,
|
119 |
+
const int pooled_size,
|
120 |
+
const int part_size,
|
121 |
+
const int sample_per_part,
|
122 |
+
const float trans_std)
|
123 |
+
{
|
124 |
+
if (input.type().is_cuda())
|
125 |
+
{
|
126 |
+
#ifdef WITH_CUDA
|
127 |
+
return dcn_v2_psroi_pooling_cuda_backward(out_grad,
|
128 |
+
input,
|
129 |
+
bbox,
|
130 |
+
trans,
|
131 |
+
top_count,
|
132 |
+
no_trans,
|
133 |
+
spatial_scale,
|
134 |
+
output_dim,
|
135 |
+
group_size,
|
136 |
+
pooled_size,
|
137 |
+
part_size,
|
138 |
+
sample_per_part,
|
139 |
+
trans_std);
|
140 |
+
#else
|
141 |
+
AT_ERROR("Not compiled with GPU support");
|
142 |
+
#endif
|
143 |
+
}
|
144 |
+
AT_ERROR("Not implemented on the CPU");
|
145 |
+
}
|
maskrcnn_benchmark/csrc/nms.h
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#pragma once
|
3 |
+
#include "cpu/vision.h"
|
4 |
+
|
5 |
+
#ifdef WITH_CUDA
|
6 |
+
#include "cuda/vision.h"
|
7 |
+
#endif
|
8 |
+
|
9 |
+
|
10 |
+
at::Tensor nms(const at::Tensor& dets,
|
11 |
+
const at::Tensor& scores,
|
12 |
+
const float threshold) {
|
13 |
+
|
14 |
+
if (dets.type().is_cuda()) {
|
15 |
+
#ifdef WITH_CUDA
|
16 |
+
// TODO raise error if not compiled with CUDA
|
17 |
+
if (dets.numel() == 0)
|
18 |
+
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
|
19 |
+
auto b = at::cat({dets, scores.unsqueeze(1)}, 1);
|
20 |
+
return nms_cuda(b, threshold);
|
21 |
+
#else
|
22 |
+
AT_ERROR("Not compiled with GPU support");
|
23 |
+
#endif
|
24 |
+
}
|
25 |
+
|
26 |
+
at::Tensor result = nms_cpu(dets, scores, threshold);
|
27 |
+
return result;
|
28 |
+
}
|
maskrcnn_benchmark/csrc/vision.cpp
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
#include "nms.h"
|
3 |
+
#include "ROIAlign.h"
|
4 |
+
#include "ROIPool.h"
|
5 |
+
#include "SigmoidFocalLoss.h"
|
6 |
+
#include "dcn_v2.h"
|
7 |
+
|
8 |
+
|
9 |
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
10 |
+
m.def("nms", &nms, "non-maximum suppression");
|
11 |
+
m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
|
12 |
+
m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
|
13 |
+
m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward");
|
14 |
+
m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
|
15 |
+
m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
|
16 |
+
m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
|
17 |
+
m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward");
|
18 |
+
m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward");
|
19 |
+
m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward");
|
20 |
+
m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward");
|
21 |
+
}
|
maskrcnn_benchmark/data/README.md
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Setting Up Datasets
|
2 |
+
This file describes how to perform training on other datasets.
|
3 |
+
|
4 |
+
Only Pascal VOC dataset can be loaded from its original format and be outputted to Pascal style results currently.
|
5 |
+
|
6 |
+
We expect the annotations from other datasets be converted to COCO json format, and
|
7 |
+
the output will be in COCO-style. (i.e. AP, AP50, AP75, APs, APm, APl for bbox and segm)
|
8 |
+
|
9 |
+
## Creating Symlinks for PASCAL VOC
|
10 |
+
|
11 |
+
We assume that your symlinked `datasets/voc/VOC<year>` directory has the following structure:
|
12 |
+
|
13 |
+
```
|
14 |
+
VOC<year>
|
15 |
+
|_ JPEGImages
|
16 |
+
| |_ <im-1-name>.jpg
|
17 |
+
| |_ ...
|
18 |
+
| |_ <im-N-name>.jpg
|
19 |
+
|_ Annotations
|
20 |
+
| |_ pascal_train<year>.json (optional)
|
21 |
+
| |_ pascal_val<year>.json (optional)
|
22 |
+
| |_ pascal_test<year>.json (optional)
|
23 |
+
| |_ <im-1-name>.xml
|
24 |
+
| |_ ...
|
25 |
+
| |_ <im-N-name>.xml
|
26 |
+
|_ VOCdevkit<year>
|
27 |
+
```
|
28 |
+
|
29 |
+
Create symlinks for `voc/VOC<year>`:
|
30 |
+
|
31 |
+
```
|
32 |
+
cd ~/github/maskrcnn-benchmark
|
33 |
+
mkdir -p datasets/voc/VOC<year>
|
34 |
+
ln -s /path/to/VOC<year> /datasets/voc/VOC<year>
|
35 |
+
```
|
36 |
+
Example configuration files for PASCAL VOC could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/pascal_voc/).
|
37 |
+
|
38 |
+
### PASCAL VOC Annotations in COCO Format
|
39 |
+
To output COCO-style evaluation result, PASCAL VOC annotations in COCO json format is required and could be downloaded from [here](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip)
|
40 |
+
via http://cocodataset.org/#external.
|
41 |
+
|
42 |
+
## Creating Symlinks for Cityscapes:
|
43 |
+
|
44 |
+
We assume that your symlinked `datasets/cityscapes` directory has the following structure:
|
45 |
+
|
46 |
+
```
|
47 |
+
cityscapes
|
48 |
+
|_ images
|
49 |
+
| |_ <im-1-name>.jpg
|
50 |
+
| |_ ...
|
51 |
+
| |_ <im-N-name>.jpg
|
52 |
+
|_ annotations
|
53 |
+
| |_ instanceonly_gtFile_train.json
|
54 |
+
| |_ ...
|
55 |
+
|_ raw
|
56 |
+
|_ gtFine
|
57 |
+
|_ ...
|
58 |
+
|_ README.md
|
59 |
+
```
|
60 |
+
|
61 |
+
Create symlinks for `cityscapes`:
|
62 |
+
|
63 |
+
```
|
64 |
+
cd ~/github/maskrcnn-benchmark
|
65 |
+
mkdir -p datasets/cityscapes
|
66 |
+
ln -s /path/to/cityscapes datasets/data/cityscapes
|
67 |
+
```
|
68 |
+
|
69 |
+
### Steps to convert Cityscapes Annotations to COCO Format
|
70 |
+
1. Download gtFine_trainvaltest.zip from https://www.cityscapes-dataset.com/downloads/ (login required)
|
71 |
+
2. Extract it to /path/to/gtFine_trainvaltest
|
72 |
+
```
|
73 |
+
cityscapes
|
74 |
+
|_ gtFine_trainvaltest.zip
|
75 |
+
|_ gtFine_trainvaltest
|
76 |
+
|_ gtFine
|
77 |
+
```
|
78 |
+
3. Run the below commands to convert the annotations
|
79 |
+
|
80 |
+
```
|
81 |
+
cd ~/github
|
82 |
+
git clone https://github.com/mcordts/cityscapesScripts.git
|
83 |
+
cd cityscapesScripts
|
84 |
+
cp ~/github/maskrcnn-benchmark/tools/cityscapes/instances2dict_with_polygons.py cityscapesscripts/evaluation
|
85 |
+
python setup.py install
|
86 |
+
cd ~/github/maskrcnn-benchmark
|
87 |
+
python tools/cityscapes/convert_cityscapes_to_coco.py --datadir /path/to/cityscapes --outdir /path/to/cityscapes/annotations
|
88 |
+
```
|
89 |
+
|
90 |
+
Example configuration files for Cityscapes could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/cityscapes/).
|
maskrcnn_benchmark/data/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
from .build import make_data_loader
|
maskrcnn_benchmark/data/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (196 Bytes). View file
|
|
maskrcnn_benchmark/data/__pycache__/build.cpython-37.pyc
ADDED
Binary file (4.81 kB). View file
|
|
maskrcnn_benchmark/data/__pycache__/collate_batch.cpython-37.pyc
ADDED
Binary file (973 Bytes). View file
|
|
maskrcnn_benchmark/data/build.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
import bisect
|
3 |
+
import copy
|
4 |
+
import logging
|
5 |
+
|
6 |
+
import torch.utils.data
|
7 |
+
from maskrcnn_benchmark.utils.comm import get_world_size
|
8 |
+
from maskrcnn_benchmark.utils.imports import import_file
|
9 |
+
|
10 |
+
from . import datasets as D
|
11 |
+
from . import samplers
|
12 |
+
|
13 |
+
from .collate_batch import BatchCollator
|
14 |
+
from .transforms import build_transforms
|
15 |
+
|
16 |
+
|
17 |
+
def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True):
|
18 |
+
"""
|
19 |
+
Arguments:
|
20 |
+
dataset_list (list[str]): Contains the names of the datasets, i.e.,
|
21 |
+
coco_2014_trian, coco_2014_val, etc
|
22 |
+
transforms (callable): transforms to apply to each (image, target) sample
|
23 |
+
dataset_catalog (DatasetCatalog): contains the information on how to
|
24 |
+
construct a dataset.
|
25 |
+
is_train (bool): whether to setup the dataset for training or testing
|
26 |
+
"""
|
27 |
+
if not isinstance(dataset_list, (list, tuple)):
|
28 |
+
raise RuntimeError(
|
29 |
+
"dataset_list should be a list of strings, got {}".format(dataset_list)
|
30 |
+
)
|
31 |
+
datasets = []
|
32 |
+
for dataset_name in dataset_list:
|
33 |
+
data = dataset_catalog.get(dataset_name)
|
34 |
+
factory = getattr(D, data["factory"])
|
35 |
+
args = data["args"]
|
36 |
+
# for COCODataset, we want to remove images without annotations
|
37 |
+
# during training
|
38 |
+
if data["factory"] in ["COCODataset",
|
39 |
+
"WordDataset"]:
|
40 |
+
args["remove_images_without_annotations"] = is_train
|
41 |
+
if data["factory"] == "PascalVOCDataset":
|
42 |
+
args["use_difficult"] = not is_train
|
43 |
+
args["transforms"] = transforms
|
44 |
+
# make dataset from factory
|
45 |
+
dataset = factory(**args)
|
46 |
+
datasets.append(dataset)
|
47 |
+
|
48 |
+
# for testing, return a list of datasets
|
49 |
+
if not is_train:
|
50 |
+
return datasets
|
51 |
+
|
52 |
+
# for training, concatenate all datasets into a single one
|
53 |
+
dataset = datasets[0]
|
54 |
+
if len(datasets) > 1:
|
55 |
+
dataset = D.ConcatDataset(datasets)
|
56 |
+
|
57 |
+
return [dataset]
|
58 |
+
|
59 |
+
|
60 |
+
def make_data_sampler(dataset, shuffle, distributed):
|
61 |
+
if distributed:
|
62 |
+
return samplers.DistributedSampler(dataset, shuffle=shuffle)
|
63 |
+
if shuffle:
|
64 |
+
sampler = torch.utils.data.sampler.RandomSampler(dataset)
|
65 |
+
else:
|
66 |
+
sampler = torch.utils.data.sampler.SequentialSampler(dataset)
|
67 |
+
return sampler
|
68 |
+
|
69 |
+
|
70 |
+
def _quantize(x, bins):
|
71 |
+
bins = copy.copy(bins)
|
72 |
+
bins = sorted(bins)
|
73 |
+
quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
|
74 |
+
return quantized
|
75 |
+
|
76 |
+
|
77 |
+
def _compute_aspect_ratios(dataset):
|
78 |
+
aspect_ratios = []
|
79 |
+
for i in range(len(dataset)):
|
80 |
+
img_info = dataset.get_img_info(i)
|
81 |
+
aspect_ratio = float(img_info["height"]) / float(img_info["width"])
|
82 |
+
aspect_ratios.append(aspect_ratio)
|
83 |
+
return aspect_ratios
|
84 |
+
|
85 |
+
|
86 |
+
def make_batch_data_sampler(
|
87 |
+
dataset, sampler, aspect_grouping, images_per_batch, num_iters=None, start_iter=0
|
88 |
+
):
|
89 |
+
if aspect_grouping:
|
90 |
+
if not isinstance(aspect_grouping, (list, tuple)):
|
91 |
+
aspect_grouping = [aspect_grouping]
|
92 |
+
aspect_ratios = _compute_aspect_ratios(dataset)
|
93 |
+
group_ids = _quantize(aspect_ratios, aspect_grouping)
|
94 |
+
batch_sampler = samplers.GroupedBatchSampler(
|
95 |
+
sampler, group_ids, images_per_batch, drop_uneven=False
|
96 |
+
)
|
97 |
+
else:
|
98 |
+
batch_sampler = torch.utils.data.sampler.BatchSampler(
|
99 |
+
sampler, images_per_batch, drop_last=False
|
100 |
+
)
|
101 |
+
if num_iters is not None:
|
102 |
+
batch_sampler = samplers.IterationBasedBatchSampler(
|
103 |
+
batch_sampler, num_iters, start_iter
|
104 |
+
)
|
105 |
+
return batch_sampler
|
106 |
+
|
107 |
+
|
108 |
+
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
|
109 |
+
num_gpus = get_world_size()
|
110 |
+
if is_train:
|
111 |
+
images_per_batch = cfg.SOLVER.IMS_PER_BATCH
|
112 |
+
assert (
|
113 |
+
images_per_batch % num_gpus == 0
|
114 |
+
), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
|
115 |
+
"of GPUs ({}) used.".format(images_per_batch, num_gpus)
|
116 |
+
images_per_gpu = images_per_batch // num_gpus
|
117 |
+
shuffle = True
|
118 |
+
num_iters = cfg.SOLVER.MAX_ITER
|
119 |
+
else:
|
120 |
+
images_per_batch = cfg.TEST.IMS_PER_BATCH
|
121 |
+
assert (
|
122 |
+
images_per_batch % num_gpus == 0
|
123 |
+
), "TEST.IMS_PER_BATCH ({}) must be divisible by the number "
|
124 |
+
"of GPUs ({}) used.".format(images_per_batch, num_gpus)
|
125 |
+
images_per_gpu = images_per_batch // num_gpus
|
126 |
+
shuffle = False if not is_distributed else True
|
127 |
+
num_iters = None
|
128 |
+
start_iter = 0
|
129 |
+
|
130 |
+
if images_per_gpu > 1:
|
131 |
+
logger = logging.getLogger(__name__)
|
132 |
+
logger.warning(
|
133 |
+
"When using more than one image per GPU you may encounter "
|
134 |
+
"an out-of-memory (OOM) error if your GPU does not have "
|
135 |
+
"sufficient memory. If this happens, you can reduce "
|
136 |
+
"SOLVER.IMS_PER_BATCH (for training) or "
|
137 |
+
"TEST.IMS_PER_BATCH (for inference). For training, you must "
|
138 |
+
"also adjust the learning rate and schedule length according "
|
139 |
+
"to the linear scaling rule. See for example: "
|
140 |
+
"https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14"
|
141 |
+
)
|
142 |
+
|
143 |
+
# group images which have similar aspect ratio. In this case, we only
|
144 |
+
# group in two cases: those with width / height > 1, and the other way around,
|
145 |
+
# but the code supports more general grouping strategy
|
146 |
+
aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []
|
147 |
+
|
148 |
+
paths_catalog = import_file(
|
149 |
+
"maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True
|
150 |
+
)
|
151 |
+
DatasetCatalog = paths_catalog.DatasetCatalog
|
152 |
+
dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST
|
153 |
+
|
154 |
+
transforms = build_transforms(cfg, is_train)
|
155 |
+
datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train)
|
156 |
+
|
157 |
+
data_loaders = []
|
158 |
+
for dataset in datasets:
|
159 |
+
sampler = make_data_sampler(dataset, shuffle, is_distributed)
|
160 |
+
batch_sampler = make_batch_data_sampler(
|
161 |
+
dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter
|
162 |
+
)
|
163 |
+
collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
|
164 |
+
num_workers = cfg.DATALOADER.NUM_WORKERS
|
165 |
+
data_loader = torch.utils.data.DataLoader(
|
166 |
+
dataset,
|
167 |
+
num_workers=num_workers,
|
168 |
+
batch_sampler=batch_sampler,
|
169 |
+
collate_fn=collator,
|
170 |
+
)
|
171 |
+
data_loaders.append(data_loader)
|
172 |
+
if is_train:
|
173 |
+
# during training, a single (possibly concatenated) data_loader is returned
|
174 |
+
assert len(data_loaders) == 1
|
175 |
+
return data_loaders[0]
|
176 |
+
return data_loaders
|
maskrcnn_benchmark/data/collate_batch.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
from maskrcnn_benchmark.structures.image_list import to_image_list
|
3 |
+
|
4 |
+
|
5 |
+
class BatchCollator(object):
|
6 |
+
"""
|
7 |
+
From a list of samples from the dataset,
|
8 |
+
returns the batched images and targets.
|
9 |
+
This should be passed to the DataLoader
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, size_divisible=0):
|
13 |
+
self.size_divisible = size_divisible
|
14 |
+
|
15 |
+
def __call__(self, batch):
|
16 |
+
transposed_batch = list(zip(*batch))
|
17 |
+
images = to_image_list(transposed_batch[0], self.size_divisible)
|
18 |
+
targets = transposed_batch[1]
|
19 |
+
img_ids = transposed_batch[2]
|
20 |
+
return images, targets, img_ids
|
maskrcnn_benchmark/data/datasets/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
from .coco import COCODataset
|
3 |
+
from .voc import PascalVOCDataset
|
4 |
+
from .concat_dataset import ConcatDataset
|
5 |
+
from .word_dataset import WordDataset
|
6 |
+
|
7 |
+
__all__ = ["COCODataset", "ConcatDataset", "PascalVOCDataset",
|
8 |
+
"WordDataset"]
|
maskrcnn_benchmark/data/datasets/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (388 Bytes). View file
|
|
maskrcnn_benchmark/data/datasets/__pycache__/char_dataset.cpython-37.pyc
ADDED
Binary file (7.64 kB). View file
|
|
maskrcnn_benchmark/data/datasets/__pycache__/coco.cpython-37.pyc
ADDED
Binary file (4.43 kB). View file
|
|
maskrcnn_benchmark/data/datasets/__pycache__/concat_dataset.cpython-37.pyc
ADDED
Binary file (985 Bytes). View file
|
|
maskrcnn_benchmark/data/datasets/__pycache__/voc.cpython-37.pyc
ADDED
Binary file (4.13 kB). View file
|
|
maskrcnn_benchmark/data/datasets/__pycache__/word_dataset.cpython-37.pyc
ADDED
Binary file (4.56 kB). View file
|
|
maskrcnn_benchmark/data/datasets/coco.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
import torch
|
3 |
+
import torchvision
|
4 |
+
|
5 |
+
from maskrcnn_benchmark.structures.bounding_box import BoxList
|
6 |
+
from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask
|
7 |
+
from maskrcnn_benchmark.structures.keypoint import PersonKeypoints
|
8 |
+
|
9 |
+
|
10 |
+
min_keypoints_per_image = 10
|
11 |
+
|
12 |
+
|
13 |
+
def _count_visible_keypoints(anno):
|
14 |
+
return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
|
15 |
+
|
16 |
+
|
17 |
+
def _has_only_empty_bbox(anno):
|
18 |
+
return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
|
19 |
+
|
20 |
+
|
21 |
+
def has_valid_annotation(anno):
|
22 |
+
# if it's empty, there is no annotation
|
23 |
+
if len(anno) == 0:
|
24 |
+
return False
|
25 |
+
# if all boxes have close to zero area, there is no annotation
|
26 |
+
if _has_only_empty_bbox(anno):
|
27 |
+
return False
|
28 |
+
# keypoints task have a slight different critera for considering
|
29 |
+
# if an annotation is valid
|
30 |
+
if "keypoints" not in anno[0]:
|
31 |
+
return True
|
32 |
+
# for keypoint detection tasks, only consider valid images those
|
33 |
+
# containing at least min_keypoints_per_image
|
34 |
+
if _count_visible_keypoints(anno) >= min_keypoints_per_image:
|
35 |
+
return True
|
36 |
+
return False
|
37 |
+
|
38 |
+
|
39 |
+
class COCODataset(torchvision.datasets.coco.CocoDetection):
|
40 |
+
def __init__(
|
41 |
+
self, ann_file, root, remove_images_without_annotations, transforms=None
|
42 |
+
):
|
43 |
+
super(COCODataset, self).__init__(root, ann_file)
|
44 |
+
# sort indices for reproducible results
|
45 |
+
self.ids = sorted(self.ids)
|
46 |
+
|
47 |
+
# filter images without detection annotations
|
48 |
+
if remove_images_without_annotations:
|
49 |
+
ids = []
|
50 |
+
for img_id in self.ids:
|
51 |
+
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
|
52 |
+
anno = self.coco.loadAnns(ann_ids)
|
53 |
+
if has_valid_annotation(anno):
|
54 |
+
ids.append(img_id)
|
55 |
+
self.ids = ids
|
56 |
+
|
57 |
+
self.json_category_id_to_contiguous_id = {
|
58 |
+
v: i + 1 for i, v in enumerate(self.coco.getCatIds())
|
59 |
+
}
|
60 |
+
self.contiguous_category_id_to_json_id = {
|
61 |
+
v: k for k, v in self.json_category_id_to_contiguous_id.items()
|
62 |
+
}
|
63 |
+
self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
|
64 |
+
self.transforms = transforms
|
65 |
+
|
66 |
+
def __getitem__(self, idx):
|
67 |
+
img, anno = super(COCODataset, self).__getitem__(idx)
|
68 |
+
|
69 |
+
# filter crowd annotations
|
70 |
+
# TODO might be better to add an extra field
|
71 |
+
anno = [obj for obj in anno if obj["iscrowd"] == 0]
|
72 |
+
|
73 |
+
boxes = [obj["bbox"] for obj in anno]
|
74 |
+
boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes
|
75 |
+
target = BoxList(boxes, img.size, mode="xywh").convert("xyxy")
|
76 |
+
|
77 |
+
classes = [obj["category_id"] for obj in anno]
|
78 |
+
classes = [self.json_category_id_to_contiguous_id[c] for c in classes]
|
79 |
+
classes = torch.tensor(classes)
|
80 |
+
target.add_field("labels", classes)
|
81 |
+
|
82 |
+
masks = [obj["segmentation"] for obj in anno]
|
83 |
+
masks = SegmentationMask(masks, img.size, mode='poly')
|
84 |
+
target.add_field("masks", masks)
|
85 |
+
|
86 |
+
if anno and "keypoints" in anno[0]:
|
87 |
+
keypoints = [obj["keypoints"] for obj in anno]
|
88 |
+
keypoints = PersonKeypoints(keypoints, img.size)
|
89 |
+
target.add_field("keypoints", keypoints)
|
90 |
+
|
91 |
+
target = target.clip_to_image(remove_empty=True)
|
92 |
+
|
93 |
+
if self.transforms is not None:
|
94 |
+
img, target = self.transforms(img, target)
|
95 |
+
|
96 |
+
return img, target, idx
|
97 |
+
|
98 |
+
def get_img_info(self, index):
|
99 |
+
img_id = self.id_to_img_map[index]
|
100 |
+
img_data = self.coco.imgs[img_id]
|
101 |
+
return img_data
|
maskrcnn_benchmark/data/datasets/concat_dataset.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2 |
+
import bisect
|
3 |
+
|
4 |
+
from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
|
5 |
+
|
6 |
+
|
7 |
+
class ConcatDataset(_ConcatDataset):
|
8 |
+
"""
|
9 |
+
Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra
|
10 |
+
method for querying the sizes of the image
|
11 |
+
"""
|
12 |
+
|
13 |
+
def get_idxs(self, idx):
|
14 |
+
dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
|
15 |
+
if dataset_idx == 0:
|
16 |
+
sample_idx = idx
|
17 |
+
else:
|
18 |
+
sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
|
19 |
+
return dataset_idx, sample_idx
|
20 |
+
|
21 |
+
def get_img_info(self, idx):
|
22 |
+
dataset_idx, sample_idx = self.get_idxs(idx)
|
23 |
+
return self.datasets[dataset_idx].get_img_info(sample_idx)
|