diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..6d4eb2c8a1abeaba24a7fa2180cf5dd6db196906
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,25 @@
+FCOS for non-commercial purposes
+
+Copyright (c) 2019 the authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/configs/ctw/r50_baseline.yaml b/configs/ctw/r50_baseline.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29c27d35c825154636745bbe5a85f581ffe0dfd8
--- /dev/null
+++ b/configs/ctw/r50_baseline.yaml
@@ -0,0 +1,70 @@
+OUTPUT_DIR: "./output/ctw"
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+  RESNETS:
+    BACKBONE_OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    ASPECT_RATIOS: (0.25, 0.5, 1.0, 2.0, 4.0)
+  ROI_HEADS:
+    USE_FPN: True
+    SCORE_THRESH: 0.85
+    NMS: 0.3
+  ROI_BOX_HEAD:
+    DEFORMABLE_POOLING: False
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+    NUM_CLASSES: 2
+    CLASS_WEIGHT: 1.0
+  ## Boundary
+  BOUNDARY_ON: True
+  ROI_BOUNDARY_HEAD:
+    DEFORMABLE_POOLING: True
+    FEATURE_EXTRACTOR: "BoundaryRCNNFPNFeatureExtractor"
+    POOLER_RESOLUTION: 14
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    PREDICTOR: "BoundaryRCNNC4Predictor"
+    RESOLUTION: 48
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+    BO_WEIGHT: 0.1
+    Loss_balance: 1.1
+
+PROCESS:
+  PNMS: True
+  NMS_THRESH: 0.25
+DATASETS:
+  TRAIN: ("CTW1500_train",)
+  TEST: ("CTW1500_test",)
+  Test_Visual: True
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  BASE_LR: 0.0025
+  BIAS_LR_FACTOR: 2
+  WEIGHT_DECAY: 0.0001
+  STEPS: (30000, 40000)
+  MAX_ITER: 45000
+  IMS_PER_BATCH: 1
+  CHECKPOINT_PERIOD: 1000
+INPUT:
+
+  MIN_SIZE_TRAIN: (400,600,720,1000,1200)
+  MAX_SIZE_TRAIN: 2000
+  MIN_SIZE_TEST: 720
+  MAX_SIZE_TEST: 1280
+  CROP_PROB_TRAIN: 1.0
+  ROTATE_PROB_TRAIN: 0.0
+  ROTATE_DEGREE: (0,30,60,90,210,150,180,210,240,270,300,330,360)
+
+TEST:
+  IMS_PER_BATCH: 1
+
+
diff --git a/configs/ic/r50_baseline.yaml b/configs/ic/r50_baseline.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..056a3fc7a39ffc40e13da3a0d10af9166cc200da
--- /dev/null
+++ b/configs/ic/r50_baseline.yaml
@@ -0,0 +1,75 @@
+OUTPUT_DIR: "./output/ic15"
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  WEIGHT: catalog://ImageNetPretrained/MSRA/R-50
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+  RESNETS:
+    BACKBONE_OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    ASPECT_RATIOS: (0.25, 0.5, 1.0, 2.0, 4.0)
+  ROI_HEADS:
+    USE_FPN: True
+    SCORE_THRESH: 0.52  # ic15
+    NMS: 0.89
+  ROI_BOX_HEAD:
+    DEFORMABLE_POOLING: False
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+    NUM_CLASSES: 2
+    CLASS_WEIGHT: 1.0
+  ## Boundary
+  BOUNDARY_ON: True
+  ROI_BOUNDARY_HEAD:
+    DEFORMABLE_POOLING: False
+    FEATURE_EXTRACTOR: "BoundaryRCNNFPNFeatureExtractor"
+    POOLER_RESOLUTION: 14
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    PREDICTOR: "BoundaryRCNNC4Predictor"
+    RESOLUTION: 48
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+    BO_WEIGHT: 0.1
+    Loss_balance: 1.0
+
+PROCESS:
+  PNMS: True
+  NMS_THRESH: 0.25
+DATASETS:
+  TRAIN: ("ic15_train",)
+  TEST: ("ic15_test",)
+  Test_Visual: True
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  BASE_LR: 0.00025
+  BIAS_LR_FACTOR: 2
+  WEIGHT_DECAY: 0.0001
+#  STEPS: (120000, 160000)
+  STEPS: (5000, 10000)  # fine-tune
+#  MAX_ITER: 180000
+  MAX_ITER: 190500  # fine-tune
+  IMS_PER_BATCH: 1
+  CHECKPOINT_PERIOD: 5000
+INPUT:
+
+  MIN_SIZE_TRAIN: (400,600,720,1000,1200)
+  MAX_SIZE_TRAIN: 2000
+  MIN_SIZE_TEST: 1200
+  MAX_SIZE_TEST: 2000
+
+  CROP_PROB_TRAIN: 1.0
+  ROTATE_PROB_TRAIN: 0.3  # fine-tune
+#  ROTATE_PROB_TRAIN: 1.0
+#  ROTATE_DEGREE: (0,30,60,90,210,150,180,210,240,270,300,330,360)
+  ROTATE_DEGREE: (10,)  # fine-tune
+
+TEST:
+  IMS_PER_BATCH: 1
+
+
diff --git a/demo/1.jpg b/demo/1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d0be654dbc9fbb13e9e6a4b9453caa3562daba47
Binary files /dev/null and b/demo/1.jpg differ
diff --git a/demo/2.jpg b/demo/2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..507950bc42f98aacbce40b33a0118becec43d93f
Binary files /dev/null and b/demo/2.jpg differ
diff --git a/demo/example1.jpg b/demo/example1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fb8063a68c4a7422810a9a83d323a78cd0faf67c
Binary files /dev/null and b/demo/example1.jpg differ
diff --git a/demo/example_results.jpg b/demo/example_results.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b8b7c8e2d1a68952152703111be1d9f478619257
Binary files /dev/null and b/demo/example_results.jpg differ
diff --git a/maskrcnn_benchmark/__init__.py b/maskrcnn_benchmark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c7f19c6c00a4ac3f2f2bc66f892e44bcbd72612
--- /dev/null
+++ b/maskrcnn_benchmark/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/maskrcnn_benchmark/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..239b08be4b4cfdf3d70754418e36b0ebe8b19083
Binary files /dev/null and b/maskrcnn_benchmark/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/config/__init__.py b/maskrcnn_benchmark/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a15023b1b06dad1f8c36924cdbb96bf1f5dc8d
--- /dev/null
+++ b/maskrcnn_benchmark/config/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .defaults import _C as cfg
diff --git a/maskrcnn_benchmark/config/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/config/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4dcd8c5e9719cf492218a3621e7b022ebbcab6f
Binary files /dev/null and b/maskrcnn_benchmark/config/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/config/__pycache__/defaults.cpython-37.pyc b/maskrcnn_benchmark/config/__pycache__/defaults.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51b32a43281423b8a8127c3a035e20516eb30225
Binary files /dev/null and b/maskrcnn_benchmark/config/__pycache__/defaults.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/config/__pycache__/paths_catalog.cpython-37.pyc b/maskrcnn_benchmark/config/__pycache__/paths_catalog.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18400dd1d95f9ca56e88aea30264e2f28cb2199c
Binary files /dev/null and b/maskrcnn_benchmark/config/__pycache__/paths_catalog.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa35ac474b5d42a99361d1ac5ba2d8e164ae0a2c
--- /dev/null
+++ b/maskrcnn_benchmark/config/defaults.py
@@ -0,0 +1,471 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import os
+
+from yacs.config import CfgNode as CN
+
+
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# or _TEST for a test-specific parameter.
+# For example, the number of images during training will be
+# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
+# IMAGES_PER_BATCH_TEST
+
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+
+_C = CN()
+
+_C.MODEL = CN()
+_C.MODEL.RPN_ONLY = False
+_C.MODEL.MASK_ON = False
+_C.MODEL.FCOS_ON = False
+_C.MODEL.KE_ON = False
+_C.MODEL.BOUNDARY_ON = False
+_C.MODEL.MSR_ON = False
+_C.MODEL.RETINANET_ON = False
+_C.MODEL.KEYPOINT_ON = False
+_C.MODEL.DEVICE = "cuda"
+_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
+_C.MODEL.CLS_AGNOSTIC_BBOX_REG = False
+
+# If the WEIGHT starts with a catalog://, like :R-50, the code will look for
+# the path in paths_catalog. Else, it will use it as the specified absolute
+# path
+_C.MODEL.WEIGHT = ""
+
+
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# Size of the smallest side of the image during training
+_C.INPUT.MIN_SIZE_TRAIN = (800,)  # (800,)
+# The range of the smallest side for multi-scale training
+_C.INPUT.MIN_SIZE_RANGE_TRAIN = (-1, -1)  # -1 means disabled and it will use MIN_SIZE_TRAIN
+# Maximum size of the side of the image during training
+_C.INPUT.MAX_SIZE_TRAIN = 1333
+# Size of the smallest side of the image during testing
+_C.INPUT.MIN_SIZE_TEST = 1000
+# Maximum size of the side of the image during testing
+_C.INPUT.MAX_SIZE_TEST = 1333
+# Values to be used for image normalization
+_C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717]
+# Values to be used for image normalization
+_C.INPUT.PIXEL_STD = [1., 1., 1.]
+# Convert image to BGR format (for Caffe2 models), in range 0-255
+_C.INPUT.TO_BGR255 = True
+_C.INPUT.CROP_PROB_TRAIN = 1.0
+_C.INPUT.ROTATE_PROB_TRAIN = 0.3
+_C.INPUT.ROTATE_DEGREE = (0,15,-15,45,-45,90,-90)
+# _C.INPUT.ROTATE_DEGREE = 15
+
+
+
+
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training, as present in paths_catalog.py
+_C.DATASETS.TRAIN = ()
+# List of the dataset names for testing, as present in paths_catalog.py
+_C.DATASETS.TEST = ()
+_C.DATASETS.Test_Visual = False
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 4
+# If > 0, this enforces that each collated batch should have a size divisible
+# by SIZE_DIVISIBILITY
+_C.DATALOADER.SIZE_DIVISIBILITY = 0
+# If True, each batch should contain only images for which the aspect ratio
+# is compatible. This groups portrait images together, and landscape images
+# are not batched with portrait images.
+_C.DATALOADER.ASPECT_RATIO_GROUPING = True
+
+
+# ---------------------------------------------------------------------------- #
+# Backbone options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+
+# The backbone conv body to use
+# The string must match a function that is imported in modeling.model_builder
+# (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN
+# backbone)
+_C.MODEL.BACKBONE.CONV_BODY = "R-50-C4"
+
+# Add StopGrad at a specified stage so the bottom layers are frozen
+_C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2
+# GN for backbone
+
+##123123123
+_C.MODEL.BACKBONE.USE_GN = False
+
+
+# ---------------------------------------------------------------------------- #
+# FPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FPN = CN()
+
+# 123123123
+_C.MODEL.FPN.USE_GN = False
+_C.MODEL.FPN.USE_RELU = False
+
+#############123123123
+_C.MODEL.FPN.USE_DEFORMABLE = False
+
+
+# ---------------------------------------------------------------------------- #
+# Group Norm options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.GROUP_NORM = CN()
+# Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS)
+_C.MODEL.GROUP_NORM.DIM_PER_GP = -1
+# Number of groups in GroupNorm (-1 if using DIM_PER_GP)
+_C.MODEL.GROUP_NORM.NUM_GROUPS = 32
+# GroupNorm's small constant in the denominator
+_C.MODEL.GROUP_NORM.EPSILON = 1e-5
+
+
+# ---------------------------------------------------------------------------- #
+# RPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RPN = CN()
+_C.MODEL.RPN.USE_FPN = False
+# Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input
+_C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512)
+# Stride of the feature map that RPN is attached.
+# For FPN, number of strides should match number of scales
+_C.MODEL.RPN.ANCHOR_STRIDE = (16,)
+# RPN anchor aspect ratios
+_C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0)
+# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels
+# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
+_C.MODEL.RPN.STRADDLE_THRESH = 0
+# Minimum overlap required between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
+# ==> positive RPN example)
+_C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7
+# Maximum overlap allowed between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
+# ==> negative RPN example)
+_C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3
+# Total number of RPN examples per image
+_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
+# Target fraction of foreground (positive) examples per RPN minibatch
+_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
+# Number of top scoring RPN proposals to keep before applying NMS
+# When FPN is used, this is *per FPN level* (not total)
+_C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000
+
+_C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000
+# Number of top scoring RPN proposals to keep after applying NMS
+_C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000
+_C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000
+# NMS threshold used on RPN proposals
+_C.MODEL.RPN.NMS_THRESH = 0.7
+# Proposal height and width both need to be greater than RPN_MIN_SIZE
+# (a the scale used during training or inference)
+_C.MODEL.RPN.MIN_SIZE = 0
+# Number of top scoring RPN proposals to keep after combining proposals from
+# all FPN levels
+_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000
+_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
+# Custom rpn head, empty to use default conv or separable conv
+_C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead_1"
+
+
+# ---------------------------------------------------------------------------- #
+# ROI HEADS options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_HEADS = CN()
+_C.MODEL.ROI_HEADS.USE_FPN = False
+_C.MODEL.ROI_HEADS.USE_FPN = False
+# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
+_C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5
+# Overlap threshold for an RoI to be considered background
+# (class = 0 if overlap in [0, BG_IOU_THRESHOLD))
+_C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5
+# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
+# These are empirically chosen to approximately lead to unit variance targets
+_C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.)
+# RoI minibatch size *per image* (number of regions of interest [ROIs])
+# Total number of RoIs per training minibatch =
+#   TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH
+# E.g., a common configuration is: 512 * 2 * 8 = 8192
+_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
+# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
+_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
+
+# Only used on test mode
+
+# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
+# balance obtaining high recall with not having too many low precision
+# detections that will slow down inference post processing steps (like NMS)
+_C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+_C.MODEL.ROI_HEADS.NMS = 0.5
+# Maximum number of detections to return per image (100 is based on the limit established for the COCO dataset)
+_C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100
+
+
+_C.MODEL.ROI_BOX_HEAD = CN()
+_C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
+_C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor"
+_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81
+# Hidden layer dimension when using an MLP for the RoI box head
+_C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024
+# GN
+#####123123123
+_C.MODEL.ROI_BOX_HEAD.USE_GN = False
+# Dilation
+_C.MODEL.ROI_BOX_HEAD.DILATION = 1
+_C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256
+
+#### 123123
+_C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4
+_C.MODEL.ROI_BOX_HEAD.CLASS_WEIGHT = 0.1
+_C.MODEL.ROI_BOX_HEAD.DEFORMABLE_POOLING = False
+
+_C.MODEL.ROI_MASK_HEAD = CN()
+# Whether or not resize and translate masks to the input image.
+_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False
+_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5
+_C.MODEL.ROI_MASK_HEAD.DILATION = 1
+_C.MODEL.ROI_MASK_HEAD.USE_GN = False
+
+# Boundary edge
+_C.MODEL.ROI_BOUNDARY_HEAD = CN()
+_C.MODEL.ROI_BOUNDARY_HEAD.DEFORMABLE_POOLING = False
+
+_C.MODEL.ROI_BOUNDARY_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
+_C.MODEL.ROI_BOUNDARY_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOUNDARY_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_BOUNDARY_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_BOUNDARY_HEAD.CONV_LAYERS = (256, 256, 256, 256)
+
+_C.MODEL.ROI_BOUNDARY_HEAD.PREDICTOR = "KERCNNC4Predictor"
+_C.MODEL.ROI_BOUNDARY_HEAD.RESOLUTION = 14
+_C.MODEL.ROI_BOUNDARY_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
+_C.MODEL.ROI_BOUNDARY_HEAD.BO_WEIGHT = 1.0
+_C.MODEL.ROI_BOUNDARY_HEAD.Loss_balance = 1.2
+
+# ---------------------------------------------------------------------------- #
+# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
+# Note that parts of a resnet may be used for both the backbone and the head
+# These options apply to both
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RESNETS = CN()
+
+# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
+_C.MODEL.RESNETS.NUM_GROUPS = 1
+
+# Baseline width of each group
+_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
+
+# Place the stride 2 conv on the 1x1 filter
+# Use True only for the original MSRA ResNet; use False for C2 and Torch models
+_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
+
+# Residual transformation function
+_C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm"
+_C.MODEL.RESNETS.DEF_FUNC = "DeformableConvWithFixedBatchNorm"
+# ResNet's stem function (conv1 and pool1)
+_C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm"
+_C.MODEL.RESNETS.DEF_START_MODULE = "NA"
+
+#########123123123
+_C.MODEL.RESNETS.DEFORM_POOLING = False
+
+# Apply dilation in stage "res5"
+_C.MODEL.RESNETS.RES5_DILATION = 1
+
+_C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4
+_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
+_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
+
+# ---------------------------------------------------------------------------- #
+# FCOS Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FCOS = CN()
+_C.MODEL.FCOS.NUM_CLASSES = 81  # the number of classes including background
+_C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
+_C.MODEL.FCOS.PRIOR_PROB = 0.01
+_C.MODEL.FCOS.INFERENCE_TH = 0.05
+_C.MODEL.FCOS.NMS_TH = 0.4
+_C.MODEL.FCOS.PRE_NMS_TOP_N = 1000
+
+# Focal loss parameter: alpha
+_C.MODEL.FCOS.LOSS_ALPHA = 0.25
+# Focal loss parameter: gamma
+_C.MODEL.FCOS.LOSS_GAMMA = 2.0
+_C.MODEL.FCOS.SIZES_OF_INTEREST = [64, 128, 256, 512]
+
+# the number of convolutions used in the cls and bbox tower
+_C.MODEL.FCOS.NUM_CONVS = 4
+
+# ---------------------------------------------------------------------------- #
+# RetinaNet Options (Follow the Detectron version)
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RETINANET = CN()
+
+# This is the number of foreground classes and background.
+_C.MODEL.RETINANET.NUM_CLASSES = 81
+
+# Anchor aspect ratios to use
+_C.MODEL.RETINANET.ANCHOR_SIZES = (32, 64, 128, 256, 512)
+_C.MODEL.RETINANET.ASPECT_RATIOS = (0.5, 1.0, 2.0)
+_C.MODEL.RETINANET.ANCHOR_STRIDES = (8, 16, 32, 64, 128)
+_C.MODEL.RETINANET.STRADDLE_THRESH = 0
+
+# Anchor scales per octave
+_C.MODEL.RETINANET.OCTAVE = 2.0
+_C.MODEL.RETINANET.SCALES_PER_OCTAVE = 3
+
+# Use C5 or P5 to generate P6
+_C.MODEL.RETINANET.USE_C5 = True
+
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.MODEL.RETINANET.NUM_CONVS = 4
+
+# Weight for bbox_regression loss
+_C.MODEL.RETINANET.BBOX_REG_WEIGHT = 4.0
+
+# Smooth L1 loss beta for bbox regression
+_C.MODEL.RETINANET.BBOX_REG_BETA = 0.11
+
+# During inference, #locs to select based on cls score before NMS is performed
+# per FPN level
+_C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000
+
+# IoU overlap ratio for labeling an anchor as positive
+# Anchors with >= iou overlap are labeled positive
+_C.MODEL.RETINANET.FG_IOU_THRESHOLD = 0.5
+
+# IoU overlap ratio for labeling an anchor as negative
+# Anchors with < iou overlap are labeled negative
+_C.MODEL.RETINANET.BG_IOU_THRESHOLD = 0.4
+
+# Focal loss parameter: alpha
+_C.MODEL.RETINANET.LOSS_ALPHA = 0.25
+
+# Focal loss parameter: gamma
+_C.MODEL.RETINANET.LOSS_GAMMA = 2.0
+
+# Prior prob for the positives at the beginning of training. This is used to set
+# the bias init for the logits layer
+_C.MODEL.RETINANET.PRIOR_PROB = 0.01
+
+# Inference cls score threshold, anchors with score > INFERENCE_TH are
+# considered for inference
+_C.MODEL.RETINANET.INFERENCE_TH = 0.05
+
+# NMS threshold used in RetinaNet
+_C.MODEL.RETINANET.NMS_TH = 0.4
+
+
+# ---------------------------------------------------------------------------- #
+# FBNet options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FBNET = CN()
+_C.MODEL.FBNET.ARCH = "default"
+# custom arch
+_C.MODEL.FBNET.ARCH_DEF = ""
+_C.MODEL.FBNET.BN_TYPE = "bn"
+_C.MODEL.FBNET.SCALE_FACTOR = 1.0
+# the output channels will be divisible by WIDTH_DIVISOR
+_C.MODEL.FBNET.WIDTH_DIVISOR = 1
+_C.MODEL.FBNET.DW_CONV_SKIP_BN = True
+_C.MODEL.FBNET.DW_CONV_SKIP_RELU = True
+
+# > 0 scale, == 0 skip, < 0 same dimension
+_C.MODEL.FBNET.DET_HEAD_LAST_SCALE = 1.0
+_C.MODEL.FBNET.DET_HEAD_BLOCKS = []
+# overwrite the stride for the head, 0 to use original value
+_C.MODEL.FBNET.DET_HEAD_STRIDE = 0
+
+# > 0 scale, == 0 skip, < 0 same dimension
+_C.MODEL.FBNET.KPTS_HEAD_LAST_SCALE = 0.0
+_C.MODEL.FBNET.KPTS_HEAD_BLOCKS = []
+# overwrite the stride for the head, 0 to use original value
+_C.MODEL.FBNET.KPTS_HEAD_STRIDE = 0
+
+# > 0 scale, == 0 skip, < 0 same dimension
+_C.MODEL.FBNET.MASK_HEAD_LAST_SCALE = 0.0
+_C.MODEL.FBNET.MASK_HEAD_BLOCKS = []
+# overwrite the stride for the head, 0 to use original value
+_C.MODEL.FBNET.MASK_HEAD_STRIDE = 0
+
+# 0 to use all blocks defined in arch_def
+_C.MODEL.FBNET.RPN_HEAD_BLOCKS = 0
+_C.MODEL.FBNET.RPN_BN_TYPE = ""
+
+
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+_C.SOLVER.MAX_ITER = 40000
+
+_C.SOLVER.BASE_LR = 0.001
+_C.SOLVER.BIAS_LR_FACTOR = 2
+
+_C.SOLVER.MOMENTUM = 0.9
+
+_C.SOLVER.WEIGHT_DECAY = 0.0005
+_C.SOLVER.WEIGHT_DECAY_BIAS = 0
+
+_C.SOLVER.GAMMA = 0.1
+_C.SOLVER.STEPS = (30000,)
+
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 3
+_C.SOLVER.WARMUP_ITERS = 500
+_C.SOLVER.WARMUP_METHOD = "linear"
+
+_C.SOLVER.CHECKPOINT_PERIOD = 2500
+
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
+# see 2 images per batch
+_C.SOLVER.IMS_PER_BATCH = 4
+
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+_C.TEST.EXPECTED_RESULTS = []
+_C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
+# see 2 images per batch
+_C.TEST.IMS_PER_BATCH = 16
+# Number of detections per image
+_C.TEST.DETECTIONS_PER_IMG = 100
+
+
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+_C.OUTPUT_DIR = "./1"
+_C.IS_LOAD_OPTIMIZER = True
+_C.IS_LOAD_SCHEDULER = True
+_C.PROCESS = CN()
+
+#####123123123
+_C.PROCESS.PNMS = False
+_C.PROCESS.NMS_THRESH = 0.4
+
+_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
diff --git a/maskrcnn_benchmark/config/paths_catalog.py b/maskrcnn_benchmark/config/paths_catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ad2540e07c484eff858363003f3015a5f99713d
--- /dev/null
+++ b/maskrcnn_benchmark/config/paths_catalog.py
@@ -0,0 +1,120 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Centralized catalog of paths."""
+
+import os
+
+class DatasetCatalog(object):
+    DATA_DIR = "/home/zhangbq/ws/ct/dataset/"
+    DATASETS = {
+        "ic15_train": (
+            "ic15/ic15_train_images",
+            "ic15/annotations/ic15_train.json"
+        ),
+        "ic15_test": (
+            "ic15/ic15_test_images",
+            "ic15/annotations/ic15_test.json"
+        ),
+         "CTW1500_train": (
+            "ctw/ctw_train_images",
+            "ctw/annotations/ctw_train.json"
+        ),
+        "CTW1500_test": (
+            "ctw/ctw_test_images",
+            "ctw/annotations/ctw_test.json"
+        )
+        
+}
+
+    @staticmethod
+    def get(name):
+        data_dir = DatasetCatalog.DATA_DIR
+        attrs = DatasetCatalog.DATASETS[name]
+        if "coco" in name:
+            args = dict(
+                root=os.path.join(data_dir, attrs["img_dir"]),
+                ann_file=os.path.join(data_dir, attrs["ann_file"]),
+            )
+            return dict(
+                factory="COCODataset",
+                args=args,
+            )
+        elif "voc" in name:
+            args = dict(
+                data_dir=os.path.join(data_dir, attrs["data_dir"]),
+                split=attrs["split"],
+            )
+            return dict(
+                factory="PascalVOCDataset",
+                args=args,
+            )
+        elif True:
+            args = dict(
+                root=os.path.join(data_dir, attrs[0]),
+                ann_file=os.path.join(data_dir, attrs[1]),
+            )
+            return dict(
+                factory="WordDataset",
+                args=args,
+            )
+        raise RuntimeError("Dataset not available: {}".format(name))
+
+
+class ModelCatalog(object):
+    S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron"
+    C2_IMAGENET_MODELS = {
+        "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
+        "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
+        "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
+        "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
+        "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
+    }
+
+    C2_DETECTRON_SUFFIX = "output/train/{}coco_2014_train%3A{}coco_2014_valminusminival/generalized_rcnn/model_final.pkl"
+    C2_DETECTRON_MODELS = {
+        "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
+        "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
+        "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7",
+        "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ",
+        "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB",
+        "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC",
+        "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
+        "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
+        "37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x": "09_35_36.8pzTQKYK",
+        # keypoints
+        "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "08_42_54.kdzV35ao"
+    }
+
+    @staticmethod
+    def get(name):
+        if name.startswith("Caffe2Detectron/COCO"):
+            return ModelCatalog.get_c2_detectron_12_2017_baselines(name)
+        if name.startswith("ImageNetPretrained"):
+            return ModelCatalog.get_c2_imagenet_pretrained(name)
+        raise RuntimeError("model not present in the catalog {}".format(name))
+
+    @staticmethod
+    def get_c2_imagenet_pretrained(name):
+        prefix = ModelCatalog.S3_C2_DETECTRON_URL
+        name = name[len("ImageNetPretrained/"):]
+        name = ModelCatalog.C2_IMAGENET_MODELS[name]
+        url = "/".join([prefix, name])
+        return url
+
+    @staticmethod
+    def get_c2_detectron_12_2017_baselines(name):
+        # Detectron C2 models are stored following the structure
+        # prefix/<model_id>/2012_2017_baselines/<model_name>.yaml.<signature>/suffix
+        # we use as identifiers in the catalog Caffe2Detectron/COCO/<model_id>/<model_name>
+        prefix = ModelCatalog.S3_C2_DETECTRON_URL
+        dataset_tag = "keypoints_" if "keypoint" in name else ""
+        suffix = ModelCatalog.C2_DETECTRON_SUFFIX.format(dataset_tag, dataset_tag)
+        # remove identification prefix
+        name = name[len("Caffe2Detectron/COCO/"):]
+        # split in <model_id> and <model_name>
+        model_id, model_name = name.split("/")
+        # parsing to make it match the url address from the Caffe2 models
+        model_name = "{}.yaml".format(model_name)
+        signature = ModelCatalog.C2_DETECTRON_MODELS[name]
+        unique_name = ".".join([model_name, signature])
+        url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix])
+        return url
diff --git a/maskrcnn_benchmark/csrc/ROIAlign.h b/maskrcnn_benchmark/csrc/ROIAlign.h
new file mode 100644
index 0000000000000000000000000000000000000000..3907deab2a750a9f83f0f3ef38fee279c1445c61
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/ROIAlign.h
@@ -0,0 +1,46 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+// Interface for Python
+at::Tensor ROIAlign_forward(const at::Tensor& input,
+                            const at::Tensor& rois,
+                            const float spatial_scale,
+                            const int pooled_height,
+                            const int pooled_width,
+                            const int sampling_ratio) {
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+at::Tensor ROIAlign_backward(const at::Tensor& grad,
+                             const at::Tensor& rois,
+                             const float spatial_scale,
+                             const int pooled_height,
+                             const int pooled_width,
+                             const int batch_size,
+                             const int channels,
+                             const int height,
+                             const int width,
+                             const int sampling_ratio) {
+  if (grad.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
diff --git a/maskrcnn_benchmark/csrc/ROIPool.h b/maskrcnn_benchmark/csrc/ROIPool.h
new file mode 100644
index 0000000000000000000000000000000000000000..200fd7390b4629747f0ea9e16c0823ac5f099ac1
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/ROIPool.h
@@ -0,0 +1,48 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+
+std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
+                                const at::Tensor& rois,
+                                const float spatial_scale,
+                                const int pooled_height,
+                                const int pooled_width) {
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+at::Tensor ROIPool_backward(const at::Tensor& grad,
+                                 const at::Tensor& input,
+                                 const at::Tensor& rois,
+                                 const at::Tensor& argmax,
+                                 const float spatial_scale,
+                                 const int pooled_height,
+                                 const int pooled_width,
+                                 const int batch_size,
+                                 const int channels,
+                                 const int height,
+                                 const int width) {
+  if (grad.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+
+
diff --git a/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
new file mode 100644
index 0000000000000000000000000000000000000000..308861e44774dffd89b3f5ebff7cc6c5491fe3a5
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+// Interface for Python
+at::Tensor SigmoidFocalLoss_forward(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const int num_classes, 
+		const float gamma, 
+		const float alpha) {
+  if (logits.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+at::Tensor SigmoidFocalLoss_backward(
+			     const at::Tensor& logits,
+                             const at::Tensor& targets,
+			     const at::Tensor& d_losses,
+			     const int num_classes,
+			     const float gamma,
+			     const float alpha) {
+  if (logits.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d35aedf27ea581b9241d44b87dcca2e901b5064e
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
@@ -0,0 +1,257 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward_cpu_kernel(
+    const int nthreads,
+    const T* bottom_data,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    //int roi_cols,
+    T* top_data) {
+  //AT_ASSERT(roi_cols == 4 || roi_cols == 5);
+  int roi_cols = 5;
+
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    // roi could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+    int roi_batch_ind = 0;
+    if (roi_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        pre_calc);
+
+      for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_bottom_data =
+          bottom_data + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                  pc.w2 * offset_bottom_data[pc.pos2] +
+                  pc.w3 * offset_bottom_data[pc.pos3] +
+                  pc.w4 * offset_bottom_data[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          top_data[index] = output_val;
+        } // for pw
+      } // for ph
+    } // for c
+  } // for n
+}
+
+at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
+                                const at::Tensor& rois,
+                                const float spatial_scale,
+                                const int pooled_height,
+                                const int pooled_width,
+                                const int sampling_ratio) {
+  AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor");
+  AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor");
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
+    ROIAlignForward_cpu_kernel<scalar_t>(
+         output_size,
+         input.data<scalar_t>(),
+         spatial_scale,
+         channels,
+         height,
+         width,
+         pooled_height,
+         pooled_width,
+         sampling_ratio,
+         rois.data<scalar_t>(),
+         output.data<scalar_t>());
+  });
+  return output;
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/dcn_v2_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/dcn_v2_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc8a2a13c4dab47fbbc5dfc83dfc269a9ff68ecf
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/dcn_v2_cpu.cpp
@@ -0,0 +1,74 @@
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+at::Tensor
+dcn_v2_cpu_forward(const at::Tensor &input,
+                   const at::Tensor &weight,
+                   const at::Tensor &bias,
+                   const at::Tensor &offset,
+                   const at::Tensor &mask,
+                   const int kernel_h,
+                   const int kernel_w,
+                   const int stride_h,
+                   const int stride_w,
+                   const int pad_h,
+                   const int pad_w,
+                   const int dilation_h,
+                   const int dilation_w,
+                   const int deformable_group)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+dcn_v2_cpu_backward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const at::Tensor &grad_output,
+                    int kernel_h, int kernel_w,
+                    int stride_h, int stride_w,
+                    int pad_h, int pad_w,
+                    int dilation_h, int dilation_w,
+                    int deformable_group)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
+                                 const at::Tensor &bbox,
+                                 const at::Tensor &trans,
+                                 const int no_trans,
+                                 const float spatial_scale,
+                                 const int output_dim,
+                                 const int group_size,
+                                 const int pooled_size,
+                                 const int part_size,
+                                 const int sample_per_part,
+                                 const float trans_std)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
+                                  const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const at::Tensor &top_count,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std)
+{
+    AT_ERROR("Not implement on cpu");
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1153dea04f032c67c41bd0d2a285376a72c5a595
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+
+
+template <typename scalar_t>
+at::Tensor nms_cpu_kernel(const at::Tensor& dets,
+                          const at::Tensor& scores,
+                          const float threshold) {
+  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
+  }
+
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+
+  at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+  auto suppressed = suppressed_t.data<uint8_t>();
+  auto order = order_t.data<int64_t>();
+  auto x1 = x1_t.data<scalar_t>();
+  auto y1 = y1_t.data<scalar_t>();
+  auto x2 = x2_t.data<scalar_t>();
+  auto y2 = y2_t.data<scalar_t>();
+  auto areas = areas_t.data<scalar_t>();
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1)
+      continue;
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1)
+        continue;
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
+      auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= threshold)
+        suppressed[j] = 1;
+   }
+  }
+  return at::nonzero(suppressed_t == 0).squeeze(1);
+}
+
+at::Tensor nms_cpu(const at::Tensor& dets,
+               const at::Tensor& scores,
+               const float threshold) {
+  at::Tensor result;
+  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
+    result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
+  });
+  return result;
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/vision.h b/maskrcnn_benchmark/csrc/cpu/vision.h
new file mode 100644
index 0000000000000000000000000000000000000000..19539caf9c5aa8b8025f786c3e54e23de300cf5e
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/vision.h
@@ -0,0 +1,73 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include <torch/extension.h>
+
+
+at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
+                                const at::Tensor& rois,
+                                const float spatial_scale,
+                                const int pooled_height,
+                                const int pooled_width,
+                                const int sampling_ratio);
+
+
+at::Tensor nms_cpu(const at::Tensor& dets,
+                   const at::Tensor& scores,
+                   const float threshold);
+at::Tensor
+dcn_v2_cpu_forward(const at::Tensor &input,
+                   const at::Tensor &weight,
+                   const at::Tensor &bias,
+                   const at::Tensor &offset,
+                   const at::Tensor &mask,
+                   const int kernel_h,
+                   const int kernel_w,
+                   const int stride_h,
+                   const int stride_w,
+                   const int pad_h,
+                   const int pad_w,
+                   const int dilation_h,
+                   const int dilation_w,
+                   const int deformable_group);
+
+std::vector<at::Tensor>
+dcn_v2_cpu_backward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const at::Tensor &grad_output,
+                    int kernel_h, int kernel_w,
+                    int stride_h, int stride_w,
+                    int pad_h, int pad_w,
+                    int dilation_h, int dilation_w,
+                    int deformable_group);
+
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
+                                 const at::Tensor &bbox,
+                                 const at::Tensor &trans,
+                                 const int no_trans,
+                                 const float spatial_scale,
+                                 const int output_dim,
+                                 const int group_size,
+                                 const int pooled_size,
+                                 const int part_size,
+                                 const int sample_per_part,
+                                 const float trans_std);
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
+                                  const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const at::Tensor &top_count,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std);
diff --git a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1142fb37597141122ee63161d0abd7beac510a74
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
@@ -0,0 +1,346 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+
+template <typename T>
+__device__ T bilinear_interpolate(const T* bottom_data,
+    const int height, const int width,
+    T y, T x,
+    const int index /* index for debug only*/) {
+
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    //empty
+    return 0;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int) y;
+  int x_low = (int) x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T) y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T) x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = bottom_data[y_low * width + x_low];
+  T v2 = bottom_data[y_low * width + x_high];
+  T v3 = bottom_data[y_high * width + x_low];
+  T v4 = bottom_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__global__ void RoIAlignForward(const int nthreads, const T* bottom_data,
+    const T spatial_scale, const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois, T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix ++)
+      {
+        const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height, const int width,
+    T y, T x,
+    T & w1, T & w2, T & w3, T & w4,
+    int & x_low, int & x_high, int & y_low, int & y_high,
+    const int index /* index for debug only*/) {
+
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    //empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int) y;
+  x_low = (int) x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T) y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T) x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = bottom_data[y_low * width + x_low];
+  // T v2 = bottom_data[y_low * width + x_high];
+  // T v3 = bottom_data[y_high * width + x_low];
+  // T v4 = bottom_data[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff,
+    const int num_rois, const T spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset    = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix ++)
+      {
+        const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(height, width, y, x,
+            w1, w2, w3, w4,
+            x_low, x_high, y_low, y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0)
+        {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignBackward
+
+
+at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
+                                 const at::Tensor& rois,
+                                 const float spatial_scale,
+                                 const int pooled_height,
+                                 const int pooled_width,
+                                 const int sampling_ratio) {
+  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return output;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
+    RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
+         output_size,
+         input.contiguous().data<scalar_t>(),
+         spatial_scale,
+         channels,
+         height,
+         width,
+         pooled_height,
+         pooled_width,
+         sampling_ratio,
+         rois.contiguous().data<scalar_t>(),
+         output.data<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
+                                  const at::Tensor& rois,
+                                  const float spatial_scale,
+                                  const int pooled_height,
+                                  const int pooled_width,
+                                  const int batch_size,
+                                  const int channels,
+                                  const int height,
+                                  const int width,
+                                  const int sampling_ratio) {
+  AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
+
+  auto num_rois = rois.size(0);
+  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return grad_input;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] {
+    RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
+         grad.numel(),
+         grad.contiguous().data<scalar_t>(),
+         num_rois,
+         spatial_scale,
+         channels,
+         height,
+         width,
+         pooled_height,
+         pooled_width,
+         sampling_ratio,
+         grad_input.data<scalar_t>(),
+         rois.contiguous().data<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return grad_input;
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8f072ffc2bd6de310f0d92c8c513dd9cfcc80dbc
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
@@ -0,0 +1,202 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+
+template <typename T>
+__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
+    const T spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const T* bottom_rois, T* top_data, int* argmax_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height)
+                       / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width)
+                       / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph)
+                                        * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw)
+                                        * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
+                                     * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
+                                     * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Define an empty pooling region to be zero
+    T maxval = is_empty ? 0 : -FLT_MAX;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    int maxidx = -1;
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int bottom_index = h * width + w;
+        if (offset_bottom_data[bottom_index] > maxval) {
+          maxval = offset_bottom_data[bottom_index];
+          maxidx = bottom_index;
+        }
+      }
+    }
+    top_data[index] = maxval;
+    argmax_data[index] = maxidx;
+  }
+}
+
+template <typename T>
+__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
+    const int* argmax_data, const int num_rois, const T spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    int bottom_offset = (roi_batch_ind * channels + c) * height * width;
+    int top_offset    = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    T* offset_bottom_diff = bottom_diff + bottom_offset;
+    const int* offset_argmax_data = argmax_data + top_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      atomicAdd(
+          offset_bottom_diff + argmax,
+          static_cast<T>(offset_top_diff[ph * pooled_width + pw]));
+
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
+                                const at::Tensor& rois,
+                                const float spatial_scale,
+                                const int pooled_height,
+                                const int pooled_width) {
+  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt));
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return std::make_tuple(output, argmax);
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] {
+    RoIPoolFForward<scalar_t><<<grid, block, 0, stream>>>(
+         output_size,
+         input.contiguous().data<scalar_t>(),
+         spatial_scale,
+         channels,
+         height,
+         width,
+         pooled_height,
+         pooled_width,
+         rois.contiguous().data<scalar_t>(),
+         output.data<scalar_t>(),
+         argmax.data<int>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return std::make_tuple(output, argmax);
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
+                                 const at::Tensor& input,
+                                 const at::Tensor& rois,
+                                 const at::Tensor& argmax,
+                                 const float spatial_scale,
+                                 const int pooled_height,
+                                 const int pooled_width,
+                                 const int batch_size,
+                                 const int channels,
+                                 const int height,
+                                 const int width) {
+  AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
+  // TODO add more checks
+
+  auto num_rois = rois.size(0);
+  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return grad_input;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] {
+    RoIPoolFBackward<scalar_t><<<grid, block, 0, stream>>>(
+         grad.numel(),
+         grad.contiguous().data<scalar_t>(),
+         argmax.data<int>(),
+         num_rois,
+         spatial_scale,
+         channels,
+         height,
+         width,
+         pooled_height,
+         pooled_width,
+         grad_input.data<scalar_t>(),
+         rois.contiguous().data<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return grad_input;
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7d40767bbb690eb8e55397bca83af636c7e0531c
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
@@ -0,0 +1,188 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This file is modified from  https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
+// Cheng-Yang Fu
+// cyfu@cs.unc.edu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+#include <cfloat>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+
+template <typename T>
+__global__ void SigmoidFocalLossForward(const int nthreads, 
+    const T* logits,
+    const int* targets,
+    const int num_classes,
+    const float gamma, 
+    const float alpha,
+    const int num, 
+    T* losses) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+
+    int n = i / num_classes;
+    int d = i % num_classes; // current class[0~79]; 
+    int t = targets[n]; // target class [1~80];
+
+    // Decide it is positive or negative case. 
+    T c1 = (t == (d+1)); 
+    T c2 = (t>=0 & t != (d+1));
+
+    T zn = (1.0 - alpha);
+    T zp = (alpha);
+
+    // p = 1. / 1. + expf(-x); p = sigmoid(x)
+    T  p = 1. / (1. + expf(-logits[i]));
+
+    // (1-p)**gamma * log(p) where
+    T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
+
+    // p**gamma * log(1-p)
+    T term2 = powf(p, gamma) *
+            (-1. * logits[i] * (logits[i] >= 0) -   
+             logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
+
+    losses[i] = 0.0;
+    losses[i] += -c1 * term1 * zp;
+    losses[i] += -c2 * term2 * zn;
+
+  } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossForward
+
+
+template <typename T>
+__global__ void SigmoidFocalLossBackward(const int nthreads,
+                const T* logits,
+                const int* targets,
+                const T* d_losses,
+                const int num_classes,
+                const float gamma,
+                const float alpha,
+                const int num,
+                T* d_logits) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+
+    int n = i / num_classes;
+    int d = i % num_classes; // current class[0~79]; 
+    int t = targets[n]; // target class [1~80], 0 is background;
+
+    // Decide it is positive or negative case. 
+    T c1 = (t == (d+1));
+    T c2 = (t>=0 & t != (d+1));
+
+    T zn = (1.0 - alpha);
+    T zp = (alpha);
+    // p = 1. / 1. + expf(-x); p = sigmoid(x)
+    T  p = 1. / (1. + expf(-logits[i]));
+
+    // (1-p)**g * (1 - p - g*p*log(p)
+    T term1 = powf((1. - p), gamma) *
+                      (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
+
+    // (p**g) * (g*(1-p)*log(1-p) - p)
+    T term2 = powf(p, gamma) *
+                  ((-1. * logits[i] * (logits[i] >= 0) -
+                      logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
+                      (1. - p) * gamma - p);
+    d_logits[i] = 0.0;
+    d_logits[i] += -c1 * term1 * zp;
+    d_logits[i] += -c2 * term2 * zn;
+    d_logits[i] = d_logits[i] * d_losses[i];
+
+  } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossBackward
+
+
+at::Tensor SigmoidFocalLoss_forward_cuda(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const int num_classes, 
+		const float gamma, 
+		const float alpha) {
+  AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
+  AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
+  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
+
+  const int num_samples = logits.size(0);
+	
+  auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
+  auto losses_size = num_samples * logits.size(1);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
+  dim3 block(512);
+
+  if (losses.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return losses;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] {
+    SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
+         losses_size,
+         logits.contiguous().data<scalar_t>(),
+	 targets.contiguous().data<int>(),
+         num_classes,
+	 gamma,
+	 alpha,
+	 num_samples,
+         losses.data<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return losses;   
+}	
+
+
+at::Tensor SigmoidFocalLoss_backward_cuda(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const at::Tensor& d_losses,
+		const int num_classes, 
+		const float gamma, 
+		const float alpha) {
+  AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
+  AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
+  AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor");
+
+  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
+
+  const int num_samples = logits.size(0);
+  AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
+	
+  auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
+  auto d_logits_size = num_samples * logits.size(1);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
+  dim3 block(512);
+
+  if (d_logits.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return d_logits;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] {
+    SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
+         d_logits_size,
+         logits.contiguous().data<scalar_t>(),
+	 targets.contiguous().data<int>(),
+	 d_losses.contiguous().data<scalar_t>(),
+         num_classes,
+	 gamma,
+	 alpha,
+	 num_samples,
+         d_logits.data<scalar_t>());
+  });
+
+  THCudaCheck(cudaGetLastError());
+  return d_logits;   
+}	
+
diff --git a/maskrcnn_benchmark/csrc/cuda/dcn_v2_cuda.cu b/maskrcnn_benchmark/csrc/cuda/dcn_v2_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..767ed8fb12b6218223b8331342de91b0f13ea3d4
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/dcn_v2_cuda.cu
@@ -0,0 +1,335 @@
+#include <vector>
+#include "cuda/dcn_v2_im2col_cuda.h"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+extern THCState *state;
+
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+// [batch gemm]
+// https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu
+
+__global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
+                                      float **columns_b, const float **ones_b,
+                                      const float **weight_b, const float **bias_b,
+                                      float *input, float *output,
+                                      float *columns, float *ones,
+                                      float *weight, float *bias,
+                                      const int input_stride, const int output_stride,
+                                      const int columns_stride, const int ones_stride,
+                                      const int num_batches)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_batches)
+    {
+        input_b[idx] = input + idx * input_stride;
+        output_b[idx] = output + idx * output_stride;
+        columns_b[idx] = columns + idx * columns_stride;
+        ones_b[idx] = ones + idx * ones_stride;
+        // share weights and bias within a Mini-Batch
+        weight_b[idx] = weight;
+        bias_b[idx] = bias;
+    }
+}
+
+at::Tensor
+dcn_v2_cuda_forward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group)
+{
+    using scalar_t = float;
+    // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
+    // printf("Channels: %d %d\n", channels, channels_kernel);
+    // printf("Channels: %d %d\n", channels_out, channels_kernel);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({batch, height_out, width_out}, input.options());
+    auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    // prepare for batch-wise computing, which is significantly faster than instance-wise computing
+    // when batch size is large.
+    // launch batch threads
+    int matrices_size = batch * sizeof(float *);
+    auto input_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto output_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
+    auto columns_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
+    auto ones_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto weight_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto bias_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+
+    const int block = 128;
+    const int grid = (batch + block - 1) / block;
+
+    createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+        input_b, output_b,
+        columns_b, ones_b,
+        weight_b, bias_b,
+        input.data<scalar_t>(),
+        output.data<scalar_t>(),
+        columns.data<scalar_t>(),
+        ones.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        channels * width * height,
+        channels_out * width_out * height_out,
+        channels * kernel_h * kernel_w * height_out * width_out,
+        height_out * width_out,
+        batch);
+
+    long m_ = channels_out;
+    long n_ = height_out * width_out;
+    long k_ = 1;
+    THCudaBlas_SgemmBatched(state,
+                            't',
+                            'n',
+                            n_,
+                            m_,
+                            k_,
+                            1.0f,
+                            ones_b, k_,
+                            bias_b, k_,
+                            0.0f,
+                            output_b, n_,
+                            batch);
+
+    modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
+                                     input.data<scalar_t>(),
+                                     offset.data<scalar_t>(),
+                                     mask.data<scalar_t>(),
+                                     batch, channels, height, width,
+                                     height_out, width_out, kernel_h, kernel_w,
+                                     pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                     deformable_group,
+                                     columns.data<scalar_t>());
+
+    long m = channels_out;
+    long n = height_out * width_out;
+    long k = channels * kernel_h * kernel_w;
+    THCudaBlas_SgemmBatched(state,
+                            'n',
+                            'n',
+                            n,
+                            m,
+                            k,
+                            1.0f,
+                            (const float **)columns_b, n,
+                            weight_b, k,
+                            1.0f,
+                            output_b, n,
+                            batch);
+
+    THCudaFree(state, input_b);
+    THCudaFree(state, output_b);
+    THCudaFree(state, columns_b);
+    THCudaFree(state, ones_b);
+    THCudaFree(state, weight_b);
+    THCudaFree(state, bias_b);
+    return output;
+}
+
+__global__ void createBatchGemmBufferBackward(
+    float **grad_output_b,
+    float **columns_b,
+    float **ones_b,
+    float **weight_b,
+    float **grad_weight_b,
+    float **grad_bias_b,
+    float *grad_output,
+    float *columns,
+    float *ones,
+    float *weight,
+    float *grad_weight,
+    float *grad_bias,
+    const int grad_output_stride,
+    const int columns_stride,
+    const int ones_stride,
+    const int num_batches)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_batches)
+    {
+        grad_output_b[idx] = grad_output + idx * grad_output_stride;
+        columns_b[idx] = columns + idx * columns_stride;
+        ones_b[idx] = ones + idx * ones_stride;
+
+        // share weights and bias within a Mini-Batch
+        weight_b[idx] = weight;
+        grad_weight_b[idx] = grad_weight;
+        grad_bias_b[idx] = grad_bias;
+    }
+}
+
+std::vector<at::Tensor> dcn_v2_cuda_backward(const at::Tensor &input,
+                                             const at::Tensor &weight,
+                                             const at::Tensor &bias,
+                                             const at::Tensor &offset,
+                                             const at::Tensor &mask,
+                                             const at::Tensor &grad_output,
+                                             int kernel_h, int kernel_w,
+                                             int stride_h, int stride_w,
+                                             int pad_h, int pad_w,
+                                             int dilation_h, int dilation_w,
+                                             int deformable_group)
+{
+
+    THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
+    THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
+
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({height_out, width_out}, input.options());
+    auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    auto grad_input = at::zeros_like(input);
+    auto grad_weight = at::zeros_like(weight);
+    auto grad_bias = at::zeros_like(bias);
+    auto grad_offset = at::zeros_like(offset);
+    auto grad_mask = at::zeros_like(mask);
+
+    using scalar_t = float;
+
+    for (int b = 0; b < batch; b++)
+    {
+        auto input_n = input.select(0, b);
+        auto offset_n = offset.select(0, b);
+        auto mask_n = mask.select(0, b);
+        auto grad_output_n = grad_output.select(0, b);
+        auto grad_input_n = grad_input.select(0, b);
+        auto grad_offset_n = grad_offset.select(0, b);
+        auto grad_mask_n = grad_mask.select(0, b);
+
+        long m = channels * kernel_h * kernel_w;
+        long n = height_out * width_out;
+        long k = channels_out;
+
+        THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f,
+                         grad_output_n.data<scalar_t>(), n,
+                         weight.data<scalar_t>(), m, 0.0f,
+                         columns.data<scalar_t>(), n);
+
+        // gradient w.r.t. input coordinate data
+        modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state),
+                                               columns.data<scalar_t>(),
+                                               input_n.data<scalar_t>(),
+                                               offset_n.data<scalar_t>(),
+                                               mask_n.data<scalar_t>(),
+                                               1, channels, height, width,
+                                               height_out, width_out, kernel_h, kernel_w,
+                                               pad_h, pad_w, stride_h, stride_w,
+                                               dilation_h, dilation_w, deformable_group,
+                                               grad_offset_n.data<scalar_t>(),
+                                               grad_mask_n.data<scalar_t>());
+        // gradient w.r.t. input data
+        modulated_deformable_col2im_cuda(THCState_getCurrentStream(state),
+                                         columns.data<scalar_t>(),
+                                         offset_n.data<scalar_t>(),
+                                         mask_n.data<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         grad_input_n.data<scalar_t>());
+
+        // gradient w.r.t. weight, dWeight should accumulate across the batch and group
+        modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
+                                         input_n.data<scalar_t>(),
+                                         offset_n.data<scalar_t>(),
+                                         mask_n.data<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         columns.data<scalar_t>());
+
+        long m_ = channels_out;
+        long n_ = channels * kernel_h * kernel_w;
+        long k_ = height_out * width_out;
+
+        THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f,
+                         columns.data<scalar_t>(), k_,
+                         grad_output_n.data<scalar_t>(), k_, 1.0f,
+                         grad_weight.data<scalar_t>(), n_);
+
+        // gradient w.r.t. bias
+        // long m_ = channels_out;
+        // long k__ = height_out * width_out;
+        THCudaBlas_Sgemv(state,
+                         't',
+                         k_, m_, 1.0f,
+                         grad_output_n.data<scalar_t>(), k_,
+                         ones.data<scalar_t>(), 1, 1.0f,
+                         grad_bias.data<scalar_t>(), 1);
+    }
+
+    return {
+        grad_input, grad_offset, grad_mask, grad_weight, grad_bias
+    };
+}
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.cu b/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4183793ba0b9230d5c78af72b3050c070dad5268
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.cu
@@ -0,0 +1,402 @@
+#include "dcn_v2_im2col_cuda.h"
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+
+__device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width,
+                                      const int height, const int width, float h, float w)
+{
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh, hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+__device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w,
+                                          const int h, const int w, const int height, const int width)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+__device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w,
+                                            const int height, const int width, const float *im_data,
+                                            const int data_width, const int bp_dir)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+                                                       const float *data_im, const float *data_offset, const float *data_mask,
+                                                       const int height, const int width, const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int num_channels, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       float *data_col)
+{
+  // launch channels * batch_size * height_col * width_col cores
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
+    // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
+
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    // const int b_col = (index / width_col / height_col) % batch_size;
+    const int b_col = (index / width_col / height_col / num_channels) % batch_size;
+    // const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_im = (index / width_col / height_col) % num_channels;
+    // const int c_col = c_im * kernel_h * kernel_w;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
+    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const float offset_h = data_offset_ptr[data_offset_h_ptr];
+        const float offset_w = data_offset_ptr[data_offset_w_ptr];
+        const float mask = data_mask_ptr[data_mask_hw_ptr];
+        float val = static_cast<float>(0);
+        const float h_im = h_in + i * dilation_h + offset_h;
+        const float w_im = w_in + j * dilation_w + offset_w;
+        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const float map_h = i * dilation_h + offset_h;
+          //const float map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        // data_col_ptr += batch_size * height_col * width_col;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
+                                                       const float *data_col, const float *data_offset, const float *data_mask,
+                                                       const int channels, const int height, const int width,
+                                                       const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       float *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const float offset_h = data_offset_ptr[data_offset_h_ptr];
+    const float offset_w = data_offset_ptr[data_offset_w_ptr];
+    const float mask = data_mask_ptr[data_mask_hw_ptr];
+    const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const float cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
+                                                             const float *data_col, const float *data_im,
+                                                             const float *data_offset, const float *data_mask,
+                                                             const int channels, const int height, const int width,
+                                                             const int kernel_h, const int kernel_w,
+                                                             const int pad_h, const int pad_w,
+                                                             const int stride_h, const int stride_w,
+                                                             const int dilation_h, const int dilation_w,
+                                                             const int channel_per_deformable_group,
+                                                             const int batch_size, const int offset_channels, const int deformable_group,
+                                                             const int height_col, const int width_col,
+                                                             float *grad_offset, float *grad_mask)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    float val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+    const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const float offset_h = data_offset_ptr[data_offset_h_ptr];
+      const float offset_w = data_offset_ptr[data_offset_w_ptr];
+      const float mask = data_mask_ptr[data_mask_hw_ptr];
+      float inv_h = h_in + i * dilation_h + offset_h;
+      float inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      else
+      {
+        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+      }
+      const float weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cuda(cudaStream_t stream,
+  const float* data_im, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w,
+  const int deformable_group, float* data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+  modulated_deformable_im2col_gpu_kernel
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
+          0, stream>>>(
+      num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
+      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+      batch_size, channels, deformable_group, height_col, width_col, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+void modulated_deformable_col2im_cuda(cudaStream_t stream,
+  const float* data_col, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w, 
+  const int deformable_group, float* grad_im){
+
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+  modulated_deformable_col2im_gpu_kernel
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
+          0, stream>>>(
+        num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
+        kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, deformable_group, height_col, width_col, grad_im);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
+  const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w, 
+  const int deformable_group,
+  float* grad_offset, float* grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+  modulated_deformable_col2im_coord_gpu_kernel
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
+        0, stream>>>(
+        num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
+        kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 
+        grad_offset, grad_mask);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.h b/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..c85683198e0f6f908c294aef45314d79d9de8451
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/dcn_v2_im2col_cuda.h
@@ -0,0 +1,101 @@
+
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.h
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1811.11168
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
+ */
+
+/***************** Adapted by Charles Shang *********************/
+
+#ifndef DCN_V2_IM2COL_CUDA
+#define DCN_V2_IM2COL_CUDA
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  void modulated_deformable_im2col_cuda(cudaStream_t stream,
+                                        const float *data_im, const float *data_offset, const float *data_mask,
+                                        const int batch_size, const int channels, const int height_im, const int width_im,
+                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                        const int dilation_h, const int dilation_w,
+                                        const int deformable_group, float *data_col);
+
+  void modulated_deformable_col2im_cuda(cudaStream_t stream,
+                                        const float *data_col, const float *data_offset, const float *data_mask,
+                                        const int batch_size, const int channels, const int height_im, const int width_im,
+                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                        const int dilation_h, const int dilation_w,
+                                        const int deformable_group, float *grad_im);
+
+  void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
+                                         const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
+                                         const int batch_size, const int channels, const int height_im, const int width_im,
+                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                         const int dilation_h, const int dilation_w,
+                                         const int deformable_group,
+                                         float *grad_offset, float *grad_mask);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cuda/dcn_v2_psroi_pooling_cuda.cu b/maskrcnn_benchmark/csrc/cuda/dcn_v2_psroi_pooling_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..07b438e1957b8190e842e6873cd7feee805535e5
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/dcn_v2_psroi_pooling_cuda.cu
@@ -0,0 +1,419 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__device__ T bilinear_interp(
+    const T *data,
+    const T x,
+    const T y,
+    const int width,
+    const int height)
+{
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  T dist_x = static_cast<T>(x - x1);
+  T dist_y = static_cast<T>(y - y1);
+  T value11 = data[y1 * width + x1];
+  T value12 = data[y2 * width + x1];
+  T value21 = data[y1 * width + x2];
+  T value22 = data[y2 * width + x2];
+  T value = (1 - dist_x) * (1 - dist_y) * value11 +
+            (1 - dist_x) * dist_y * value12 +
+            dist_x * (1 - dist_y) * value21 +
+            dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename T>
+__global__ void DeformablePSROIPoolForwardKernel(
+    const int count,
+    const T *bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const T *bottom_rois, const T *bottom_trans,
+    const int no_trans,
+    const T trans_std,
+    const int sample_per_part,
+    const int output_dim,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class,
+    T *top_data,
+    T *top_count)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    T sum = 0;
+    int count = 0;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename T>
+__global__ void DeformablePSROIPoolBackwardAccKernel(
+    const int count,
+    const T *top_diff,
+    const T *top_count,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    T *bottom_data_diff, T *bottom_trans_diff,
+    const T *bottom_data,
+    const T *bottom_rois,
+    const T *bottom_trans,
+    const int no_trans,
+    const T trans_std,
+    const int sample_per_part,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0)
+    {
+      continue;
+    }
+    T diff_val = top_diff[index] / top_count[index];
+    const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+    T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        T dist_x = w - x0, dist_y = h - y0;
+        T q00 = (1 - dist_x) * (1 - dist_y);
+        T q01 = (1 - dist_x) * dist_y;
+        T q10 = dist_x * (1 - dist_y);
+        T q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
+
+        if (no_trans)
+        {
+          continue;
+        }
+        T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+        diff_x *= roi_width;
+        T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+        diff_y *= roi_height;
+
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
+      }
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std)
+{
+  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
+  AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+
+  AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
+  auto pooled_height = pooled_size;
+  auto pooled_width = pooled_size;
+
+  auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
+
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (out.numel() == 0)
+  {
+    THCudaCheck(cudaGetLastError());
+    return std::make_tuple(out, top_count);
+  }
+
+  dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
+  dim3 block(512);
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] {
+    DeformablePSROIPoolForwardKernel<scalar_t><<<grid, block, 0, stream>>>(
+        out_size,
+        input.contiguous().data<scalar_t>(),
+        spatial_scale,
+        channels,
+        height, width,
+        pooled_height,
+        pooled_width,
+        bbox.contiguous().data<scalar_t>(),
+        trans.contiguous().data<scalar_t>(),
+        no_trans,
+        trans_std,
+        sample_per_part,
+        output_dim,
+        group_size,
+        part_size,
+        num_classes,
+        channels_each_class,
+        out.data<scalar_t>(),
+        top_count.data<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return std::make_tuple(out, top_count);
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
+                                   const at::Tensor &input,
+                                   const at::Tensor &bbox,
+                                   const at::Tensor &trans,
+                                   const at::Tensor &top_count,
+                                   const int no_trans,
+                                   const float spatial_scale,
+                                   const int output_dim,
+                                   const int group_size,
+                                   const int pooled_size,
+                                   const int part_size,
+                                   const int sample_per_part,
+                                   const float trans_std)
+{
+  AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
+  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
+  AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
+  AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+
+  AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
+  auto pooled_height = pooled_size;
+  auto pooled_width = pooled_size;
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
+  auto trans_grad = at::zeros_like(trans);
+
+  if (input_grad.numel() == 0)
+  {
+    THCudaCheck(cudaGetLastError());
+    return std::make_tuple(input_grad, trans_grad);
+  }
+
+  dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
+  dim3 block(512);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] {
+    DeformablePSROIPoolBackwardAccKernel<scalar_t><<<grid, block, 0, stream>>>(
+        out_size,
+        out_grad.contiguous().data<scalar_t>(),
+        top_count.contiguous().data<scalar_t>(),
+        num_bbox,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        output_dim,
+        input_grad.contiguous().data<scalar_t>(),
+        trans_grad.contiguous().data<scalar_t>(),
+        input.contiguous().data<scalar_t>(),
+        bbox.contiguous().data<scalar_t>(),
+        trans.contiguous().data<scalar_t>(),
+        no_trans,
+        trans_std,
+        sample_per_part,
+        group_size,
+        part_size,
+        num_classes,
+        channels_each_class);
+  });
+  THCudaCheck(cudaGetLastError());
+  return std::make_tuple(input_grad, trans_grad);
+}
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cuda/nms.cu b/maskrcnn_benchmark/csrc/cuda/nms.cu
new file mode 100644
index 0000000000000000000000000000000000000000..833d8523a5809d99a1078a144a384c864a9d8df9
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/nms.cu
@@ -0,0 +1,131 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCDeviceUtils.cuh>
+
+#include <vector>
+#include <iostream>
+
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+__device__ inline float devIoU(float const * const a, float const * const b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+                           const float *dev_boxes, unsigned long long *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ float block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+// boxes is a N x 5 tensor
+at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
+  using scalar_t = float;
+  AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
+  auto scores = boxes.select(1, 4);
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  auto boxes_sorted = boxes.index_select(0, order_t);
+
+  int boxes_num = boxes.size(0);
+
+  const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
+
+  scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
+
+  THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
+
+  unsigned long long* mask_dev = NULL;
+  //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
+  //                      boxes_num * col_blocks * sizeof(unsigned long long)));
+
+  mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
+
+  dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
+              THCCeilDiv(boxes_num, threadsPerBlock));
+  dim3 threads(threadsPerBlock);
+  nms_kernel<<<blocks, threads>>>(boxes_num,
+                                  nms_overlap_thresh,
+                                  boxes_dev,
+                                  mask_dev);
+
+  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
+  THCudaCheck(cudaMemcpy(&mask_host[0],
+                        mask_dev,
+                        sizeof(unsigned long long) * boxes_num * col_blocks,
+                        cudaMemcpyDeviceToHost));
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  THCudaFree(state, mask_dev);
+  // TODO improve this part
+  return std::get<0>(order_t.index({
+                       keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
+                         order_t.device(), keep.scalar_type())
+                     }).sort(0, false));
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/vision.h b/maskrcnn_benchmark/csrc/cuda/vision.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff02d612304120f86dfc0940a745250594adb267
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/vision.h
@@ -0,0 +1,121 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include <torch/extension.h>
+
+
+at::Tensor SigmoidFocalLoss_forward_cuda(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const int num_classes, 
+		const float gamma, 
+		const float alpha); 
+
+at::Tensor SigmoidFocalLoss_backward_cuda(
+			     const at::Tensor& logits,
+                             const at::Tensor& targets,
+			     const at::Tensor& d_losses,
+			     const int num_classes,
+			     const float gamma,
+			     const float alpha);
+
+at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
+                                 const at::Tensor& rois,
+                                 const float spatial_scale,
+                                 const int pooled_height,
+                                 const int pooled_width,
+                                 const int sampling_ratio);
+
+at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
+                                  const at::Tensor& rois,
+                                  const float spatial_scale,
+                                  const int pooled_height,
+                                  const int pooled_width,
+                                  const int batch_size,
+                                  const int channels,
+                                  const int height,
+                                  const int width,
+                                  const int sampling_ratio);
+
+
+std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
+                                const at::Tensor& rois,
+                                const float spatial_scale,
+                                const int pooled_height,
+                                const int pooled_width);
+
+at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
+                                 const at::Tensor& input,
+                                 const at::Tensor& rois,
+                                 const at::Tensor& argmax,
+                                 const float spatial_scale,
+                                 const int pooled_height,
+                                 const int pooled_width,
+                                 const int batch_size,
+                                 const int channels,
+                                 const int height,
+                                 const int width);
+
+at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
+
+
+at::Tensor compute_flow_cuda(const at::Tensor& boxes,
+                             const int height,
+                             const int width);
+
+at::Tensor
+dcn_v2_cuda_forward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group);
+
+std::vector<at::Tensor>
+dcn_v2_cuda_backward(const at::Tensor &input,
+                     const at::Tensor &weight,
+                     const at::Tensor &bias,
+                     const at::Tensor &offset,
+                     const at::Tensor &mask,
+                     const at::Tensor &grad_output,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int pad_h, int pad_w,
+                     int dilation_h, int dilation_w,
+                     int deformable_group);
+
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std);
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
+                                   const at::Tensor &input,
+                                   const at::Tensor &bbox,
+                                   const at::Tensor &trans,
+                                   const at::Tensor &top_count,
+                                   const int no_trans,
+                                   const float spatial_scale,
+                                   const int output_dim,
+                                   const int group_size,
+                                   const int pooled_size,
+                                   const int part_size,
+                                   const int sample_per_part,
+                                   const float trans_std);
diff --git a/maskrcnn_benchmark/csrc/dcn_v2.h b/maskrcnn_benchmark/csrc/dcn_v2.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c718a4969e26b7fb04358db10b71a0fa953c20c
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/dcn_v2.h
@@ -0,0 +1,145 @@
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+at::Tensor
+dcn_v2_forward(const at::Tensor &input,
+               const at::Tensor &weight,
+               const at::Tensor &bias,
+               const at::Tensor &offset,
+               const at::Tensor &mask,
+               const int kernel_h,
+               const int kernel_w,
+               const int stride_h,
+               const int stride_w,
+               const int pad_h,
+               const int pad_w,
+               const int dilation_h,
+               const int dilation_w,
+               const int deformable_group)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_cuda_forward(input, weight, bias, offset, mask,
+                                   kernel_h, kernel_w,
+                                   stride_h, stride_w,
+                                   pad_h, pad_w,
+                                   dilation_h, dilation_w,
+                                   deformable_group);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::vector<at::Tensor>
+dcn_v2_backward(const at::Tensor &input,
+                const at::Tensor &weight,
+                const at::Tensor &bias,
+                const at::Tensor &offset,
+                const at::Tensor &mask,
+                const at::Tensor &grad_output,
+                int kernel_h, int kernel_w,
+                int stride_h, int stride_w,
+                int pad_h, int pad_w,
+                int dilation_h, int dilation_w,
+                int deformable_group)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_cuda_backward(input,
+                                    weight,
+                                    bias,
+                                    offset,
+                                    mask,
+                                    grad_output,
+                                    kernel_h, kernel_w,
+                                    stride_h, stride_w,
+                                    pad_h, pad_w,
+                                    dilation_h, dilation_w,
+                                    deformable_group);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_forward(const at::Tensor &input,
+                             const at::Tensor &bbox,
+                             const at::Tensor &trans,
+                             const int no_trans,
+                             const float spatial_scale,
+                             const int output_dim,
+                             const int group_size,
+                             const int pooled_size,
+                             const int part_size,
+                             const int sample_per_part,
+                             const float trans_std)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_psroi_pooling_cuda_forward(input,
+                                                 bbox,
+                                                 trans,
+                                                 no_trans,
+                                                 spatial_scale,
+                                                 output_dim,
+                                                 group_size,
+                                                 pooled_size,
+                                                 part_size,
+                                                 sample_per_part,
+                                                 trans_std);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad,
+                              const at::Tensor &input,
+                              const at::Tensor &bbox,
+                              const at::Tensor &trans,
+                              const at::Tensor &top_count,
+                              const int no_trans,
+                              const float spatial_scale,
+                              const int output_dim,
+                              const int group_size,
+                              const int pooled_size,
+                              const int part_size,
+                              const int sample_per_part,
+                              const float trans_std)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_psroi_pooling_cuda_backward(out_grad,
+                                                  input,
+                                                  bbox,
+                                                  trans,
+                                                  top_count,
+                                                  no_trans,
+                                                  spatial_scale,
+                                                  output_dim,
+                                                  group_size,
+                                                  pooled_size,
+                                                  part_size,
+                                                  sample_per_part,
+                                                  trans_std);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
diff --git a/maskrcnn_benchmark/csrc/nms.h b/maskrcnn_benchmark/csrc/nms.h
new file mode 100644
index 0000000000000000000000000000000000000000..312fed4a7cb7c1bc6c2345b5e5d678cc6c1a7141
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/nms.h
@@ -0,0 +1,28 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+
+at::Tensor nms(const at::Tensor& dets,
+               const at::Tensor& scores,
+               const float threshold) {
+
+  if (dets.type().is_cuda()) {
+#ifdef WITH_CUDA
+    // TODO raise error if not compiled with CUDA
+    if (dets.numel() == 0)
+      return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
+    auto b = at::cat({dets, scores.unsqueeze(1)}, 1);
+    return nms_cuda(b, threshold);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  at::Tensor result = nms_cpu(dets, scores, threshold);
+  return result;
+}
diff --git a/maskrcnn_benchmark/csrc/vision.cpp b/maskrcnn_benchmark/csrc/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d5fbfb63e035dd1efd01ca3fa226c88cc1f2409
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/vision.cpp
@@ -0,0 +1,21 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "nms.h"
+#include "ROIAlign.h"
+#include "ROIPool.h"
+#include "SigmoidFocalLoss.h"
+#include "dcn_v2.h"
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("nms", &nms, "non-maximum suppression");
+  m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
+  m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
+  m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward");
+  m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
+  m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
+  m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
+  m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward");
+  m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward");
+  m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward");
+  m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward");
+}
diff --git a/maskrcnn_benchmark/data/README.md b/maskrcnn_benchmark/data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ae85e0567cbe71ef1f1df4137cbf549240065d2
--- /dev/null
+++ b/maskrcnn_benchmark/data/README.md
@@ -0,0 +1,90 @@
+# Setting Up Datasets
+This file describes how to perform training on other datasets.
+
+Only Pascal VOC dataset can be loaded from its original format and be outputted to Pascal style results currently.
+
+We expect the annotations from other datasets be converted to COCO json format, and
+the output will be in COCO-style. (i.e. AP, AP50, AP75, APs, APm, APl for bbox and segm)
+
+## Creating Symlinks for PASCAL VOC
+
+We assume that your symlinked `datasets/voc/VOC<year>` directory has the following structure:
+
+```
+VOC<year>
+|_ JPEGImages
+|  |_ <im-1-name>.jpg
+|  |_ ...
+|  |_ <im-N-name>.jpg
+|_ Annotations
+|  |_ pascal_train<year>.json (optional)
+|  |_ pascal_val<year>.json (optional)
+|  |_ pascal_test<year>.json (optional)
+|  |_ <im-1-name>.xml
+|  |_ ...
+|  |_ <im-N-name>.xml
+|_ VOCdevkit<year>
+```
+
+Create symlinks for `voc/VOC<year>`:
+
+```
+cd ~/github/maskrcnn-benchmark
+mkdir -p datasets/voc/VOC<year>
+ln -s /path/to/VOC<year> /datasets/voc/VOC<year>
+```
+Example configuration files for PASCAL VOC could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/pascal_voc/).
+
+### PASCAL VOC Annotations in COCO Format
+To output COCO-style evaluation result, PASCAL VOC annotations in COCO json format is required and could be downloaded from [here](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip)
+via http://cocodataset.org/#external.
+
+## Creating Symlinks for Cityscapes:
+
+We assume that your symlinked `datasets/cityscapes` directory has the following structure:
+
+```
+cityscapes
+|_ images
+|  |_ <im-1-name>.jpg
+|  |_ ...
+|  |_ <im-N-name>.jpg
+|_ annotations
+|  |_ instanceonly_gtFile_train.json
+|  |_ ...
+|_ raw
+   |_ gtFine
+   |_ ...
+   |_ README.md
+```
+
+Create symlinks for `cityscapes`:
+
+```
+cd ~/github/maskrcnn-benchmark
+mkdir -p datasets/cityscapes
+ln -s /path/to/cityscapes datasets/data/cityscapes
+```
+
+### Steps to convert Cityscapes Annotations to COCO Format
+1. Download gtFine_trainvaltest.zip from https://www.cityscapes-dataset.com/downloads/ (login required)
+2. Extract it to /path/to/gtFine_trainvaltest
+```
+cityscapes
+|_ gtFine_trainvaltest.zip
+|_ gtFine_trainvaltest
+   |_ gtFine
+```
+3. Run the below commands to convert the annotations
+
+```
+cd ~/github
+git clone https://github.com/mcordts/cityscapesScripts.git
+cd cityscapesScripts
+cp ~/github/maskrcnn-benchmark/tools/cityscapes/instances2dict_with_polygons.py cityscapesscripts/evaluation
+python setup.py install
+cd ~/github/maskrcnn-benchmark
+python tools/cityscapes/convert_cityscapes_to_coco.py --datadir /path/to/cityscapes --outdir /path/to/cityscapes/annotations
+```
+
+Example configuration files for Cityscapes could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/cityscapes/).
diff --git a/maskrcnn_benchmark/data/__init__.py b/maskrcnn_benchmark/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ba1e52473f97615cc41f82aef279fff4d194527
--- /dev/null
+++ b/maskrcnn_benchmark/data/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .build import make_data_loader
diff --git a/maskrcnn_benchmark/data/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3364ff853c4e75938313a943503d2d7dadeead1
Binary files /dev/null and b/maskrcnn_benchmark/data/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/__pycache__/build.cpython-37.pyc b/maskrcnn_benchmark/data/__pycache__/build.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b84186aeb391fafd4fc23962880ff0174c230fa1
Binary files /dev/null and b/maskrcnn_benchmark/data/__pycache__/build.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/__pycache__/collate_batch.cpython-37.pyc b/maskrcnn_benchmark/data/__pycache__/collate_batch.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..150b3a691c5813b5c1cd13cba77da8a0d47f8ebc
Binary files /dev/null and b/maskrcnn_benchmark/data/__pycache__/collate_batch.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/build.py b/maskrcnn_benchmark/data/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..24fbc5c1f4897b40cb13c204767315e549c18d28
--- /dev/null
+++ b/maskrcnn_benchmark/data/build.py
@@ -0,0 +1,176 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import bisect
+import copy
+import logging
+
+import torch.utils.data
+from maskrcnn_benchmark.utils.comm import get_world_size
+from maskrcnn_benchmark.utils.imports import import_file
+
+from . import datasets as D
+from . import samplers
+
+from .collate_batch import BatchCollator
+from .transforms import build_transforms
+
+
+def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True):
+    """
+    Arguments:
+        dataset_list (list[str]): Contains the names of the datasets, i.e.,
+            coco_2014_trian, coco_2014_val, etc
+        transforms (callable): transforms to apply to each (image, target) sample
+        dataset_catalog (DatasetCatalog): contains the information on how to
+            construct a dataset.
+        is_train (bool): whether to setup the dataset for training or testing
+    """
+    if not isinstance(dataset_list, (list, tuple)):
+        raise RuntimeError(
+            "dataset_list should be a list of strings, got {}".format(dataset_list)
+        )
+    datasets = []
+    for dataset_name in dataset_list:
+        data = dataset_catalog.get(dataset_name)
+        factory = getattr(D, data["factory"])
+        args = data["args"]
+        # for COCODataset, we want to remove images without annotations
+        # during training
+        if data["factory"] in ["COCODataset",
+                               "WordDataset"]:
+            args["remove_images_without_annotations"] = is_train
+        if data["factory"] == "PascalVOCDataset":
+            args["use_difficult"] = not is_train
+        args["transforms"] = transforms
+        # make dataset from factory
+        dataset = factory(**args)
+        datasets.append(dataset)
+
+    # for testing, return a list of datasets
+    if not is_train:
+        return datasets
+
+    # for training, concatenate all datasets into a single one
+    dataset = datasets[0]
+    if len(datasets) > 1:
+        dataset = D.ConcatDataset(datasets)
+
+    return [dataset]
+
+
+def make_data_sampler(dataset, shuffle, distributed):
+    if distributed:
+        return samplers.DistributedSampler(dataset, shuffle=shuffle)
+    if shuffle:
+        sampler = torch.utils.data.sampler.RandomSampler(dataset)
+    else:
+        sampler = torch.utils.data.sampler.SequentialSampler(dataset)
+    return sampler
+
+
+def _quantize(x, bins):
+    bins = copy.copy(bins)
+    bins = sorted(bins)
+    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
+    return quantized
+
+
+def _compute_aspect_ratios(dataset):
+    aspect_ratios = []
+    for i in range(len(dataset)):
+        img_info = dataset.get_img_info(i)
+        aspect_ratio = float(img_info["height"]) / float(img_info["width"])
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def make_batch_data_sampler(
+    dataset, sampler, aspect_grouping, images_per_batch, num_iters=None, start_iter=0
+):
+    if aspect_grouping:
+        if not isinstance(aspect_grouping, (list, tuple)):
+            aspect_grouping = [aspect_grouping]
+        aspect_ratios = _compute_aspect_ratios(dataset)
+        group_ids = _quantize(aspect_ratios, aspect_grouping)
+        batch_sampler = samplers.GroupedBatchSampler(
+            sampler, group_ids, images_per_batch, drop_uneven=False
+        )
+    else:
+        batch_sampler = torch.utils.data.sampler.BatchSampler(
+            sampler, images_per_batch, drop_last=False
+        )
+    if num_iters is not None:
+        batch_sampler = samplers.IterationBasedBatchSampler(
+            batch_sampler, num_iters, start_iter
+        )
+    return batch_sampler
+
+
+def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
+    num_gpus = get_world_size()
+    if is_train:
+        images_per_batch = cfg.SOLVER.IMS_PER_BATCH
+        assert (
+            images_per_batch % num_gpus == 0
+        ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
+        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
+        images_per_gpu = images_per_batch // num_gpus
+        shuffle = True
+        num_iters = cfg.SOLVER.MAX_ITER
+    else:
+        images_per_batch = cfg.TEST.IMS_PER_BATCH
+        assert (
+            images_per_batch % num_gpus == 0
+        ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number "
+        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
+        images_per_gpu = images_per_batch // num_gpus
+        shuffle = False if not is_distributed else True
+        num_iters = None
+        start_iter = 0
+
+    if images_per_gpu > 1:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            "When using more than one image per GPU you may encounter "
+            "an out-of-memory (OOM) error if your GPU does not have "
+            "sufficient memory. If this happens, you can reduce "
+            "SOLVER.IMS_PER_BATCH (for training) or "
+            "TEST.IMS_PER_BATCH (for inference). For training, you must "
+            "also adjust the learning rate and schedule length according "
+            "to the linear scaling rule. See for example: "
+            "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14"
+        )
+
+    # group images which have similar aspect ratio. In this case, we only
+    # group in two cases: those with width / height > 1, and the other way around,
+    # but the code supports more general grouping strategy
+    aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []
+
+    paths_catalog = import_file(
+        "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True
+    )
+    DatasetCatalog = paths_catalog.DatasetCatalog
+    dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST
+
+    transforms = build_transforms(cfg, is_train)
+    datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train)
+
+    data_loaders = []
+    for dataset in datasets:
+        sampler = make_data_sampler(dataset, shuffle, is_distributed)
+        batch_sampler = make_batch_data_sampler(
+            dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter
+        )
+        collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
+        num_workers = cfg.DATALOADER.NUM_WORKERS
+        data_loader = torch.utils.data.DataLoader(
+            dataset,
+            num_workers=num_workers,
+            batch_sampler=batch_sampler,
+            collate_fn=collator,
+        )
+        data_loaders.append(data_loader)
+    if is_train:
+        # during training, a single (possibly concatenated) data_loader is returned
+        assert len(data_loaders) == 1
+        return data_loaders[0]
+    return data_loaders
diff --git a/maskrcnn_benchmark/data/collate_batch.py b/maskrcnn_benchmark/data/collate_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7f03416741cfb4c04de613f7d2c8f2050258d73
--- /dev/null
+++ b/maskrcnn_benchmark/data/collate_batch.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from maskrcnn_benchmark.structures.image_list import to_image_list
+
+
+class BatchCollator(object):
+    """
+    From a list of samples from the dataset,
+    returns the batched images and targets.
+    This should be passed to the DataLoader
+    """
+
+    def __init__(self, size_divisible=0):
+        self.size_divisible = size_divisible
+
+    def __call__(self, batch):
+        transposed_batch = list(zip(*batch))
+        images = to_image_list(transposed_batch[0], self.size_divisible)
+        targets = transposed_batch[1]
+        img_ids = transposed_batch[2]
+        return images, targets, img_ids
diff --git a/maskrcnn_benchmark/data/datasets/__init__.py b/maskrcnn_benchmark/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2ab8384e78842d06b639ac631511368b93bf01a
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .coco import COCODataset
+from .voc import PascalVOCDataset
+from .concat_dataset import ConcatDataset
+from .word_dataset import WordDataset
+
+__all__ = ["COCODataset", "ConcatDataset", "PascalVOCDataset",
+           "WordDataset"]
diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3c932e33972edd8cd5b473b425a126458dcca33
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/char_dataset.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/char_dataset.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9dae9b30ea4a38b29ae1c3a23511426b69f10078
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/char_dataset.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/coco.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/coco.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7842be5c123e881e9dd3c6663be0e0d4e4ea6dab
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/coco.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/concat_dataset.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/concat_dataset.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ff6ac314c989677abab6f6dc52748dd6900000d
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/concat_dataset.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/voc.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/voc.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2a07fed0e252f636ada7fcdaf9cdda93e0216d2
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/voc.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/__pycache__/word_dataset.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/__pycache__/word_dataset.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff7695f750c19697b1333daaf47df21f3addd769
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/__pycache__/word_dataset.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/coco.py b/maskrcnn_benchmark/data/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0e42b437db2fab29d4fab59a813c932c9355516
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/coco.py
@@ -0,0 +1,101 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+import torchvision
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask
+from maskrcnn_benchmark.structures.keypoint import PersonKeypoints
+
+
+min_keypoints_per_image = 10
+
+
+def _count_visible_keypoints(anno):
+    return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+
+def _has_only_empty_bbox(anno):
+    return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+
+def has_valid_annotation(anno):
+    # if it's empty, there is no annotation
+    if len(anno) == 0:
+        return False
+    # if all boxes have close to zero area, there is no annotation
+    if _has_only_empty_bbox(anno):
+        return False
+    # keypoints task have a slight different critera for considering
+    # if an annotation is valid
+    if "keypoints" not in anno[0]:
+        return True
+    # for keypoint detection tasks, only consider valid images those
+    # containing at least min_keypoints_per_image
+    if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+        return True
+    return False
+
+
+class COCODataset(torchvision.datasets.coco.CocoDetection):
+    def __init__(
+        self, ann_file, root, remove_images_without_annotations, transforms=None
+    ):
+        super(COCODataset, self).__init__(root, ann_file)
+        # sort indices for reproducible results
+        self.ids = sorted(self.ids)
+
+        # filter images without detection annotations
+        if remove_images_without_annotations:
+            ids = []
+            for img_id in self.ids:
+                ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+                anno = self.coco.loadAnns(ann_ids)
+                if has_valid_annotation(anno):
+                    ids.append(img_id)
+            self.ids = ids
+
+        self.json_category_id_to_contiguous_id = {
+            v: i + 1 for i, v in enumerate(self.coco.getCatIds())
+        }
+        self.contiguous_category_id_to_json_id = {
+            v: k for k, v in self.json_category_id_to_contiguous_id.items()
+        }
+        self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
+        self.transforms = transforms
+
+    def __getitem__(self, idx):
+        img, anno = super(COCODataset, self).__getitem__(idx)
+
+        # filter crowd annotations
+        # TODO might be better to add an extra field
+        anno = [obj for obj in anno if obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        boxes = torch.as_tensor(boxes).reshape(-1, 4)  # guard against no boxes
+        target = BoxList(boxes, img.size, mode="xywh").convert("xyxy")
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = [self.json_category_id_to_contiguous_id[c] for c in classes]
+        classes = torch.tensor(classes)
+        target.add_field("labels", classes)
+
+        masks = [obj["segmentation"] for obj in anno]
+        masks = SegmentationMask(masks, img.size, mode='poly')
+        target.add_field("masks", masks)
+
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = PersonKeypoints(keypoints, img.size)
+            target.add_field("keypoints", keypoints)
+
+        target = target.clip_to_image(remove_empty=True)
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target, idx
+
+    def get_img_info(self, index):
+        img_id = self.id_to_img_map[index]
+        img_data = self.coco.imgs[img_id]
+        return img_data
diff --git a/maskrcnn_benchmark/data/datasets/concat_dataset.py b/maskrcnn_benchmark/data/datasets/concat_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5e087c42036f27132ca2c6e1d5252af5fee4a97
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/concat_dataset.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import bisect
+
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+
+
+class ConcatDataset(_ConcatDataset):
+    """
+    Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra
+    method for querying the sizes of the image
+    """
+
+    def get_idxs(self, idx):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return dataset_idx, sample_idx
+
+    def get_img_info(self, idx):
+        dataset_idx, sample_idx = self.get_idxs(idx)
+        return self.datasets[dataset_idx].get_img_info(sample_idx)
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5687182c7268d7b2fcc5fcbf6a35dc27341281d
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/__init__.py
@@ -0,0 +1,25 @@
+from maskrcnn_benchmark.data import datasets
+
+
+from .word import word_evaluation
+
+
+def evaluate(dataset, predictions, output_folder, **kwargs):
+    """evaluate dataset using different methods based on dataset type.
+    Args:
+        dataset: Dataset object
+        predictions(list[BoxList]): each item in the list represents the
+            prediction results for one image.
+        output_folder: output folder, to save evaluation files or results.
+        **kwargs: other args.
+    Returns:
+        evaluation result
+    """
+    args = dict(
+        dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs
+    )
+    if isinstance(dataset, datasets.WordDataset):
+        return word_evaluation(**args)
+    else:
+        dataset_name = dataset.__class__.__name__
+        raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name))
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c72a8f32ac3a03d8df8a710bc0b3f7debfe1ca8f
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/word/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..efcf8ce034944e58a34592ed22e82adaa266808b
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/__init__.py
@@ -0,0 +1,21 @@
+from .word_eval import do_coco_evaluation
+# from util import io_
+
+def word_evaluation(
+    dataset,
+    predictions,
+    output_folder,
+    box_only,
+    iou_types,
+    expected_results,
+    expected_results_sigma_tol,
+):
+    return do_coco_evaluation(
+        dataset=dataset,
+        predictions=predictions,
+        box_only=box_only,
+        output_folder=output_folder,
+        iou_types=iou_types,
+        expected_results=expected_results,
+        expected_results_sigma_tol=expected_results_sigma_tol,
+    )
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c84d02958ed66b0ab303f0cdd523e8c0b289ea06
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/alfashape.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/alfashape.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1d3a519c736f089b17eda3404babf46f1cc061a
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/alfashape.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/area_of_polygon.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/area_of_polygon.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a25b55023c40456c0f70a8d8bc9f2da077122cfb
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/area_of_polygon.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/io_.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/io_.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b05afd5505226acb78fbb81e3048366fbfde324
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/io_.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/word_eval.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/word_eval.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa07d9c7c8c9998cbf7fc6d7d82a84447cc0a487
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/__pycache__/word_eval.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/alfashape.py b/maskrcnn_benchmark/data/datasets/evaluation/word/alfashape.py
new file mode 100644
index 0000000000000000000000000000000000000000..9043c54b2cc8a27a37702649c8acff865f741790
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/alfashape.py
@@ -0,0 +1,89 @@
+import numpy as np
+from scipy.spatial import Delaunay
+from .area_of_polygon import area_of_polygon_crd
+import networkx as nx
+
+def sqrt_sum(a, b):
+    x = (a[0]-b[0])
+    y = (a[1]-b[1])
+    return np.sqrt(x*x+y*y)
+
+def shapeToSomePolygons(shape):
+    G = nx.Graph()
+    allnodes = set()
+    for line in shape:
+        G.add_nodes_from(line)
+        G.add_edge(line[0], line[1])
+        allnodes.add(line[0])
+        allnodes.add(line[1])
+
+    result = []
+
+    while allnodes:
+        node = allnodes.pop()
+        new_node = next(iter(G[node]), None)
+        if not new_node: continue
+
+        G.remove_edge(node, new_node)
+        temp = nx.shortest_path(G, node, new_node)
+        for j,t in enumerate(temp):
+            if t in allnodes:
+                allnodes.remove(t)
+        result.append(temp)
+    return result
+
+def getAlfaShapes(pts,alfas=1):
+    tri_ind = [(0,1),(1,2),(2,0)]
+    tri = Delaunay(pts)
+    lenghts={}
+    for s in tri.simplices:
+        for ind in tri_ind:
+            a = pts[s[ind[0]]]
+            b = pts[s[ind[1]]]
+            # print('a---', a)
+            # print('b---', b)
+            line = (a, b)
+            # line = ((a[0], a[1]), (b[0], b[1]))
+            lenghts[line] = sqrt_sum(a, b)
+
+    ls = sorted(lenghts.values())
+
+    mean_length = np.mean(ls)
+    mean_length_index = ls.index(next(filter(lambda x: x>=mean_length, ls)))
+    magic_numbers = [ls[i] for i in range(mean_length_index, len(ls))]
+    magic_numbers[0] = 0
+    sum_magic = np.sum(magic_numbers)
+    for i in range(2, len(magic_numbers)):
+        magic_numbers[i] += magic_numbers[i-1]
+    magic_numbers = [m /sum_magic for m in magic_numbers]
+
+    rez = []
+    for alfa in alfas:
+        i = magic_numbers.index(next(filter(lambda z: z > alfa, magic_numbers), magic_numbers[-1]))
+        av_length = ls[mean_length_index+i]
+
+        lines = {}
+
+        for s in tri.simplices:
+            used = True
+            for ind in tri_ind:
+                if lenghts[(pts[s[ind[0]]], pts[s[ind[1]]])] > av_length:
+                    used = False
+                    break
+            if used == False: continue
+
+            for ind in tri_ind:
+                i,j= s[ind[0]],s[ind[1]]
+                line = (pts[min(i,j)], pts[max(i,j)])
+                lines[line] = line in lines
+
+        good_lines = []
+        for v in lines:
+            if not lines[v]:
+                good_lines.append(v)
+
+        result = shapeToSomePolygons(good_lines)
+        result.sort(key=area_of_polygon_crd, reverse=True)
+        rez.append(result)
+    return rez
+
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/area_of_polygon.py b/maskrcnn_benchmark/data/datasets/evaluation/word/area_of_polygon.py
new file mode 100644
index 0000000000000000000000000000000000000000..73694a0f91b56b9bff08bfea02e89c8d106624ae
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/area_of_polygon.py
@@ -0,0 +1,38 @@
+import numpy as np
+
+def area_of_polygon_xy(x, y):
+    """Calculates the area of an arbitrary polygon given its verticies"""
+    area = 0.0
+    for i in range(-1, len(x)-1):
+        area += x[i] * (y[i+1] - y[i-1])
+    return abs(area) / 2.0
+
+def area_of_polygon_crd(cordinates):
+    """Calculates the area of an arbitrary polygon given its verticies"""
+    x = [v[0] for v in cordinates]
+    y = [v[1] for v in cordinates]
+    return area_of_polygon_xy(x,y)
+
+def area_of_polygon(**kwargs):
+    if 'x' in kwargs and 'y' in kwargs:
+        x = kwargs['x']
+        y = kwargs['y']
+        return area_of_polygon_xy(x, y)
+
+    if 'coordinates' in kwargs:
+        cordinates = kwargs['coordinates']
+        return area_of_polygon_crd(cordinates)
+
+    print("Wrong parameters")
+    return None
+
+def length_of_way(cordinates):
+    """Length of the way"""
+    if len(cordinates)<2:
+        return 0
+    leng = 0
+    for i in range(1,len(cordinates)):
+        crd = cordinates
+        dist = distance(crd[i-1],crd[i-1])
+        leng = leng + dist
+    return leng
\ No newline at end of file
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/io_.py b/maskrcnn_benchmark/data/datasets/evaluation/word/io_.py
new file mode 100644
index 0000000000000000000000000000000000000000..0976223422731574789f5ed7fc30c167a2db03fc
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/io_.py
@@ -0,0 +1,216 @@
+#coding=utf-8
+'''
+Created on 2016年9月27日
+
+@author: dengdan
+
+Tool  functions for file system operation and I/O. 
+In the style of linux shell commands
+'''
+import os
+import pickle as pkl
+# import commands
+import logging
+
+# import util
+
+def mkdir(path):
+    """
+    If the target directory does not exists, it and its parent directories will created. 
+    """
+    path = get_absolute_path(path)
+    if not exists(path):
+        os.makedirs(path)
+    return path
+
+def make_parent_dir(path):
+    """make the parent directories for a file."""
+    parent_dir = get_dir(path)
+    mkdir(parent_dir)
+    
+    
+def pwd():
+    return os.getcwd()
+
+def dump(path, obj):
+    path = get_absolute_path(path)
+    parent_path = get_dir(path)
+    mkdir(parent_path)
+    with open(path, 'w') as f:
+        logging.info('dumping file:' + path);
+        pkl.dump(obj, f)
+
+def load(path):
+    path = get_absolute_path(path)
+    with open(path, 'r') as f:
+        data = pkl.load(f)
+    return data
+
+def join_path(a, *p):
+    return os.path.join(a, *p)
+
+def is_dir(path):
+    path = get_absolute_path(path)
+    return os.path.isdir(path)
+
+
+def is_path(path):
+    path = get_absolute_path(path)
+    return os.path.ispath(path)
+    
+def get_dir(path):
+    '''
+    return the directory it belongs to.
+    if path is a directory itself, itself will be return 
+    '''
+    path = get_absolute_path(path)
+    if is_dir(path):
+        return path;
+    return os.path.split(path)[0]
+
+def get_filename(path):
+    return os.path.split(path)[1]
+
+def get_absolute_path(p):
+    if p.startswith('~'):
+        p = os.path.expanduser(p)
+    return os.path.abspath(p)
+
+def cd(p):
+    p = get_absolute_path(p)
+    os.chdir(p)
+    
+# def ls(path = '.', suffix = None):
+#     """
+#     list files in a directory.
+#     return file names in a list
+#     """
+#     path = get_absolute_path(path)
+#     files = os.listdir(path)
+#
+#     if suffix is None:
+#         return files
+#
+#     filtered = []
+#     for f in files:
+#         if util.str.ends_with(f, suffix, ignore_case = True):
+#             filtered.append(f)
+#
+#     return filtered
+
+def find_files(pattern):
+    import glob
+    return glob.glob(pattern)
+
+def read_lines(p):
+    """return the text in a file in lines as a list """
+    p = get_absolute_path(p)
+    f = open(p,'r')
+    return f.readlines()
+    
+def write_lines(p, lines):
+    p = get_absolute_path(p)
+    make_parent_dir(p)
+    with open(p, 'w') as f:
+        for line in lines:
+            f.write(line)
+            
+
+# def cat(p):
+#     """return the text in a file as a whole"""
+#     cmd = 'cat ' + p
+#     return commands.getoutput(cmd)
+
+def exists(path):
+    path = get_absolute_path(path)
+    return os.path.exists(path)
+
+def load_mat(path):
+    import scipy.io as sio
+    path = get_absolute_path(path)
+    return sio.loadmat(path)
+
+def dump_mat(path, dict_obj, append = True):
+    import scipy.io as sio
+    path = get_absolute_path(path)
+    make_parent_dir(path)
+    sio.savemat(file_name = path, mdict =  dict_obj, appendmat = append)
+    
+def dir_mat(path):
+    '''
+    list the variables in mat file.
+    return a list: [(name, shape, dtype), ...]
+    '''
+    import scipy.io as sio
+    path = get_absolute_path(path)
+    return sio.whosmat(path)
+    
+SIZE_UNIT_K = 1024
+SIZE_UNIT_M = SIZE_UNIT_K ** 2
+SIZE_UNIT_G = SIZE_UNIT_K ** 3
+def get_file_size(path, unit = SIZE_UNIT_K):
+    size = os.path.getsize(get_absolute_path(path))
+    return size * 1.0 / unit
+    
+    
+def create_h5(path):
+    import h5py
+    path = get_absolute_path(path)
+    make_parent_dir(path)
+    return h5py.File(path, 'w');
+
+def open_h5(path, mode = 'r'):
+    import h5py
+    path = get_absolute_path(path)
+    return h5py.File(path, mode);
+    
+def read_h5(h5, key):
+    return h5[key][:]
+def read_h5_attrs(h5, key, attrs):
+    return h5[key].attrs[attrs]
+    
+def copy(src, dest):
+    import shutil
+    shutil.copy(get_absolute_path(src), get_absolute_path(dest))
+    
+cp = copy
+
+def remove(p):
+    import os
+    os.remove(get_absolute_path(p))
+rm = remove
+
+# def search(pattern, path, file_only = True):
+#     """
+#     Search files whose name matches the give pattern. The search scope
+#     is the directory and sub-directories of 'path'.
+#     """
+#     path = get_absolute_path(path)
+#     pattern_here = util.io.join_path(path, pattern)
+#     targets = []
+#
+#     # find matchings in current directory
+#     candidates = find_files(pattern_here)
+#     for can in candidates:
+#         if util.io.is_dir(can) and file_only:
+#             continue
+#         else:
+#             targets.append(can)
+#
+#     # find matching in sub-dirs
+#     files = ls(path)
+#     for f in files:
+#         fpath = util.io.join_path(path, f)
+#         if is_dir(fpath):
+#             targets_in_sub_dir = search(pattern, fpath, file_only)
+#             targets.extend(targets_in_sub_dir)
+#     return targets
+
+def dump_json(path, data):
+    import json
+    path = get_absolute_path(path)
+    make_parent_dir(path)
+
+    with open(path, 'w') as f:
+        json.dump(data, f)
+    return path
\ No newline at end of file
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..89b681378328a3f30ec9de2d9743de066d5c7632
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__init__.py
@@ -0,0 +1,62 @@
+# import log
+# import dtype
+# # import plt
+# import np
+# import img
+# _img = img
+# import dec
+# import rand
+# import mod
+# import proc
+# import test
+# import neighbour as nb
+# #import mask
+# import str_ as str
+# import io as sys_io
+# import io_ as io
+# import feature
+# import thread_ as thread
+# import caffe_ as caffe
+# # import tf
+# import cmd
+# import ml
+# import sys
+# import url
+# from .misc import *
+# from .logger import *
+# # log.init_logger('~/temp/log/log_' + get_date_str() + '.log')
+#
+# def exit(code = 0):
+#     sys.exit(0)
+#
+# is_main = mod.is_main
+# init_logger = log.init_logger
+#
+# def sit(img, path = None, name = ""):
+#     if path is None:
+#         _count = get_count();
+#         path = '~/temp/no-use/images/%s_%d_%s.jpg'%(log.get_date_str(), _count, name)
+#
+#     if type(img) == list:
+#         plt.show_images(images = img, path = path, show = False, axis_off = True, save = True)
+#     else:
+#         plt.imwrite(path, img)
+#
+#     return path
+# _count = 0;
+#
+# def get_count():
+#     global _count;
+#     _count += 1;
+#     return _count
+#
+# def cit(img, path = None, rgb = True, name = ""):
+#     _count = get_count();
+#     if path is None:
+#         img = np.np.asarray(img, dtype = np.np.uint8)
+#         path = '~/temp/no-use/%s_%d_%s.jpg'%(log.get_date_str(), _count, name)
+#         _img.imwrite(path, img, rgb = rgb)
+#     return path
+#
+# def argv(index):
+#     return sys.argv[index]
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e881f4c7d31e4210c752fe810650653d798f370
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/io_.cpython-37.pyc b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/io_.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84ab18e8447d7e9f4d154922c07cc4cdc0dd7531
Binary files /dev/null and b/maskrcnn_benchmark/data/datasets/evaluation/word/util/__pycache__/io_.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc34c8368e81b687d949eaca7bdcc46dede5e561
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py
@@ -0,0 +1,70 @@
+# encoding=utf-8
+
+import util
+def get_data(net, name):
+    import caffe
+    if isinstance(net, caffe._caffe.Solver):
+        net = net.net
+    return net.blobs[name].data[...]
+    
+def get_params(net, name = None):
+    import caffe
+    if isinstance(net, caffe._caffe.Solver):
+        net = net.net
+    params = net.params[name]
+    p = []
+    for param in params:
+        p.append(param.data[...])
+    return p
+    
+def draw_log(log_path, output_names, show = False, save_path = None, from_to = None, smooth = False):
+    pattern = "Train net output: word_bbox_loc_loss = "
+    log_path = util.io.get_absolute_path(log_path)
+    f = open(log_path,'r')
+    iterations = []
+    outputs = {}
+    plt = util.plt.plt
+    for line in f.readlines():
+        if util.str.contains(line, 'Iteration') and util.str.contains(line, 'loss = '):
+            print line
+            s = line.split('Iteration')[-1]
+            iter_num = util.str.find_all(s, '\d+')[0]
+            iter_num = int(iter_num)
+            iterations.append(iter_num)
+
+        if util.str.contains(line, "Train net output #"):
+            s = util.str.split(line, 'Train net output #\d+\:')[-1]
+            s = s.split('(')[0]
+            output = util.str.find_all(s, '\d*\.*\d+e*\-*\d*\.*\d*')[-1]
+            output = eval(output)
+            output = float(output)
+            for name in output_names:
+                ptr = ' '+ name + ' ='
+                if util.str.contains(line, ptr):
+                    if name not in outputs:
+                        outputs[name] = []
+                    print line
+                    print '\t', iter_num, name, output
+                    outputs[name].append(output)
+    if len(outputs)==0:
+        print 'No output named:', output_names
+        return    
+    for name in outputs:
+        output = outputs[name]
+        if smooth:
+            output = util.np.smooth(output)
+        start = 0
+        end = len(output)
+        
+        if from_to is not None:
+            start = from_to[0]
+            end = from_to[1]        
+        line_style = util.plt.get_random_line_style()
+        plt.plot(iterations[start: end], output[start: end], line_style, label = name)
+    
+    plt.legend()
+    
+    if save_path is not None:
+        util.plt.save_image(save_path)
+    if show:
+        util.plt.show()
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py~ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py~
new file mode 100644
index 0000000000000000000000000000000000000000..fced85251199da6de7b51392a65b3c75794995d7
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/caffe_.py~
@@ -0,0 +1,72 @@
+# encoding=utf-8
+
+import util
+def get_data(net, name):
+    import caffe
+    if isinstance(net, caffe._caffe.Solver):
+        net = net.net
+    return net.blobs[name].data[...]
+    
+def get_params(net, name = None):
+    import caffe
+    if isinstance(net, caffe._caffe.Solver):
+        net = net.net
+    params = net.params[name]
+    p = []
+    for param in params:
+        p.append(param.data[...])
+    return p
+    
+def draw_log(log_path, output_names, show = False, save_path = None, from_to = None, smooth = False):
+    pattern = "Train net output: word_bbox_loc_loss = "
+    log_path = util.io.get_absolute_path(log_path)
+    f = open(log_path,'r')
+    iterations = []
+    outputs = {}
+    plt = util.plt.plt
+    for line in f.readlines():
+        if util.str.contains(line, 'Iteration') and util.str.contains(line, 'loss = '):
+            print line
+            s = line.split('Iteration')[-1]
+            iter_num = util.str.find_all(s, '\d+')[0]
+            iter_num = int(iter_num)
+            iterations.append(iter_num)
+
+        if util.str.contains(line, "Train net output #"):
+            s = util.str.split(line, 'Train net output #\d+\:')[-1]
+            s = s.split('(')[0]
+            output = util.str.find_all(s, '\d*\.*\d+e*\-*\d*\.*\d*')[-1]
+            output = eval(output)
+            output = float(output)
+            for name in output_names:
+                ptr = ' '+ name + ' ='
+                if util.str.contains(line, ptr):
+                    if name not in outputs:
+                        outputs[name] = []
+                    print line
+                    print '\t', iter_num, name, output
+                    outputs[name].append(output)
+    if len(outputs)==0:
+        print 'No output named:', output_names
+        return    
+    for name in outputs:
+        output = outputs[name]
+        if smooth:
+            output = util.np.smooth(output)
+        start = 0
+        end = len(output)
+        
+        import pdb
+        pdb.set_trace()
+        if from_to is not None:
+            start = from_to[0]
+            end = from_to[1]        
+        line_style = util.plt.get_random_line_style()
+        plt.plot(iterations[start: end], output[start: end], line_style, label = name)
+    
+    plt.legend()
+    
+    if save_path is not None:
+        util.plt.save_image(save_path)
+    if show:
+        util.plt.show()
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/cmd.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/cmd.py
new file mode 100644
index 0000000000000000000000000000000000000000..0003c2805772bd9f68c705c8f759e4a76e5b2ca8
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/cmd.py
@@ -0,0 +1,6 @@
+#encoding = utf-8
+
+def cmd(cmd):
+    import  commands
+    return commands.getoutput(cmd)
+
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/dec.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/dec.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd80e90be1c610d2c46bc8b8b02fd6070d94ee6d
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/dec.py
@@ -0,0 +1,78 @@
+#encoding=utf-8
+import logging
+import time
+def print_calling(fn):
+    def wrapper(*args1, ** args2):
+        s = "calling function %s"%(fn.__name__)
+        logging.info(s)
+        start = time.time()
+        ret = fn(*args1, **args2)
+        end = time.time()
+#         s = "%s. time used = %f seconds"%(s, (end - start))
+        s = "function [%s] has been called, taking %f seconds"%(fn.__name__, (end - start))
+        logging.debug(s)
+        return ret
+    return wrapper
+
+
+def print_test(fn):
+    def wrapper(*args1, ** args2):
+        s = "running test: %s..."%(fn.__name__)
+        logging.info(s)
+        ret = fn(*args1, **args2)
+        s = "running test: %s...succeed"%(fn.__name__)
+        logging.debug(s)
+        return ret
+    return wrapper
+    
+def print_calling_in_short(fn):
+    def wrapper(*args1, ** args2):
+        start = time.time()
+        ret = fn(*args1, **args2)
+        end = time.time()
+        s = "function [%s] has been called, taking %f seconds"%(fn.__name__, (end - start))
+        logging.debug(s)
+        return ret
+    return wrapper
+
+import collections
+counter = collections.defaultdict(int)
+count_times =collections.defaultdict(int)
+def print_calling_in_short_for_tf(fn):
+    import tensorflow as tf
+    import util
+    def wrapper(*args1, ** args2):
+        start = time.time()
+        thread_name = util.thread.get_current_thread_name()
+        ret = fn(*args1, **args2)
+        end = time.time()
+        counter[fn.__name__] = counter[fn.__name__] + (end - start)
+        count_times[fn.__name__] += 1
+        all_time = sum([counter[name] for name in counter]) * 1.0
+        for name in counter:
+#             tf.logging.info('\t %s: %f, %f seconds'%(name, counter[name] / all_time, counter[name]))
+            tf.logging.info('\t %s: %d callings, %fsper calling'%(name, count_times[name], counter[name] * 1.0 / count_times[name]))
+        s = "Thread [%s]:function [%s] has been called, taking %f seconds"%(thread_name, fn.__name__, (end - start))
+        tf.logging.info(s)
+        return ret
+    return wrapper
+
+def timeit(fn):
+    import util
+    def wrapper(*args1, ** args2):
+        start = time.time()
+        thread_name = util.thread.get_current_thread_name()
+        ret = fn(*args1, **args2)
+        end = time.time()
+        counter[fn.__name__] = counter[fn.__name__] + (end - start)
+        count_times[fn.__name__] += 1
+        all_time = sum([counter[name] for name in counter]) * 1.0
+        for name in counter:
+            logging.info('\t %s: %f, %f seconds'%(name, counter[name] / all_time, counter[name]))
+            logging.info('\t %s: %d callings, %f seconds per calling'%(name, count_times[name], counter[name] * 1.0 / count_times[name]))
+        s = "Thread [%s]:function [%s] has been called, taking %f seconds"%(thread_name, fn.__name__, (end - start))
+#         logging.info(s)
+        return ret
+    return wrapper
+    
+
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/dtype.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..baedb192be4bbddd52bc0105a344a0484c890fe1
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/dtype.py
@@ -0,0 +1,39 @@
+#coding=utf-8
+'''
+Created on 2016年9月27日
+@author: dengdan
+'''
+import numpy as np
+
+float32 = 'float32'
+floatX = float32
+int32 = 'int32'
+uint8 = 'uint8'
+string = 'str'
+
+def cast(obj, dtype):
+    if isinstance(obj, list):
+        return np.asarray(obj, dtype = floatX)
+    return np.cast[dtype](obj)
+
+def int(obj):
+    return cast(obj, 'int')
+    
+def double(obj):
+    return cast(obj, 'double')
+    
+def is_number(obj):
+	try:
+		obj + 1
+	except:
+		return False
+	return True
+    
+def is_str(s):
+    return type(s) == str
+
+def is_list(s):
+    return type(s) == list
+
+def is_tuple(s):
+    return type(s) == tuple
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/event.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/event.py
new file mode 100644
index 0000000000000000000000000000000000000000..5612e818f66fa1fa633b8995b97701700c560b62
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/event.py
@@ -0,0 +1,12 @@
+import cv2
+import logging
+def wait_key(target = None):
+    key = cv2.waitKey()& 0xFF
+    if target == None:
+        return key
+    if type(target) == str:
+        target = ord(target)
+    while key != target:
+        key = cv2.waitKey()& 0xFF
+
+    logging.debug('Key Pression caught:%s'%(target))
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/feature.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/feature.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dd24e0a24459b16e6032bf33d013a1654fc9f41
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/feature.py
@@ -0,0 +1,14 @@
+# encoding utf-8
+def hog(img, bins =9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), transform_sqrt=False, feature_vector=True):
+    """
+    Extract hog feature from image.
+    See detail at https://github.com/scikit-image/scikit-image/blob/master/skimage/feature/_hog.py
+    """
+    from skimage.feature import hog
+    return hog(img, 
+                orientations = bins, 
+                pixels_per_cell = pixels_per_cell,
+                cells_per_block = cells_per_block, 
+                visualise = False, 
+                transform_sqrt=False,
+                feature_vector=True)
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/img.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/img.py
new file mode 100644
index 0000000000000000000000000000000000000000..59db386776210cad4abe6dd85b0e0f8821f06a3e
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/img.py
@@ -0,0 +1,521 @@
+#coding=utf-8
+'''
+@author: dengdan
+'''
+import cv2
+import numpy as np
+import logging
+import math
+import event
+import util
+
+IMREAD_GRAY = 0
+IMREAD_COLOR = 1
+IMREAD_UNCHANGED = -1
+
+
+
+COLOR_WHITE =(255, 255, 255)
+COLOR_BLACK = (0, 0, 0)
+COLOR_GREEN = (0, 255, 0)
+
+COLOR_RGB_RED = (255, 0, 0)
+COLOR_BGR_RED = (0, 0, 255)
+
+COLOR_RGB_BLUE = (0, 0, 255)
+COLOR_BGR_BLUE = (255, 0, 0)
+
+COLOR_RGB_YELLOW = (255, 255, 0)
+COLOR_BGR_YELLOW = (0, 255, 255)
+
+
+COLOR_RGB_GRAY = (47, 79, 79)
+
+COLOR_RGB_PINK = (255, 192, 203)
+def imread(path, rgb = False, mode = cv2.IMREAD_COLOR):
+    path = util.io.get_absolute_path(path)
+    img = cv2.imread(path, mode)
+    if img is None:
+        raise IOError('File not found:%s'%(path))
+        
+    if rgb:
+        img = bgr2rgb(img)
+    return img
+
+def imshow(winname, img, block = True, position = None, maximized = False, rgb = False):
+    if isinstance(img, str):
+        img = imread(path = img)
+    
+    cv2.namedWindow(winname, cv2.WINDOW_NORMAL)
+    if rgb:
+        img = rgb2bgr(img)
+    cv2.imshow(winname, img)
+    if position is not None:
+#         cv2.moveWindow(winname, position[0], position[1])
+        move_win(winname, position)
+    
+    if maximized:
+        maximize_win(winname)  
+        
+        
+    if block:
+#         cv2.waitKey(0)
+        event.wait_key(" ")
+        cv2.destroyAllWindows()
+
+
+def imwrite(path, img, rgb = False):
+    if rgb:
+        img = rgb2bgr(img)
+    path = util.io.get_absolute_path(path)
+    util.io.make_parent_dir(path)
+    cv2.imwrite(path, img)
+
+def move_win(winname, position = (0, 0)):
+    """
+    move pyplot window
+    """
+    cv2.moveWindow(winname, position[0], position[1])
+
+def maximize_win(winname):
+    cv2.setWindowProperty(winname, cv2.WND_PROP_FULLSCREEN, True);
+
+def eq_color(target, color):
+    for i, c in enumerate(color):
+        if target[i] != color[i]:
+            return False
+    return True
+    
+def is_white(color):
+    for c in color:
+        if c < 255:
+            return False
+    return True
+    
+def black(shape):
+    if len(np.shape(shape)) >= 2:
+        shape = get_shape(shape)
+    shape = [int(v) for v in shape]
+    return np.zeros(shape, np.uint8)
+    
+def white(shape, value = 255):
+    if len(np.shape(shape)) >= 2:
+        shape = get_shape(shape)
+    return np.ones(shape, np.uint8) * np.uint8(value)
+    
+def bgr2rgb(img):
+    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+def rgb2bgr(img):
+    return cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+
+
+def rgb2gray(img):
+    return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+
+def bgr2gray(img):
+    return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+
+def ds_size(image_size, kernel_size, stride):
+    """calculate the size of downsampling result"""
+    image_x, image_y = image_size
+    
+
+    kernel_x, kernel_y = kernel_size
+    stride_x, stride_y = stride
+    
+    def f(iw, kw, sw):
+        return int(np.floor((iw - kw) / sw) + 1)
+    
+    output_size = (f(image_x, kernel_x, stride_x), f(image_y, kernel_y, stride_y))
+    return output_size
+
+
+    
+def get_roi(img, p1, p2):
+    """
+    extract region of interest from an image.
+    p1, p2: two tuples standing for two opposite corners of the rectangle bounding the roi. 
+    Their order is arbitrary.
+    """
+    x1, y1 = p1
+    x2, y2 = p2
+    
+    x_min = min([x1, x2])
+    y_min = min([y1, y2])
+    x_max = max([x1, x2]) + 1
+    y_max = max([y1, y2]) + 1
+    
+    return img[y_min: y_max, x_min: x_max]
+    
+def rectangle(img, left_up, right_bottom, color, border_width = 1):
+    left_up = (int(left_up[0]), int(left_up[1]))
+    right_bottom = (int(right_bottom[0]), int(right_bottom[1]))
+    cv2.rectangle(img, left_up, right_bottom, color, border_width)
+
+
+def circle(img, center, r, color, border_width = 1):
+    center = (int(center[0]), int(center[1]))
+    cv2.circle(img, center, r, color, border_width)
+
+def render_points(img, points, color):
+    for p in points:
+        x, y = p
+        img[y][x] = color
+
+    
+def draw_contours(img, contours, idx = -1, color = 1, border_width = 1):
+#     img = img.copy()
+    cv2.drawContours(img, contours, idx, color, border_width)
+    return img
+
+def get_contour_rect_box(contour):
+    x,y,w,h = cv2.boundingRect(contour)
+    return x, y, w, h
+
+def get_contour_region_in_rect(img, contour):
+    x, y, w, h = get_contour_rect_box(contour)
+    lu, rb = (x, y), (x + w, y + h)
+    return get_roi(img, lu, rb)
+
+def get_contour_min_area_box(contour):
+    rect = cv2.minAreaRect(contour)
+    box = cv2.cv.BoxPoints(rect)
+    box = np.int0(box)
+    return box
+
+def get_contour_region_in_min_area_rect(img, cnt):
+    # find the min area rect of contour
+    rect = cv2.minAreaRect(cnt)
+    angle = rect[-1]
+    box = cv2.cv.BoxPoints(rect)
+    box_cnt = points_to_contour(box)
+    
+    # find the rectangle containing box_cnt, and set it as ROI
+    outer_rect = get_contour_rect_box(box_cnt)
+    x, y, w, h = outer_rect
+    img = get_roi(img, (x, y), (x + w,  y + h))
+    box = [(ox - x, oy - y) for (ox, oy) in box]
+    
+    # rotate ROI and corner points
+    rows, cols = get_shape(img)
+    M = cv2.getRotationMatrix2D((cols/2,rows/2), angle, scale = 1)
+    dst = cv2.warpAffine(img,M,(cols,rows))
+    bar_xy = np.hstack((box, np.ones((4, 1))))
+    new_corners = np.dot(M, np.transpose(bar_xy))
+    new_corners = util.dtype.int(np.transpose(new_corners))
+#     cnt = points_to_contour(new_corners)
+    
+    xs = new_corners[:, 0]
+    ys = new_corners[:, 1]
+    lu = (min(xs), min(ys))
+    rb = (max(xs), max(ys))
+    return get_roi(dst, lu, rb)
+
+
+def contour_to_points(contour):
+    return np.asarray([c[0] for c in contour])
+
+
+def points_to_contour(points):
+    contours = [[list(p)]for p in points]
+    return np.asarray(contours, dtype = np.int32)
+
+def points_to_contours(points):
+    return np.asarray([points_to_contour(points)])
+
+def get_contour_region_iou(I, cnt1, cnt2):
+    """
+    calculate the iou of two contours
+    """
+    mask1 = util.img.black(I)
+    draw_contours(mask1, [cnt1], color = 1, border_width = -1)
+    
+    mask2 = util.img.black(I)
+    draw_contours(mask2, [cnt2], color = 1, border_width = -1)
+    
+    union_mask = ((mask1 + mask2) >=1) * 1
+    intersect_mask = (mask1 * mask2 >= 1) * 1
+    
+    return np.sum(intersect_mask) * 1.0 / np.sum(union_mask)
+
+    
+def fill_bbox(img, box, color = 1):
+    """
+    filling a bounding box with color.
+    box: a list of 4 points, in clockwise order, as the four vertice of a bounding box
+    """
+    util.test.assert_equal(np.shape(box), (4, 2))
+    cnt = to_contours(box)
+    draw_contours(img, cnt, color = color, border_width = -1)
+    
+def get_rect_points(left_up, right_bottom):
+    """
+    given the left up and right bottom points of a rectangle, return its four points
+    """
+    right_bottom, left_up = np.asarray(right_bottom), np.asarray(left_up)
+    w, h = right_bottom - left_up
+    x, y = left_up
+    points = [(x, y), (x + w, y), (x + w, y + h), (x, y + h)]
+    return points
+    
+def rect_perimeter(left_up, right_bottom):
+    """
+    calculate the perimeter of the rectangle described by its left-up and right-bottom point.
+    """
+    return sum(np.asarray(right_bottom) -  np.asarray(left_up)) * 2
+
+def rect_area(left_up, right_bottom):
+    wh = np.asarray(right_bottom) - np.asarray(left_up) + 1
+    return np.prod(wh)
+    
+def apply_mask(img, mask):
+    """
+    the img will be masked in place. 
+    """
+    c = np.shape(img)[-1]
+    for i in range(c):
+        img[:, :, i] = img[:, :, i] * mask 
+    return img
+    
+def get_shape(img):
+    """
+    return the height and width of an image
+    """
+    return np.shape(img)[0:2]
+
+def get_wh(img):
+    return np.shape(img)[0:2][::-1]
+
+def get_value(img, x, y = None):
+    if y == None:
+        y = x[1]
+        x = x[0]
+    
+    return img[y][x]        
+    
+def set_value(img, xy, val):
+    x, y = xy
+    img[y][x] = val
+
+
+def filter2D(img, kernel):
+    dst = cv2.filter2D(img, -1, kernel)
+    return dst
+
+def average_blur(img, shape = (5, 5)):
+    return cv2.blur(img, shape)
+
+def gaussian_blur(img, shape = (5, 5), sigma = 0):
+    # sigma --> sigmaX, sigmaY
+    blur = cv2.GaussianBlur(img,shape, sigma)
+    return blur
+
+def bilateral_blur(img, d = 9, sigmaColor = 75, sigmaSpace = 75):
+    dst = cv2.bilateralFilter(img, d, sigmaColor, sigmaSpace)
+    return dst
+
+BLUR_AVERAGE =  'average'
+BLUR_GAUSSIAN = 'gaussian'
+BLUR_BILATERAL = 'bilateral'
+
+
+_blur_dict = {
+              BLUR_AVERAGE: average_blur,
+              BLUR_GAUSSIAN: gaussian_blur,
+              BLUR_BILATERAL: bilateral_blur
+}
+
+def blur(img, blur_type):
+    fn = _blur_dict[blur_type]
+    return fn(img)
+    
+def put_text(img, text, pos, scale = 1, color = COLOR_WHITE, thickness = 1):
+    pos = np.int32(pos)
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    cv2.putText(img = img, text = text, org = tuple(pos), fontFace = font,  fontScale = scale,  color = color, thickness = thickness)
+
+def resize(img, f = None, fx = None, fy = None, size = None, interpolation = cv2.INTER_LINEAR):
+    """
+    size: (w, h)
+    """
+    h, w = get_shape(img)
+    if fx != None and fy != None:
+        return cv2.resize(img, None, fx = fx, fy = fy, interpolation = interpolation)
+        
+    if size != None:
+        size = util.dtype.int(size)
+#         size = (size[1], size[0])
+        size = tuple(size)
+        return cv2.resize(img, size, interpolation = interpolation)
+    
+    return cv2.resize(img, None, fx = f, fy = f, interpolation = interpolation)
+
+def translate(img, delta_x, delta_y, size = None):
+    M = np.float32([[1,0, delta_x],[0,1, delta_y]])
+    if size == None:
+        size = get_wh(img)
+    
+    dst = cv2.warpAffine(img,M, size)
+    return dst
+
+
+def rotate_about_center(src, angle, scale=1.):
+    """https://www.oschina.net/translate/opencv-rotation"""
+    w = src.shape[1]
+    h = src.shape[0]
+    rangle = np.deg2rad(angle)  # angle in radians
+    # now calculate new image width and height
+    nw = (abs(np.sin(rangle)*h) + abs(np.cos(rangle)*w))*scale
+    nh = (abs(np.cos(rangle)*h) + abs(np.sin(rangle)*w))*scale
+    # ask OpenCV for the rotation matrix
+    rot_mat = cv2.getRotationMatrix2D((nw*0.5, nh*0.5), angle, scale)
+    # calculate the move from the old center to the new center combined
+    # with the rotation
+    rot_move = np.dot(rot_mat, np.array([(nw-w)*0.5, (nh-h)*0.5,0]))
+    # the move only affects the translation, so update the translation
+    # part of the transform
+    rot_mat[0,2] += rot_move[0]
+    rot_mat[1,2] += rot_move[1]
+    return cv2.warpAffine(src, rot_mat, (int(math.ceil(nw)), int(math.ceil(nh))), flags=cv2.INTER_LANCZOS4), rot_mat
+
+
+def get_rect_iou(rects1, rects2):
+    """
+    calculate the iou between rects1 and rects2
+    each rect consists of four points:[min_x, min_y, max_x, max_y]
+    return: a iou matrix, len(rects1) * len(rects2)
+    """
+    rects1, rects2 = np.asarray(rects1), np.asarray(rects2)
+
+    def _to_matrix(p, ps):
+        p = np.ones((len(ps), 1)) * p
+        ps = np.reshape(ps, (len(ps), 1))
+        temp =np.hstack([p, ps])
+        return temp
+    
+    def _get_max(p, ps):
+        return np.max(_to_matrix(p, ps), axis = 1)
+    
+    def _get_min(p, ps):
+        return np.min(_to_matrix(p, ps), axis = 1)
+ 
+    
+    def _get_area(rect):
+        w, h = rect[:, 2] - rect[:, 0] + 1.0 , rect[:, 3] - rect[:, 1] + 1.0
+        return w * h
+
+    def _get_inter(rect1, rects2):
+            x1 = _get_max(rect1[0], rects2[:, 0])
+            y1 = _get_max(rect1[1], rects2[:, 1])
+            
+            x2 = _get_min(rect1[2], rects2[:, 2])
+            y2 = _get_min(rect1[3], rects2[:, 3])
+        
+            w,h = x2-x1 +1, y2 - y1 + 1
+            areas = w * h
+            areas[np.where(w < 0)] = 0
+            areas[np.where(h < 0)] = 0
+            return areas
+            
+    area2 = _get_area(rects2)
+    area1 = _get_area(rects1)
+    iou = np.zeros((len(rects1), len(rects2)))
+    for ri in range(len(rects1)):
+        inter = _get_inter(rects1[ri, :], rects2)
+        union = area1[ri] + area2 - inter
+        iou[ri, :] = np.transpose( inter / union)
+    return iou
+
+def find_contours(mask):
+    mask = np.asarray(mask, dtype = np.uint8)
+    mask = mask.copy()
+    contours, _ = cv2.findContours(mask, mode = cv2.RETR_CCOMP, 
+                                   method = cv2.CHAIN_APPROX_SIMPLE)
+    return contours
+
+def find_two_level_contours(mask):
+    mask = mask.copy()
+    contours, tree = cv2.findContours(mask, mode = cv2.RETR_CCOMP, 
+                                  method = cv2.CHAIN_APPROX_SIMPLE)
+    return contours, tree
+    
+    
+def is_in_contour(point, cnt):
+    """tell whether a point is in contour or not. 
+            In-contour here includes both the 'in contour' and 'on contour' cases.
+       point:(x, y)
+       cnt: a cv2 contour
+    """
+    # doc of pointPolygonTest: http://docs.opencv.org/2.4/modules/imgproc/doc/structural_analysis_and_shape_descriptors.html?highlight=pointpolygontest#cv.PointPolygonTest
+    # the last argument means only tell if in or not, without calculating the shortest distance
+    in_cnt = cv2.pointPolygonTest(cnt, point, False)
+    return in_cnt >= 0;
+
+def convex_hull(contour):
+    hull = cv2.convexHull(contour, returnPoints=1)
+    return hull
+    
+def random_color_3():
+    c = util.rand.randint(low = 0, high = 255, shape = (3, ))
+#     c = np.uint8(c)
+    return c
+
+def get_contour_area(cnt):
+    return cv2.contourArea(cnt)
+    
+def is_valid_jpg(jpg_file):
+    with open(jpg_file, 'rb') as f:     
+        f.seek(-2, 2)
+        return f.read() == '\xff\xd9'
+
+
+
+def rotate_point_by_90(x, y, k, w = 1.0, h = 1.0):
+    """
+    Rotate a point xy on an image by k * 90
+    degrees.
+    Params:
+        x, y: a point, (x, y). If not normalized within 0 and 1, the 
+            width and height of the image should be specified clearly.
+        w, h: the width and height of image
+        k: k * 90 degrees will be rotated
+    """
+    k = k % 4
+    
+    if k == 0:
+        return x, y
+    elif k == 1:
+        return y, w - x
+    elif k == 2:
+        return w - x, h - y
+    elif k == 3:
+        return h - y, x
+    
+    
+def min_area_rect(xs, ys):
+    """
+    Args:
+        xs: numpy ndarray with shape=(N,4). N is the number of oriented bboxes. 4 contains [x1, x2, x3, x4]
+        ys: numpy ndarray with shape=(N,4), [y1, y2, y3, y4]
+            Note that [(x1, y1), (x2, y2), (x3, y3), (x4, y4)] can represent an oriented bbox.
+    Return:
+        the oriented rects sorrounding the box, in the format:[cx, cy, w, h, theta]. 
+    """
+    xs = np.asarray(xs, dtype = np.float32)
+    ys = np.asarray(ys, dtype = np.float32)
+        
+    num_rects = xs.shape[0]
+    box = np.empty((num_rects, 5))#cx, cy, w, h, theta
+    for idx in xrange(num_rects):
+        points = zip(xs[idx, :], ys[idx, :])
+        cnt = points_to_contour(points)
+        rect = cv2.minAreaRect(cnt)
+        cx, cy = rect[0]
+        w, h = rect[1]
+        theta = rect[2]
+        box[idx, :] = [cx, cy, w, h, theta]
+    
+    box = np.asarray(box, dtype = xs.dtype)
+    return box
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/io_.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/io_.py
new file mode 100644
index 0000000000000000000000000000000000000000..0976223422731574789f5ed7fc30c167a2db03fc
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/io_.py
@@ -0,0 +1,216 @@
+#coding=utf-8
+'''
+Created on 2016年9月27日
+
+@author: dengdan
+
+Tool  functions for file system operation and I/O. 
+In the style of linux shell commands
+'''
+import os
+import pickle as pkl
+# import commands
+import logging
+
+# import util
+
+def mkdir(path):
+    """
+    If the target directory does not exists, it and its parent directories will created. 
+    """
+    path = get_absolute_path(path)
+    if not exists(path):
+        os.makedirs(path)
+    return path
+
+def make_parent_dir(path):
+    """make the parent directories for a file."""
+    parent_dir = get_dir(path)
+    mkdir(parent_dir)
+    
+    
+def pwd():
+    return os.getcwd()
+
+def dump(path, obj):
+    path = get_absolute_path(path)
+    parent_path = get_dir(path)
+    mkdir(parent_path)
+    with open(path, 'w') as f:
+        logging.info('dumping file:' + path);
+        pkl.dump(obj, f)
+
+def load(path):
+    path = get_absolute_path(path)
+    with open(path, 'r') as f:
+        data = pkl.load(f)
+    return data
+
+def join_path(a, *p):
+    return os.path.join(a, *p)
+
+def is_dir(path):
+    path = get_absolute_path(path)
+    return os.path.isdir(path)
+
+
+def is_path(path):
+    path = get_absolute_path(path)
+    return os.path.ispath(path)
+    
+def get_dir(path):
+    '''
+    return the directory it belongs to.
+    if path is a directory itself, itself will be return 
+    '''
+    path = get_absolute_path(path)
+    if is_dir(path):
+        return path;
+    return os.path.split(path)[0]
+
+def get_filename(path):
+    return os.path.split(path)[1]
+
+def get_absolute_path(p):
+    if p.startswith('~'):
+        p = os.path.expanduser(p)
+    return os.path.abspath(p)
+
+def cd(p):
+    p = get_absolute_path(p)
+    os.chdir(p)
+    
+# def ls(path = '.', suffix = None):
+#     """
+#     list files in a directory.
+#     return file names in a list
+#     """
+#     path = get_absolute_path(path)
+#     files = os.listdir(path)
+#
+#     if suffix is None:
+#         return files
+#
+#     filtered = []
+#     for f in files:
+#         if util.str.ends_with(f, suffix, ignore_case = True):
+#             filtered.append(f)
+#
+#     return filtered
+
+def find_files(pattern):
+    import glob
+    return glob.glob(pattern)
+
+def read_lines(p):
+    """return the text in a file in lines as a list """
+    p = get_absolute_path(p)
+    f = open(p,'r')
+    return f.readlines()
+    
+def write_lines(p, lines):
+    p = get_absolute_path(p)
+    make_parent_dir(p)
+    with open(p, 'w') as f:
+        for line in lines:
+            f.write(line)
+            
+
+# def cat(p):
+#     """return the text in a file as a whole"""
+#     cmd = 'cat ' + p
+#     return commands.getoutput(cmd)
+
+def exists(path):
+    path = get_absolute_path(path)
+    return os.path.exists(path)
+
+def load_mat(path):
+    import scipy.io as sio
+    path = get_absolute_path(path)
+    return sio.loadmat(path)
+
+def dump_mat(path, dict_obj, append = True):
+    import scipy.io as sio
+    path = get_absolute_path(path)
+    make_parent_dir(path)
+    sio.savemat(file_name = path, mdict =  dict_obj, appendmat = append)
+    
+def dir_mat(path):
+    '''
+    list the variables in mat file.
+    return a list: [(name, shape, dtype), ...]
+    '''
+    import scipy.io as sio
+    path = get_absolute_path(path)
+    return sio.whosmat(path)
+    
+SIZE_UNIT_K = 1024
+SIZE_UNIT_M = SIZE_UNIT_K ** 2
+SIZE_UNIT_G = SIZE_UNIT_K ** 3
+def get_file_size(path, unit = SIZE_UNIT_K):
+    size = os.path.getsize(get_absolute_path(path))
+    return size * 1.0 / unit
+    
+    
+def create_h5(path):
+    import h5py
+    path = get_absolute_path(path)
+    make_parent_dir(path)
+    return h5py.File(path, 'w');
+
+def open_h5(path, mode = 'r'):
+    import h5py
+    path = get_absolute_path(path)
+    return h5py.File(path, mode);
+    
+def read_h5(h5, key):
+    return h5[key][:]
+def read_h5_attrs(h5, key, attrs):
+    return h5[key].attrs[attrs]
+    
+def copy(src, dest):
+    import shutil
+    shutil.copy(get_absolute_path(src), get_absolute_path(dest))
+    
+cp = copy
+
+def remove(p):
+    import os
+    os.remove(get_absolute_path(p))
+rm = remove
+
+# def search(pattern, path, file_only = True):
+#     """
+#     Search files whose name matches the give pattern. The search scope
+#     is the directory and sub-directories of 'path'.
+#     """
+#     path = get_absolute_path(path)
+#     pattern_here = util.io.join_path(path, pattern)
+#     targets = []
+#
+#     # find matchings in current directory
+#     candidates = find_files(pattern_here)
+#     for can in candidates:
+#         if util.io.is_dir(can) and file_only:
+#             continue
+#         else:
+#             targets.append(can)
+#
+#     # find matching in sub-dirs
+#     files = ls(path)
+#     for f in files:
+#         fpath = util.io.join_path(path, f)
+#         if is_dir(fpath):
+#             targets_in_sub_dir = search(pattern, fpath, file_only)
+#             targets.extend(targets_in_sub_dir)
+#     return targets
+
+def dump_json(path, data):
+    import json
+    path = get_absolute_path(path)
+    make_parent_dir(path)
+
+    with open(path, 'w') as f:
+        json.dump(data, f)
+    return path
\ No newline at end of file
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/log.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/log.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1fdfaac6d3564c5b59ad7ca51f02da00f355438
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/log.py
@@ -0,0 +1,47 @@
+#coding=utf-8
+'''
+Created on 2016年10月12日
+
+@author: dengdan
+'''
+import datetime
+import logging
+import util
+import sys
+
+def get_date_str():
+    now = datetime.datetime.now()
+    return now.strftime('%Y-%m-%d %H:%M:%S')  
+
+def init_logger(log_file = None, log_path = None, log_level = logging.DEBUG, mode = 'w', stdout = True):
+    """
+    log_path: 日志文件的文件夹路径
+    mode: 'a', append; 'w', 覆盖原文件写入.
+    """
+    fmt = '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s: %(message)s'
+    if log_path is None:
+        log_path = '~/temp/log/' 
+    if log_file is None:
+        log_file = 'log_' + get_date_str() + '.log'
+    log_file = util.io.join_path(log_path, log_file)
+    # 此处不能使用logging输出
+    print('log file path:' + log_file);
+    util.io.make_parent_dir(log_file)
+    logging.basicConfig(level = log_level,
+                format= fmt,
+                filename= util.io.get_absolute_path(log_file),
+                filemode=mode)
+    
+    if stdout:
+        console = logging.StreamHandler(stream = sys.stdout)
+        console.setLevel(log_level)
+        formatter = logging.Formatter(fmt)
+        console.setFormatter(formatter)
+        logging.getLogger('').addHandler(console)
+
+#     console = logging.StreamHandler(stream = sys.stderr)
+#     console.setLevel(log_level)
+#     formatter = logging.Formatter(fmt)
+#     console.setFormatter(formatter)
+#     logging.getLogger('').addHandler(console)
+
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/logger.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dab12dc305b88e880d1babde3ba3c7825132802
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/logger.py
@@ -0,0 +1,133 @@
+# A simple torch style logger
+# (C) Wei YANG 2017
+from __future__ import absolute_import
+# import matplotlib.pyplot as plt
+import matplotlib
+matplotlib.use('pdf')
+import matplotlib.pyplot as plt
+import os
+import sys
+import numpy as np
+
+__all__ = ['Logger', 'LoggerMonitor', 'savefig']
+
+def savefig(fname, dpi=None):
+    dpi = 150 if dpi == None else dpi
+    plt.savefig(fname, dpi=dpi)
+    
+def plot_overlap(logger, names=None):
+    names = logger.names if names == None else names
+    numbers = logger.numbers
+    for _, name in enumerate(names):
+        x = np.arange(len(numbers[name]))
+        plt.plot(x, np.asarray(numbers[name]))
+    return [logger.title + '(' + name + ')' for name in names]
+
+class Logger(object):
+    '''Save training process to log file with simple plot function.'''
+    def __init__(self, fpath, title=None, resume=False): 
+        self.file = None
+        self.resume = resume
+        self.title = '' if title == None else title
+        if fpath is not None:
+            if resume: 
+                self.file = open(fpath, 'r') 
+                name = self.file.readline()
+                self.names = name.rstrip().split('\t')
+                self.numbers = {}
+                for _, name in enumerate(self.names):
+                    self.numbers[name] = []
+
+                for numbers in self.file:
+                    numbers = numbers.rstrip().split('\t')
+                    for i in range(0, len(numbers)):
+                        self.numbers[self.names[i]].append(numbers[i])
+                self.file.close()
+                self.file = open(fpath, 'a')  
+            else:
+                self.file = open(fpath, 'w')
+
+    def set_names(self, names):
+        if self.resume: 
+            pass
+        # initialize numbers as empty list
+        self.numbers = {}
+        self.names = names
+        for _, name in enumerate(self.names):
+            self.file.write(name)
+            self.file.write('\t')
+            self.numbers[name] = []
+        self.file.write('\n')
+        self.file.flush()
+
+
+    def append(self, numbers):
+        assert len(self.names) == len(numbers), 'Numbers do not match names'
+        for index, num in enumerate(numbers):
+            self.file.write("{0:.6f}".format(num))
+            self.file.write('\t')
+            self.numbers[self.names[index]].append(num)
+        self.file.write('\n')
+        self.file.flush()
+
+    def plot(self, names=None):   
+        print 'plot'
+        '''
+        names = self.names if names == None else names
+        numbers = self.numbers
+        for _, name in enumerate(names):
+            x = np.arange(len(numbers[name]))
+            plt.plot(x, np.asarray(numbers[name]))
+        plt.legend([self.title + '(' + name + ')' for name in names])
+        plt.grid(True)
+        '''
+
+    def close(self):
+        if self.file is not None:
+            self.file.close()
+
+class LoggerMonitor(object):
+    '''Load and visualize multiple logs.'''
+    def __init__ (self, paths):
+        '''paths is a distionary with {name:filepath} pair'''
+        self.loggers = []
+        for title, path in paths.items():
+            logger = Logger(path, title=title, resume=True)
+            self.loggers.append(logger)
+
+    def plot(self, names=None):
+        plt.figure()
+        plt.subplot(121)
+        legend_text = []
+        for logger in self.loggers:
+            legend_text += plot_overlap(logger, names)
+        plt.legend(legend_text, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
+        plt.grid(True)
+                    
+if __name__ == '__main__':
+    # # Example
+    # logger = Logger('test.txt')
+    # logger.set_names(['Train loss', 'Valid loss','Test loss'])
+
+    # length = 100
+    # t = np.arange(length)
+    # train_loss = np.exp(-t / 10.0) + np.random.rand(length) * 0.1
+    # valid_loss = np.exp(-t / 10.0) + np.random.rand(length) * 0.1
+    # test_loss = np.exp(-t / 10.0) + np.random.rand(length) * 0.1
+
+    # for i in range(0, length):
+    #     logger.append([train_loss[i], valid_loss[i], test_loss[i]])
+    # logger.plot()
+
+    # Example: logger monitor
+    paths = {
+    'resadvnet20':'/home/wyang/code/pytorch-classification/checkpoint/cifar10/resadvnet20/log.txt', 
+    'resadvnet32':'/home/wyang/code/pytorch-classification/checkpoint/cifar10/resadvnet32/log.txt',
+    'resadvnet44':'/home/wyang/code/pytorch-classification/checkpoint/cifar10/resadvnet44/log.txt',
+    }
+
+    field = ['Valid Acc.']
+
+    monitor = LoggerMonitor(paths)
+    monitor.plot(names=field)
+    savefig('test.eps')
\ No newline at end of file
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/mask.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..d660607b1a798c38ed0495ec4acb3b14de735d35
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/mask.py
@@ -0,0 +1,82 @@
+import cv2
+import numpy as np
+
+import util
+from util import nb as neighbour
+
+    
+def find_white_components(mask, min_area = 0):
+    mask = (mask == 0) * 1
+    return find_black_components(mask, min_area);
+    
+def find_black_components(mask, min_area = 0):
+    """
+    find components of zeros. 
+    mask is a 0-1 matrix, ndarray.
+    """
+    neighbour_type = neighbour.N4
+    visited = mask.copy()
+    c_mask = util.img.black(mask)
+
+    root_idx = [1]
+    def get_new_root():
+        root_idx[0] += 1
+        return root_idx[0]
+        
+    def is_visited(xy):
+        x, y = xy
+        return visited[y][x]
+        
+    def set_visited(xy):
+        x, y = xy
+        visited[y][x] = 255
+    
+    def set_root(xy, root):
+        x, y = xy
+        c_mask[y][x] = root
+        
+    def get_root(xy):
+        x, y = xy
+        return c_mask[y][x]
+        
+    rows, cols = np.shape(mask)
+    q = []
+    for y in xrange(rows):
+        for x in xrange(cols):
+            xy = (x, y)
+            if  is_visited(xy):
+                continue
+                
+            q.append(xy)
+            new_root = get_new_root()
+            while len(q) > 0:
+                cp = q.pop()
+                set_root(cp, new_root)
+                set_visited(cp)
+                nbs = neighbour.get_neighbours(cp[0], cp[1], cols, rows, neighbour_type)
+                for nb in nbs:
+                    if not is_visited(nb) and nb not in q:
+#                         q.append(nb)
+                        q.insert(0, nb)
+    
+    components = {}
+    for y in xrange(rows):
+        for x in xrange(cols):
+            root = get_root((x, y))
+            if root == 0:
+                continue
+                
+            if root not in components:
+                components[root] = []
+                
+            components[root].append((x,y))
+    
+    ret = []
+    
+    for root in components:
+        if len(components[root]) >= min_area:
+            ret.append(components[root])
+        
+    return ret
+    
+
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/misc.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..324309c3f9b7a3f5e3430fd53575779c394f283f
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/misc.py
@@ -0,0 +1,74 @@
+'''Some helper functions for PyTorch, including:
+    - get_mean_and_std: calculate the mean and std value of dataset.
+    - msr_init: net parameter initialization.
+    - progress_bar: progress bar mimic xlua.progress.
+'''
+import errno
+import os
+import sys
+import time
+import math
+
+import torch.nn as nn
+import torch.nn.init as init
+from torch.autograd import Variable
+
+__all__ = ['get_mean_and_std', 'init_params', 'mkdir_p', 'AverageMeter']
+
+
+def get_mean_and_std(dataset):
+    '''Compute the mean and std value of dataset.'''
+    dataloader = trainloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)
+
+    mean = torch.zeros(3)
+    std = torch.zeros(3)
+    print('==> Computing mean and std..')
+    for inputs, targets in dataloader:
+        for i in range(3):
+            mean[i] += inputs[:,i,:,:].mean()
+            std[i] += inputs[:,i,:,:].std()
+    mean.div_(len(dataset))
+    std.div_(len(dataset))
+    return mean, std
+
+def init_params(net):
+    '''Init layer parameters.'''
+    for m in net.modules():
+        if isinstance(m, nn.Conv2d):
+            init.kaiming_normal(m.weight, mode='fan_out')
+            if m.bias:
+                init.constant(m.bias, 0)
+        elif isinstance(m, nn.BatchNorm2d):
+            init.constant(m.weight, 1)
+            init.constant(m.bias, 0)
+        elif isinstance(m, nn.Linear):
+            init.normal(m.weight, std=1e-3)
+            if m.bias:
+                init.constant(m.bias, 0)
+
+def mkdir_p(path):
+    '''make dir if not exist'''
+    try:
+        os.makedirs(path)
+    except OSError as exc:  # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
\ No newline at end of file
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/ml.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/ml.py
new file mode 100644
index 0000000000000000000000000000000000000000..12bee2ba8caefb32d4337071c6b45889693b8f62
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/ml.py
@@ -0,0 +1,25 @@
+import logging
+import cv2
+import numpy as np
+import util.dec
+import util.np
+
+@util.dec.print_calling
+def kmeans(samples, k, criteria = None, attempts = 3, flags = cv2.KMEANS_RANDOM_CENTERS):
+    if criteria == None:
+        criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
+    samples = np.asarray(samples, dtype = np.float32)
+    _,labels,centers = cv2.kmeans(samples, k, criteria, attempts, flags)
+    labels = util.np.flatten(labels)
+    clusters = [None]*k
+    for idx, label in enumerate(labels):
+        if clusters[label] is None:
+            clusters[label] = []
+        clusters[label].append(idx)
+        
+    for  idx, cluster in enumerate(clusters):
+        if cluster == None:
+            logging.warn('Empty cluster appeared.')
+            clusters[idx] = []
+            
+    return labels, clusters, centers
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/mod.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/mod.py
new file mode 100644
index 0000000000000000000000000000000000000000..033d664457cf14d8e116f28e01192f8de23c1a82
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/mod.py
@@ -0,0 +1,57 @@
+#coding=utf-8
+
+import logging
+
+def add_to_path(path):
+    '''
+    add path to sys.path.
+    '''
+    import sys;
+    sys.path.insert(0, path);
+
+def add_ancester_dir_to_path(fp, p):
+    '''
+    add ancester directory to sys.path.
+    fp: usually __file__
+    p : the relative path to be added.
+    '''
+    import util
+    parent_path = util.io.get_dir(fp)
+    path = util.io.join_path(parent_path, p)
+    add_to_path(path)
+
+def is_main(mod_name):
+    return mod_name == '__main__'
+    
+def import_by_name(mod_name):
+    __import__(mod_name)
+    return get_mod_by_name(mod_name)
+
+def try_import_by_name(mod_name, error_path):
+    try:
+        import_by_name(mod_name)
+    except ImportError:
+        logging.info('adding %s to sys.path'%(error_path))
+        add_to_path(error_path)        
+        import_by_name(mod_name)
+    
+    return get_mod_by_name(mod_name)
+    
+def get_mod_by_name(mod_name):
+    import sys
+    return sys.modules[mod_name]
+    
+def load_mod_from_path(path, keep_name = True):
+    """"
+    Params:
+        path
+        keep_name: if True, the filename will be used as module name.
+    """
+    import util
+    import imp
+    path = util.io.get_absolute_path(path)
+    file_name = util.io.get_filename(path)
+    module_name = file_name.split('.')[0]
+    if not keep_name:
+        module_name = '%s_%d'%(module_name, util.get_count())
+    return imp.load_source(module_name, path)
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/neighbour.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/neighbour.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f1826b88d55ccee198e77ad6874ff7976f1d0d5
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/neighbour.py
@@ -0,0 +1,86 @@
+#encoding=utf-8
+
+import numpy as np
+
+N1 = 'n1'
+N2 = 'n2'
+N4 = 'n4'
+N8 = 'n8'
+
+def _in_image(c, w, h):
+    cx, cy = c
+    return cx >=0 and cx < w and cy >= 0 and cy < h
+
+def n1(x, y, w, h):
+    """down and right"""
+    neighbours = []
+    candidates = [(x, y + 1),  (x + 1, y)];
+    
+    for c in candidates:
+        if _in_image(c, w, h):
+            neighbours.append(c)
+    
+    return neighbours
+    
+
+def n2(x, y, w, h):
+    neighbours = []
+    candidates = [(x, y + 1),  (x + 1, y), (x + 1, y + 1), (x - 1, y + 1)];
+    for c in candidates:
+        if _in_image(c, w, h):
+            neighbours.append(c)
+    
+    return neighbours;
+
+def n4(x, y, w, h):
+    neighbours = []
+    candidates = [(x, y - 1),(x, y + 1),  (x + 1, y), (x - 1, y)];
+    for c in candidates:
+        if _in_image(c, w, h):
+            neighbours.append(c)
+    return neighbours
+    
+
+def n8(x, y, w, h):
+    neighbours = []
+    candidates = [(x + 1, y - 1),(x, y - 1),(x - 1, y - 1), (x - 1, y),(x, y + 1),  (x + 1, y), (x + 1, y + 1), (x - 1, y + 1)];
+    for c in candidates:
+        if _in_image(c, w, h):
+            neighbours.append(c)
+    
+    return neighbours;
+    
+    
+def n1_count(w, h):
+    return 2 * w * h - w - h
+    
+def n2_count(w, h):
+    return 4 * w * h - 3 * w - 3 * h + 2
+    
+    
+_dict1 = {N1:n1, N2:n2, N4:n4, N8:n8};
+_dict2 = {N1:n1_count, N2:n2_count};
+
+def get_neighbours(x, y, w, h, neighbour_type):
+    if neighbour_type in _dict1:
+        fn = _dict1[neighbour_type]
+        return fn(x, y, w, h)
+    raise NotImplementedError("unknown neighbour type '%s'" % (neighbour_type))
+    
+def count_neighbours(w, h, neighbour_type):
+    if neighbour_type in _dict2:
+        fn = _dict2[neighbour_type]
+        return fn(w, h)
+    raise NotImplementedError("unknown neighbour type '%s'" % (neighbour_type))
+    
+
+if __name__ == "__main__":
+    w, h = 10, 10
+    np.testing.assert_equal(len(n4(0, 0, w, h)), 2)
+    np.testing.assert_equal(len(n8(0, 0, w, h)), 3)
+    
+    np.testing.assert_equal(len(n4(0, 2, w, h)), 3)
+    np.testing.assert_equal(len(n8(0, 2, w, h)), 5)
+    
+    np.testing.assert_equal(len(n4(3, 3, w, h)), 4)
+    np.testing.assert_equal(len(n8(3, 3, w, h)), 8)
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py
new file mode 100644
index 0000000000000000000000000000000000000000..0faf6d0107e9aab0981f0eaf8d218eb706cb81f9
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py
@@ -0,0 +1,171 @@
+import numpy as np
+import copy
+
+TINY = np.exp(-100)
+concat = np.concatenate
+def is_2D(m):
+    '''
+    judge if a matrix is 2-D or not
+    '''
+    return len(np.shape(m)) == 2
+
+def norm1(v):
+    return np.sum(np.abs(v))
+
+def norm2(v):
+    return np.sqrt(np.sum(v ** 2))
+
+def norm2_squared(v):
+    return np.sum(v ** 2)
+
+
+def cos_dist(v1, v2):
+    length1 = norm2(v1)
+    length2 = norm2(v2)
+    return np.dot(v1, v2) / (length1 * length2)
+
+def eu_dist(v1, v2):
+    v = v1 - v2
+    return norm2(v)
+
+def chi_squared_dist(f1, f2):
+    dist = 0
+    for ff1, ff2 in zip(f1, f2):
+        if ff1 + ff2 == 0:# color feature values are supposed to be non-negative. If this case happened, it means both ne and de are 0s 
+            continue;
+        dist += (ff1 - ff2) ** 2 * 1.0/ (ff1 + ff2) 
+    return np.sqrt(dist)
+
+def flatten(arr, ndim = 1):
+    """
+    flatten an multi-dimensional array to a certain degree.
+    ndim: the number of dimensions after flatten
+    """
+    arr = np.asarray(arr)
+    dims = len(arr.shape)
+    shape = [np.prod(arr.shape[0: dims + 1 - ndim])]
+    shape.extend(arr.shape[dims + 1 - ndim: dims])
+    return np.reshape(arr, shape)
+
+def arcsin(sins, xs = None):
+    """
+    cal arcsin.
+    xs: if this parameter is provided, the returned arcsins will be within [0, 2*pi)
+    otherwise the default [-pi/2, pi/2]
+    """
+    arcs = np.arcsin(sins);
+    if xs != None:
+        xs = np.asarray(xs)
+        sins = np.asarray(sins)
+        # if x > 0, then the corresponding mask value is  -1. The resulting angle unchanged: v = 0 - (-v) = v.  else, v = pi - v
+        add_pi = xs < 0
+        pi_mask = add_pi * np.pi
+        # 0 --> 1, 1 --> -1
+        arc_mask = 2 * add_pi - 1
+        arcs = pi_mask - arcs * arc_mask
+
+        # if x >= 0 and sin < 0, v = 2*pi + v
+        add_2_pi = (xs >= 0) * (sins < 0)
+        pi_mask = add_2_pi * 2 * np.pi
+        arcs = pi_mask + arcs 
+    return arcs
+    
+def sin(ys = None, lengths = None, xs = None, angles = None):
+    """
+    calculate sin with multiple kinds of parameters
+    """
+    if not angles is None:
+        return np.sin(angles)
+    
+    if ys is None:
+        raise ValueError('ys must be provided when "angles" is None ')
+
+    if lengths is None:
+        if xs is None:
+            raise ValueError('xs must be provided when "lengths" is None ')
+        lengths = np.sqrt(xs ** 2 + ys ** 2)
+    
+    if not np.iterable(lengths):
+        sins = ys / lengths if lengths > 0 else 0
+    else:
+        lengths = np.asarray(lengths)
+        shape = lengths.shape
+        ys = flatten(ys)
+        lengths = flatten(lengths)                
+        sins = [y / length if length > 0 else 0 for (y, length) in zip(ys, lengths)]
+        sins = np.reshape(sins, shape)
+    return sins
+
+def sum_all(m):
+    """
+    sum up all the elements in a multi-dimension array
+    """
+    return np.sum(m)
+    
+    
+def clone(obj, deep = False):
+    if not deep:
+        return copy.copy(obj)
+    return copy.deepcopy(obj)
+
+def empty_list(length, etype):
+    empty_list = [None] * length
+    for i in xrange(length):
+        if etype == list:
+            empty_list[i] = []
+        else:
+            raise NotImplementedError
+    
+    return empty_list
+            
+def shuffle(arr):
+    import random
+    random.shuffle(arr)
+        
+def is_empty(a):
+    '''
+    tell whether an array is empty.
+    If a is multidimensional, it is empty when it contains no entry in the last dimension.
+    '''
+    if a is None:
+        return True
+    
+    shape = np.shape(a)
+    if np.prod(shape) == 0:
+        return True
+    
+    return False
+    
+def angle_with_x(x, y):
+    """
+    return the arctan x/y, in range [-pi, pi]
+    """
+    return np.arctan2(y, x)
+
+def has_infty(x):
+    test = x == np.infty
+    return np.sum(test) > 0
+    
+def has_nan(x):
+    x = np.asarray(x)
+    test = x != x
+    return np.sum(test) > 0
+
+def has_nan_or_infty(x):
+    if has_nan(x):
+        return True
+    
+    if has_infty(x):
+        return True
+    
+        
+def iterable(x):
+    return np.iterable(x)
+    
+def smooth(arr):
+    result = [0] * len(arr)
+    s = 0
+    for idx, n in enumerate(arr):
+        s += n
+        result[idx] = s * 1.0 / (idx + 1)
+    return result
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py~ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py~
new file mode 100644
index 0000000000000000000000000000000000000000..614d195ea3e6b001e6df3b9ffc4075efad8a4a63
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/np.py~
@@ -0,0 +1,171 @@
+import numpy as np
+import copy
+
+TINY = np.exp(-100)
+concat = np.concatenate
+def is_2D(m):
+    '''
+    judge if a matrix is 2-D or not
+    '''
+    return len(np.shape(m)) == 2
+
+def norm1(v):
+    return np.sum(np.abs(v))
+
+def norm2(v):
+    return np.sqrt(np.sum(v ** 2))
+
+def norm2_squared(v):
+    return np.sum(v ** 2)
+
+
+def cos_dist(v1, v2):
+    length1 = norm2(v1)
+    length2 = norm2(v2)
+    return np.dot(v1, v2) / (length1 * length2)
+
+def eu_dist(v1, v2):
+    v = v1 - v2
+    return norm2(v)
+
+def chi_squared_dist(f1, f2):
+    dist = 0
+    for ff1, ff2 in zip(f1, f2):
+        if ff1 + ff2 == 0:# color feature values are supposed to be non-negative. If this case happened, it means both ne and de are 0s 
+            continue;
+        dist += (ff1 - ff2) ** 2 * 1.0/ (ff1 + ff2) 
+    return np.sqrt(dist)
+
+def flatten(arr, ndim = 1):
+    """
+    flatten an multi-dimensional array to a certain degree.
+    ndim: the number of dimensions after flatten
+    """
+    arr = np.asarray(arr)
+    dims = len(arr.shape)
+    shape = [np.prod(arr.shape[0: dims + 1 - ndim])]
+    shape.extend(arr.shape[dims + 1 - ndim: dims])
+    return np.reshape(arr, shape)
+
+def arcsin(sins, xs = None):
+    """
+    cal arcsin.
+    xs: if this parameter is provided, the returned arcsins will be within [0, 2*pi)
+    otherwise the default [-pi/2, pi/2]
+    """
+    arcs = np.arcsin(sins);
+    if xs != None:
+        xs = np.asarray(xs)
+        sins = np.asarray(sins)
+        # if x > 0, then the corresponding mask value is  -1. The resulting angle unchanged: v = 0 - (-v) = v.  else, v = pi - v
+        add_pi = xs < 0
+        pi_mask = add_pi * np.pi
+        # 0 --> 1, 1 --> -1
+        arc_mask = 2 * add_pi - 1
+        arcs = pi_mask - arcs * arc_mask
+
+        # if x >= 0 and sin < 0, v = 2*pi + v
+        add_2_pi = (xs >= 0) * (sins < 0)
+        pi_mask = add_2_pi * 2 * np.pi
+        arcs = pi_mask + arcs 
+    return arcs
+    
+def sin(ys = None, lengths = None, xs = None, angles = None):
+    """
+    calculate sin with multiple kinds of parameters
+    """
+    if not angles is None:
+        return np.sin(angles)
+    
+    if ys is None:
+        raise ValueError('ys must be provided when "angles" is None ')
+
+    if lengths is None:
+        if xs is None:
+            raise ValueError('xs must be provided when "lengths" is None ')
+        lengths = np.sqrt(xs ** 2 + ys ** 2)
+    
+    if not np.iterable(lengths):
+        sins = ys / lengths if lengths > 0 else 0
+    else:
+        lengths = np.asarray(lengths)
+        shape = lengths.shape
+        ys = flatten(ys)
+        lengths = flatten(lengths)                
+        sins = [y / length if length > 0 else 0 for (y, length) in zip(ys, lengths)]
+        sins = np.reshape(sins, shape)
+    return sins
+
+def sum_all(m):
+    """
+    sum up all the elements in a multi-dimension array
+    """
+    return np.sum(m)
+    
+    
+def clone(obj, deep = False):
+    if not deep:
+        return copy.copy(obj)
+    return copy.deepcopy(obj)
+
+def empty_list(length, etype):
+    empty_list = [None] * length
+    for i in xrange(length):
+        if etype == list:
+            empty_list[i] = []
+        else:
+            raise NotImplementedError
+    
+    return empty_list
+            
+def shuffle(arr):
+    import random
+    random.shuffle(arr)
+        
+def is_empty(a):
+    '''
+    tell whether an array is empty.
+    If a is multidimensional, it is empty when it contains no entry in the last dimension.
+    '''
+    if a is None:
+        return True
+    
+    shape = np.shape(a)
+    if np.prod(shape) == 0:
+        return True
+    
+    return False
+    
+def angle_with_x(x, y):
+    """
+    return the arctan x/y, in range [-pi, pi]
+    """
+    return np.arctan2(y, x)
+
+def has_infty(x):
+    test = x == np.infty
+    return np.sum(test) > 0
+    
+def has_nan(x):
+    x = np.asarray(x)
+    test = x != x
+    return np.sum(test) > 0
+
+def has_nan_or_infty(x):
+    if has_nan(x):
+        return True
+    
+    if has_infty(x):
+        return True
+    
+        
+def iterable(x):
+    return np.iterable(x)
+    
+def smooth(arr):
+    result = [0] * len(arr)
+    s = 0
+    for idx, n in arr:
+        s += n
+        result[idx] = s * 1.0 / (idx + 1)
+    return result
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py
new file mode 100644
index 0000000000000000000000000000000000000000..6eb61833ff09c6db1308fedb1082eeaea194334b
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py
@@ -0,0 +1,191 @@
+#coding=utf-8
+'''
+Created on 2016-9-27
+
+@author: dengdan
+'''
+import matplotlib.pyplot as plt
+import numpy as np
+import util
+        
+def hist(x, title = None, normed = False, show = True, save = False, save_path = None, bin_count = 100, bins = None):    
+    x = np.asarray(x)
+    if len(np.shape(x)) > 1:
+#         x = np.reshape(x, np.prod(x.shape))
+        x = util.np.flatten(x)
+    if bins == None:
+        bins = np.linspace(start = min(x), stop = max(x), num = bin_count, endpoint = True, retstep = False)
+    plt.figure(num = title)
+    plt.hist(x, bins, normed = normed)
+    if save:
+        if save_path is None:
+            raise ValueError
+        path = util.io.join_path(save_path, title + '.png')
+        save_image(path)
+    if show:
+        plt.show()
+        #util.img.imshow(title, path, block = block)        
+
+def plot_solver_data(solver_path):
+    data = util.io.load(solver_path)
+    training_losses = data.training_losses
+    training_accuracies = data.training_accuracies
+    val_losses = data.val_losses
+    val_accuracies = data.val_accuracies
+    plt.figure(solver_path)
+    
+    n = len(training_losses)
+    x = range(n)
+    
+    plt.plot(x, training_losses, 'r-', label = 'Training Loss')
+    
+    if len(training_accuracies) > 0:
+        plt.plot(x, training_accuracies, 'r--', label = 'Training Accuracy')
+    
+    if len(val_losses) > 0:
+        n = len(val_losses)
+        x = range(n)
+        plt.plot(x, val_losses, 'g-', label = 'Validation Loss')
+        
+        if len(val_accuracies) > 0:
+            plt.plot(x, val_accuracies, 'g--', label = 'Validation Accuracy')
+    plt.legend()
+    plt.show()
+    
+    
+def rectangle(xy, width, height, color = 'red', linewidth = 1, fill = False, alpha = None, axis = None):
+    """
+    draw a rectangle on plt axis
+    """
+    import matplotlib.patches as patches
+    rect = patches.Rectangle(
+        xy = xy,
+        width = width,
+        height = height,
+        alpha = alpha,
+        color = color,
+        fill = fill,
+        linewidth = linewidth
+    )
+    if axis is not None:
+        axis.add_patch(rect)
+    return rect
+    
+rect = rectangle
+
+def maximize_figure():
+    mng = plt.get_current_fig_manager()
+    mng.full_screen_toggle()
+
+def line(xy_start, xy_end, color = 'red', linewidth = 1, alpha = None, axis = None):
+    """
+    draw a line on plt axis
+    """
+    from matplotlib.lines import Line2D 
+    num = 100
+    xdata = np.linspace(xy_start[0], xy_end[0], num = num)
+    ydata = np.linspace(xy_start[1], xy_end[1], num = num)
+    line = Line2D(
+        alpha = alpha,
+        color = color,
+        linewidth = linewidth,
+        xdata = xdata,
+        ydata = ydata
+    )
+    if axis is not None:
+        axis.add_line(line)
+    return line
+
+def imshow(title = None, img = None, gray = False):
+    show_images([img], [title], gray = gray)
+
+def show_images(images, titles = None, shape = None, share_axis = False, 
+                bgr2rgb = False, maximized = False, 
+                show = True, gray = False, save = False, colorbar = False, 
+                path = None, axis_off = False, vertical = False, subtitle = None):
+        
+    if shape == None:
+        if vertical:
+            shape = (len(images), 1)
+        else:
+            shape = (1, len(images))
+        
+    ret_axes = []
+    ax0 = None
+    for idx, img in enumerate(images): 
+        if bgr2rgb:
+            img = util.img.bgr2rgb(img)
+        loc = (idx / shape[1], idx % shape[1])
+        if idx == 0:
+            ax = plt.subplot2grid(shape, loc)
+            ax0 = ax
+        else:
+            if share_axis:
+                ax = plt.subplot2grid(shape, loc, sharex = ax0, sharey = ax0)
+            else:
+                ax = plt.subplot2grid(shape, loc)
+        if len(np.shape(img)) == 2 and gray:
+            img_ax = ax.imshow(img, cmap = 'gray')
+        else:
+            img_ax = ax.imshow(img)
+        
+        if len(np.shape(img)) == 2 and colorbar:
+            plt.colorbar(img_ax, ax = ax)
+        if titles != None:
+            ax.set_title(titles[idx])
+        
+        if axis_off:
+            plt.axis('off')
+#             plt.xticks([]), plt.yticks([])
+        ret_axes.append(ax)
+        
+    if subtitle is not None:
+        set_subtitle(subtitle)
+    if maximized:
+        maximize_figure()
+        
+    if save:
+        if path is None:
+            raise ValueError('path can not be None when save is True')
+        save_image(path)
+    if show:
+        plt.show()
+    return ret_axes
+
+def save_image(path, img = None, dpi = 150):
+    path = util.io.get_absolute_path(path)
+    util.io.make_parent_dir(path)
+    if img is None:
+        plt.gcf().savefig(path, dpi = dpi)
+    else:
+        plt.imsave(path, img)
+
+imwrite = save_image
+    
+def to_ROI(ax, ROI):
+    xy1, xy2 = ROI
+    xmin, ymin = xy1
+    xmax, ymax = xy2
+    ax.set_xlim(xmin, xmax)
+    #ax.extent
+    ax.set_ylim(ymax, ymin)
+    
+def set_subtitle(title, fontsize = 12):
+    plt.gcf().suptitle(title, fontsize=fontsize)
+
+def show(maximized = False):
+    if maximized:
+        maximize_figure()
+    plt.show()
+    
+def draw():
+    plt.gcf().canvas.draw()
+
+def get_random_line_style():
+    colors = ['r', 'g', 'b']
+    line_types = ['-']#, '--', '-.', ':']
+    idx = util.rand.randint(len(colors))
+    color = colors[idx]
+    idx = util.rand.randint(len(line_types))
+    line_type = line_types[idx]
+    return color + line_type
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py~ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py~
new file mode 100644
index 0000000000000000000000000000000000000000..36129f23c783dc9324cadef5e94f1d4ef4bbfcca
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/plt.py~
@@ -0,0 +1,188 @@
+#coding=utf-8
+'''
+Created on 2016-9-27
+
+@author: dengdan
+'''
+import matplotlib.pyplot as plt
+import numpy as np
+import util
+        
+def hist(x, title, normed = False, show = True, save = False, save_path = None, bin_count = 100, bins = None):    
+    x = np.asarray(x)
+    if len(np.shape(x)) > 1:
+#         x = np.reshape(x, np.prod(x.shape))
+        x = util.np.flatten(x)
+    if bins == None:
+        bins = np.linspace(start = min(x), stop = max(x), num = bin_count, endpoint = True, retstep = False)
+    plt.figure(num = title)
+    plt.hist(x, bins, normed = normed)
+    if save:
+        if save_path is None:
+            raise ValueError
+        path = util.io.join_path(save_path, title + '.png')
+        save_image(path)
+    if show:
+        plt.show()
+        #util.img.imshow(title, path, block = block)        
+
+def plot_solver_data(solver_path):
+    data = util.io.load(solver_path)
+    training_losses = data.training_losses
+    training_accuracies = data.training_accuracies
+    val_losses = data.val_losses
+    val_accuracies = data.val_accuracies
+    plt.figure(solver_path)
+    
+    n = len(training_losses)
+    x = range(n)
+    
+    plt.plot(x, training_losses, 'r-', label = 'Training Loss')
+    
+    if len(training_accuracies) > 0:
+        plt.plot(x, training_accuracies, 'r--', label = 'Training Accuracy')
+    
+    if len(val_losses) > 0:
+        n = len(val_losses)
+        x = range(n)
+        plt.plot(x, val_losses, 'g-', label = 'Validation Loss')
+        
+        if len(val_accuracies) > 0:
+            plt.plot(x, val_accuracies, 'g--', label = 'Validation Accuracy')
+    plt.legend()
+    plt.show()
+    
+    
+def rectangle(xy, width, height, color = 'red', linewidth = 1, fill = False, alpha = None, axis = None):
+    """
+    draw a rectangle on plt axis
+    """
+    import matplotlib.patches as patches
+    rect = patches.Rectangle(
+        xy = xy,
+        width = width,
+        height = height,
+        alpha = alpha,
+        color = color,
+        fill = fill,
+        linewidth = linewidth
+    )
+    if axis is not None:
+        axis.add_patch(rect)
+    return rect
+    
+rect = rectangle
+
+def maximize_figure():
+    mng = plt.get_current_fig_manager()
+    mng.full_screen_toggle()
+
+def line(xy_start, xy_end, color = 'red', linewidth = 1, alpha = None, axis = None):
+    """
+    draw a line on plt axis
+    """
+    from matplotlib.lines import Line2D 
+    num = 100
+    xdata = np.linspace(xy_start[0], xy_end[0], num = num)
+    ydata = np.linspace(xy_start[1], xy_end[1], num = num)
+    line = Line2D(
+        alpha = alpha,
+        color = color,
+        linewidth = linewidth,
+        xdata = xdata,
+        ydata = ydata
+    )
+    if axis is not None:
+        axis.add_line(line)
+    return line
+
+def imshow(title = None, img = None, gray = False):
+    show_images([img], [title], gray = gray)
+
+def show_images(images, titles = None, shape = None, share_axis = False, 
+                bgr2rgb = False, maximized = False, 
+                show = True, gray = False, save = False, 
+                path = None, axis_off = False, vertical = False, subtitle = None):
+        
+    if shape == None:
+        if vertical:
+            shape = (len(images), 1)
+        else:
+            shape = (1, len(images))
+        
+    ret_axes = []
+    ax0 = None
+    for idx, img in enumerate(images): 
+        if bgr2rgb:
+            img = util.img.bgr2rgb(img)
+        loc = (idx / shape[1], idx % shape[1])
+        if idx == 0:
+            ax = plt.subplot2grid(shape, loc)
+            ax0 = ax
+        else:
+            if share_axis:
+                ax = plt.subplot2grid(shape, loc, sharex = ax0, sharey = ax0)
+            else:
+                ax = plt.subplot2grid(shape, loc)
+        if len(np.shape(img)) == 2 and gray:
+            ax.imshow(img, cmap = 'gray')
+        else:
+            ax.imshow(img)
+        if titles != None:
+            ax.set_title(titles[idx])
+        
+        if axis_off:
+            plt.axis('off')
+#             plt.xticks([]), plt.yticks([])
+        ret_axes.append(ax)
+        
+    if subtitle is not None:
+        set_subtitle(subtitle)
+    if maximized:
+        maximize_figure()
+        
+    if save:
+        if path is None:
+            raise ValueError('path can not be None when save is True')
+        save_image(path)
+    if show:
+        plt.show()
+    return ret_axes
+
+def save_image(path, img = None, dpi = 150):
+    path = util.io.get_absolute_path(path)
+    util.io.make_parent_dir(path)
+    if img is None:
+        plt.gcf().savefig(path, dpi = dpi)
+    else:
+        plt.imsave(path, img)
+
+imwrite = save_image
+    
+def to_ROI(ax, ROI):
+    xy1, xy2 = ROI
+    xmin, ymin = xy1
+    xmax, ymax = xy2
+    ax.set_xlim(xmin, xmax)
+    #ax.extent
+    ax.set_ylim(ymax, ymin)
+    
+def set_subtitle(title, fontsize = 12):
+    plt.gcf().suptitle(title, fontsize=fontsize)
+
+def show(maximized = False):
+    if maximized:
+        maximize_figure()
+    plt.show()
+    
+def draw():
+    plt.gcf().canvas.draw()
+
+def get_random_line_style():
+    colors = ['r', 'g', 'b', 'k']
+    line_types = ['-']#, '--', '-.', ':']
+    idx = util.rand.randint(len(colors))
+    color = colors[idx]
+    idx = util.rand.randint(len(line_types))
+    line_type = line_types[idx]
+    return color + line_type
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/proc.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/proc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6621c00b1cc3f4efa60b1dbaac72d8717f565b3
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/proc.py
@@ -0,0 +1,51 @@
+import multiprocessing
+
+def cpu_count():
+    return multiprocessing.cpu_count()
+
+def get_pool(processes):
+    pool = multiprocessing.Pool(processes = processes)
+    return pool
+
+def wait_for_pool(pool):
+    pool.close()
+    pool.join()  
+
+def set_proc_name(name):
+    import setproctitle
+    setproctitle.setproctitle(name)
+    
+def kill(pid):
+    import util
+    if type(pid) == list:
+        for p in pid:
+            kill(p)
+    elif type(pid) == int:
+        cmd = 'kill -9 %d'%(pid)
+        print cmd
+        print util.cmd.cmd(cmd)
+    elif type(pid) == str:
+        pids = get_pid(pid)
+        kill(pids)
+    else:
+        raise ValueError, 'Not supported parameter type:', type(pid)
+
+def ps_aux_grep(pattern):
+    import util
+    cmd = 'ps aux|grep %s'%(pattern)
+    return util.cmd.cmd(cmd)
+
+        
+def get_pid(pattern):
+    import util
+    cmd = 'ps aux|grep %s'%(pattern)
+    results = util.cmd.cmd(cmd)
+    results = util.str.split(results, '\n')
+    pids = []
+    for result in results:
+        info = result.split()
+        if len(info) > 0:
+            pid = int(info[1])
+            pids.append(pid)
+    return pids
+
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/rand.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/rand.py
new file mode 100644
index 0000000000000000000000000000000000000000..34ffa45c2fa36805d16ac5543bb453221b67a175
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/rand.py
@@ -0,0 +1,37 @@
+#coding=utf-8
+'''
+Created on 2016年9月27日
+
+@author: dengdan
+'''
+import numpy as np
+import time
+import random
+
+rng = np.random.RandomState(int(time.time()))
+
+rand = np.random.rand
+"""
+Create an array of the given shape and populate it with random samples from a uniform distribution over [0, 1)
+"""
+
+
+def normal(shape, mu = 0, sigma_square = 1):
+    return rng.normal(mu, np.sqrt(sigma_square), shape)
+
+def randint(low = 2 ** 30, high = None, shape = None):
+    """
+    low: the higher bound except when high is not None.
+    high: when it is not none, low must be smaller than it
+    shape: if not provided, a scalar will be returned
+    """
+    return rng.randint(low = low, high = high, size = shape)
+    
+def shuffle(lst):
+    random.shuffle(lst)
+
+def sample(lst, n):
+    return random.sample(lst, n)
+
+
+    
\ No newline at end of file
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/statistic.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/statistic.py
new file mode 100644
index 0000000000000000000000000000000000000000..69dab91c46cd93c0e666dca9aa067a7cbe384ac5
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/statistic.py
@@ -0,0 +1,16 @@
+#coding=utf-8
+'''
+Created on 2016年10月8日
+
+@author: dengdan
+'''
+import numpy as np
+import util.np
+
+def D(x):
+    x = util.np.flatten(x)
+    return np.var(x)
+
+def E(x):
+    x = util.np.flatten(x)
+    return np.average(x)
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fda36820bc711e2548df52d70472adc8c20b99b
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py
@@ -0,0 +1,94 @@
+# encoding = utf-8
+def int_array_to_str(arr):
+    """turn an int array to a str"""
+    return "".join(map(chr, arr))
+
+def join(arr, splitter=','):
+    temp = []
+    for e in arr:
+        temp.append(e)
+        temp.append(splitter)
+    temp.pop()
+    return "".join(temp)
+
+def is_str(s):
+    return type(s) == str
+
+def to_lowercase(s):
+    return str.lower(s)
+
+def to_uppercase(s):
+    return str.upper(s)
+    
+def ends_with(s, suffix, ignore_case = False):
+    """
+    suffix: str, list, or tuple
+    """
+    if is_str(suffix):
+        suffix = [suffix]
+    suffix = list(suffix)
+    if ignore_case:
+        for idx, suf in enumerate(suffix):
+            suffix[idx] = to_lowercase(suf)    
+        s = to_lowercase(s)
+    suffix = tuple(suffix)
+    return s.endswith(suffix)
+
+def starts_with(s, prefix, ignore_case = False):
+    """
+    prefix: str, list, or tuple
+    """
+    if is_str(prefix):
+        prefix = [prefix]
+    prefix = list(prefix)
+    if ignore_case:
+        for idx, pre in enumerate(prefix):
+            prefix[idx] = to_lowercase(pre)    
+        s = to_lowercase(s)
+    prefix = tuple(prefix)
+    return s.startswith(prefix)
+
+
+def contains(s, target, ignore_case = False):
+    if ignore_case:
+        s = to_lowercase(s)
+        target = to_lowercase(target)
+    return s.find(target) >= 0
+
+def index_of(s, target):
+    return s.find(target)
+
+def replace_all(s, old, new, reg = False):
+    if reg:
+        import re
+        targets = re.findall(old, s)
+        for t in targets:
+            s = s.replace(t, new)
+    else:
+        s = s.replace(old, new)
+    return s
+    
+def remove_all(s, sub):
+    return replace_all(s, sub, '')
+    
+def split(s, splitter, reg = False):
+    if not reg:
+        return s.split(splitter)
+    import re
+    return re.split(splitter, s)   
+    
+def remove_invisible(s):
+    s = replace_all(s, ' ', '')
+    s = replace_all(s, '\n', '')
+    s = replace_all(s, '\t', '')
+    s = replace_all(s, '\r', '')
+    return s
+    
+def find_all(s, pattern):
+    import re
+    return re.findall(pattern, s)
+
+def is_none_or_empty(s):
+    if s is None:
+        return True
+    return len(s)==0; 
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py~ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py~
new file mode 100644
index 0000000000000000000000000000000000000000..7b88e8f7e1e09d6d3e73b0d40f2abccf5c43e8a3
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/str_.py~
@@ -0,0 +1,75 @@
+# encoding = utf-8
+def int_array_to_str(arr):
+    """turn an int array to a str"""
+    return "".join(map(chr, arr))
+
+def join(arr, splitter=','):
+    temp = []
+    for e in arr:
+        temp.append(e)
+        temp.append(splitter)
+    temp.pop()
+    return "".join(temp)
+
+def is_str(s):
+    return type(s) == str
+
+def to_lowercase(s):
+    return str.lower(s)
+    
+def ends_with(s, suffix, ignore_case = False):
+    """
+    suffix: str, list, or tuple
+    """
+    if is_str(suffix):
+        suffix = [suffix]
+    suffix = list(suffix)
+    if ignore_case:
+        for idx, suf in enumerate(suffix):
+            suffix[idx] = to_lowercase(suf)    
+        s = to_lowercase(s)
+    suffix = tuple(suffix)
+    return s.endswith(suffix)
+
+def starts_with(s, prefix, ignore_case = False):
+    """
+    prefix: str, list, or tuple
+    """
+    if is_str(prefix):
+        prefix = [prefix]
+    prefix = list(prefix)
+    if ignore_case:
+        for idx, pre in enumerate(prefix):
+            prefix[idx] = to_lowercase(pre)    
+        s = to_lowercase(s)
+    prefix = tuple(prefix)
+    return s.startswith(prefix)
+
+
+def contains(s, target, ignore_case = False):
+    if ignore_case:
+        s = to_lowercase(s)
+        target = to_lowercase(target)
+    return s.find(target) >= 0
+    
+def replace_all(s, old, new):
+    return s.replace(old, new)
+    
+def remove_all(s, sub):
+    return replace_all(s, sub, '')
+    
+def split(s, splitter):
+#    return s.split(splitter)
+    import re
+    return re.split(splitter, s)   
+def remove_invisible(s):
+    s = replace_all(s, ' ', '')
+    s = replace_all(s, '\n', '')
+    s = replace_all(s, '\t', '')
+    s = replace_all(s, '\r', '')
+    return s
+    
+def find_all(s, pattern):
+    import re
+    return re.findall(pattern, s)
+    
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/t.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/t.py
new file mode 100644
index 0000000000000000000000000000000000000000..72ee2de7dadfe782f0f9092e2dc0233d93e468a5
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/t.py
@@ -0,0 +1,25 @@
+#encoding=utf-8
+"""
+for theano shortcuts
+"""
+import theano
+import theano.tensor as T
+import util.rand
+
+trng = T.shared_randomstreams.RandomStreams(util.rand.randint())
+scan_until = theano.scan_module.until
+
+def add_noise(input, noise_level):
+    noise = trng.binomial(size = input.shape, n = 1, p = 1 - noise_level)
+    return noise * input
+
+def crop_into(large, small):
+    """
+    center crop large image into small.
+    both 'large' and 'small' are 4D: (batch_size, channels, h, w)
+    """
+    
+    h1, w1 = large.shape[2:]
+    h2, w2 = small.shape[2:]
+    y, x = (h1 - h2) / 2, (w1 - h2)/2
+    return large[:, :, y: y + h2, x: x + w2 ]
\ No newline at end of file
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/test.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae99b2778f346df88890a0f3e2c1d0b730a5309d
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/test.py
@@ -0,0 +1,7 @@
+#encoding = utf-8
+import numpy as np
+
+assert_true = np.testing.assert_
+assert_equal = np.testing.assert_equal
+assert_array_equal = np.testing.assert_array_equal
+assert_almost_equal = np.testing.assert_almost_equal
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/tf.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..5db3b39e69a20717c7d840e537027ce0d833306c
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/tf.py
@@ -0,0 +1,269 @@
+from __future__ import print_function
+
+
+try:
+    import tensorflow as tf
+    from tensorflow.python.ops import nn
+    relu = nn.relu
+    slim = tf.contrib.slim
+    sigmoid = nn.sigmoid
+    softmax = nn.softmax
+except:
+    print("tensorflow is not installed, util.tf can not be used.")
+
+def is_gpu_available(cuda_only=True):
+  """
+  code from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/platform/test.py
+  Returns whether TensorFlow can access a GPU.
+  Args:
+    cuda_only: limit the search to CUDA gpus.
+  Returns:
+    True iff a gpu device of the requested kind is available.
+  """
+  from tensorflow.python.client import device_lib as _device_lib
+
+  if cuda_only:
+    return any((x.device_type == 'GPU')
+               for x in _device_lib.list_local_devices())
+  else:
+    return any((x.device_type == 'GPU' or x.device_type == 'SYCL')
+               for x in _device_lib.list_local_devices())
+
+
+
+def get_available_gpus(num_gpus = None):
+    """
+    Modified on http://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow
+    However, the original code will occupy all available gpu memory.
+    The modified code need a parameter: num_gpus. It does nothing but return the device handler name
+    It will work well on single-maching-training, but I don't know whether it will work well on a cluster.
+    """
+    if num_gpus == None:
+        from tensorflow.python.client import device_lib as _device_lib
+        local_device_protos = _device_lib.list_local_devices()
+        return [x.name for x in local_device_protos if x.device_type == 'GPU']
+    else:
+        return ['/gpu:%d'%(idx) for idx in xrange(num_gpus)]
+
+def get_latest_ckpt(path):
+# tf.train.latest_checkpoint
+    import util
+    path = util.io.get_absolute_path(path)
+    if util.io.is_dir(path):
+        ckpt = tf.train.get_checkpoint_state(path)
+        if ckpt is not None:
+            ckpt_path = ckpt.model_checkpoint_path
+        else:
+            ckpt_path = None
+    else:
+        ckpt_path = path;
+    return ckpt_path
+
+def get_all_ckpts(path):
+    ckpt = tf.train.get_checkpoint_state(path)
+    all_ckpts = ckpt.all_model_checkpoint_paths
+    ckpts = [str(c) for c in all_ckpts]
+    return ckpts
+
+def get_iter(ckpt):
+    import util
+    iter_ = int(util.str.find_all(ckpt, '.ckpt-\d+')[0].split('-')[-1])
+    return iter_
+
+def get_init_fn(checkpoint_path, train_dir, ignore_missing_vars = False,
+                checkpoint_exclude_scopes = None, model_name = None, checkpoint_model_scope = None):
+    """
+    code from github/SSD-tensorflow/tf_utils.py
+    Returns a function run by the chief worker to warm-start the training.
+    Note that the init_fn is only run when initializing the model during the very
+    first global step.
+
+    checkpoint_path: the checkpoint to be restored
+    train_dir: the directory where checkpoints are stored during training.
+    ignore_missing_vars: if False and there are variables in the model but not in the checkpoint, an error will be raised.
+    checkpoint_model_scope and model_name: if the root scope of checkpoints and the model in session is different,
+            (but the sub-scopes are all the same), specify them clearly
+    checkpoint_exclude_scopes: variables to be excluded when restoring from checkpoint_path.
+    Returns:
+      An init function run by the supervisor.
+    """
+    import util
+    if util.str.is_none_or_empty(checkpoint_path):
+        return None
+    # Warn the user if a checkpoint exists in the train_dir. Then ignore.
+    if tf.train.latest_checkpoint(train_dir):
+        tf.logging.info(
+            'Ignoring --checkpoint_path because a checkpoint already exists in %s'
+            % train_dir)
+        return None
+
+    exclusions = []
+    if checkpoint_exclude_scopes:
+        exclusions = [scope.strip()
+                      for scope in checkpoint_exclude_scopes.split(',')]
+
+    # TODO(sguada) variables.filter_variables()
+    variables_to_restore = []
+    for var in slim.get_model_variables():
+        excluded = False
+        for exclusion in exclusions:
+            if var.op.name.startswith(exclusion):
+                excluded = True
+                break
+        if not excluded:
+            variables_to_restore.append(var)
+    # Change model scope if necessary.
+    if checkpoint_model_scope is not None:
+        variables_to_restore = {checkpoint_model_scope + '/' + var.op.name : var for var in variables_to_restore}
+        tf.logging.info('variables_to_restore: %r'%(variables_to_restore))
+    checkpoint_path = get_latest_ckpt(checkpoint_path)
+    tf.logging.info('Fine-tuning from %s. Ignoring missing vars: %s' % (checkpoint_path, ignore_missing_vars))
+    print ('checkpoint_path', checkpoint_path)
+    return slim.assign_from_checkpoint_fn(
+        checkpoint_path,
+        variables_to_restore,
+        ignore_missing_vars=ignore_missing_vars)
+
+
+def get_variables_to_train(flags = None):
+    """code from github/SSD-tensorflow/tf_utils.py
+    Returns a list of variables to train.
+
+    Returns:
+      A list of variables to train by the optimizer.
+    """
+    if flags is None or flags.trainable_scopes is None:
+        return tf.trainable_variables()
+    else:
+        scopes = [scope.strip() for scope in flags.trainable_scopes.split(',')]
+
+    variables_to_train = []
+    for scope in scopes:
+        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
+        variables_to_train.extend(variables)
+    return variables_to_train
+
+def Print(tensor, data, msg = '', file = None, mode = 'w'):
+    from tensorflow.python.ops import control_flow_ops
+    import util
+    def np_print(*args):
+        if util.str.contains(msg, '%'):
+            message = msg%tuple(args)
+        else:
+            message = msg + ' %'*len(args)%tuple(args)
+        if file is not None:
+            file_path = util.io.get_absolute_path(file)
+            print('writting message to file(%s):'%(file_path), message)
+            with open(file_path, mode) as f:
+                print(message, file = f)
+        else:
+            print(message)
+    return control_flow_ops.with_dependencies([tf.py_func(np_print, data, [])], tensor)
+
+def get_variable_names_in_checkpoint(path, return_shapes = False, return_reader = False):
+    """
+    Args:
+        path: the path to training directory containing checkpoints,
+            or path to checkpoint
+    Return:
+        a list of variable names in the checkpoint
+    """
+    import util
+    ckpt = get_latest_ckpt(path)
+    ckpt_reader = tf.train.NewCheckpointReader(ckpt)
+    ckpt_vars = ckpt_reader.get_variable_to_shape_map()
+    names = [var for var in ckpt_vars]
+    if return_shapes:
+        return names, ckpt_vars
+    def get(name):
+        return ckpt_reader.get_tensor(name)
+    if return_reader:
+        return names, get
+    return names
+
+
+
+def min_area_rect(xs, ys):
+    import util
+    rects = tf.py_func(util.img.min_area_rect, [xs, ys], xs.dtype)
+    rects.set_shape([None, 5])
+    return rects
+
+
+def gpu_config(config = None, allow_growth = None, gpu_memory_fraction = None):
+    if config is None:
+        config = tf.ConfigProto()
+
+    if allow_growth is not None:
+        config.gpu_options.allow_growth = allow_growth
+
+    if gpu_memory_fraction is not None:
+        config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction
+
+    return config
+
+def wait_for_checkpoint(path):
+    from tensorflow.contrib.training.python.training import evaluation
+    return evaluation.checkpoints_iterator(path)
+    
+def focal_loss(labels, logits, gamma = 2.0, alpha = 0.75, normalize = True):
+    labels = tf.where(labels > 0, tf.ones_like(labels), tf.zeros_like(labels))
+    labels = tf.cast(labels, tf.float32)
+    probs = tf.sigmoid(logits)
+    CE = tf.nn.sigmoid_cross_entropy_with_logits(labels = labels, logits = logits)
+
+    alpha_t = tf.ones_like(logits) * alpha
+    alpha_t = tf.where(labels > 0, alpha_t, 1.0 - alpha_t)
+    probs_t = tf.where(labels > 0, probs, 1.0 - probs)
+
+    focal_matrix = alpha_t * tf.pow((1.0 - probs_t), gamma)
+    fl = focal_matrix * CE
+
+    fl = tf.reduce_sum(fl)
+    if normalize:
+        #n_pos = tf.reduce_sum(labels)
+        #fl = fl / tf.cast(n_pos, tf.float32)
+        total_weights = tf.stop_gradient(tf.reduce_sum(focal_matrix))
+        fl = fl / total_weights
+    return fl
+
+
+def focal_loss_layer_initializer(sigma = 0.01, pi = 0.01):
+    import numpy as np
+    b0 = - np.log((1 - pi) / pi)
+    return tf.random_normal_initializer(stddev = sigma), \
+            tf.constant_initializer(b0)
+
+
+def sum_gradients(clone_grads, do_summary = False):                        
+    averaged_grads = []
+    for grad_and_vars in zip(*clone_grads):
+        grads = []
+        var = grad_and_vars[0][1]
+        try:
+            for g, v in grad_and_vars:
+                assert v == var
+                grads.append(g)
+            grad = tf.add_n(grads, name = v.op.name + '_summed_gradients')
+        except:
+            import pdb
+            pdb.set_trace()
+        
+        averaged_grads.append((grad, v))
+        
+        if do_summary:
+            tf.summary.histogram("variables_and_gradients_" + grad.op.name, grad)
+            tf.summary.histogram("variables_and_gradients_" + v.op.name, v)
+            tf.summary.scalar("variables_and_gradients_" + grad.op.name+\
+                  '_mean/var_mean', tf.reduce_mean(grad)/tf.reduce_mean(var))
+            tf.summary.scalar("variables_and_gradients_" + v.op.name+'_mean',tf.reduce_mean(var))
+    return averaged_grads
+
+def get_update_op():
+    """
+    Extremely important for BatchNorm
+    """
+    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+    if update_ops:
+        return tf.group(*update_ops)
+    return None
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/thread_.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/thread_.py
new file mode 100644
index 0000000000000000000000000000000000000000..907e01f922fecf85a68072df7c9dd5557f8be13f
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/thread_.py
@@ -0,0 +1,62 @@
+import threading
+from threading import Thread
+
+def get_current_thread():
+    return threading.current_thread()
+
+def get_current_thread_name():
+    return get_current_thread().getName()
+    
+def is_alive(t):
+    return t.is_alive()
+    
+def create_and_start(name, target, daemon = True):
+    t = Thread(target= target)
+    t.daemon = True
+    t.setName(name)
+    t.start()
+    return t
+
+
+  
+class ThreadPool(object):
+    def __init__(self, capacity = 10):
+        import threadpool
+        self.num_threads = capacity
+        self.pool = threadpool.ThreadPool(10)
+        
+    def add(self, fn, args):
+        import threadpool
+        if type(args) == list:
+            args = [(args, None)]
+        elif type(args) == dict:
+            args = [(None, args)]
+        else:
+            raise ValueError, "Unsuported args,", type(args)
+        request = threadpool.makeRequests(fn, args)[0]
+        self.pool.putRequest(request, block = False)
+        self.pool.poll()
+    
+    def join(self):
+        self.pool.wait()
+        
+class ProcessPool(object):
+    """
+    Remember that function in function is not supported by multiprocessing.
+    """
+    def __init__(self, capacity = 8):
+        from multiprocessing import Pool
+
+        self.capacity = capacity
+        self.pool = Pool(capacity)
+    
+    def add(self, fn, args):
+        self.pool.apply_async(fn, args)
+#         self.pool.poll()
+#         self.pool.poll
+        
+    def join(self):
+        self.pool.close()
+        self.pool.join()
+        
+        
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/util/url.py b/maskrcnn_benchmark/data/datasets/evaluation/word/util/url.py
new file mode 100644
index 0000000000000000000000000000000000000000..c08e6cac66aaa0092dc8ffa4945b653d0015f818
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/util/url.py
@@ -0,0 +1,17 @@
+import sys
+import os
+from six.moves import  urllib
+
+import util
+def download(url, path):
+    filename = path.split('/')[-1]
+    if not util.io.exists(path):
+      def _progress(count, block_size, total_size):
+        sys.stdout.write('\r-----Downloading %s %.1f%%' % (filename,
+            float(count * block_size) / float(total_size) * 100.0))
+        sys.stdout.flush()
+      path, _ = urllib.request.urlretrieve(url, path, _progress)
+      print()
+      statinfo = os.stat(path)
+      print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
+    
\ No newline at end of file
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/word/word_eval.py b/maskrcnn_benchmark/data/datasets/evaluation/word/word_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..462875e4d54adf792039b2eac4b83da0c0d07423
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/evaluation/word/word_eval.py
@@ -0,0 +1,782 @@
+import logging
+import tempfile
+import os
+import torch
+from collections import OrderedDict
+import itertools
+from tqdm import tqdm
+from .util import io_
+from maskrcnn_benchmark.modeling.roi_heads.boundary_head.inference import Masker
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+
+from maskrcnn_benchmark.config import cfg
+from shapely.geometry import *
+import cv2
+import numpy as np
+import csv
+from .alfashape import getAlfaShapes
+import torch.nn as nn
+
+
+def do_coco_evaluation(
+        dataset,
+        predictions,
+        box_only,  # False
+        output_folder,
+        iou_types,  # 'segm'
+        expected_results,  # []
+        expected_results_sigma_tol,  # 4
+):
+    logger = logging.getLogger("maskrcnn_benchmark.inference")
+
+    if box_only:
+        logger.info("Evaluating bbox proposals")
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        res = COCOResults("box_proposal")
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = evaluate_box_proposals(
+                    predictions, dataset, area=area, limit=limit
+                )
+                key = "AR{}@{:d}".format(suffix, limit)
+                res.results["box_proposal"][key] = stats["ar"].item()
+        logger.info(res)
+        check_expected_results(res, expected_results, expected_results_sigma_tol)
+        if output_folder:
+            torch.save(res, os.path.join(output_folder, "box_proposals.pth"))
+        return
+    logger.info("Preparing results for COCO format")
+    coco_results = {}
+    if "bbox" in iou_types:
+        logger.info("Preparing bbox results")
+        coco_results["bbox"] = prepare_for_coco_detection(predictions, dataset)
+    if "bo" in iou_types:
+        logger.info("Preparing bo results")
+        coco_results["bo"] = prepare_for_boundary_segmentation(predictions, dataset)
+    logger.info("Do not apply evaluating predictions")
+    for iou_type in iou_types:
+        with tempfile.NamedTemporaryFile() as f:
+            file_path = f.name
+            if output_folder:
+                if not os.path.isdir(output_folder):
+                    print('creating dir: ' + output_folder)
+                    os.mkdir(output_folder)
+                file_path = os.path.join(output_folder, iou_type + ".json")
+            res = evaluate_predictions_on_coco(
+                dataset.coco, coco_results[iou_type], file_path, iou_type
+            )
+
+    return None
+
+
+def prepare_for_coco_detection(predictions, dataset):
+    # assert isinstance(dataset, COCODataset)
+    coco_results = []
+    for image_id, prediction in enumerate(predictions):
+        original_id = dataset.id_to_img_map[image_id]
+        if len(prediction) == 0:
+            continue
+
+        # TODO replace with get_img_info?
+        image_width = dataset.coco.imgs[original_id]["width"]
+        image_height = dataset.coco.imgs[original_id]["height"]
+        prediction = prediction.resize((image_width, image_height))
+        prediction = prediction.convert("xywh")
+
+        boxes = prediction.bbox.tolist()
+        scores = prediction.get_field("scores").tolist()
+        labels = prediction.get_field("labels").tolist()
+
+        mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels]
+
+        coco_results.extend(
+            [
+                {
+                    "image_id": original_id,
+                    "category_id": mapped_labels[k],
+                    "bbox": box,
+                    "score": scores[k],
+                }
+                for k, box in enumerate(boxes)
+            ]
+        )
+    return coco_results
+
+
+def contour_to_xys(cnt, image_shape):
+    """Convert rect to xys, i.e., eight points
+    The `image_shape` is used to to make sure all points return are valid, i.e., within image area
+    """
+    rect = cv2.minAreaRect(cnt)
+    h, w = image_shape[0:2]
+
+    def get_valid_x(x):
+        if x < 0:
+            return 0
+        if x >= w:
+            return w - 1
+        return x
+
+    def get_valid_y(y):
+        if y < 0:
+            return 0
+        if y >= h:
+            return h - 1
+        return y
+
+    points = cv2.boxPoints(rect)
+    points = np.int0(points)
+    for i_xy, (x, y) in enumerate(points):
+        x = get_valid_x(x)
+        y = get_valid_y(y)
+        points[i_xy, :] = [x, y]
+    points = np.reshape(points, -1)
+    return points
+
+
+def contour_to_valid(cnt, image_shape):
+    """Convert rect to xys, i.e., eight points
+    The `image_shape` is used to to make sure all points return are valid, i.e., within image area
+    """
+    # rect = cv2.minAreaRect(cnt)
+    if len(cnt.shape) != 3:
+        assert 1 < 0
+    rect = cnt.reshape([cnt.shape[0], cnt.shape[2]])
+    h, w = image_shape[0:2]
+
+    def get_valid_x(x):
+        if x < 0:
+            return 0
+        if x >= w:
+            return w - 1
+        return x
+
+    def get_valid_y(y):
+        if y < 0:
+            return 0
+        if y >= h:
+            return h - 1
+        return y
+    for i_xy, (x, y) in enumerate(rect):
+        x = get_valid_x(x)
+        y = get_valid_y(y)
+        rect[i_xy, :] = [x, y]
+
+    points = np.reshape(rect, -1)
+    return points
+
+
+def _nms_y(heat, kernel=3):
+    pad = (kernel - 1) // 2
+    hmax = nn.functional.max_pool2d(
+        heat, (1, kernel), stride=1, padding=(0, pad))
+    keep = (hmax == heat).float()
+    return heat * keep
+
+
+def _nms_x(heat, kernel=3):
+    pad = (kernel - 1) // 2
+    hmax = nn.functional.max_pool2d(
+        heat, (kernel, 1), stride=1, padding=(pad, 0))
+    keep = (hmax == heat).float()
+    return heat * keep
+
+def CTW_order_lr(map_in):
+
+    line_out_l2r = []
+    line_out_r2l = []
+
+    map_in = torch.tensor(map_in)
+    value, top = torch.topk(map_in, 2, dim=0)
+    value = value.numpy()
+    top = top.numpy()
+    top_th = np.where(value[1] > 0.1)[0]  # L
+    # print(top_th)
+    if len(top_th) == 0:
+        return []
+    top1 = np.sort(top, axis=0)
+    for i in range(len(top_th)):
+        line_out_l2r.append([top_th[i], top1[0][top_th[i]]])
+        line_out_r2l.append([top_th[i], top1[1][top_th[i]]])
+    line_out = line_out_l2r+line_out_r2l[::-1]
+    # print(line_out)
+    return line_out
+
+def CTW_order_bt(map_in):
+
+    line_out_t2b = []
+    line_out_b2t = []
+
+    map_in = torch.tensor(map_in)
+    value, top = torch.topk(map_in, 2, dim=1)
+    value = value.numpy()
+    top = top.numpy()
+    top_th = np.where(value[:, 1] > 0.1)[0]  # H
+    if len(top_th) == 0:
+        return []
+    top1 = np.sort(top, axis=1)
+    for i in range(len(top_th)):
+        line_out_b2t.append([top1[top_th[i]][0], top_th[i]])
+        line_out_t2b.append([top1[top_th[i]][1], top_th[i]])
+    line_out = line_out_b2t[::-1] + line_out_t2b
+    # print(line_out)
+    return line_out
+
+def boundary_to_mask_ic(bo_x, bo_y, name, num):
+
+    # NMS Hmap and Vmap
+    Vmap = _nms_x(bo_x, kernel=5)
+    Hmap = _nms_y(bo_y, kernel=3)
+    Vmap = Vmap[0]
+    Hmap = Hmap[0]
+    ploys_Alfa_x = Vmap.clone().numpy()
+    ploys_Alfa_y = Hmap.clone().numpy()
+
+    # Threshold Hmap and Vmap
+    thresh = 0.5
+    ploys_Alfa_x[ploys_Alfa_x < thresh] = 0
+    ploys_Alfa_x[ploys_Alfa_x >= thresh] = 1
+    ploys_Alfa_y[ploys_Alfa_y < thresh] = 0
+    ploys_Alfa_y[ploys_Alfa_y >= thresh] = 1
+    # Output points with strong texture inforamtion in both maps
+    ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y
+    ploys_Alfa[ploys_Alfa < 2] = 0
+    ploys_Alfa[ploys_Alfa == 2] = 1
+    img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8)
+
+    # calculate polygon by Alpha-Shape Algorithm
+    if ploys_Alfa.sum() == 0:
+        return img_draw
+    ploys_Alfa_inds = np.argwhere(ploys_Alfa == 1)
+    zero_detect_x = ploys_Alfa_inds[:, 0] - ploys_Alfa_inds[0, 0]
+    zero_detect_y = ploys_Alfa_inds[:, 1] - ploys_Alfa_inds[0, 1]
+    if np.where(zero_detect_x != 0)[0].shape[0] == 0 or np.where(zero_detect_y != 0)[0].shape[0] == 0 or \
+            ploys_Alfa_inds.shape[0] < 4:
+        draw_line = ploys_Alfa_inds[np.newaxis, np.newaxis, :, :]
+        cv2.fillPoly(img_draw, draw_line, 1)
+        return img_draw
+    ploys_Alfa_inds = ploys_Alfa_inds.tolist()
+    ploys_Alfa_inds = [tuple(ploys_Alfa_ind) for ploys_Alfa_ind in ploys_Alfa_inds]
+    lines = getAlfaShapes(ploys_Alfa_inds, alfas=[1])
+    draw_line = np.array(lines)
+    if len(draw_line.shape) == 4:
+        if draw_line.shape[1] == 1:
+            draw_line[0, 0, :, :] = draw_line[0, 0, :, ::-1]
+            cv2.fillPoly(img_draw, draw_line, 1)
+        else:
+            i_draw = 0
+            for draw_l in draw_line[0]:
+                img_draw_new = np.zeros([28, 28], dtype=np.uint8)
+                draw_l = draw_l[np.newaxis, np.newaxis, :, :]
+                cv2.fillPoly(img_draw, np.int32(draw_l), 1)
+                cv2.fillPoly(img_draw_new, np.int32(draw_l), 1)
+                i_draw += 1
+
+    else:
+        for i, line in enumerate(lines[0]):
+            draw_line = np.array(line)
+            draw_line = draw_line[np.newaxis, np.newaxis, :, :]
+            draw_line[0, 0, :, :] = draw_line[0, 0, :, ::-1]
+            cv2.fillPoly(img_draw, draw_line, 1)
+    return img_draw
+
+def boundary_to_mask_ctw(bo_x,bo_y, name, num, image_name_name,p_temp_box):
+    w_half = (p_temp_box[2] - p_temp_box[0]) * .5
+    h_half = (p_temp_box[3] - p_temp_box[1]) * .5
+    thresh_total = 0.5
+
+    if w_half >= h_half:
+        # point re-scoring
+        bo_x = _nms_x(bo_x, kernel=9)
+        bo_x = bo_x[0]
+        bo_y = bo_y[0]
+        ploys_Alfa_x = bo_x.clone().numpy()
+        ploys_Alfa_y = bo_y.clone().numpy()
+        thresh_x = thresh_total
+        thresh_y = thresh_total
+        ploys_Alfa_x_1 = bo_x.clone().numpy()
+        ploys_Alfa_y_1 = bo_y.clone().numpy()
+        ploys_Alfa__1 = ploys_Alfa_x_1 + ploys_Alfa_y_1
+        ploys_Alfa_x[ploys_Alfa_x < thresh_x] = 0
+        ploys_Alfa_x[ploys_Alfa_x >= thresh_x] = 1
+        ploys_Alfa_y[ploys_Alfa_y < thresh_y] = 0
+        ploys_Alfa_y[ploys_Alfa_y >= thresh_y] = 1
+        ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y
+        ploys_Alfa[ploys_Alfa < 2] = 0
+        ploys_Alfa[ploys_Alfa == 2] = 1
+        ploys_Alfa *= ploys_Alfa__1
+        # rebuild text region from contour points
+        img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8)
+        if ploys_Alfa.sum() == 0:
+            return img_draw
+        lines = CTW_order_lr(ploys_Alfa)
+    else:
+        bo_y = _nms_y(bo_y,kernel=9)
+        bo_x = bo_x[0]
+        bo_y = bo_y[0]
+        ploys_Alfa_x = bo_x.clone().numpy()
+        ploys_Alfa_y = bo_y.clone().numpy()
+        thresh_x = thresh_total
+        thresh_y = thresh_total
+        ploys_Alfa_x_1 = bo_x.clone().numpy()
+        ploys_Alfa_y_1 = bo_y.clone().numpy()
+        ploys_Alfa__1 = ploys_Alfa_x_1 + ploys_Alfa_y_1
+        ploys_Alfa_x[ploys_Alfa_x < thresh_x] = 0
+        ploys_Alfa_x[ploys_Alfa_x >= thresh_x] = 1
+        ploys_Alfa_y[ploys_Alfa_y < thresh_y] = 0
+        ploys_Alfa_y[ploys_Alfa_y >= thresh_y] = 1
+        ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y
+        ploys_Alfa[ploys_Alfa < 2] = 0
+        ploys_Alfa[ploys_Alfa == 2] = 1
+        ploys_Alfa *= ploys_Alfa__1
+        img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8)
+        if ploys_Alfa.sum() == 0:
+            return img_draw
+        lines = CTW_order_bt(ploys_Alfa)
+    if len(lines) <=10:
+        return img_draw
+    draw_line = np.array(lines)
+    draw_line = draw_line[np.newaxis, np.newaxis, :, :]
+    cv2.fillPoly(img_draw, draw_line, 1)
+    img_draw = img_draw.astype(np.uint8)
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+    img_draw = cv2.morphologyEx(img_draw, cv2.MORPH_CLOSE, kernel)
+    return img_draw
+
+def mask_to_roRect(mask, img_shape):
+    ## convert mask into rotated rect
+    e = mask[0, :, :]
+    _, countours, hier = cv2.findContours(e.clone().numpy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)  # Aarlog
+    if len(countours) == 0:
+        return np.zeros((1, 8))
+    t_c = countours[0].copy()
+    quad = contour_to_xys(t_c, img_shape)
+    return quad
+
+
+def mask_to_contours(mask, img_shape):
+    e = mask[0, :, :]
+
+    _, countours, hier = cv2.findContours(e.clone().numpy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)  # Aarlog
+
+    if len(countours) == 0:
+        return np.zeros((1, 8))
+    t_c = countours[0].copy()
+    quad = contour_to_valid(t_c, img_shape)
+    return quad
+
+
+def write_result_as_txt(image_name, bboxes, path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    filename = io_.join_path(path, '%s.txt' % (image_name))
+    lines = []
+    for b_idx, bbox in enumerate(bboxes):
+        if len(bbox) < 6:
+            continue
+        values = [int(v) for v in bbox]
+        # line = "%d, %d, %d, %d, %d, %d, %d, %d\n"%tuple(values)
+        line = "%d" % values[0]
+        for v_id in range(1, len(values)):
+            line += ", %d" % values[v_id]
+        line += '\n'
+        lines.append(line)
+    io_.write_lines(filename, lines)
+
+
+def prepare_for_boundary_segmentation(predictions, dataset):
+    import pycocotools.mask as mask_util
+    import numpy as np
+
+    masker = Masker(threshold=0.5, padding=1)
+    coco_results = []
+
+    for image_id, prediction in tqdm(enumerate(predictions)):
+        original_id = dataset.id_to_img_map[image_id]
+        image_name = dataset.coco.imgs[original_id]["file_name"].split('.')[0]
+        im_w_name = dataset.coco.imgs[original_id]["file_name"]
+        if len(prediction) == 0:
+            continue
+
+        # TODO replace with get_img_info?
+        image_width = dataset.coco.imgs[original_id]["width"]
+        image_height = dataset.coco.imgs[original_id]["height"]
+        prediction = prediction.resize((image_width, image_height))
+        masks_x = prediction.get_field("mask_x")
+        masks_y = prediction.get_field("mask_y")
+
+        if 'ic15' in cfg.DATASETS.TEST[0]:
+            masks = [boundary_to_mask_ic(mask_x, mask_y, dataset.coco.imgs[original_id]["file_name"], number) for
+                     mask_x, mask_y, number in zip(masks_x, masks_y,list(range(masks_x.shape[0])))]
+        elif 'CTW' in cfg.DATASETS.TEST[0]:
+            masks = [boundary_to_mask_ctw(mask_x, mask_y, dataset.coco.imgs[original_id]["file_name"], number, image_name,
+                                      p_temp) for
+                     mask_x, mask_y, number, p_temp in zip(masks_x, masks_y,
+                                                           list(range(masks_x.shape[0])), prediction.bbox)]
+        else:
+            print('Please add your own construction code!')
+            assert 1<0
+
+        masks = torch.from_numpy(np.array(masks)[:, np.newaxis, :, :])
+        # Masker is necessary only if masks haven't been already resized.
+        if list(masks.shape[-2:]) != [image_height, image_width]:
+            masks = masker(masks.expand(1, -1, -1, -1, -1), prediction)
+            masks = masks[0]
+
+        scores = prediction.get_field("scores").tolist()
+        labels = prediction.get_field("labels").tolist()
+        if 'ic15' in cfg.DATASETS.TEST[0]:
+            rects = [mask_to_roRect(mask, [image_height, image_width]) for mask in masks]
+        if 'CTW' in cfg.DATASETS.TEST[0]:
+            contours = [mask_to_contours(mask, [image_height, image_width]) for mask in masks]
+            # output for evaluation
+            write_result_as_txt(image_name, contours, './output/ctw/results.txt')
+            # visualization
+            if cfg.DATASETS.Test_Visual:
+                im_write = cv2.imread(
+                    '../ct/dataset/ctw/ctw_test_images/' + im_w_name)[:, :,::-1]
+                for box in contours:
+                    box = np.array(box)
+                    box = np.around(box).astype(np.int32)
+                    cv2.polylines(im_write[:, :, ::-1], [box.astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), thickness=2)  # 0,255,255 y 0,255,0 g
+                cv2.imwrite('./det_visual/' + im_w_name,im_write[:, :, ::-1])
+
+        if 'ic15' in cfg.DATASETS.TEST[0]:
+            mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels]
+            esd = []
+            for k, rect in enumerate(rects):
+                if rect.all() == 0:
+                    continue
+                else:
+                    esd.append(
+                        {
+                            "image_id": original_id,
+                            "category_id": mapped_labels[k],
+                            "seg_rorect": rect.tolist(),
+                            "score": scores[k],
+                        }
+                    )
+            if cfg.PROCESS.PNMS:
+                pnms_thresh = cfg.PROCESS.NMS_THRESH
+                keep = esd_pnms(esd, pnms_thresh)
+                new_esd = []
+                for i in keep:
+                    new_esd.append(esd[i])
+                coco_results.extend(new_esd)
+                # visualization
+                if cfg.DATASETS.Test_Visual:
+                    im_write = cv2.imread(
+                        '../ct/dataset/ic15/ic15_test_images/' + im_w_name)[
+                               :, :, ::-1]
+                    for i in keep:
+                        box = esd[i]
+                        # print(box)
+                        # assert 1<0
+                        box = np.array(box['seg_rorect'])
+                        box = np.around(box).astype(np.int32)
+                        cv2.polylines(im_write[:, :, ::-1], [box.astype(np.int32).reshape((-1, 1, 2))], True,
+                                      color=(0, 255, 0), thickness=2)  # 0,255,255 y 0,255,0 g
+                    cv2.imwrite('./det_visual/' + im_w_name, im_write[:, :, ::-1])
+            else:
+                coco_results.extend(esd)
+
+
+    return coco_results
+
+def ke_to_quad(ke, mty, img_shape):
+    mt = mty[:].argmax()
+    quad = paraToQuad_v3(ke, mt)
+    return quad
+
+
+# polynms
+def py_cpu_pnms(dets, scores, thresh):
+    pts = []
+    for det in dets:
+        pts.append([[det[i][0], det[i][1]] for i in range(len(det))])
+    order = scores.argsort()[::-1]
+    areas = np.zeros(scores.shape)
+    order = scores.argsort()[::-1]
+    inter_areas = np.zeros((scores.shape[0], scores.shape[0]))
+    for il in range(len(pts)):
+        poly = Polygon(pts[il])
+        areas[il] = poly.area
+        for jl in range(il, len(pts)):
+            polyj = Polygon(pts[jl])
+            try:
+                inS = poly.intersection(polyj)
+            except:
+                print(poly, polyj)
+            inter_areas[il][jl] = inS.area
+            inter_areas[jl][il] = inS.area
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        ovr = inter_areas[i][order[1:]] / (areas[i] + areas[order[1:]] - inter_areas[i][order[1:]])
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+    return keep
+
+
+def esd_pnms(esd, pnms_thresh):
+    scores = []
+    dets = []
+    for ele in esd:
+        score = ele['score']
+        quad = ele['seg_rorect']
+        # det = np.array([[quad[0][0], quad[0][1]], [quad[1][0], quad[1][1]],[quad[2][0], quad[2][1]],[quad[3][0], quad[3][1]]])
+        det = np.array([[quad[0], quad[1]], [quad[2], quad[3]], [quad[4], quad[5]], [quad[6], quad[7]]])
+        scores.append(score)
+        dets.append(det)
+    scores = np.array(scores)
+    dets = np.array(dets)
+    keep = py_cpu_pnms(dets, scores, pnms_thresh)
+    return keep
+
+
+# inspired from Detectron
+def evaluate_box_proposals(
+        predictions, dataset, thresholds=None, area="all", limit=None
+):
+    """Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official COCO API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0 ** 2, 1e5 ** 2],  # all
+        [0 ** 2, 32 ** 2],  # small
+        [32 ** 2, 96 ** 2],  # medium
+        [96 ** 2, 1e5 ** 2],  # large
+        [96 ** 2, 128 ** 2],  # 96-128
+        [128 ** 2, 256 ** 2],  # 128-256
+        [256 ** 2, 512 ** 2],  # 256-512
+        [512 ** 2, 1e5 ** 2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+
+    for image_id, prediction in enumerate(predictions):
+        original_id = dataset.id_to_img_map[image_id]
+
+        # TODO replace with get_img_info?
+        image_width = dataset.coco.imgs[original_id]["width"]
+        image_height = dataset.coco.imgs[original_id]["height"]
+        prediction = prediction.resize((image_width, image_height))
+
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = prediction.get_field("objectness").sort(descending=True)[1]
+        prediction = prediction[inds]
+
+        ann_ids = dataset.coco.getAnnIds(imgIds=original_id)
+        anno = dataset.coco.loadAnns(ann_ids)
+        gt_boxes = [obj["bbox"] for obj in anno if obj["iscrowd"] == 0]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
+        gt_boxes = BoxList(gt_boxes, (image_width, image_height), mode="xywh").convert(
+            "xyxy"
+        )
+        gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])
+
+        if len(gt_boxes) == 0:
+            continue
+
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+
+        num_pos += len(gt_boxes)
+
+        if len(gt_boxes) == 0:
+            continue
+
+        if len(prediction) == 0:
+            continue
+
+        if limit is not None and len(prediction) > limit:
+            prediction = prediction[:limit]
+
+        overlaps = boxlist_iou(prediction, gt_boxes)
+
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(prediction), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = torch.cat(gt_overlaps, dim=0)
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
+
+
+def evaluate_predictions_on_coco(
+        coco_gt, coco_results, json_result_file, iou_type="bbox"
+):
+    import json
+
+    print('writing results to ' + json_result_file)
+    with open(json_result_file, "w") as f:
+        json.dump(coco_results, f)
+
+    # from pycocotools.cocoeval import COCOeval
+
+    # coco_dt = coco_gt.loadRes(str(json_result_file))
+    # # coco_dt = coco_gt.loadRes(coco_results)
+    # coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
+    # coco_eval.evaluate()
+    # coco_eval.accumulate()
+    # coco_eval.summarize()
+    # return coco_eval
+    return None
+
+
+class COCOResults(object):
+    METRICS = {
+        "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+        "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+        "box_proposal": [
+            "AR@100",
+            "ARs@100",
+            "ARm@100",
+            "ARl@100",
+            "AR@1000",
+            "ARs@1000",
+            "ARm@1000",
+            "ARl@1000",
+        ],
+        "keypoint": ["AP", "AP50", "AP75", "APm", "APl"],
+    }
+
+    def __init__(self, *iou_types):
+        allowed_types = ("box_proposal", "bbox", "segm")
+        assert all(iou_type in allowed_types for iou_type in iou_types)
+        results = OrderedDict()
+        for iou_type in iou_types:
+            results[iou_type] = OrderedDict(
+                [(metric, -1) for metric in COCOResults.METRICS[iou_type]]
+            )
+        self.results = results
+
+    def update(self, coco_eval):
+        if coco_eval is None:
+            return
+        from pycocotools.cocoeval import COCOeval
+
+        assert isinstance(coco_eval, COCOeval)
+        s = coco_eval.stats
+        iou_type = coco_eval.params.iouType
+        res = self.results[iou_type]
+        metrics = COCOResults.METRICS[iou_type]
+        for idx, metric in enumerate(metrics):
+            res[metric] = s[idx]
+
+    def __repr__(self):
+        # TODO make it pretty
+        return repr(self.results)
+
+
+def check_expected_results(results, expected_results, sigma_tol):
+    if not expected_results:
+        return
+
+    logger = logging.getLogger("maskrcnn_benchmark.inference")
+    for task, metric, (mean, std) in expected_results:
+        actual_val = results.results[task][metric]
+        lo = mean - sigma_tol * std
+        hi = mean + sigma_tol * std
+        ok = (lo < actual_val) and (actual_val < hi)
+        msg = (
+            "{} > {} sanity check (actual vs. expected): "
+            "{:.3f} vs. mean={:.4f}, std={:.4}, range=({:.4f}, {:.4f})"
+        ).format(task, metric, actual_val, mean, std, lo, hi)
+        if not ok:
+            msg = "FAIL: " + msg
+            logger.error(msg)
+        else:
+            msg = "PASS: " + msg
+            logger.info(msg)
+
+
+def paraToQuad_v3(kes, mt):
+    ms = (kes[0, 0], kes[6, 0])
+    xs = [kes[i, 0] for i in range(1, 5)]  # 1 2 3 4
+    ys = [kes[i, 0] for i in range(7, 11)]  # 7 8 9 10
+    crs = (kes[5, 0], kes[11, 0])
+    ms = Point(ms)
+    crs = Point(crs)
+    vp = []
+    all_types = [[1, 2, 3, 4], [1, 2, 4, 3], [1, 3, 2, 4], [1, 3, 4, 2], [1, 4, 2, 3], [1, 4, 3, 2], \
+                 [2, 1, 3, 4], [2, 1, 4, 3], [2, 3, 1, 4], [2, 3, 4, 1], [2, 4, 1, 3], [2, 4, 3, 1], \
+                 [3, 1, 2, 4], [3, 1, 4, 2], [3, 2, 1, 4], [3, 2, 4, 1], [3, 4, 1, 2], [3, 4, 2, 1], \
+                 [4, 1, 2, 3], [4, 1, 3, 2], [4, 2, 1, 3], [4, 2, 3, 1], [4, 3, 1, 2], [4, 3, 2, 1]]
+    all_types = [[all_types[iat][0] - 1, all_types[iat][1] - 1, all_types[iat][2] - 1, all_types[iat][3] - 1] for iat in
+                 range(24)]
+
+    tpe = all_types[mt]
+    p1 = Point((xs[0], ys[tpe[0]]))
+    p2 = Point((xs[1], ys[tpe[1]]))
+    p3 = Point((xs[2], ys[tpe[2]]))
+    p4 = Point((xs[3], ys[tpe[3]]))
+    pts = [p1, p2, p3, p4]
+    scs = [0, 1, 2, 3]
+    for it in itertools.permutations(scs, 4):
+        poly = Polygon([(pts[it[0]].x, pts[it[0]].y), (pts[it[1]].x, pts[it[1]].y),(pts[it[2]].x, pts[it[2]].y), (pts[it[3]].x, pts[it[3]].y)])
+        if poly.is_valid and ms.within(poly) and crs.within(poly):
+            quad = [(pts[it[0]].x, pts[it[0]].y), (pts[it[1]].x, pts[it[1]].y),(pts[it[2]].x, pts[it[2]].y), (pts[it[3]].x, pts[it[3]].y)]
+            lr = LinearRing(quad)
+            if lr.is_ccw:
+                return [(int(iq[0]), int(iq[1])) for iq in quad]
+            else:
+                quad = [quad[0], quad[3], quad[2], quad[1]]
+                return [(int(iq[0]), int(iq[1])) for iq in quad]
+
+            return [(int(iq[0]), int(iq[1])) for iq in quad]
+
+    return None
+
diff --git a/maskrcnn_benchmark/data/datasets/list_dataset.py b/maskrcnn_benchmark/data/datasets/list_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9058d35b3d4279048732074f4a8dbb6edd4c9ed0
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/list_dataset.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+Simple dataset class that wraps a list of path names
+"""
+
+from PIL import Image
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+
+class ListDataset(object):
+    def __init__(self, image_lists, transforms=None):
+        self.image_lists = image_lists
+        self.transforms = transforms
+
+    def __getitem__(self, item):
+        img = Image.open(self.image_lists[item]).convert("RGB")
+
+        # dummy target
+        w, h = img.size
+        target = BoxList([[0, 0, w, h]], img.size, mode="xyxy")
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.image_lists)
+
+    def get_img_info(self, item):
+        """
+        Return the image dimensions for the image, without
+        loading and pre-processing it
+        """
+        pass
diff --git a/maskrcnn_benchmark/data/datasets/voc.py b/maskrcnn_benchmark/data/datasets/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..459985bd12a47ffe5a246cbf8e00b7930b991a1c
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/voc.py
@@ -0,0 +1,134 @@
+import os
+
+import torch
+import torch.utils.data
+from PIL import Image
+import sys
+
+if sys.version_info[0] == 2:
+    import xml.etree.cElementTree as ET
+else:
+    import xml.etree.ElementTree as ET
+
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+
+class PascalVOCDataset(torch.utils.data.Dataset):
+
+    CLASSES = (
+        "__background__ ",
+        "aeroplane",
+        "bicycle",
+        "bird",
+        "boat",
+        "bottle",
+        "bus",
+        "car",
+        "cat",
+        "chair",
+        "cow",
+        "diningtable",
+        "dog",
+        "horse",
+        "motorbike",
+        "person",
+        "pottedplant",
+        "sheep",
+        "sofa",
+        "train",
+        "tvmonitor",
+    )
+
+    def __init__(self, data_dir, split, use_difficult=False, transforms=None):
+        self.root = data_dir
+        self.image_set = split
+        self.keep_difficult = use_difficult
+        self.transforms = transforms
+
+        self._annopath = os.path.join(self.root, "Annotations", "%s.xml")
+        self._imgpath = os.path.join(self.root, "JPEGImages", "%s.jpg")
+        self._imgsetpath = os.path.join(self.root, "ImageSets", "Main", "%s.txt")
+
+        with open(self._imgsetpath % self.image_set) as f:
+            self.ids = f.readlines()
+        self.ids = [x.strip("\n") for x in self.ids]
+        self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
+
+        cls = PascalVOCDataset.CLASSES
+        self.class_to_ind = dict(zip(cls, range(len(cls))))
+
+    def __getitem__(self, index):
+        img_id = self.ids[index]
+        img = Image.open(self._imgpath % img_id).convert("RGB")
+
+        target = self.get_groundtruth(index)
+        target = target.clip_to_image(remove_empty=True)
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target, index
+
+    def __len__(self):
+        return len(self.ids)
+
+    def get_groundtruth(self, index):
+        img_id = self.ids[index]
+        anno = ET.parse(self._annopath % img_id).getroot()
+        anno = self._preprocess_annotation(anno)
+
+        height, width = anno["im_info"]
+        target = BoxList(anno["boxes"], (width, height), mode="xyxy")
+        target.add_field("labels", anno["labels"])
+        target.add_field("difficult", anno["difficult"])
+        return target
+
+    def _preprocess_annotation(self, target):
+        boxes = []
+        gt_classes = []
+        difficult_boxes = []
+        TO_REMOVE = 1
+        
+        for obj in target.iter("object"):
+            difficult = int(obj.find("difficult").text) == 1
+            if not self.keep_difficult and difficult:
+                continue
+            name = obj.find("name").text.lower().strip()
+            bb = obj.find("bndbox")
+            # Make pixel indexes 0-based
+            # Refer to "https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/pascal_voc.py#L208-L211"
+            box = [
+                bb.find("xmin").text, 
+                bb.find("ymin").text, 
+                bb.find("xmax").text, 
+                bb.find("ymax").text,
+            ]
+            bndbox = tuple(
+                map(lambda x: x - TO_REMOVE, list(map(int, box)))
+            )
+
+            boxes.append(bndbox)
+            gt_classes.append(self.class_to_ind[name])
+            difficult_boxes.append(difficult)
+
+        size = target.find("size")
+        im_info = tuple(map(int, (size.find("height").text, size.find("width").text)))
+
+        res = {
+            "boxes": torch.tensor(boxes, dtype=torch.float32),
+            "labels": torch.tensor(gt_classes),
+            "difficult": torch.tensor(difficult_boxes),
+            "im_info": im_info,
+        }
+        return res
+
+    def get_img_info(self, index):
+        img_id = self.ids[index]
+        anno = ET.parse(self._annopath % img_id).getroot()
+        size = anno.find("size")
+        im_info = tuple(map(int, (size.find("height").text, size.find("width").text)))
+        return {"height": im_info[0], "width": im_info[1]}
+
+    def map_class_id_to_class_name(self, class_id):
+        return PascalVOCDataset.CLASSES[class_id]
diff --git a/maskrcnn_benchmark/data/datasets/word_dataset.py b/maskrcnn_benchmark/data/datasets/word_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc1653472826506811e09aad785f58443ea584af
--- /dev/null
+++ b/maskrcnn_benchmark/data/datasets/word_dataset.py
@@ -0,0 +1,107 @@
+import torch
+import torchvision
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask
+
+from maskrcnn_benchmark.structures.ke import textKES
+from maskrcnn_benchmark.structures.mty import MTY
+
+DEBUG = 0
+
+class WordDataset(torchvision.datasets.coco.CocoDetection):
+    def __init__(
+        self, ann_file, root, remove_images_without_annotations, transforms=None
+    ):
+        super(WordDataset, self).__init__(root, ann_file)
+        # sort indices for reproducible results
+        self.ids = sorted(self.ids)
+
+        # filter images without detection annotations
+        if remove_images_without_annotations:
+            self.ids = [
+                img_id
+                for img_id in self.ids
+                if len(self.coco.getAnnIds(imgIds=img_id, iscrowd=None)) > 0
+            ]
+
+        self.json_category_id_to_contiguous_id = {
+            v: i + 1 for i, v in enumerate(self.coco.getCatIds())
+        }
+        self.contiguous_category_id_to_json_id = {
+            v: k for k, v in self.json_category_id_to_contiguous_id.items()
+        }
+        self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
+        self.transforms = transforms
+
+    def kes_encode(self, kes):
+        kes_encode = []
+        for i in kes:
+            mnx = i[0]
+            mny = i[1]
+            assert(len(i)%3 == 0)
+            npts = int(len(i)/3-2)
+            for index in range(npts):
+                i[3+index*3]  = (i[3+index*3]+mnx)/2
+                i[4+index*3]  = (i[4+index*3]+mny)/2
+            kes_encode.append(i)
+        return kes_encode
+
+    def kes_gen(self, kes):
+        kes_gen_out = []
+        for i in kes:
+            mnx = i[0]
+            mny = i[1]
+            cx= i[27]
+            cy= i[28]
+            assert(len(i)%3 == 0)
+            ot = [mnx, i[3],i[6],i[9],i[12], cx,\
+                  mny, i[16],i[19],i[22],i[25], cy]
+            kes_gen_out.append(ot)
+        return kes_gen_out
+
+    def __getitem__(self, idx):
+        img, anno = super(WordDataset, self).__getitem__(idx)
+        # filter crowd annotations
+        # TODO might be better to add an extra field
+        anno = [obj for obj in anno if obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        if DEBUG: print('len(boxes)', len(boxes), boxes[0])
+        boxes = torch.as_tensor(boxes).reshape(-1, 4)  # guard against no boxes
+        target = BoxList(boxes, img.size, mode="xywh").convert("xyxy")
+
+        classes = [obj["category_id"] for obj in anno]
+        if DEBUG: print('len(classes)', len(classes), classes[0])
+        classes = [self.json_category_id_to_contiguous_id[c] for c in classes]
+        classes = torch.tensor(classes)
+        target.add_field("labels", classes)
+
+        masks = [obj["segmentation"] for obj in anno]
+        if DEBUG: print('len(masks)', len(masks), masks[0])
+        masks = SegmentationMask(masks, img.size)
+        target.add_field("masks", masks)
+
+        if anno and 'keypoints' in anno[0]:
+            kes = [obj["keypoints"] for obj in anno]
+            kes = self.kes_gen(kes)
+            if DEBUG: print('len(kes)', len(kes), kes[0])
+            kes = textKES(kes, img.size)
+            target.add_field("kes", kes)
+
+        if anno and 'match_type' in anno[0]:
+            mty = [obj["match_type"] for obj in anno]
+            mty = MTY(mty, img.size)
+            target.add_field("mty", mty)
+
+        target = target.clip_to_image(remove_empty=True)
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target, idx
+
+    def get_img_info(self, index):
+        img_id = self.id_to_img_map[index]
+        img_data = self.coco.imgs[img_id]
+        return img_data
diff --git a/maskrcnn_benchmark/data/samplers/__init__.py b/maskrcnn_benchmark/data/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27982cbe68c6173a911e700273f25973acbf04bd
--- /dev/null
+++ b/maskrcnn_benchmark/data/samplers/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .distributed import DistributedSampler
+from .grouped_batch_sampler import GroupedBatchSampler
+from .iteration_based_batch_sampler import IterationBasedBatchSampler
+
+__all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"]
diff --git a/maskrcnn_benchmark/data/samplers/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/samplers/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7467061ce7570f7d5ec8e73311376709ac572853
Binary files /dev/null and b/maskrcnn_benchmark/data/samplers/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/samplers/__pycache__/distributed.cpython-37.pyc b/maskrcnn_benchmark/data/samplers/__pycache__/distributed.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76a9e61b1ed44a3ee95e336ce85ef62429b884a2
Binary files /dev/null and b/maskrcnn_benchmark/data/samplers/__pycache__/distributed.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/samplers/__pycache__/grouped_batch_sampler.cpython-37.pyc b/maskrcnn_benchmark/data/samplers/__pycache__/grouped_batch_sampler.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d87ffbb208e8739735c8648a23454c3cfc9033a4
Binary files /dev/null and b/maskrcnn_benchmark/data/samplers/__pycache__/grouped_batch_sampler.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/samplers/__pycache__/iteration_based_batch_sampler.cpython-37.pyc b/maskrcnn_benchmark/data/samplers/__pycache__/iteration_based_batch_sampler.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a679ada15a0288db36dcb2181e1dfbc537f4f37
Binary files /dev/null and b/maskrcnn_benchmark/data/samplers/__pycache__/iteration_based_batch_sampler.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/samplers/distributed.py b/maskrcnn_benchmark/data/samplers/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..27a280f9ac767e299f996c8c0e1ba4c37a4f2759
--- /dev/null
+++ b/maskrcnn_benchmark/data/samplers/distributed.py
@@ -0,0 +1,66 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Code is copy-pasted exactly as in torch.utils.data.distributed.
+# FIXME remove this once c10d fixes the bug it has
+import math
+import torch
+import torch.distributed as dist
+from torch.utils.data.sampler import Sampler
+
+
+class DistributedSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+
+    def __iter__(self):
+        if self.shuffle:
+            # deterministically shuffle based on epoch
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset : offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py b/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..0960cd1f49ec7fb11bc586235653380f4b0fd02f
--- /dev/null
+++ b/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import itertools
+
+import torch
+from torch.utils.data.sampler import BatchSampler
+from torch.utils.data.sampler import Sampler
+
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces that elements from the same group should appear in groups of batch_size.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+
+    Arguments:
+        sampler (Sampler): Base sampler.
+        batch_size (int): Size of mini-batch.
+        drop_uneven (bool): If ``True``, the sampler will drop the batches whose
+            size is less than ``batch_size``
+
+    """
+
+    def __init__(self, sampler, group_ids, batch_size, drop_uneven=False):
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        self.sampler = sampler
+        self.group_ids = torch.as_tensor(group_ids)
+        assert self.group_ids.dim() == 1
+        self.batch_size = batch_size
+        self.drop_uneven = drop_uneven
+
+        self.groups = torch.unique(self.group_ids).sort(0)[0]
+
+        self._can_reuse_batches = False
+
+    def _prepare_batches(self):
+        dataset_size = len(self.group_ids)
+        # get the sampled indices from the sampler
+        sampled_ids = torch.as_tensor(list(self.sampler))
+        # potentially not all elements of the dataset were sampled
+        # by the sampler (e.g., DistributedSampler).
+        # construct a tensor which contains -1 if the element was
+        # not sampled, and a non-negative number indicating the
+        # order where the element was sampled.
+        # for example. if sampled_ids = [3, 1] and dataset_size = 5,
+        # the order is [-1, 1, -1, 0, -1]
+        order = torch.full((dataset_size,), -1, dtype=torch.int64)
+        order[sampled_ids] = torch.arange(len(sampled_ids))
+
+        # get a mask with the elements that were sampled
+        mask = order >= 0
+
+        # find the elements that belong to each individual cluster
+        clusters = [(self.group_ids == i) & mask for i in self.groups]
+        # get relative order of the elements inside each cluster
+        # that follows the order from the sampler
+        relative_order = [order[cluster] for cluster in clusters]
+        # with the relative order, find the absolute order in the
+        # sampled space
+        permutation_ids = [s[s.sort()[1]] for s in relative_order]
+        # permute each cluster so that they follow the order from
+        # the sampler
+        permuted_clusters = [sampled_ids[idx] for idx in permutation_ids]
+
+        # splits each cluster in batch_size, and merge as a list of tensors
+        splits = [c.split(self.batch_size) for c in permuted_clusters]
+        merged = tuple(itertools.chain.from_iterable(splits))
+
+        # now each batch internally has the right order, but
+        # they are grouped by clusters. Find the permutation between
+        # different batches that brings them as close as possible to
+        # the order that we have in the sampler. For that, we will consider the
+        # ordering as coming from the first element of each batch, and sort
+        # correspondingly
+        first_element_of_batch = [t[0].item() for t in merged if t.numel() > 0]
+        # get and inverse mapping from sampled indices and the position where
+        # they occur (as returned by the sampler)
+        inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())}
+        # from the first element in each batch, get a relative ordering
+        first_index_of_batch = torch.as_tensor(
+            [inv_sampled_ids_map[s] for s in first_element_of_batch]
+        )
+
+        # permute the batches so that they approximately follow the order
+        # from the sampler
+        permutation_order = first_index_of_batch.sort(0)[1].tolist()
+        # finally, permute the batches
+        batches = [merged[i].tolist() for i in permutation_order]
+
+        if self.drop_uneven:
+            kept = []
+            for batch in batches:
+                if len(batch) == self.batch_size:
+                    kept.append(batch)
+            batches = kept
+        return batches
+
+    def __iter__(self):
+        if self._can_reuse_batches:
+            batches = self._batches
+            self._can_reuse_batches = False
+        else:
+            batches = self._prepare_batches()
+        self._batches = batches
+        return iter(batches)
+
+    def __len__(self):
+        if not hasattr(self, "_batches"):
+            self._batches = self._prepare_batches()
+            self._can_reuse_batches = True
+        return len(self._batches)
diff --git a/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py b/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..93452b64696dc9b2cd2a347b8051729864bf9510
--- /dev/null
+++ b/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from torch.utils.data.sampler import BatchSampler
+
+
+class IterationBasedBatchSampler(BatchSampler):
+    """
+    Wraps a BatchSampler, resampling from it until
+    a specified number of iterations have been sampled
+    """
+
+    def __init__(self, batch_sampler, num_iterations, start_iter=0):
+        self.batch_sampler = batch_sampler
+        self.num_iterations = num_iterations
+        self.start_iter = start_iter
+
+    def __iter__(self):
+        iteration = self.start_iter
+        while iteration <= self.num_iterations:
+            # if the underlying sampler has a set_epoch method, like
+            # DistributedSampler, used for making each process see
+            # a different split of the dataset, then set it
+            if hasattr(self.batch_sampler.sampler, "set_epoch"):
+                self.batch_sampler.sampler.set_epoch(iteration)
+            for batch in self.batch_sampler:
+                iteration += 1
+                if iteration > self.num_iterations:
+                    break
+                yield batch
+
+    def __len__(self):
+        return self.num_iterations
diff --git a/maskrcnn_benchmark/data/transforms/__init__.py b/maskrcnn_benchmark/data/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..892b9cec0c2bc59162196ef9243e9aedcdcbaee6
--- /dev/null
+++ b/maskrcnn_benchmark/data/transforms/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .transforms import Compose
+from .transforms import Resize
+from .transforms import RandomHorizontalFlip
+from .transforms import ToTensor
+from .transforms import Normalize
+from .transforms import RandomCrop
+
+from .build import build_transforms
diff --git a/maskrcnn_benchmark/data/transforms/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/data/transforms/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7de1b84ba522fa114d24097cc0476ab15a7f9608
Binary files /dev/null and b/maskrcnn_benchmark/data/transforms/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/transforms/__pycache__/build.cpython-37.pyc b/maskrcnn_benchmark/data/transforms/__pycache__/build.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6ac82a547e423c0cd236419d376f87559fe2f63
Binary files /dev/null and b/maskrcnn_benchmark/data/transforms/__pycache__/build.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/transforms/__pycache__/transforms.cpython-37.pyc b/maskrcnn_benchmark/data/transforms/__pycache__/transforms.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc854cb82ddc2351287695ad3028b547b10e9d98
Binary files /dev/null and b/maskrcnn_benchmark/data/transforms/__pycache__/transforms.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/data/transforms/build.py b/maskrcnn_benchmark/data/transforms/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..09b09a92985d23cfef9e78284584e10b66d5fde0
--- /dev/null
+++ b/maskrcnn_benchmark/data/transforms/build.py
@@ -0,0 +1,54 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from . import transforms as T
+
+
+def build_transforms(cfg, is_train=True):
+    if is_train:
+        if cfg.INPUT.MIN_SIZE_RANGE_TRAIN[0] == -1:
+            min_size = cfg.INPUT.MIN_SIZE_TRAIN
+        else:
+            assert len(cfg.INPUT.MIN_SIZE_RANGE_TRAIN) == 2, \
+                "MIN_SIZE_RANGE_TRAIN must have two elements (lower bound, upper bound)"
+            min_size = range(
+                cfg.INPUT.MIN_SIZE_RANGE_TRAIN[0],
+                cfg.INPUT.MIN_SIZE_RANGE_TRAIN[1] + 1
+            )
+        max_size = cfg.INPUT.MAX_SIZE_TRAIN
+        # max_size = None
+
+        flip_prob = 0.5  # cfg.INPUT.FLIP_PROB_TRAIN
+        rotate_prob = cfg.INPUT.ROTATE_PROB_TRAIN
+        rotate_degree = cfg.INPUT.ROTATE_DEGREE
+        crop_prob = cfg.INPUT.CROP_PROB_TRAIN
+    else:
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        # max_size = None
+
+
+        flip_prob = 0
+        rotate_prob = 0
+        rotate_degree = 0
+        crop_prob = 0
+
+    to_bgr255 = cfg.INPUT.TO_BGR255
+    normalize_transform = T.Normalize(
+        mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255
+    )
+
+    transform = T.Compose(
+        [
+            T.RandomCrop(crop_prob),
+            T.RandomBrightness(crop_prob),
+            T.RandomContrast(crop_prob),
+            T.RandomHue(crop_prob),
+            T.RandomSaturation(crop_prob),
+            T.RandomGamma(crop_prob),
+            T.Resize(min_size, max_size),
+            T.RandomHorizontalFlip(flip_prob),
+            T.RandomRotation(rotate_prob, rotate_degree),
+            T.ToTensor(),
+            normalize_transform,
+        ]
+    )
+    return transform
diff --git a/maskrcnn_benchmark/data/transforms/transforms.py b/maskrcnn_benchmark/data/transforms/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5549a3de99e16cfb3562cece52482a111b7f506
--- /dev/null
+++ b/maskrcnn_benchmark/data/transforms/transforms.py
@@ -0,0 +1,468 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import random
+
+import torch
+import torchvision
+from torchvision.transforms import functional as F
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask
+
+from maskrcnn_benchmark.structures.ke import textKES
+from maskrcnn_benchmark.structures.mty import MTY
+import numpy as np
+from PIL import Image
+from shapely.geometry import *
+import cv2
+from maskrcnn_benchmark.config import cfg
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
+
+
+class Resize(object):
+    def __init__(self, min_size, max_size):
+        if not isinstance(min_size, (list, tuple)):
+            min_size = (min_size,)
+        self.min_size = min_size
+        self.max_size = max_size
+
+    # modified from torchvision to add support for max size
+    def get_size(self, image_size):
+
+       # if test ic15
+
+       #oh = 1200
+       #ow = 2000
+       #return (oh, ow)
+
+       w, h = image_size
+       size = random.choice(self.min_size)
+       max_size = self.max_size
+       if max_size is not None:
+           min_original_size = float(min((w, h)))
+           max_original_size = float(max((w, h)))
+           if max_original_size / min_original_size * size > max_size:
+               size = int(round(max_size * min_original_size / max_original_size))
+
+       if (w <= h and w == size) or (h <= w and h == size):
+           return (h, w)
+
+       if w < h:
+           ow = size
+           oh = int(size * h / w)
+       else:
+           oh = size
+           ow = int(size * w / h)
+
+       return (oh, ow)
+
+    def __call__(self, image, target):
+        size = self.get_size(image.size)
+        image = F.resize(image, size)
+        if isinstance(target, list):
+            target = [t.resize(image.size) for t in target]
+        else:
+            target = target.resize(image.size)
+        return image, target
+
+
+class RandomHorizontalFlip(object):
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            image = F.hflip(image)
+            if isinstance(target, list):
+                target = [t.transpose(0) for t in target]
+            else:
+                target = target.transpose(0)
+        return image, target
+
+
+class ToTensor(object):
+    def __call__(self, image, target):
+        return F.to_tensor(image), target
+
+
+class Normalize(object):
+    def __init__(self, mean, std, to_bgr255=True):
+        self.mean = mean
+        self.std = std
+        self.to_bgr255 = to_bgr255
+
+    def __call__(self, image, target):
+        if self.to_bgr255:
+            image = image[[2, 1, 0]] * 255
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        return image, target
+
+
+class RandomCrop(object):
+    """Random crop with repeatedly expanding the range to included box borders."""
+    def __init__(self, prob, init_crop_size=(0.5, 1.0)):
+
+        if (not isinstance(init_crop_size, list)) and (not isinstance(init_crop_size, tuple)):
+            raise ValueError('Paremeter init_crop_size should be a list or tuple!')
+        elif len(init_crop_size) != 2:
+            raise ValueError('Length of init_crop_size should be 2!')
+        elif not (init_crop_size[0] <= 1 and init_crop_size[0] >= 0 and init_crop_size[1] <= 1 and init_crop_size[1] >= 0):
+            raise ValueError('Elements of init_crop_size should be within [0, 1]!')
+        self.prob = prob
+        self.init_crop_size = init_crop_size
+
+    def __call__(self, image, target):
+        if random.random() >= self.prob:
+            return image, target
+
+        if isinstance(target, list):
+            target0 = target[0]
+        else:
+            target0 = target
+        while True:
+            # Initial Crop Region
+            crop_region = self.initial_crop_region(image)
+
+            # Adjust Crop Region
+            crop_region, keep_target = self.adjust_crop_region(crop_region, target0)
+            if crop_region is None and keep_target is None:
+                continue
+
+            if isinstance(target, list):
+                # check empty char
+                new_t1 = target[1].crop(crop_region)
+                if len(new_t1) < 1: return image, target
+
+            image = image.crop(crop_region.numpy())
+            if isinstance(target, list):
+                target0 = keep_target.crop(crop_region)
+                others = [t.crop(crop_region, remove_empty=True) for t in target[1:]]
+                target = [target0] + others
+            else:
+                target = keep_target.crop(crop_region)
+
+            return image, target
+
+    def initial_crop_region(self, image):
+        width, height = image.size
+        ratio_w, ratio_h = torch.empty(2).uniform_(self.init_crop_size[0], self.init_crop_size[1])
+        crop_width, crop_height = int(width*ratio_w), int(height*ratio_h)
+        crop_xmin = torch.randint(width-crop_width, (1,))
+        crop_ymin = torch.randint(height-crop_height, (1,))
+        crop_xmax = crop_xmin + crop_width
+        crop_ymax = crop_ymin + crop_height
+        crop_region = torch.Tensor([crop_xmin, crop_ymin, crop_xmax, crop_ymax])
+        return crop_region
+
+    def intersect_area(self, bbox, bboxes):
+        inter_xmin = torch.max(bbox[0], bboxes[:, 0])
+        inter_ymin = torch.max(bbox[1], bboxes[:, 1])
+        inter_xmax = torch.min(bbox[2], bboxes[:, 2])
+        inter_ymax = torch.min(bbox[3], bboxes[:, 3])
+        inter_width = torch.max(torch.Tensor([0]), inter_xmax-inter_xmin)
+        inter_height = torch.max(torch.Tensor([0]), inter_ymax-inter_ymin)
+        return inter_width*inter_height
+
+    def adjust_crop_region(self, crop_region, target):
+        keep_indies_ = torch.zeros((len(target)), dtype=torch.uint8)
+        while True:
+            inter_area = self.intersect_area(crop_region, target.bbox)
+            keep_indies = (inter_area > 0)
+            if torch.sum(keep_indies) == 0:
+                return None, None
+            keep_target = target[keep_indies]
+            if keep_indies.equal(keep_indies_):
+                return crop_region, keep_target
+            keep_bbox = keep_target.bbox
+            crop_xmin = torch.min(crop_region[0], torch.min(keep_bbox[:, 0]))
+            crop_ymin = torch.min(crop_region[1], torch.min(keep_bbox[:, 1]))
+            crop_xmax = torch.max(crop_region[2], torch.max(keep_bbox[:, 2]))
+            crop_ymax = torch.max(crop_region[3], torch.max(keep_bbox[:, 3]))
+            crop_region = torch.Tensor([crop_xmin, crop_ymin, crop_xmax, crop_ymax])
+            keep_indies_ = keep_indies
+
+class RandomBrightness(object):
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            brightness_factor = random.uniform(0.5, 2)
+            image = F.adjust_brightness(image, brightness_factor)
+        return image, target
+
+class RandomContrast(object):
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            contrast_factor = random.uniform(0.5, 2)
+            image = F.adjust_contrast(image, contrast_factor)
+        return image, target
+
+class RandomHue(object):
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            hue_factor = random.uniform(-0.25, 0.25)
+            image = F.adjust_hue(image, hue_factor)
+        return image, target
+
+class RandomSaturation(object):
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            saturation_factor = random.uniform(0.5, 2)
+            image = F.adjust_saturation(image, saturation_factor)
+        return image, target
+
+class RandomGamma(object):
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            gamma_factor = random.uniform(0.5, 2)
+            image = F.adjust_gamma(image, gamma_factor)
+        return image, target
+
+
+class RandomRotation(object):
+    def __init__(self, prob = 0.3, degree = 5):
+        self.prob = prob
+        self.degree = degree
+
+    def kes_encode(self, kes):
+        kes_encode = []
+        for i in kes:
+            mnx = i[0]
+            mny = i[1]
+            assert(len(i)%3 == 0)
+            npts = int(len(i)/3-2)
+            for index in range(npts):
+                i[3+index*3]  = (i[3+index*3]+mnx)/2
+                i[4+index*3]  = (i[4+index*3]+mny)/2
+            kes_encode.append(i)
+        return kes_encode
+    
+    def kes_gen(self, kes):
+        kes_gen_out = []
+        for i in kes:
+            mnx = i[0]
+            mny = i[1]
+            cx= i[27]
+            cy= i[28]
+            assert(len(i)%3 == 0)
+            ot = [mnx, i[3],i[6],i[9],i[12], cx,\
+                  mny, i[16],i[19],i[22],i[25], cy]
+            kes_gen_out.append(ot)
+        return kes_gen_out
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            image1 = image
+            target1 = target
+            img = np.array(image)
+            w = image.size[0]
+            h = image.size[1]
+            pri_points = []
+            for i in range(len(target.extra_fields['masks'].instances)):
+                assert(len(target.extra_fields['masks'].instances[i].polygons)==1), 'one text instance should have only one polygon.'
+                tensor_box = target.extra_fields['masks'].instances[i].polygons[0].polygons
+
+                points_x = np.array([tensor_box[0][0],tensor_box[0][2],tensor_box[0][4],tensor_box[0][6]])
+                points_y = np.array([tensor_box[0][1],tensor_box[0][3],tensor_box[0][5],tensor_box[0][7]])
+                smaller_x = np.where(points_x <= 0)
+                larger_x = np.where(points_x >= w)
+                smaller_y = np.where(points_y <= 0)
+                larger_y = np.where(points_y >= h)
+                points_x[smaller_x] = 1
+                points_x[larger_x] = w - 1
+                points_y[smaller_y] = 1
+                points_y[larger_y] = h -1
+                pri_points.append((int(points_x[0]),int(points_y[0])))
+                pri_points.append((int(points_x[1]),int(points_y[1])))
+                pri_points.append((int(points_x[2]),int(points_y[2])))
+                pri_points.append((int(points_x[3]),int(points_y[3])))
+
+            #get the transform image and points  
+            height, width = img.shape[:2]
+
+            # if ROTATE_DEGREE = (0,30,60,90,210,150,180,210,240,270,300,330,360)
+            #de_ro = random.choice(self.degree)
+            #matrix = cv2.getRotationMatrix2D((width / 2, height / 2) ,de_ro, 1.0)
+
+            # if ROTATE_DEGREE = 10
+            matrix = cv2.getRotationMatrix2D((width / 2, height / 2), random.uniform(-self.degree[0],self.degree[0]), 1.0)
+
+            cos = np.abs(matrix[0,0])
+            sin = np.abs(matrix[0,1])
+            new_W = int((height * sin) + (width * cos))
+            new_H = int((height * cos) + (width * sin))
+            matrix[0,2] += (new_W/2) - width/2
+            matrix[1,2] += ((new_H/2)) - height/2
+            img = cv2.warpAffine(img, matrix, (new_W,new_H))
+
+            change_points = []
+            for i in range(int(len(pri_points))):
+                x_r,y_r = cv2.transform(np.array([[pri_points[i]]]),matrix).squeeze()
+                change_points.append([x_r,y_r])
+        
+            image = Image.fromarray(img)
+
+            keypoints_len = len(change_points)
+            tran_boxes = []
+            n = keypoints_len/4
+
+            for i in range(int(n)):
+                tran_boxes.append(change_points[0 + i*4: 4 + i*4])
+
+            tran_boxes = np.array(tran_boxes).reshape(-1,2)                       
+            tran_x = []
+            tran_y = []
+            for k in range(len(tran_boxes)):
+                tran_x.append(int(tran_boxes[k][0]))
+                tran_y.append(int(tran_boxes[k][1]))
+            max_x = max(tran_x)
+            min_x = min(tran_x)
+            max_y = max(tran_y)
+            min_y = min(tran_x)
+            ctr_x = new_W / 2
+            ctr_y = new_H / 2
+            origin_xmin = ctr_x - width / 2
+            origin_xmax = ctr_x + width / 2
+            origin_ymin = ctr_y - height / 2
+            origin_ymax = ctr_y + height / 2
+            cut_xmax = origin_xmax
+            cut_xmin = origin_xmin
+            cut_ymax = origin_ymax
+            cut_ymin = origin_ymin
+            if max_x >= origin_xmax:
+                cut_xmax = max_x
+            if min_x <= origin_xmin:
+                cut_xmin = min_x
+            if max_y >= origin_ymax:
+                cut_ymax = max_y
+            if min_y <= origin_ymin:
+                cut_ymin = min_y
+            for i in range(len(tran_boxes)):
+                tran_x[i] = tran_x[i] - cut_xmin 
+                tran_y[i] = tran_y[i] - cut_ymin 
+            image = image.crop((cut_xmin,cut_ymin,cut_xmax,cut_ymax))
+            tran_x = np.array(tran_x)
+            tran_y = np.array(tran_y)
+
+            boxes = []
+            masks = []
+            mty = []
+            kes = []
+            #GET FORMAT OF BOXES,MASKS
+            for idx in range(int(tran_x.size/4)):
+                x_points = [tran_x[4 * idx], tran_x[4*idx+1],tran_x[4*idx+2],tran_x[4*idx+3]]
+                y_points = [tran_y[4 * idx], tran_y[4*idx+1],tran_y[4*idx+2],tran_y[4*idx+3]]
+
+                l1 = LineString([(x_points[0], y_points[0]), (x_points[2], y_points[2])])
+                l2 = LineString([(x_points[1], y_points[1]), (x_points[3], y_points[3])])
+                p_l1l2 = l1.intersection(l2)
+                poly1 = Polygon([(x_points[0], y_points[0]), (x_points[1], y_points[1]),
+                                (x_points[2], y_points[2]), (x_points[3], y_points[3])])
+                if not poly1.is_valid:
+                    continue
+                if not p_l1l2.within(poly1):
+                    continue
+                if poly1.area <= 10:
+                    continue
+                x_min = min(x_points)
+                x_max = max(x_points)
+                y_min = min(y_points)
+                y_max = max(y_points)
+                width = max(0, x_max - x_min + 1)
+                height = max(0, y_max - y_min + 1)
+                if width == 0 or height == 0:
+                    continue
+                boxes.append([x_min,y_min,width,height])
+
+                #get mask format
+                one_point = [[tran_x[4*idx],tran_y[4*idx],tran_x[4*idx+1],tran_y[4*idx+1],tran_x[4*idx+2],tran_y[4*idx+2],tran_x[4*idx+3],tran_y[4*idx+3]]]
+                masks.append(one_point)
+
+                #get matchtype format
+                mean_x = np.mean(x_points)
+                mean_y = np.mean(y_points)
+                xt_sort = np.sort(x_points)
+                yt_sort = np.sort(y_points)
+                xt_argsort = list(np.argsort(x_points))
+                yt_argsort = list(np.argsort(y_points))
+                ldx = []
+                for ildx in range(4):
+                    ldx.append(yt_argsort.index(xt_argsort[ildx]))
+                all_types = [[1,2,3,4],[1,2,4,3],[1,3,2,4],[1,3,4,2],[1,4,2,3],[1,4,3,2],\
+                                [2,1,3,4],[2,1,4,3],[2,3,1,4],[2,3,4,1],[2,4,1,3],[2,4,3,1],\
+                                [3,1,2,4],[3,1,4,2],[3,2,1,4],[3,2,4,1],[3,4,1,2],[3,4,2,1],\
+                                [4,1,2,3],[4,1,3,2],[4,2,1,3],[4,2,3,1],[4,3,1,2],[4,3,2,1]]
+                all_types = [[all_types[iat][0]-1,all_types[iat][1]-1,all_types[iat][2]-1,all_types[iat][3]-1] for iat in range(24)]
+                match_type = all_types.index(ldx)
+                mty.append(match_type)
+
+                half_x = (xt_sort + mean_x) / 2
+                half_y = (yt_sort + mean_y) / 2
+
+                keypoints = []
+                keypoints.append(mean_x)
+                keypoints.append(mean_y)
+                keypoints.append(2)
+                for i in range(4):
+                    keypoints.append(half_x[i])
+                    keypoints.append(mean_y)
+                    keypoints.append(2)
+                for i in range(4):
+                    keypoints.append(mean_x)
+                    keypoints.append(half_y[i])
+                    keypoints.append(2)
+                try:
+                    keypoints.append(int(p_l1l2.x))
+                    keypoints.append(int(p_l1l2.y))
+                    keypoints.append(2)
+                except Exception as e:
+                    continue
+                kes.append(keypoints)
+            #IF ENCOUNTER THAT NO BOX IN A TRANSFORMED IMAGE, RETURN PRIMARY IMAGE AND TARGET
+            if kes == []:
+                image = image1
+                target = target1
+                return image,target
+            classes = []
+            for i in range(len(boxes)):
+                classes.append(1)
+            classes = torch.tensor(classes)
+            #GET NEW TARGET
+            boxes = torch.as_tensor(boxes).reshape(-1, 4)  
+            target = BoxList(boxes, image.size, mode="xywh").convert("xyxy")
+
+            target.add_field("labels",classes)
+
+            masks = SegmentationMask(masks, image.size)
+            target.add_field("masks", masks)
+
+        return image,target
diff --git a/maskrcnn_benchmark/engine/__init__.py b/maskrcnn_benchmark/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c7f19c6c00a4ac3f2f2bc66f892e44bcbd72612
--- /dev/null
+++ b/maskrcnn_benchmark/engine/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/maskrcnn_benchmark/engine/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/engine/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e163be9e859a22628af58e88c1d81aef428bbf03
Binary files /dev/null and b/maskrcnn_benchmark/engine/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/engine/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/engine/__pycache__/inference.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd6e6aeda2ccfb22b503e31b714ce6d6c8de9157
Binary files /dev/null and b/maskrcnn_benchmark/engine/__pycache__/inference.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/engine/__pycache__/trainer.cpython-37.pyc b/maskrcnn_benchmark/engine/__pycache__/trainer.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db00539f6e5b6cde7325020046e6126958d8fb76
Binary files /dev/null and b/maskrcnn_benchmark/engine/__pycache__/trainer.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/engine/inference.py b/maskrcnn_benchmark/engine/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..77e7396d1e68f77301daee9af1c14707237bf5a9
--- /dev/null
+++ b/maskrcnn_benchmark/engine/inference.py
@@ -0,0 +1,129 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import logging
+import time
+import os
+
+import torch
+from tqdm import tqdm
+
+from maskrcnn_benchmark.data.datasets.evaluation import evaluate
+from ..utils.comm import is_main_process, get_world_size
+from ..utils.comm import all_gather
+from ..utils.comm import synchronize
+from ..utils.timer import Timer, get_time_str
+
+
+def compute_on_dataset(model, data_loader, device, timer=None):
+    model.eval()
+    results_dict = {}
+    cpu_device = torch.device("cpu")
+    for _, batch in enumerate(tqdm(data_loader)):
+        images, targets, image_ids = batch
+        images = images.to(device)
+        with torch.no_grad():
+            if timer:
+                timer.tic()
+            output = model(images)
+            if timer:
+                torch.cuda.synchronize()
+                timer.toc()
+            output = [o.to(cpu_device) for o in output]
+        results_dict.update(
+            {img_id: result for img_id, result in zip(image_ids, output)}
+        )
+    return results_dict
+
+
+def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu):
+    all_predictions = all_gather(predictions_per_gpu)
+    if not is_main_process():
+        return
+    # merge the list of dicts
+    predictions = {}
+    for p in all_predictions:
+        predictions.update(p)
+    # convert a dict where the key is the index in a list
+    image_ids = list(sorted(predictions.keys()))
+    if len(image_ids) != image_ids[-1] + 1:
+        logger = logging.getLogger("maskrcnn_benchmark.inference")
+        logger.warning(
+            "Number of images that were gathered from multiple processes is not "
+            "a contiguous set. Some images might be missing from the evaluation"
+        )
+
+    # convert to a list
+    predictions = [predictions[i] for i in image_ids]
+    return predictions
+
+
+def inference(
+        model,
+        data_loader,
+        dataset_name,
+        iou_types=("bbox",),
+        box_only=False,
+        device="cuda",
+        expected_results=(),
+        expected_results_sigma_tol=4,
+        output_folder=None,
+):
+
+    logger = logging.getLogger("maskrcnn_benchmark.inference")
+    dataset = data_loader.dataset
+    logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset)))
+
+    extra_args = dict(
+        box_only=box_only,
+        iou_types=iou_types,
+        expected_results=expected_results,
+        expected_results_sigma_tol=expected_results_sigma_tol,
+    )
+
+    # load predictions if exists
+    prediction_file = os.path.join(output_folder, 'predictions.pth')
+    if os.path.isfile(prediction_file):
+        predictions = torch.load(prediction_file)
+        logger.info("Found prediction results at {}".format(prediction_file))
+
+        return evaluate(dataset=dataset,
+                        predictions=predictions,
+                        output_folder=output_folder,
+                        **extra_args)
+
+    # convert to a torch.device for efficiency
+    device = torch.device(device)
+    num_devices = get_world_size()
+    total_timer = Timer()
+    inference_timer = Timer()
+    total_timer.tic()
+    predictions = compute_on_dataset(model, data_loader, device, inference_timer)
+    # wait for all processes to complete before measuring the time
+    synchronize()
+    total_time = total_timer.toc()
+    total_time_str = get_time_str(total_time)
+    logger.info(
+        "Total run time: {} ({} s / img per device, on {} devices)".format(
+            total_time_str, total_time * num_devices / len(dataset), num_devices
+        )
+    )
+    total_infer_time = get_time_str(inference_timer.total_time)
+    logger.info(
+        "Model inference time: {} ({} s / img per device, on {} devices)".format(
+            total_infer_time,
+            inference_timer.total_time * num_devices / len(dataset),
+            num_devices,
+        )
+    )
+
+    predictions = _accumulate_predictions_from_multiple_gpus(predictions)
+    if not is_main_process():
+        return
+
+    if output_folder:
+        torch.save(predictions, os.path.join(output_folder, "predictions.pth"))
+
+
+    return evaluate(dataset=dataset,
+                    predictions=predictions,
+                    output_folder=output_folder,
+                    **extra_args)
diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a13fda4b7597e94a0e0bdea4008574de655d3fee
--- /dev/null
+++ b/maskrcnn_benchmark/engine/trainer.py
@@ -0,0 +1,130 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import datetime
+import logging
+import time
+
+import torch
+import torch.distributed as dist
+
+from maskrcnn_benchmark.utils.comm import get_world_size
+from maskrcnn_benchmark.utils.metric_logger import MetricLogger
+
+
+def reduce_loss_dict(loss_dict):
+    """
+    Reduce the loss dictionary from all processes so that process with rank
+    0 has the averaged results. Returns a dict with the same fields as
+    loss_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return loss_dict
+    with torch.no_grad():
+        loss_names = []
+        all_losses = []
+        for k in sorted(loss_dict.keys()):
+            loss_names.append(k)
+            all_losses.append(loss_dict[k])
+        all_losses = torch.stack(all_losses, dim=0)
+        dist.reduce(all_losses, dst=0)
+        if dist.get_rank() == 0:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            all_losses /= world_size
+        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
+    return reduced_losses
+
+
+def do_train(
+    model,
+    data_loader,
+    optimizer,
+    scheduler,
+    checkpointer,
+    device,
+    checkpoint_period,
+    arguments,
+):
+    logger = logging.getLogger("maskrcnn_benchmark.trainer")
+    logger.info("Start training")
+    meters = MetricLogger(delimiter="  ")
+    max_iter = len(data_loader)
+    start_iter = arguments["iteration"]
+
+    model.train()
+    start_training_time = time.time()
+    end = time.time()
+    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
+        data_time = time.time() - end
+        iteration = iteration + 1
+        arguments["iteration"] = iteration
+
+        scheduler.step()
+
+        images = images.to(device)
+        if isinstance(targets[0], list):
+            targets = [[target[0].to(device) for target in targets],
+                       [target[1].to(device) for target in targets]]
+        else:
+            targets = [target.to(device) for target in targets]
+
+        loss_dict = model(images, targets)
+
+        del targets
+
+        losses = sum(loss for loss in loss_dict.values())
+
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = reduce_loss_dict(loss_dict)
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+        meters.update(loss=losses_reduced, **loss_dict_reduced)
+
+        optimizer.zero_grad()
+        losses.backward()
+        optimizer.step()
+
+        batch_time = time.time() - end
+        end = time.time()
+        meters.update(time=batch_time, data=data_time)
+
+        eta_seconds = meters.time.global_avg * (max_iter - iteration)
+        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+
+        losses = losses.float()
+        del losses, loss_dict, loss_dict_reduced, losses_reduced
+
+        if iteration % 20 == 0 or iteration == max_iter:
+            logger.info(
+                meters.delimiter.join(
+                    [
+                        "eta: {eta}",
+                        "iter: {iter}",
+                        "{meters}",
+                        "lr: {lr:.6f}",
+                        "max mem: {memory:.0f}",
+                    ]
+                ).format(
+                    eta=eta_string,
+                    iter=iteration,
+                    meters=str(meters),
+                    lr=optimizer.param_groups[0]["lr"],
+                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
+                )
+            )
+
+            del meters
+            meters = MetricLogger(delimiter="  ")
+
+        if iteration % checkpoint_period == 0:
+            checkpointer.save("model_{:07d}".format(iteration), **arguments)
+
+        if iteration == max_iter:
+            checkpointer.save("model_final", **arguments)
+
+    total_training_time = time.time() - start_training_time
+    total_time_str = str(datetime.timedelta(seconds=total_training_time))
+    logger.info(
+        "Total training time: {} ({:.4f} s / it)".format(
+            total_time_str, total_training_time / (max_iter)
+        )
+    )
diff --git a/maskrcnn_benchmark/layers/__init__.py b/maskrcnn_benchmark/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..00c2d2b57abf56a8749329f6bf2092ffee021dca
--- /dev/null
+++ b/maskrcnn_benchmark/layers/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+
+from .batch_norm import FrozenBatchNorm2d
+from .misc import Conv2d
+from .misc import ConvTranspose2d
+from .misc import BatchNorm2d
+from .misc import interpolate
+from .nms import nms
+from .roi_align import ROIAlign
+from .roi_align import roi_align
+from .roi_pool import ROIPool
+from .roi_pool import roi_pool
+from .smooth_l1_loss import smooth_l1_loss
+from .sigmoid_focal_loss import SigmoidFocalLoss
+from .iou_loss import IOULoss
+from .scale import Scale
+from .deform_conv_v2 import DCN, DCNPooling
+from .iou import iou_regress
+from .focal_loss import Focal_Loss
+
+__all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool",
+           "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate",
+           "BatchNorm2d", "FrozenBatchNorm2d", "SigmoidFocalLoss", "IOULoss",
+           "Scale", "DCN", "DCNPooling", "iou_regress","Focal_Loss"]
+
diff --git a/maskrcnn_benchmark/layers/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2de2e6df60b22e4d34b270bfa1d5123303159a5f
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/batch_norm.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/batch_norm.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6ddb28b907c036c3359dac2c62ac9435754d8fc
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/batch_norm.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/deform_conv_v2.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/deform_conv_v2.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84cd84e702e4930348b1fabacf33bc5b27072a8d
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/deform_conv_v2.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/focal_loss.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/focal_loss.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fb8a09da5180a3a452ee3ce7a0b37f137645030
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/focal_loss.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/iou.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/iou.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..029e65110ae616b3271b9eeeb034470d8dc83550
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/iou.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/iou_loss.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/iou_loss.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bab390fd58b575424f6a94aca79e6232948339a
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/iou_loss.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/misc.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/misc.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48f7ee3beb974425895be0f0c9dd3e7259561c1b
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/misc.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/nms.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/nms.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..290eb866c51b4acaf55b20db2db0aad52121412c
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/nms.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/roi_align.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/roi_align.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cdcc07a13bdd2e2dc1486ab7c3b1cb7224ac7aa
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/roi_align.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/roi_pool.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/roi_pool.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f41a2990f4c1e590312d810b45a8ddee667f222
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/roi_pool.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/scale.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/scale.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd3459342f2a1fa08b8d454e3375fac678b78524
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/scale.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/sigmoid_focal_loss.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/sigmoid_focal_loss.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5305f02112bafb171e1ca04f7b16bb445fbd7e4
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/sigmoid_focal_loss.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/__pycache__/smooth_l1_loss.cpython-37.pyc b/maskrcnn_benchmark/layers/__pycache__/smooth_l1_loss.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6318911f3715dd9c18e614f5d7e75c428a564b99
Binary files /dev/null and b/maskrcnn_benchmark/layers/__pycache__/smooth_l1_loss.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/layers/_utils.py b/maskrcnn_benchmark/layers/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dabc127b221d67eae7587ab4905416fa5fcf121
--- /dev/null
+++ b/maskrcnn_benchmark/layers/_utils.py
@@ -0,0 +1,39 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import glob
+import os.path
+
+import torch
+
+try:
+    from torch.utils.cpp_extension import load as load_ext
+    from torch.utils.cpp_extension import CUDA_HOME
+except ImportError:
+    raise ImportError("The cpp layer extensions requires PyTorch 0.4 or higher")
+
+
+def _load_C_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    this_dir = os.path.dirname(this_dir)
+    this_dir = os.path.join(this_dir, "csrc")
+
+    main_file = glob.glob(os.path.join(this_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(this_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(this_dir, "cuda", "*.cu"))
+
+    source = main_file + source_cpu
+
+    extra_cflags = []
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        source.extend(source_cuda)
+        extra_cflags = ["-DWITH_CUDA"]
+    source = [os.path.join(this_dir, s) for s in source]
+    extra_include_paths = [this_dir]
+    return load_ext(
+        "torchvision",
+        source,
+        extra_cflags=extra_cflags,
+        extra_include_paths=extra_include_paths,
+    )
+
+
+_C = _load_C_extensions()
diff --git a/maskrcnn_benchmark/layers/batch_norm.py b/maskrcnn_benchmark/layers/batch_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..903607ac3895947d1aa6d6c4766624af0e97bc71
--- /dev/null
+++ b/maskrcnn_benchmark/layers/batch_norm.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch import nn
+
+
+class FrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters
+    are fixed
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def forward(self, x):
+        scale = self.weight * self.running_var.rsqrt()
+        bias = self.bias - self.running_mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+        return x * scale + bias
diff --git a/maskrcnn_benchmark/layers/deform_conv_v2.py b/maskrcnn_benchmark/layers/deform_conv_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce309848025dad326f9c891f3e6865095a571624
--- /dev/null
+++ b/maskrcnn_benchmark/layers/deform_conv_v2.py
@@ -0,0 +1,308 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import math
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+from torch.autograd.function import once_differentiable
+
+import maskrcnn_benchmark._C as _backend
+
+
+class _DCNv2(Function):
+    @staticmethod
+    def forward(ctx, input, offset, mask, weight, bias,
+                stride, padding, dilation, deformable_groups):
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.kernel_size = _pair(weight.shape[2:4])
+        ctx.deformable_groups = deformable_groups
+        output = _backend.dcn_v2_forward(input, weight, bias,
+                                         offset, mask,
+                                         ctx.kernel_size[0], ctx.kernel_size[1],
+                                         ctx.stride[0], ctx.stride[1],
+                                         ctx.padding[0], ctx.padding[1],
+                                         ctx.dilation[0], ctx.dilation[1],
+                                         ctx.deformable_groups)
+        ctx.save_for_backward(input, offset, mask, weight, bias)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input, grad_offset, grad_mask, grad_weight, grad_bias = \
+            _backend.dcn_v2_backward(input, weight,
+                                     bias,
+                                     offset, mask,
+                                     grad_output,
+                                     ctx.kernel_size[0], ctx.kernel_size[1],
+                                     ctx.stride[0], ctx.stride[1],
+                                     ctx.padding[0], ctx.padding[1],
+                                     ctx.dilation[0], ctx.dilation[1],
+                                     ctx.deformable_groups)
+
+        return grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\
+            None, None, None, None,
+
+
+dcn_v2_conv = _DCNv2.apply
+
+
+class DCNv2(nn.Module):
+
+    def __init__(self, in_channels, out_channels,
+                 kernel_size, stride, padding, dilation=1, deformable_groups=1):
+        super(DCNv2, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.deformable_groups = deformable_groups
+
+        self.weight = nn.Parameter(torch.Tensor(
+            out_channels, in_channels, *self.kernel_size))
+        self.bias = nn.Parameter(torch.Tensor(out_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        self.bias.data.zero_()
+
+    def forward(self, input, offset, mask):
+        assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
+            offset.shape[1]
+        assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
+            mask.shape[1]
+        return dcn_v2_conv(input, offset, mask,
+                           self.weight,
+                           self.bias,
+                           self.stride,
+                           self.padding,
+                           self.dilation,
+                           self.deformable_groups)
+
+
+class DCN(DCNv2):
+
+    def __init__(self, in_channels, out_channels,
+                 kernel_size, stride, padding=0,
+                 dilation=1, deformable_groups=2,
+                 groups=None, bias=True):
+        """
+        groups and bias are two dummy args which have no effect
+        """
+        super(DCN, self).__init__(in_channels, out_channels,
+                                  kernel_size, stride, padding, dilation, deformable_groups)
+
+        channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1]
+        self.conv_offset_mask = nn.Conv2d(self.in_channels,
+                                          channels_,
+                                          kernel_size=self.kernel_size,
+                                          stride=self.stride,
+                                          padding=self.padding,
+                                          bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset_mask.weight.data.zero_()
+        self.conv_offset_mask.bias.data.zero_()
+
+    def forward(self, input):
+        out = self.conv_offset_mask(input)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return dcn_v2_conv(input, offset, mask,
+                           self.weight, self.bias,
+                           self.stride,
+                           self.padding,
+                           self.dilation,
+                           self.deformable_groups)
+
+
+
+class _DCNv2Pooling(Function):
+    @staticmethod
+    def forward(ctx, input, rois, offset,
+                spatial_scale,
+                pooled_size,
+                output_dim,
+                no_trans,
+                group_size=1,
+                part_size=None,
+                sample_per_part=4,
+                trans_std=.0):
+        ctx.spatial_scale = spatial_scale
+        ctx.no_trans = int(no_trans)
+        ctx.output_dim = output_dim
+        ctx.group_size = group_size
+        ctx.pooled_size = pooled_size
+        ctx.part_size = pooled_size if part_size is None else part_size
+        ctx.sample_per_part = sample_per_part
+        ctx.trans_std = trans_std
+
+        output, output_count = \
+            _backend.dcn_v2_psroi_pooling_forward(input, rois, offset,
+                                                  ctx.no_trans, ctx.spatial_scale,
+                                                  ctx.output_dim, ctx.group_size,
+                                                  ctx.pooled_size, ctx.part_size,
+                                                  ctx.sample_per_part, ctx.trans_std)
+        ctx.save_for_backward(input, rois, offset, output_count)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, rois, offset, output_count = ctx.saved_tensors
+        grad_input, grad_offset = \
+            _backend.dcn_v2_psroi_pooling_backward(grad_output,
+                                                   input,
+                                                   rois,
+                                                   offset,
+                                                   output_count,
+                                                   ctx.no_trans,
+                                                   ctx.spatial_scale,
+                                                   ctx.output_dim,
+                                                   ctx.group_size,
+                                                   ctx.pooled_size,
+                                                   ctx.part_size,
+                                                   ctx.sample_per_part,
+                                                   ctx.trans_std)
+
+        return grad_input, None, grad_offset, \
+            None, None, None, None, None, None, None, None
+
+
+dcn_v2_pooling = _DCNv2Pooling.apply
+
+
+class DCNv2Pooling(nn.Module):
+
+    def __init__(self,
+                 spatial_scale,
+                 pooled_size,
+                 output_dim,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0):
+        super(DCNv2Pooling, self).__init__()
+        self.spatial_scale = spatial_scale
+        self.pooled_size = pooled_size
+        self.output_dim = output_dim
+        self.no_trans = no_trans
+        self.group_size = group_size
+        self.part_size = pooled_size if part_size is None else part_size
+        self.sample_per_part = sample_per_part
+        self.trans_std = trans_std
+
+    def forward(self, input, rois, offset):
+        assert input.shape[1] == self.output_dim
+        if self.no_trans:
+            offset = input.new()
+        return dcn_v2_pooling(input, rois, offset,
+                              self.spatial_scale,
+                              self.pooled_size,
+                              self.output_dim,
+                              self.no_trans,
+                              self.group_size,
+                              self.part_size,
+                              self.sample_per_part,
+                              self.trans_std)
+
+
+class DCNPooling(DCNv2Pooling):
+
+    def __init__(self,
+                 spatial_scale,
+                 pooled_size,
+                 output_dim,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 deform_fc_dim=1024):
+        # don't support non square pooling
+        pooled_size = pooled_size[0]
+        super(DCNPooling, self).__init__(spatial_scale,
+                                         pooled_size,
+                                         output_dim,
+                                         no_trans,
+                                         group_size,
+                                         part_size,
+                                         sample_per_part,
+                                         trans_std)
+
+        self.deform_fc_dim = deform_fc_dim
+
+        if not no_trans:
+            self.offset_mask_fc = nn.Sequential(
+                nn.Linear(self.pooled_size * self.pooled_size *
+                          self.output_dim, self.deform_fc_dim),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_dim, self.deform_fc_dim),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_dim, self.pooled_size *
+                          self.pooled_size * 3)
+            )
+            self.offset_mask_fc[4].weight.data.zero_()
+            self.offset_mask_fc[4].bias.data.zero_()
+
+    def forward(self, input, rois, debug=False):
+        offset = input.new()
+
+        if not self.no_trans:
+
+            # do roi_align first
+            n = rois.shape[0]
+            roi = dcn_v2_pooling(input, rois, offset,
+                                 self.spatial_scale,
+                                 self.pooled_size,
+                                 self.output_dim,
+                                 True,  # no trans
+                                 self.group_size,
+                                 self.part_size,
+                                 self.sample_per_part,
+                                 self.trans_std)
+
+            # build mask and offset
+            offset_mask = self.offset_mask_fc(roi.view(n, -1))
+            offset_mask = offset_mask.view(
+                n, 3, self.pooled_size, self.pooled_size)
+            o1, o2, mask = torch.chunk(offset_mask, 3, dim=1)
+            offset = torch.cat((o1, o2), dim=1)
+            mask = torch.sigmoid(mask)
+
+            # do pooling with offset and mask
+            return dcn_v2_pooling(input, rois, offset,
+                                  self.spatial_scale,
+                                  self.pooled_size,
+                                  self.output_dim,
+                                  self.no_trans,
+                                  self.group_size,
+                                  self.part_size,
+                                  self.sample_per_part,
+                                  self.trans_std) * mask
+        # only roi_align
+        return dcn_v2_pooling(input, rois, offset,
+                              self.spatial_scale,
+                              self.pooled_size,
+                              self.output_dim,
+                              self.no_trans,
+                              self.group_size,
+                              self.part_size,
+                              self.sample_per_part,
+                              self.trans_std)
diff --git a/maskrcnn_benchmark/layers/focal_loss.py b/maskrcnn_benchmark/layers/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..d29ca1b85ccb2584fb4e09cc13cba558544998ab
--- /dev/null
+++ b/maskrcnn_benchmark/layers/focal_loss.py
@@ -0,0 +1,61 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+def Focal_Loss(pred, gt):
+        # print('yes!!')
+
+
+
+        ce = nn.CrossEntropyLoss()
+        alpha = 0.25
+        gamma = 2
+        # logp = ce(input, target)
+        p = torch.sigmoid(pred)
+
+        loss = -alpha * (1 - p) ** gamma * (gt * torch.log(p)) - \
+               (1 - alpha) * p ** gamma * ((1 - gt) * torch.log(1 - p))
+
+        return loss.mean()
+
+
+
+
+
+
+
+
+
+
+
+        # pred =torch.sigmoid(pred)
+        # pos_inds = gt.eq(1).float()
+        # neg_inds = gt.lt(1).float()
+        #
+        # loss = 0
+        #
+        # pos_loss = torch.log(pred + 1e-10) * torch.pow(pred, 2) * pos_inds
+        # # neg_loss = torch.log(1 - pred) * torch.pow(pred, 2) * neg_weights * neg_inds
+        # neg_loss = torch.log(1 - pred) * torch.pow(1 - pred, 2) * neg_inds
+        #
+        # num_pos = pos_inds.float().sum()
+        # num_neg = neg_inds.float().sum()
+        #
+        # pos_loss = pos_loss.sum()
+        # neg_loss = neg_loss.sum()
+        #
+        # if num_pos == 0:
+        #     loss = loss - neg_loss
+        # else:
+        #     # loss = loss - (pos_loss + neg_loss) / (num_pos)
+        #     loss = loss - (pos_loss + neg_loss )
+        # return loss * 5
+
+
+
+
+        # if weight is not None and weight.sum() > 0:
+        #     return (losses * weight).sum() / weight.sum()
+        # else:
+        #     assert losses.numel() != 0
+        #     return losses.mean()
\ No newline at end of file
diff --git a/maskrcnn_benchmark/layers/iou.py b/maskrcnn_benchmark/layers/iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f47d270958c76c6328fdcec3cea72fbd7967ca5
--- /dev/null
+++ b/maskrcnn_benchmark/layers/iou.py
@@ -0,0 +1,59 @@
+import torch
+import numpy as np
+
+
+def iou_regress(input, target, beta=1. / 9, size_average=True):
+    """
+    very similar to the smooth_l1_loss from pytorch, but with
+    the extra beta parameter
+    """
+
+
+    if len(input)==0:
+        return input.sum() * 0
+
+    width_i = input[:, 2] - input[:, 0]
+    height_i = input[:, 3] - input[:, 1]
+    width_t = target[:, 2] - target[:, 0]
+    height_t = target[:, 3] - target[:, 1]
+
+    wh_if = torch.zeros_like(width_i)
+    wh_if[width_i > 0] += 1
+    wh_if[height_i > 0] += 1
+
+    uion_i = width_i * height_i
+    uion_t = width_t * height_t
+
+    x_1_max = torch.stack([input[:,0],target[:, 0]], 0)
+    y_1_max = torch.stack([input[:,1],target[:, 1]], 0)
+    x_2_min = torch.stack([input[:, 2], target[:, 2]], 0)
+    y_2_min = torch.stack([input[:, 3], target[:, 3]], 0)
+
+    x_1_max = torch.max(x_1_max, 0, keepdim=True)
+    y_1_max = torch.max(y_1_max, 0, keepdim=True)
+    x_2_min = torch.min(x_2_min, 0, keepdim=True)
+    y_2_min = torch.min(y_2_min, 0, keepdim=True)
+
+    width_inter = x_2_min[0] - x_1_max[0]
+    height_inter = y_2_min[0] - y_1_max[0]
+    N1, N2 = height_inter.shape
+    width_inter = width_inter.view([N2])
+
+    height_inter = height_inter.view([N2])
+
+    inter_area = width_inter * height_inter
+    area_union = uion_i + uion_t - inter_area
+
+    wh_if[width_inter > 0] += 1
+    wh_if[height_inter > 0] += 1
+    wh_if [wh_if != 4] = 0
+    wh_if [wh_if > 1] = 1
+
+    inter_area *= wh_if
+    area_union *= wh_if
+
+    iou_loss_map = -torch.log((inter_area + 1.0) / (area_union + 1.0))
+    iou_loss_map = iou_loss_map * wh_if
+
+    del wh_if
+    return iou_loss_map.sum()
\ No newline at end of file
diff --git a/maskrcnn_benchmark/layers/iou_loss.py b/maskrcnn_benchmark/layers/iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..af398dd63877ec05b3fbd1ce45dd576e2e7d722a
--- /dev/null
+++ b/maskrcnn_benchmark/layers/iou_loss.py
@@ -0,0 +1,36 @@
+import torch
+from torch import nn
+
+
+class IOULoss(nn.Module):
+    def forward(self, pred, target, weight=None):
+        pred_left = pred[:, 0]
+        pred_top = pred[:, 1]
+        pred_right = pred[:, 2]
+        pred_bottom = pred[:, 3]
+
+        target_left = target[:, 0]
+        target_top = target[:, 1]
+        target_right = target[:, 2]
+        target_bottom = target[:, 3]
+
+        target_aera = (target_left + target_right) * \
+                      (target_top + target_bottom)
+        pred_aera = (pred_left + pred_right) * \
+                    (pred_top + pred_bottom)
+
+        w_intersect = torch.min(pred_left, target_left) + \
+                      torch.min(pred_right, target_right)
+        h_intersect = torch.min(pred_bottom, target_bottom) + \
+                      torch.min(pred_top, target_top)
+
+        area_intersect = w_intersect * h_intersect
+        area_union = target_aera + pred_aera - area_intersect
+
+        losses = -torch.log((area_intersect + 1.0) / (area_union + 1.0))
+
+        if weight is not None and weight.sum() > 0:
+            return (losses * weight).sum() / weight.sum()
+        else:
+            assert losses.numel() != 0
+            return losses.mean()
diff --git a/maskrcnn_benchmark/layers/misc.py b/maskrcnn_benchmark/layers/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8cf1c680c06b57412bfdf7a1c4a9c53f4acdbbd
--- /dev/null
+++ b/maskrcnn_benchmark/layers/misc.py
@@ -0,0 +1,110 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+helper class that supports empty tensors on some nn functions.
+
+Ideally, add support directly in PyTorch to empty tensors in
+those functions.
+
+This can be removed once https://github.com/pytorch/pytorch/issues/12013
+is implemented
+"""
+
+import math
+import torch
+from torch.nn.modules.utils import _ntuple
+
+
+class _NewEmptyTensorOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return _NewEmptyTensorOp.apply(grad, shape), None
+
+
+class Conv2d(torch.nn.Conv2d):
+    def forward(self, x):
+        if x.numel() > 0:
+            return super(Conv2d, self).forward(x)
+        # get output shape
+
+        output_shape = [
+            (i + 2 * p - (di * (k - 1) + 1)) // d + 1
+            for i, p, di, k, d in zip(
+                x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+            )
+        ]
+        output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+        return _NewEmptyTensorOp.apply(x, output_shape)
+
+
+class ConvTranspose2d(torch.nn.ConvTranspose2d):
+    def forward(self, x):
+        if x.numel() > 0:
+            return super(ConvTranspose2d, self).forward(x)
+        # get output shape
+
+        output_shape = [
+            (i - 1) * d - 2 * p + (di * (k - 1) + 1) + op
+            for i, p, di, k, d, op in zip(
+                x.shape[-2:],
+                self.padding,
+                self.dilation,
+                self.kernel_size,
+                self.stride,
+                self.output_padding,
+            )
+        ]
+        output_shape = [x.shape[0], self.bias.shape[0]] + output_shape
+        return _NewEmptyTensorOp.apply(x, output_shape)
+
+
+class BatchNorm2d(torch.nn.BatchNorm2d):
+    def forward(self, x):
+        if x.numel() > 0:
+            return super(BatchNorm2d, self).forward(x)
+        # get output shape
+        output_shape = x.shape
+        return _NewEmptyTensorOp.apply(x, output_shape)
+
+
+def interpolate(
+    input, size=None, scale_factor=None, mode="nearest", align_corners=None
+):
+    if input.numel() > 0:
+        return torch.nn.functional.interpolate(
+            input, size, scale_factor, mode, align_corners
+        )
+
+    def _check_size_scale_factor(dim):
+        if size is None and scale_factor is None:
+            raise ValueError("either size or scale_factor should be defined")
+        if size is not None and scale_factor is not None:
+            raise ValueError("only one of size or scale_factor should be defined")
+        if (
+            scale_factor is not None
+            and isinstance(scale_factor, tuple)
+            and len(scale_factor) != dim
+        ):
+            raise ValueError(
+                "scale_factor shape must match input shape. "
+                "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor))
+            )
+
+    def _output_size(dim):
+        _check_size_scale_factor(dim)
+        if size is not None:
+            return size
+        scale_factors = _ntuple(dim)(scale_factor)
+        # math.floor might return float in py2.7
+        return [
+            int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim)
+        ]
+
+    output_shape = tuple(_output_size(2))
+    output_shape = input.shape[:-2] + output_shape
+    return _NewEmptyTensorOp.apply(input, output_shape)
diff --git a/maskrcnn_benchmark/layers/nms.py b/maskrcnn_benchmark/layers/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e80b555045d85e509c917f940ee9bc62738fee7
--- /dev/null
+++ b/maskrcnn_benchmark/layers/nms.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# from ._utils import _C
+from maskrcnn_benchmark import _C
+
+nms = _C.nms
+# nms.__doc__ = """
+# This function performs Non-maximum suppresion"""
diff --git a/maskrcnn_benchmark/layers/roi_align.py b/maskrcnn_benchmark/layers/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..170c8f18696aed19c4b9533a51933264530a1530
--- /dev/null
+++ b/maskrcnn_benchmark/layers/roi_align.py
@@ -0,0 +1,68 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from maskrcnn_benchmark import _C
+
+
+class _ROIAlign(Function):
+    @staticmethod
+    def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
+        ctx.save_for_backward(roi)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.input_shape = input.size()
+        output = _C.roi_align_forward(
+            input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        rois, = ctx.saved_tensors
+        output_size = ctx.output_size
+        spatial_scale = ctx.spatial_scale
+        sampling_ratio = ctx.sampling_ratio
+        bs, ch, h, w = ctx.input_shape
+        grad_input = _C.roi_align_backward(
+            grad_output,
+            rois,
+            spatial_scale,
+            output_size[0],
+            output_size[1],
+            bs,
+            ch,
+            h,
+            w,
+            sampling_ratio,
+        )
+        return grad_input, None, None, None, None
+
+
+roi_align = _ROIAlign.apply
+
+
+class ROIAlign(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio):
+        super(ROIAlign, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+
+    def forward(self, input, rois):
+        return roi_align(
+            input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
+        )
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ")"
+        return tmpstr
diff --git a/maskrcnn_benchmark/layers/roi_pool.py b/maskrcnn_benchmark/layers/roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0e42756ee6fcd779387255391a30079a28f5e60
--- /dev/null
+++ b/maskrcnn_benchmark/layers/roi_pool.py
@@ -0,0 +1,63 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from maskrcnn_benchmark import _C
+
+
+class _ROIPool(Function):
+    @staticmethod
+    def forward(ctx, input, roi, output_size, spatial_scale):
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.input_shape = input.size()
+        output, argmax = _C.roi_pool_forward(
+            input, roi, spatial_scale, output_size[0], output_size[1]
+        )
+        ctx.save_for_backward(input, roi, argmax)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, rois, argmax = ctx.saved_tensors
+        output_size = ctx.output_size
+        spatial_scale = ctx.spatial_scale
+        bs, ch, h, w = ctx.input_shape
+        grad_input = _C.roi_pool_backward(
+            grad_output,
+            input,
+            rois,
+            argmax,
+            spatial_scale,
+            output_size[0],
+            output_size[1],
+            bs,
+            ch,
+            h,
+            w,
+        )
+        return grad_input, None, None, None
+
+
+roi_pool = _ROIPool.apply
+
+
+class ROIPool(nn.Module):
+    def __init__(self, output_size, spatial_scale):
+        super(ROIPool, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+
+    def forward(self, input, rois):
+        return roi_pool(input, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ")"
+        return tmpstr
diff --git a/maskrcnn_benchmark/layers/scale.py b/maskrcnn_benchmark/layers/scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c25622e939b6cc19e07c485a6910c1a5ff8da3c
--- /dev/null
+++ b/maskrcnn_benchmark/layers/scale.py
@@ -0,0 +1,11 @@
+import torch
+from torch import nn
+
+
+class Scale(nn.Module):
+    def __init__(self, init_value=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.FloatTensor([init_value]))
+
+    def forward(self, input):
+        return input * self.scale
diff --git a/maskrcnn_benchmark/layers/sigmoid_focal_loss.py b/maskrcnn_benchmark/layers/sigmoid_focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..c42b4d69900e6222d972ee1296648eae97fec511
--- /dev/null
+++ b/maskrcnn_benchmark/layers/sigmoid_focal_loss.py
@@ -0,0 +1,76 @@
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from maskrcnn_benchmark import _C
+
+# TODO: Use JIT to replace CUDA implementation in the future.
+class _SigmoidFocalLoss(Function):
+    @staticmethod
+    def forward(ctx, logits, targets, gamma, alpha):
+        ctx.save_for_backward(logits, targets)
+        num_classes = logits.shape[1]
+        ctx.num_classes = num_classes
+        ctx.gamma = gamma
+        ctx.alpha = alpha
+
+        losses = _C.sigmoid_focalloss_forward(
+            logits, targets, num_classes, gamma, alpha
+        )
+        return losses
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, d_loss):
+        logits, targets = ctx.saved_tensors
+        num_classes = ctx.num_classes
+        gamma = ctx.gamma
+        alpha = ctx.alpha
+        d_loss = d_loss.contiguous()
+        d_logits = _C.sigmoid_focalloss_backward(
+            logits, targets, d_loss, num_classes, gamma, alpha
+        )
+        return d_logits, None, None, None, None
+
+
+sigmoid_focal_loss_cuda = _SigmoidFocalLoss.apply
+
+
+def sigmoid_focal_loss_cpu(logits, targets, gamma, alpha):
+    num_classes = logits.shape[1]
+    gamma = gamma[0]
+    alpha = alpha[0]
+    dtype = targets.dtype
+    device = targets.device
+    class_range = torch.arange(1, num_classes+1, dtype=dtype, device=device).unsqueeze(0)
+
+    t = targets.unsqueeze(1)
+    p = torch.sigmoid(logits)
+    term1 = (1 - p) ** gamma * torch.log(p)
+    term2 = p ** gamma * torch.log(1 - p)
+    return -(t == class_range).float() * term1 * alpha - ((t != class_range) * (t >= 0)).float() * term2 * (1 - alpha)
+
+
+class SigmoidFocalLoss(nn.Module):
+    def __init__(self, gamma, alpha):
+        super(SigmoidFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+
+    def forward(self, logits, targets):
+        device = logits.device
+        if logits.is_cuda:
+            loss_func = sigmoid_focal_loss_cuda
+        else:
+            loss_func = sigmoid_focal_loss_cpu
+
+        loss = loss_func(logits, targets, self.gamma, self.alpha)
+        return loss.sum()
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "gamma=" + str(self.gamma)
+        tmpstr += ", alpha=" + str(self.alpha)
+        tmpstr += ")"
+        return tmpstr
diff --git a/maskrcnn_benchmark/layers/smooth_l1_loss.py b/maskrcnn_benchmark/layers/smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..408a06e6a2c229c549e9ad2143826e3f7212e909
--- /dev/null
+++ b/maskrcnn_benchmark/layers/smooth_l1_loss.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+import numpy as np
+
+# TODO maybe push this to nn?
+def smooth_l1_loss(input, target, beta=1. / 9, size_average=True):
+    """
+    very similar to the smooth_l1_loss from pytorch, but with
+    the extra beta parameter
+    """
+    n = torch.abs(input - target)
+    cond = n < beta
+    loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
+    if size_average:
+        return loss.mean()
+    return loss.sum()
+
+
diff --git a/maskrcnn_benchmark/modeling/__init__.py b/maskrcnn_benchmark/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/maskrcnn_benchmark/modeling/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ab781901b2f3a1ed9bc6142ffe9ef422cf4be65
Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/__pycache__/balanced_positive_negative_sampler.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/balanced_positive_negative_sampler.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c463f1dc7267420042ba6eca641cc669c9882e05
Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/balanced_positive_negative_sampler.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/__pycache__/box_coder.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/box_coder.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73f36b72d46c5df237708f116322d2d8f5d30307
Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/box_coder.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/__pycache__/make_layers.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/make_layers.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49885f2475ca254faf872c9aae3e4b5ac88a7062
Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/make_layers.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/__pycache__/matcher.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/matcher.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71a7bccbdb73f65666d37ec0d6ddf30d61c2abfb
Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/matcher.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/__pycache__/poolers.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/poolers.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb65166368223e96fe2f5976828acf0c8cbd2f39
Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/poolers.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/__pycache__/registry.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/registry.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2fdf723eb51a74ddb0e88a38c48d930aa07469e2
Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/registry.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/__pycache__/utils.cpython-37.pyc b/maskrcnn_benchmark/modeling/__pycache__/utils.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd77f347e8904a48f0a85f60e3b67228067067f7
Binary files /dev/null and b/maskrcnn_benchmark/modeling/__pycache__/utils.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/backbone/__init__.py b/maskrcnn_benchmark/modeling/backbone/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..537ebe56e683f4c665bb9b60fed9a1811645d8e5
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/backbone/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .backbone import build_backbone
+from . import fbnet
diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c1acb7b7c716b8cae13cb8a0788ef4100f8d005
Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/backbone.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/backbone.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d30113b53867c399ad4c68d8346e59d2028bdb73
Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/backbone.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9e29322bb280acb1f3dc5c2e9d4bd86d13d1a31
Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_builder.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_builder.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0558ee4cfbf387774e878d7f68741835dc235434
Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_builder.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_modeldef.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_modeldef.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a908e013e2e49b86c7b958b36384a409ff74144
Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/fbnet_modeldef.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/fpn.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/fpn.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f4e225cbc5c8ea55f42c87760258133bc66d39f
Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/fpn.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/msr.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/msr.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9872875bf5c3f7fc2b1103b3f447f15b6952df1d
Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/msr.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/pan.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/pan.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fc79c0279cec443d54aa1225f818a8fe413d29c
Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/pan.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/backbone/__pycache__/resnet.cpython-37.pyc b/maskrcnn_benchmark/modeling/backbone/__pycache__/resnet.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68a221cd9b27243a66508dd51c81fc199a617f15
Binary files /dev/null and b/maskrcnn_benchmark/modeling/backbone/__pycache__/resnet.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/backbone/backbone.py b/maskrcnn_benchmark/modeling/backbone/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..105d6dc54c888e8a25482c95be7b27d12abad47c
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/backbone/backbone.py
@@ -0,0 +1,119 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from collections import OrderedDict
+
+from torch import nn
+
+from maskrcnn_benchmark.modeling import registry
+from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform
+from . import fpn as fpn_module
+from .pan import PAN
+from .msr import MSR
+from . import resnet
+
+
+@registry.BACKBONES.register("R-50-C4")
+@registry.BACKBONES.register("R-50-C5")
+@registry.BACKBONES.register("R-101-C4")
+@registry.BACKBONES.register("R-101-C5")
+def build_resnet_backbone(cfg):
+    body = resnet.ResNet(cfg)
+    model = nn.Sequential(OrderedDict([("body", body)]))
+    model.out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
+    return model
+
+
+@registry.BACKBONES.register("R-50-FPN")
+@registry.BACKBONES.register("R-101-FPN")
+@registry.BACKBONES.register("R-152-FPN")
+def build_resnet_fpn_backbone(cfg):
+    in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS  # 256
+    in_channels_list = [
+        in_channels_stage2,  # 256
+        in_channels_stage2 * 2,  # 512
+        in_channels_stage2 * 4,  # 1024
+        in_channels_stage2 * 8,  # 2048
+    ]
+    body = resnet.ResNet(cfg)
+    out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS  # 256
+    fpn = fpn_module.FPN(
+        in_channels_list=in_channels_list,
+        out_channels=out_channels,
+        conv_block=conv_with_kaiming_uniform(
+            cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU,
+            cfg.MODEL.FPN.USE_DEFORMABLE
+        ),
+        top_blocks=fpn_module.LastLevelMaxPool(),
+    )
+    if cfg.MODEL.MSR_ON:
+        model = MSR(body, in_channels_list, fpn=fpn)
+    else:
+        model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
+    model.out_channels = out_channels
+    return model
+
+
+@registry.BACKBONES.register("R-50-PAN")
+@registry.BACKBONES.register("R-101-PAN")
+@registry.BACKBONES.register("R-152-PAN")
+def build_resnet_fpn_backbone(cfg):
+    in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    in_channels_list = [
+        in_channels_stage2,
+        in_channels_stage2 * 2,
+        in_channels_stage2 * 4,
+        in_channels_stage2 * 8,
+    ]
+    body = resnet.ResNet(cfg)
+    out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
+    fpn = fpn_module.FPN(
+        in_channels_list=in_channels_list,
+        out_channels=out_channels,
+        conv_block=conv_with_kaiming_uniform(
+            cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU,
+            cfg.MODEL.FPN.USE_DEFORMABLE
+        ),
+        top_blocks=fpn_module.LastLevelMaxPool(),
+    )
+    pan = PAN()
+    if cfg.MODEL.MSR_ON:
+        model = MSR(body, in_channels_list, fpn=fpn, pan=pan)
+    else:
+        model = nn.Sequential(OrderedDict([("body", body),
+                                           ("pan", pan),
+                                           ("fpn", fpn)]))
+    model.out_channels = out_channels
+    return model
+
+
+@registry.BACKBONES.register("R-50-FPN-RETINANET")
+@registry.BACKBONES.register("R-101-FPN-RETINANET")
+def build_resnet_fpn_p3p7_backbone(cfg):
+    body = resnet.ResNet(cfg)
+    in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
+    in_channels_p6p7 = in_channels_stage2 * 8 if cfg.MODEL.RETINANET.USE_C5 \
+        else out_channels
+    fpn = fpn_module.FPN(
+        in_channels_list=[
+            0,
+            in_channels_stage2 * 2,
+            in_channels_stage2 * 4,
+            in_channels_stage2 * 8,
+        ],
+        out_channels=out_channels,
+        conv_block=conv_with_kaiming_uniform(
+            cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU
+        ),
+        top_blocks=fpn_module.LastLevelP6P7(in_channels_p6p7, out_channels),
+    )
+    model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
+    model.out_channels = out_channels
+    return model
+
+
+def build_backbone(cfg):
+    assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \
+        "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format(
+            cfg.MODEL.BACKBONE.CONV_BODY
+        )
+    return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg)
diff --git a/maskrcnn_benchmark/modeling/backbone/fbnet.py b/maskrcnn_benchmark/modeling/backbone/fbnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d8cf1522f61dd77c4c8617a1555a004509e4352
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/backbone/fbnet.py
@@ -0,0 +1,252 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import copy
+import json
+import logging
+from collections import OrderedDict
+
+from . import (
+    fbnet_builder as mbuilder,
+    fbnet_modeldef as modeldef,
+)
+import torch.nn as nn
+from maskrcnn_benchmark.modeling import registry
+from maskrcnn_benchmark.modeling.rpn import rpn
+from maskrcnn_benchmark.modeling import poolers
+
+
+logger = logging.getLogger(__name__)
+
+
+def create_builder(cfg):
+    bn_type = cfg.MODEL.FBNET.BN_TYPE
+    if bn_type == "gn":
+        bn_type = (bn_type, cfg.GROUP_NORM.NUM_GROUPS)
+    factor = cfg.MODEL.FBNET.SCALE_FACTOR
+
+    arch = cfg.MODEL.FBNET.ARCH
+    arch_def = cfg.MODEL.FBNET.ARCH_DEF
+    if len(arch_def) > 0:
+        arch_def = json.loads(arch_def)
+    if arch in modeldef.MODEL_ARCH:
+        if len(arch_def) > 0:
+            assert (
+                arch_def == modeldef.MODEL_ARCH[arch]
+            ), "Two architectures with the same name {},\n{},\n{}".format(
+                arch, arch_def, modeldef.MODEL_ARCH[arch]
+            )
+        arch_def = modeldef.MODEL_ARCH[arch]
+    else:
+        assert arch_def is not None and len(arch_def) > 0
+    arch_def = mbuilder.unify_arch_def(arch_def)
+
+    rpn_stride = arch_def.get("rpn_stride", None)
+    if rpn_stride is not None:
+        assert (
+            cfg.MODEL.RPN.ANCHOR_STRIDE[0] == rpn_stride
+        ), "Needs to set cfg.MODEL.RPN.ANCHOR_STRIDE to {}, got {}".format(
+            rpn_stride, cfg.MODEL.RPN.ANCHOR_STRIDE
+        )
+    width_divisor = cfg.MODEL.FBNET.WIDTH_DIVISOR
+    dw_skip_bn = cfg.MODEL.FBNET.DW_CONV_SKIP_BN
+    dw_skip_relu = cfg.MODEL.FBNET.DW_CONV_SKIP_RELU
+
+    logger.info(
+        "Building fbnet model with arch {} (without scaling):\n{}".format(
+            arch, arch_def
+        )
+    )
+
+    builder = mbuilder.FBNetBuilder(
+        width_ratio=factor,
+        bn_type=bn_type,
+        width_divisor=width_divisor,
+        dw_skip_bn=dw_skip_bn,
+        dw_skip_relu=dw_skip_relu,
+    )
+
+    return builder, arch_def
+
+
+def _get_trunk_cfg(arch_def):
+    """ Get all stages except the last one """
+    num_stages = mbuilder.get_num_stages(arch_def)
+    trunk_stages = arch_def.get("backbone", range(num_stages - 1))
+    ret = mbuilder.get_blocks(arch_def, stage_indices=trunk_stages)
+    return ret
+
+
+class FBNetTrunk(nn.Module):
+    def __init__(
+        self, builder, arch_def, dim_in,
+    ):
+        super(FBNetTrunk, self).__init__()
+        self.first = builder.add_first(arch_def["first"], dim_in=dim_in)
+        trunk_cfg = _get_trunk_cfg(arch_def)
+        self.stages = builder.add_blocks(trunk_cfg["stages"])
+
+    # return features for each stage
+    def forward(self, x):
+        y = self.first(x)
+        y = self.stages(y)
+        ret = [y]
+        return ret
+
+
+@registry.BACKBONES.register("FBNet")
+def add_conv_body(cfg, dim_in=3):
+    builder, arch_def = create_builder(cfg)
+
+    body = FBNetTrunk(builder, arch_def, dim_in)
+    model = nn.Sequential(OrderedDict([("body", body)]))
+    model.out_channels = builder.last_depth
+
+    return model
+
+
+def _get_rpn_stage(arch_def, num_blocks):
+    rpn_stage = arch_def.get("rpn")
+    ret = mbuilder.get_blocks(arch_def, stage_indices=rpn_stage)
+    if num_blocks > 0:
+        logger.warn('Use last {} blocks in {} as rpn'.format(num_blocks, ret))
+        block_count = len(ret["stages"])
+        assert num_blocks <= block_count, "use block {}, block count {}".format(
+            num_blocks, block_count
+        )
+        blocks = range(block_count - num_blocks, block_count)
+        ret = mbuilder.get_blocks(ret, block_indices=blocks)
+    return ret["stages"]
+
+
+class FBNetRPNHead(nn.Module):
+    def __init__(
+        self, cfg, in_channels, builder, arch_def,
+    ):
+        super(FBNetRPNHead, self).__init__()
+        assert in_channels == builder.last_depth
+
+        rpn_bn_type = cfg.MODEL.FBNET.RPN_BN_TYPE
+        if len(rpn_bn_type) > 0:
+            builder.bn_type = rpn_bn_type
+
+        use_blocks = cfg.MODEL.FBNET.RPN_HEAD_BLOCKS
+        stages = _get_rpn_stage(arch_def, use_blocks)
+
+        self.head = builder.add_blocks(stages)
+        self.out_channels = builder.last_depth
+
+    def forward(self, x):
+        x = [self.head(y) for y in x]
+        return x
+
+
+@registry.RPN_HEADS.register("FBNet.rpn_head")
+def add_rpn_head(cfg, in_channels, num_anchors):
+    builder, model_arch = create_builder(cfg)
+    builder.last_depth = in_channels
+
+    assert in_channels == builder.last_depth
+    # builder.name_prefix = "[rpn]"
+
+    rpn_feature = FBNetRPNHead(cfg, in_channels, builder, model_arch)
+    rpn_regressor = rpn.RPNHeadConvRegressor(
+        cfg, rpn_feature.out_channels, num_anchors)
+    return nn.Sequential(rpn_feature, rpn_regressor)
+
+
+def _get_head_stage(arch, head_name, blocks):
+    # use default name 'head' if the specific name 'head_name' does not existed
+    if head_name not in arch:
+        head_name = "head"
+    head_stage = arch.get(head_name)
+    ret = mbuilder.get_blocks(arch, stage_indices=head_stage, block_indices=blocks)
+    return ret["stages"]
+
+
+# name mapping for head names in arch def and cfg
+ARCH_CFG_NAME_MAPPING = {
+    "bbox": "ROI_BOX_HEAD",
+    "kpts": "ROI_KEYPOINT_HEAD",
+    "mask": "ROI_MASK_HEAD",
+}
+
+
+class FBNetROIHead(nn.Module):
+    def __init__(
+        self, cfg, in_channels, builder, arch_def,
+        head_name, use_blocks, stride_init, last_layer_scale,
+    ):
+        super(FBNetROIHead, self).__init__()
+        assert in_channels == builder.last_depth
+        assert isinstance(use_blocks, list)
+
+        head_cfg_name = ARCH_CFG_NAME_MAPPING[head_name]
+        self.pooler = poolers.make_pooler(cfg, head_cfg_name)
+
+        stage = _get_head_stage(arch_def, head_name, use_blocks)
+
+        assert stride_init in [0, 1, 2]
+        if stride_init != 0:
+            stage[0]["block"][3] = stride_init
+        blocks = builder.add_blocks(stage)
+
+        last_info = copy.deepcopy(arch_def["last"])
+        last_info[1] = last_layer_scale
+        last = builder.add_last(last_info)
+
+        self.head = nn.Sequential(OrderedDict([
+            ("blocks", blocks),
+            ("last", last)
+        ]))
+
+        self.out_channels = builder.last_depth
+
+    def forward(self, x, proposals):
+        x = self.pooler(x, proposals)
+        x = self.head(x)
+        return x
+
+
+@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FBNet.roi_head")
+def add_roi_head(cfg, in_channels):
+    builder, model_arch = create_builder(cfg)
+    builder.last_depth = in_channels
+    # builder.name_prefix = "_[bbox]_"
+
+    return FBNetROIHead(
+        cfg, in_channels, builder, model_arch,
+        head_name="bbox",
+        use_blocks=cfg.MODEL.FBNET.DET_HEAD_BLOCKS,
+        stride_init=cfg.MODEL.FBNET.DET_HEAD_STRIDE,
+        last_layer_scale=cfg.MODEL.FBNET.DET_HEAD_LAST_SCALE,
+    )
+
+
+@registry.ROI_KEYPOINT_FEATURE_EXTRACTORS.register("FBNet.roi_head_keypoints")
+def add_roi_head_keypoints(cfg, in_channels):
+    builder, model_arch = create_builder(cfg)
+    builder.last_depth = in_channels
+    # builder.name_prefix = "_[kpts]_"
+
+    return FBNetROIHead(
+        cfg, in_channels, builder, model_arch,
+        head_name="kpts",
+        use_blocks=cfg.MODEL.FBNET.KPTS_HEAD_BLOCKS,
+        stride_init=cfg.MODEL.FBNET.KPTS_HEAD_STRIDE,
+        last_layer_scale=cfg.MODEL.FBNET.KPTS_HEAD_LAST_SCALE,
+    )
+
+
+@registry.ROI_MASK_FEATURE_EXTRACTORS.register("FBNet.roi_head_mask")
+def add_roi_head_mask(cfg, in_channels):
+    builder, model_arch = create_builder(cfg)
+    builder.last_depth = in_channels
+    # builder.name_prefix = "_[mask]_"
+
+    return FBNetROIHead(
+        cfg, in_channels, builder, model_arch,
+        head_name="mask",
+        use_blocks=cfg.MODEL.FBNET.MASK_HEAD_BLOCKS,
+        stride_init=cfg.MODEL.FBNET.MASK_HEAD_STRIDE,
+        last_layer_scale=cfg.MODEL.FBNET.MASK_HEAD_LAST_SCALE,
+    )
diff --git a/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py b/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..112a04074c31307d9080e0bf61115f79d4a9e0d4
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py
@@ -0,0 +1,829 @@
+"""
+FBNet model builder
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import copy
+import logging
+import math
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+from maskrcnn_benchmark.layers import (
+    BatchNorm2d,
+    Conv2d,
+    FrozenBatchNorm2d,
+    interpolate,
+)
+from maskrcnn_benchmark.layers.misc import _NewEmptyTensorOp
+
+
+logger = logging.getLogger(__name__)
+
+
+def _py2_round(x):
+    return math.floor(x + 0.5) if x >= 0.0 else math.ceil(x - 0.5)
+
+
+def _get_divisible_by(num, divisible_by, min_val):
+    ret = int(num)
+    if divisible_by > 0 and num % divisible_by != 0:
+        ret = int((_py2_round(num / divisible_by) or min_val) * divisible_by)
+    return ret
+
+
+PRIMITIVES = {
+    "skip": lambda C_in, C_out, expansion, stride, **kwargs: Identity(
+        C_in, C_out, stride
+    ),
+    "ir_k3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, expansion, stride, **kwargs
+    ),
+    "ir_k5": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, expansion, stride, kernel=5, **kwargs
+    ),
+    "ir_k7": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, expansion, stride, kernel=7, **kwargs
+    ),
+    "ir_k1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, expansion, stride, kernel=1, **kwargs
+    ),
+    "shuffle": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, expansion, stride, shuffle_type="mid", pw_group=4, **kwargs
+    ),
+    "basic_block": lambda C_in, C_out, expansion, stride, **kwargs: CascadeConv3x3(
+        C_in, C_out, stride
+    ),
+    "shift_5x5": lambda C_in, C_out, expansion, stride, **kwargs: ShiftBlock5x5(
+        C_in, C_out, expansion, stride
+    ),
+    # layer search 2
+    "ir_k3_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 1, stride, kernel=3, **kwargs
+    ),
+    "ir_k3_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 3, stride, kernel=3, **kwargs
+    ),
+    "ir_k3_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 6, stride, kernel=3, **kwargs
+    ),
+    "ir_k3_s4": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 4, stride, kernel=3, shuffle_type="mid", pw_group=4, **kwargs
+    ),
+    "ir_k5_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 1, stride, kernel=5, **kwargs
+    ),
+    "ir_k5_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 3, stride, kernel=5, **kwargs
+    ),
+    "ir_k5_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 6, stride, kernel=5, **kwargs
+    ),
+    "ir_k5_s4": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 4, stride, kernel=5, shuffle_type="mid", pw_group=4, **kwargs
+    ),
+    # layer search se
+    "ir_k3_e1_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 1, stride, kernel=3, se=True, **kwargs
+    ),
+    "ir_k3_e3_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 3, stride, kernel=3, se=True, **kwargs
+    ),
+    "ir_k3_e6_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 6, stride, kernel=3, se=True, **kwargs
+    ),
+    "ir_k3_s4_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in,
+        C_out,
+        4,
+        stride,
+        kernel=3,
+        shuffle_type="mid",
+        pw_group=4,
+        se=True,
+        **kwargs
+    ),
+    "ir_k5_e1_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 1, stride, kernel=5, se=True, **kwargs
+    ),
+    "ir_k5_e3_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 3, stride, kernel=5, se=True, **kwargs
+    ),
+    "ir_k5_e6_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 6, stride, kernel=5, se=True, **kwargs
+    ),
+    "ir_k5_s4_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in,
+        C_out,
+        4,
+        stride,
+        kernel=5,
+        shuffle_type="mid",
+        pw_group=4,
+        se=True,
+        **kwargs
+    ),
+    # layer search 3 (in addition to layer search 2)
+    "ir_k3_s2": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 1, stride, kernel=3, shuffle_type="mid", pw_group=2, **kwargs
+    ),
+    "ir_k5_s2": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 1, stride, kernel=5, shuffle_type="mid", pw_group=2, **kwargs
+    ),
+    "ir_k3_s2_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in,
+        C_out,
+        1,
+        stride,
+        kernel=3,
+        shuffle_type="mid",
+        pw_group=2,
+        se=True,
+        **kwargs
+    ),
+    "ir_k5_s2_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in,
+        C_out,
+        1,
+        stride,
+        kernel=5,
+        shuffle_type="mid",
+        pw_group=2,
+        se=True,
+        **kwargs
+    ),
+    # layer search 4 (in addition to layer search 3)
+    "ir_k3_sep": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, expansion, stride, kernel=3, cdw=True, **kwargs
+    ),
+    "ir_k33_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 1, stride, kernel=3, cdw=True, **kwargs
+    ),
+    "ir_k33_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 3, stride, kernel=3, cdw=True, **kwargs
+    ),
+    "ir_k33_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 6, stride, kernel=3, cdw=True, **kwargs
+    ),
+    # layer search 5 (in addition to layer search 4)
+    "ir_k7_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 1, stride, kernel=7, **kwargs
+    ),
+    "ir_k7_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 3, stride, kernel=7, **kwargs
+    ),
+    "ir_k7_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 6, stride, kernel=7, **kwargs
+    ),
+    "ir_k7_sep": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, expansion, stride, kernel=7, cdw=True, **kwargs
+    ),
+    "ir_k7_sep_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 1, stride, kernel=7, cdw=True, **kwargs
+    ),
+    "ir_k7_sep_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 3, stride, kernel=7, cdw=True, **kwargs
+    ),
+    "ir_k7_sep_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock(
+        C_in, C_out, 6, stride, kernel=7, cdw=True, **kwargs
+    ),
+}
+
+
+class Identity(nn.Module):
+    def __init__(self, C_in, C_out, stride):
+        super(Identity, self).__init__()
+        self.conv = (
+            ConvBNRelu(
+                C_in,
+                C_out,
+                kernel=1,
+                stride=stride,
+                pad=0,
+                no_bias=1,
+                use_relu="relu",
+                bn_type="bn",
+            )
+            if C_in != C_out or stride != 1
+            else None
+        )
+
+    def forward(self, x):
+        if self.conv:
+            out = self.conv(x)
+        else:
+            out = x
+        return out
+
+
+class CascadeConv3x3(nn.Sequential):
+    def __init__(self, C_in, C_out, stride):
+        assert stride in [1, 2]
+        ops = [
+            Conv2d(C_in, C_in, 3, stride, 1, bias=False),
+            BatchNorm2d(C_in),
+            nn.ReLU(inplace=True),
+            Conv2d(C_in, C_out, 3, 1, 1, bias=False),
+            BatchNorm2d(C_out),
+        ]
+        super(CascadeConv3x3, self).__init__(*ops)
+        self.res_connect = (stride == 1) and (C_in == C_out)
+
+    def forward(self, x):
+        y = super(CascadeConv3x3, self).forward(x)
+        if self.res_connect:
+            y += x
+        return y
+
+
+class Shift(nn.Module):
+    def __init__(self, C, kernel_size, stride, padding):
+        super(Shift, self).__init__()
+        self.C = C
+        kernel = torch.zeros((C, 1, kernel_size, kernel_size), dtype=torch.float32)
+        ch_idx = 0
+
+        assert stride in [1, 2]
+        self.stride = stride
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.dilation = 1
+
+        hks = kernel_size // 2
+        ksq = kernel_size ** 2
+
+        for i in range(kernel_size):
+            for j in range(kernel_size):
+                if i == hks and j == hks:
+                    num_ch = C // ksq + C % ksq
+                else:
+                    num_ch = C // ksq
+                kernel[ch_idx : ch_idx + num_ch, 0, i, j] = 1
+                ch_idx += num_ch
+
+        self.register_parameter("bias", None)
+        self.kernel = nn.Parameter(kernel, requires_grad=False)
+
+    def forward(self, x):
+        if x.numel() > 0:
+            return nn.functional.conv2d(
+                x,
+                self.kernel,
+                self.bias,
+                (self.stride, self.stride),
+                (self.padding, self.padding),
+                self.dilation,
+                self.C,  # groups
+            )
+
+        output_shape = [
+            (i + 2 * p - (di * (k - 1) + 1)) // d + 1
+            for i, p, di, k, d in zip(
+                x.shape[-2:],
+                (self.padding, self.dilation),
+                (self.dilation, self.dilation),
+                (self.kernel_size, self.kernel_size),
+                (self.stride, self.stride),
+            )
+        ]
+        output_shape = [x.shape[0], self.C] + output_shape
+        return _NewEmptyTensorOp.apply(x, output_shape)
+
+
+class ShiftBlock5x5(nn.Sequential):
+    def __init__(self, C_in, C_out, expansion, stride):
+        assert stride in [1, 2]
+        self.res_connect = (stride == 1) and (C_in == C_out)
+
+        C_mid = _get_divisible_by(C_in * expansion, 8, 8)
+
+        ops = [
+            # pw
+            Conv2d(C_in, C_mid, 1, 1, 0, bias=False),
+            BatchNorm2d(C_mid),
+            nn.ReLU(inplace=True),
+            # shift
+            Shift(C_mid, 5, stride, 2),
+            # pw-linear
+            Conv2d(C_mid, C_out, 1, 1, 0, bias=False),
+            BatchNorm2d(C_out),
+        ]
+        super(ShiftBlock5x5, self).__init__(*ops)
+
+    def forward(self, x):
+        y = super(ShiftBlock5x5, self).forward(x)
+        if self.res_connect:
+            y += x
+        return y
+
+
+class ChannelShuffle(nn.Module):
+    def __init__(self, groups):
+        super(ChannelShuffle, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
+        N, C, H, W = x.size()
+        g = self.groups
+        assert C % g == 0, "Incompatible group size {} for input channel {}".format(
+            g, C
+        )
+        return (
+            x.view(N, g, int(C / g), H, W)
+            .permute(0, 2, 1, 3, 4)
+            .contiguous()
+            .view(N, C, H, W)
+        )
+
+
+class ConvBNRelu(nn.Sequential):
+    def __init__(
+        self,
+        input_depth,
+        output_depth,
+        kernel,
+        stride,
+        pad,
+        no_bias,
+        use_relu,
+        bn_type,
+        group=1,
+        *args,
+        **kwargs
+    ):
+        super(ConvBNRelu, self).__init__()
+
+        assert use_relu in ["relu", None]
+        if isinstance(bn_type, (list, tuple)):
+            assert len(bn_type) == 2
+            assert bn_type[0] == "gn"
+            gn_group = bn_type[1]
+            bn_type = bn_type[0]
+        assert bn_type in ["bn", "af", "gn", None]
+        assert stride in [1, 2, 4]
+
+        op = Conv2d(
+            input_depth,
+            output_depth,
+            kernel_size=kernel,
+            stride=stride,
+            padding=pad,
+            bias=not no_bias,
+            groups=group,
+            *args,
+            **kwargs
+        )
+        nn.init.kaiming_normal_(op.weight, mode="fan_out", nonlinearity="relu")
+        if op.bias is not None:
+            nn.init.constant_(op.bias, 0.0)
+        self.add_module("conv", op)
+
+        if bn_type == "bn":
+            bn_op = BatchNorm2d(output_depth)
+        elif bn_type == "gn":
+            bn_op = nn.GroupNorm(num_groups=gn_group, num_channels=output_depth)
+        elif bn_type == "af":
+            bn_op = FrozenBatchNorm2d(output_depth)
+        if bn_type is not None:
+            self.add_module("bn", bn_op)
+
+        if use_relu == "relu":
+            self.add_module("relu", nn.ReLU(inplace=True))
+
+
+class SEModule(nn.Module):
+    reduction = 4
+
+    def __init__(self, C):
+        super(SEModule, self).__init__()
+        mid = max(C // self.reduction, 8)
+        conv1 = Conv2d(C, mid, 1, 1, 0)
+        conv2 = Conv2d(mid, C, 1, 1, 0)
+
+        self.op = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1), conv1, nn.ReLU(inplace=True), conv2, nn.Sigmoid()
+        )
+
+    def forward(self, x):
+        return x * self.op(x)
+
+
+class Upsample(nn.Module):
+    def __init__(self, scale_factor, mode, align_corners=None):
+        super(Upsample, self).__init__()
+        self.scale = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        return interpolate(
+            x, scale_factor=self.scale, mode=self.mode,
+            align_corners=self.align_corners
+        )
+
+
+def _get_upsample_op(stride):
+    assert (
+        stride in [1, 2, 4]
+        or stride in [-1, -2, -4]
+        or (isinstance(stride, tuple) and all(x in [-1, -2, -4] for x in stride))
+    )
+
+    scales = stride
+    ret = None
+    if isinstance(stride, tuple) or stride < 0:
+        scales = [-x for x in stride] if isinstance(stride, tuple) else -stride
+        stride = 1
+        ret = Upsample(scale_factor=scales, mode="nearest", align_corners=None)
+
+    return ret, stride
+
+
+class IRFBlock(nn.Module):
+    def __init__(
+        self,
+        input_depth,
+        output_depth,
+        expansion,
+        stride,
+        bn_type="bn",
+        kernel=3,
+        width_divisor=1,
+        shuffle_type=None,
+        pw_group=1,
+        se=False,
+        cdw=False,
+        dw_skip_bn=False,
+        dw_skip_relu=False,
+    ):
+        super(IRFBlock, self).__init__()
+
+        assert kernel in [1, 3, 5, 7], kernel
+
+        self.use_res_connect = stride == 1 and input_depth == output_depth
+        self.output_depth = output_depth
+
+        mid_depth = int(input_depth * expansion)
+        mid_depth = _get_divisible_by(mid_depth, width_divisor, width_divisor)
+
+        # pw
+        self.pw = ConvBNRelu(
+            input_depth,
+            mid_depth,
+            kernel=1,
+            stride=1,
+            pad=0,
+            no_bias=1,
+            use_relu="relu",
+            bn_type=bn_type,
+            group=pw_group,
+        )
+
+        # negative stride to do upsampling
+        self.upscale, stride = _get_upsample_op(stride)
+
+        # dw
+        if kernel == 1:
+            self.dw = nn.Sequential()
+        elif cdw:
+            dw1 = ConvBNRelu(
+                mid_depth,
+                mid_depth,
+                kernel=kernel,
+                stride=stride,
+                pad=(kernel // 2),
+                group=mid_depth,
+                no_bias=1,
+                use_relu="relu",
+                bn_type=bn_type,
+            )
+            dw2 = ConvBNRelu(
+                mid_depth,
+                mid_depth,
+                kernel=kernel,
+                stride=1,
+                pad=(kernel // 2),
+                group=mid_depth,
+                no_bias=1,
+                use_relu="relu" if not dw_skip_relu else None,
+                bn_type=bn_type if not dw_skip_bn else None,
+            )
+            self.dw = nn.Sequential(OrderedDict([("dw1", dw1), ("dw2", dw2)]))
+        else:
+            self.dw = ConvBNRelu(
+                mid_depth,
+                mid_depth,
+                kernel=kernel,
+                stride=stride,
+                pad=(kernel // 2),
+                group=mid_depth,
+                no_bias=1,
+                use_relu="relu" if not dw_skip_relu else None,
+                bn_type=bn_type if not dw_skip_bn else None,
+            )
+
+        # pw-linear
+        self.pwl = ConvBNRelu(
+            mid_depth,
+            output_depth,
+            kernel=1,
+            stride=1,
+            pad=0,
+            no_bias=1,
+            use_relu=None,
+            bn_type=bn_type,
+            group=pw_group,
+        )
+
+        self.shuffle_type = shuffle_type
+        if shuffle_type is not None:
+            self.shuffle = ChannelShuffle(pw_group)
+
+        self.se4 = SEModule(output_depth) if se else nn.Sequential()
+
+        self.output_depth = output_depth
+
+    def forward(self, x):
+        y = self.pw(x)
+        if self.shuffle_type == "mid":
+            y = self.shuffle(y)
+        if self.upscale is not None:
+            y = self.upscale(y)
+        y = self.dw(y)
+        y = self.pwl(y)
+        if self.use_res_connect:
+            y += x
+        y = self.se4(y)
+        return y
+
+
+def _expand_block_cfg(block_cfg):
+    assert isinstance(block_cfg, list)
+    ret = []
+    for idx in range(block_cfg[2]):
+        cur = copy.deepcopy(block_cfg)
+        cur[2] = 1
+        cur[3] = 1 if idx >= 1 else cur[3]
+        ret.append(cur)
+    return ret
+
+
+def expand_stage_cfg(stage_cfg):
+    """ For a single stage """
+    assert isinstance(stage_cfg, list)
+    ret = []
+    for x in stage_cfg:
+        ret += _expand_block_cfg(x)
+    return ret
+
+
+def expand_stages_cfg(stage_cfgs):
+    """ For a list of stages """
+    assert isinstance(stage_cfgs, list)
+    ret = []
+    for x in stage_cfgs:
+        ret.append(expand_stage_cfg(x))
+    return ret
+
+
+def _block_cfgs_to_list(block_cfgs):
+    assert isinstance(block_cfgs, list)
+    ret = []
+    for stage_idx, stage in enumerate(block_cfgs):
+        stage = expand_stage_cfg(stage)
+        for block_idx, block in enumerate(stage):
+            cur = {"stage_idx": stage_idx, "block_idx": block_idx, "block": block}
+            ret.append(cur)
+    return ret
+
+
+def _add_to_arch(arch, info, name):
+    """ arch = [{block_0}, {block_1}, ...]
+        info = [
+            # stage 0
+            [
+                block0_info,
+                block1_info,
+                ...
+            ], ...
+        ]
+        convert to:
+        arch = [
+            {
+                block_0,
+                name: block0_info,
+            },
+            {
+                block_1,
+                name: block1_info,
+            }, ...
+        ]
+    """
+    assert isinstance(arch, list) and all(isinstance(x, dict) for x in arch)
+    assert isinstance(info, list) and all(isinstance(x, list) for x in info)
+    idx = 0
+    for stage_idx, stage in enumerate(info):
+        for block_idx, block in enumerate(stage):
+            assert (
+                arch[idx]["stage_idx"] == stage_idx
+                and arch[idx]["block_idx"] == block_idx
+            ), "Index ({}, {}) does not match for block {}".format(
+                stage_idx, block_idx, arch[idx]
+            )
+            assert name not in arch[idx]
+            arch[idx][name] = block
+            idx += 1
+
+
+def unify_arch_def(arch_def):
+    """ unify the arch_def to:
+        {
+            ...,
+            "arch": [
+                {
+                    "stage_idx": idx,
+                    "block_idx": idx,
+                    ...
+                },
+                {}, ...
+            ]
+        }
+    """
+    ret = copy.deepcopy(arch_def)
+
+    assert "block_cfg" in arch_def and "stages" in arch_def["block_cfg"]
+    assert "stages" not in ret
+    # copy 'first', 'last' etc. inside arch_def['block_cfg'] to ret
+    ret.update({x: arch_def["block_cfg"][x] for x in arch_def["block_cfg"]})
+    ret["stages"] = _block_cfgs_to_list(arch_def["block_cfg"]["stages"])
+    del ret["block_cfg"]
+
+    assert "block_op_type" in arch_def
+    _add_to_arch(ret["stages"], arch_def["block_op_type"], "block_op_type")
+    del ret["block_op_type"]
+
+    return ret
+
+
+def get_num_stages(arch_def):
+    ret = 0
+    for x in arch_def["stages"]:
+        ret = max(x["stage_idx"], ret)
+    ret = ret + 1
+    return ret
+
+
+def get_blocks(arch_def, stage_indices=None, block_indices=None):
+    ret = copy.deepcopy(arch_def)
+    ret["stages"] = []
+    for block in arch_def["stages"]:
+        keep = True
+        if stage_indices not in (None, []) and block["stage_idx"] not in stage_indices:
+            keep = False
+        if block_indices not in (None, []) and block["block_idx"] not in block_indices:
+            keep = False
+        if keep:
+            ret["stages"].append(block)
+    return ret
+
+
+class FBNetBuilder(object):
+    def __init__(
+        self,
+        width_ratio,
+        bn_type="bn",
+        width_divisor=1,
+        dw_skip_bn=False,
+        dw_skip_relu=False,
+    ):
+        self.width_ratio = width_ratio
+        self.last_depth = -1
+        self.bn_type = bn_type
+        self.width_divisor = width_divisor
+        self.dw_skip_bn = dw_skip_bn
+        self.dw_skip_relu = dw_skip_relu
+
+    def add_first(self, stage_info, dim_in=3, pad=True):
+        # stage_info: [c, s, kernel]
+        assert len(stage_info) >= 2
+        channel = stage_info[0]
+        stride = stage_info[1]
+        out_depth = self._get_divisible_width(int(channel * self.width_ratio))
+        kernel = 3
+        if len(stage_info) > 2:
+            kernel = stage_info[2]
+
+        out = ConvBNRelu(
+            dim_in,
+            out_depth,
+            kernel=kernel,
+            stride=stride,
+            pad=kernel // 2 if pad else 0,
+            no_bias=1,
+            use_relu="relu",
+            bn_type=self.bn_type,
+        )
+        self.last_depth = out_depth
+        return out
+
+    def add_blocks(self, blocks):
+        """ blocks: [{}, {}, ...]
+        """
+        assert isinstance(blocks, list) and all(
+            isinstance(x, dict) for x in blocks
+        ), blocks
+
+        modules = OrderedDict()
+        for block in blocks:
+            stage_idx = block["stage_idx"]
+            block_idx = block["block_idx"]
+            block_op_type = block["block_op_type"]
+            tcns = block["block"]
+            n = tcns[2]
+            assert n == 1
+            nnblock = self.add_ir_block(tcns, [block_op_type])
+            nn_name = "xif{}_{}".format(stage_idx, block_idx)
+            assert nn_name not in modules
+            modules[nn_name] = nnblock
+        ret = nn.Sequential(modules)
+        return ret
+
+    def add_last(self, stage_info):
+        """ skip last layer if channel_scale == 0
+            use the same output channel if channel_scale < 0
+        """
+        assert len(stage_info) == 2
+        channels = stage_info[0]
+        channel_scale = stage_info[1]
+
+        if channel_scale == 0.0:
+            return nn.Sequential()
+
+        if channel_scale > 0:
+            last_channel = (
+                int(channels * self.width_ratio) if self.width_ratio > 1.0 else channels
+            )
+            last_channel = int(last_channel * channel_scale)
+        else:
+            last_channel = int(self.last_depth * (-channel_scale))
+        last_channel = self._get_divisible_width(last_channel)
+
+        if last_channel == 0:
+            return nn.Sequential()
+
+        dim_in = self.last_depth
+        ret = ConvBNRelu(
+            dim_in,
+            last_channel,
+            kernel=1,
+            stride=1,
+            pad=0,
+            no_bias=1,
+            use_relu="relu",
+            bn_type=self.bn_type,
+        )
+        self.last_depth = last_channel
+        return ret
+
+    # def add_final_pool(self, model, blob_in, kernel_size):
+    #     ret = model.AveragePool(blob_in, "final_avg", kernel=kernel_size, stride=1)
+    #     return ret
+
+    def _add_ir_block(
+        self, dim_in, dim_out, stride, expand_ratio, block_op_type, **kwargs
+    ):
+        ret = PRIMITIVES[block_op_type](
+            dim_in,
+            dim_out,
+            expansion=expand_ratio,
+            stride=stride,
+            bn_type=self.bn_type,
+            width_divisor=self.width_divisor,
+            dw_skip_bn=self.dw_skip_bn,
+            dw_skip_relu=self.dw_skip_relu,
+            **kwargs
+        )
+        return ret, ret.output_depth
+
+    def add_ir_block(self, tcns, block_op_types, **kwargs):
+        t, c, n, s = tcns
+        assert n == 1
+        out_depth = self._get_divisible_width(int(c * self.width_ratio))
+        dim_in = self.last_depth
+        op, ret_depth = self._add_ir_block(
+            dim_in,
+            out_depth,
+            stride=s,
+            expand_ratio=t,
+            block_op_type=block_op_types[0],
+            **kwargs
+        )
+        self.last_depth = ret_depth
+        return op
+
+    def _get_divisible_width(self, width):
+        ret = _get_divisible_by(int(width), self.width_divisor, self.width_divisor)
+        return ret
diff --git a/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py b/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb1c96b3a4dbe735682ae81361ee0efed75cbb25
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py
@@ -0,0 +1,218 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+def add_archs(archs):
+    global MODEL_ARCH
+    for x in archs:
+        assert x not in MODEL_ARCH, "Duplicated model name {} existed".format(x)
+        MODEL_ARCH[x] = archs[x]
+
+
+MODEL_ARCH = {
+    "default": {
+        "block_op_type": [
+            # stage 0
+            ["ir_k3"],
+            # stage 1
+            ["ir_k3"] * 2,
+            # stage 2
+            ["ir_k3"] * 3,
+            # stage 3
+            ["ir_k3"] * 7,
+            # stage 4, bbox head
+            ["ir_k3"] * 4,
+            # stage 5, rpn
+            ["ir_k3"] * 3,
+            # stage 5, mask head
+            ["ir_k3"] * 5,
+        ],
+        "block_cfg": {
+            "first": [32, 2],
+            "stages": [
+                # [t, c, n, s]
+                # stage 0
+                [[1, 16, 1, 1]],
+                # stage 1
+                [[6, 24, 2, 2]],
+                # stage 2
+                [[6, 32, 3, 2]],
+                # stage 3
+                [[6, 64, 4, 2], [6, 96, 3, 1]],
+                # stage 4, bbox head
+                [[4, 160, 1, 2], [6, 160, 2, 1], [6, 240, 1, 1]],
+                # [[6, 160, 3, 2], [6, 320, 1, 1]],
+                # stage 5, rpn head
+                [[6, 96, 3, 1]],
+                # stage 6, mask head
+                [[4, 160, 1, 1], [6, 160, 3, 1], [3, 80, 1, -2]],
+            ],
+            # [c, channel_scale]
+            "last": [0, 0.0],
+            "backbone": [0, 1, 2, 3],
+            "rpn": [5],
+            "bbox": [4],
+            "mask": [6],
+        },
+    },
+    "xirb16d_dsmask": {
+        "block_op_type": [
+            # stage 0
+            ["ir_k3"],
+            # stage 1
+            ["ir_k3"] * 2,
+            # stage 2
+            ["ir_k3"] * 3,
+            # stage 3
+            ["ir_k3"] * 7,
+            # stage 4, bbox head
+            ["ir_k3"] * 4,
+            # stage 5, mask head
+            ["ir_k3"] * 5,
+            # stage 6, rpn
+            ["ir_k3"] * 3,
+        ],
+        "block_cfg": {
+            "first": [16, 2],
+            "stages": [
+                # [t, c, n, s]
+                # stage 0
+                [[1, 16, 1, 1]],
+                # stage 1
+                [[6, 32, 2, 2]],
+                # stage 2
+                [[6, 48, 3, 2]],
+                # stage 3
+                [[6, 96, 4, 2], [6, 128, 3, 1]],
+                # stage 4, bbox head
+                [[4, 128, 1, 2], [6, 128, 2, 1], [6, 160, 1, 1]],
+                # stage 5, mask head
+                [[4, 128, 1, 2], [6, 128, 2, 1], [6, 128, 1, -2], [3, 64, 1, -2]],
+                # stage 6, rpn head
+                [[6, 128, 3, 1]],
+            ],
+            # [c, channel_scale]
+            "last": [0, 0.0],
+            "backbone": [0, 1, 2, 3],
+            "rpn": [6],
+            "bbox": [4],
+            "mask": [5],
+        },
+    },
+    "mobilenet_v2": {
+        "block_op_type": [
+            # stage 0
+            ["ir_k3"],
+            # stage 1
+            ["ir_k3"] * 2,
+            # stage 2
+            ["ir_k3"] * 3,
+            # stage 3
+            ["ir_k3"] * 7,
+            # stage 4
+            ["ir_k3"] * 4,
+        ],
+        "block_cfg": {
+            "first": [32, 2],
+            "stages": [
+                # [t, c, n, s]
+                # stage 0
+                [[1, 16, 1, 1]],
+                # stage 1
+                [[6, 24, 2, 2]],
+                # stage 2
+                [[6, 32, 3, 2]],
+                # stage 3
+                [[6, 64, 4, 2], [6, 96, 3, 1]],
+                # stage 4
+                [[6, 160, 3, 1], [6, 320, 1, 1]],
+            ],
+            # [c, channel_scale]
+            "last": [0, 0.0],
+            "backbone": [0, 1, 2, 3],
+            "bbox": [4],
+        },
+    },
+}
+
+
+MODEL_ARCH_CHAM = {
+    "cham_v1a": {
+        "block_op_type": [
+            # stage 0
+            ["ir_k3"],
+            # stage 1
+            ["ir_k7"] * 2,
+            # stage 2
+            ["ir_k3"] * 5,
+            # stage 3
+            ["ir_k5"] * 7 + ["ir_k3"] * 5,
+            # stage 4, bbox head
+            ["ir_k3"] * 5,
+            # stage 5, rpn
+            ["ir_k3"] * 3,
+        ],
+        "block_cfg": {
+            "first": [32, 2],
+            "stages": [
+                # [t, c, n, s]
+                # stage 0
+                [[1, 24, 1, 1]],
+                # stage 1
+                [[4, 48, 2, 2]],
+                # stage 2
+                [[7, 64, 5, 2]],
+                # stage 3
+                [[12, 56, 7, 2], [8, 88, 5, 1]],
+                # stage 4, bbox head
+                [[7, 152, 4, 2], [10, 104, 1, 1]],
+                # stage 5, rpn head
+                [[8, 88, 3, 1]],
+            ],
+            # [c, channel_scale]
+            "last": [0, 0.0],
+            "backbone": [0, 1, 2, 3],
+            "rpn": [5],
+            "bbox": [4],
+        },
+    },
+    "cham_v2": {
+        "block_op_type": [
+            # stage 0
+            ["ir_k3"],
+            # stage 1
+            ["ir_k5"] * 4,
+            # stage 2
+            ["ir_k7"] * 6,
+            # stage 3
+            ["ir_k5"] * 3 + ["ir_k3"] * 6,
+            # stage 4, bbox head
+            ["ir_k3"] * 7,
+            # stage 5, rpn
+            ["ir_k3"] * 1,
+        ],
+        "block_cfg": {
+            "first": [32, 2],
+            "stages": [
+                # [t, c, n, s]
+                # stage 0
+                [[1, 24, 1, 1]],
+                # stage 1
+                [[8, 32, 4, 2]],
+                # stage 2
+                [[5, 48, 6, 2]],
+                # stage 3
+                [[9, 56, 3, 2], [6, 56, 6, 1]],
+                # stage 4, bbox head
+                [[2, 160, 6, 2], [6, 112, 1, 1]],
+                # stage 5, rpn head
+                [[6, 56, 1, 1]],
+            ],
+            # [c, channel_scale]
+            "last": [0, 0.0],
+            "backbone": [0, 1, 2, 3],
+            "rpn": [5],
+            "bbox": [4],
+        },
+    },
+}
+add_archs(MODEL_ARCH_CHAM)
diff --git a/maskrcnn_benchmark/modeling/backbone/fpn.py b/maskrcnn_benchmark/modeling/backbone/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd171776d8e10f4ac657303f5d1bfad624569dd
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/backbone/fpn.py
@@ -0,0 +1,98 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class FPN(nn.Module):
+    """
+    Module that adds FPN on top of a list of feature maps.
+    The feature maps are currently supposed to be in increasing depth order, and must be consecutive
+    """
+
+    def __init__(
+        self, in_channels_list, out_channels, conv_block, top_blocks=None
+    ):
+        """
+        Arguments:
+            in_channels_list (list[int]): number of channels for each feature map that
+                will be fed
+            out_channels (int): number of channels of the FPN representation
+            top_blocks (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                FPN output, and the result will extend the result list
+        """
+        super(FPN, self).__init__()
+        self.inner_blocks = []
+        self.layer_blocks = []
+        for idx, in_channels in enumerate(in_channels_list, 1):
+            inner_block = "fpn_inner{}".format(idx)
+            layer_block = "fpn_layer{}".format(idx)
+
+            if in_channels == 0:
+                continue
+            inner_block_module = conv_block(in_channels, out_channels, 1)
+            layer_block_module = conv_block(out_channels, out_channels, 3, 1)
+            self.add_module(inner_block, inner_block_module)
+            self.add_module(layer_block, layer_block_module)
+            self.inner_blocks.append(inner_block)
+            self.layer_blocks.append(layer_block)
+        self.top_blocks = top_blocks
+
+    def forward(self, x):
+        """
+        Arguments:
+            x (list[Tensor]): feature maps for each feature level.
+        Returns:
+            results (tuple[Tensor]): feature maps after FPN layers.
+                They are ordered from highest resolution first.
+        """
+        last_inner = getattr(self, self.inner_blocks[-1])(x[-1])
+        results = []
+        results.append(getattr(self, self.layer_blocks[-1])(last_inner))
+        for feature, inner_block, layer_block in zip(
+            x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1]
+        ):
+            if not inner_block:
+                continue
+            inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest")
+            inner_lateral = getattr(self, inner_block)(feature)
+            # TODO use size instead of scale to make it robust to different sizes
+            # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:],
+            # mode='bilinear', align_corners=False)
+            last_inner = inner_lateral + inner_top_down
+            results.insert(0, getattr(self, layer_block)(last_inner))
+
+        if isinstance(self.top_blocks, LastLevelP6P7):
+            last_results = self.top_blocks(x[-1], results[-1])
+            results.extend(last_results)
+        elif isinstance(self.top_blocks, LastLevelMaxPool):
+            last_results = self.top_blocks(results[-1])
+            results.extend(last_results)
+
+        return tuple(results)
+
+
+class LastLevelMaxPool(nn.Module):
+    def forward(self, x):
+        return [F.max_pool2d(x, 1, 2, 0)]
+
+
+class LastLevelP6P7(nn.Module):
+    """
+    This module is used in RetinaNet to generate extra layers, P6 and P7.
+    """
+    def __init__(self, in_channels, out_channels):
+        super(LastLevelP6P7, self).__init__()
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for module in [self.p6, self.p7]:
+            nn.init.kaiming_uniform_(module.weight, a=1)
+            nn.init.constant_(module.bias, 0)
+        self.use_P5 = in_channels == out_channels
+
+    def forward(self, c5, p5):
+        x = p5 if self.use_P5 else c5
+        p6 = self.p6(x)
+        p7 = self.p7(F.relu(p6))
+        return [p6, p7]
diff --git a/maskrcnn_benchmark/modeling/backbone/msr.py b/maskrcnn_benchmark/modeling/backbone/msr.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ed5a66cd1a7a926d6554482c47296dd617d9e2f
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/backbone/msr.py
@@ -0,0 +1,65 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class ConcatUpConv(nn.Module):
+    def __init__(self, inplanes, outplanes, upsample=True):
+        super(ConcatUpConv, self).__init__()
+        out_channels = outplanes
+        self.upsample = upsample
+        self.con_1x1 = nn.Conv2d(inplanes, outplanes, 1, bias=False)
+        nn.init.kaiming_uniform_(self.con_1x1.weight, a=1)
+        self.nor_1 = nn.BatchNorm2d(out_channels)
+        self.leakyrelu_1 = nn.ReLU()
+        if self.upsample:
+            self.con_3x3 = nn.Conv2d(outplanes, out_channels // 2,
+                                     kernel_size=3, stride=1, padding=1, bias=False)
+            nn.init.kaiming_uniform_(self.con_3x3.weight, a=1)
+            self.nor_3 = nn.BatchNorm2d(out_channels // 2)
+            self.leakyrelu_3 = nn.ReLU()
+
+    def forward(self, x1, x2):
+        fusion = torch.cat([x1, x2], dim=1)
+        out_1 = self.leakyrelu_1(self.nor_1(self.con_1x1(fusion)))
+        out = None
+        if self.upsample:
+            out = self.leakyrelu_3(self.nor_3(self.con_3x3(out_1)))
+            out = F.interpolate(out, scale_factor=2, mode='bilinear', align_corners=False)
+        return out, out_1
+
+
+class MSR(nn.Module):
+    def __init__(self, body, channels, fpn=None, pan=None):
+        super(MSR, self).__init__()
+        self.body = body
+        cucs = nn.ModuleList()
+        channel = channels[0]
+        cucs.append(ConcatUpConv(channel * 2, channel, upsample=False))
+        for i, channel in enumerate(channels[1:]):
+            cucs.append(ConcatUpConv(channel * 2, channel))
+        self.cucs = cucs
+        if fpn is not None:
+            self.fpn = fpn
+        if pan is not None:
+            self.pan = pan
+
+    def forward(self, x):
+        outputs = self.body(x)
+
+        re_x = F.interpolate(x, scale_factor=0.5,
+                             mode='bilinear', align_corners=False)
+        output_re = self.body(re_x)[-1]
+        low = F.interpolate(output_re,
+                            size=outputs[-1].shape[2:],
+                            mode='bilinear', align_corners=False)
+        new_outputs = []
+        for cuc, high in zip(self.cucs[::-1], outputs[::-1]):
+            low, out = cuc(high, low)
+            new_outputs.append(out)
+        outs = new_outputs[::-1]
+        if hasattr(self, 'pan'):
+            outs = self.pan(outs)
+        if hasattr(self, 'fpn'):
+            outs = self.fpn(outs)
+        return outs
diff --git a/maskrcnn_benchmark/modeling/backbone/pan.py b/maskrcnn_benchmark/modeling/backbone/pan.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9703e271b3987ff380e5222232592678cafef61
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/backbone/pan.py
@@ -0,0 +1,177 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FPA(nn.Module):
+    def __init__(self, channels=2048):
+        """
+        Feature Pyramid Attention
+        :type channels: int
+        """
+        super(FPA, self).__init__()
+        channels_mid = int(channels / 4)
+
+        self.channels_cond = channels
+
+        # Master branch
+        self.conv_master = nn.Conv2d(self.channels_cond, channels, kernel_size=1, bias=False)
+        self.bn_master = nn.BatchNorm2d(channels)
+
+        # Global pooling branch
+        self.conv_gpb = nn.Conv2d(self.channels_cond, channels, kernel_size=1, bias=False)
+        #self.bn_gpb = nn.BatchNorm2d(channels)
+
+        # C333 because of the shape of last feature maps is (16, 16).
+        self.conv7x7_1 = nn.Conv2d(self.channels_cond, channels_mid, kernel_size=(7, 7), stride=2, padding=3, bias=False)
+        self.bn1_1 = nn.BatchNorm2d(channels_mid)
+        self.conv5x5_1 = nn.Conv2d(channels_mid, channels_mid, kernel_size=(5, 5), stride=2, padding=2, bias=False)
+        self.bn2_1 = nn.BatchNorm2d(channels_mid)
+        self.conv3x3_1 = nn.Conv2d(channels_mid, channels_mid, kernel_size=(3, 3), stride=2, padding=1, bias=False)
+        self.bn3_1 = nn.BatchNorm2d(channels_mid)
+
+        self.conv7x7_2 = nn.Conv2d(channels_mid, channels_mid, kernel_size=(7, 7), stride=1, padding=3, bias=False)
+        self.bn1_2 = nn.BatchNorm2d(channels_mid)
+        self.conv5x5_2 = nn.Conv2d(channels_mid, channels_mid, kernel_size=(5, 5), stride=1, padding=2, bias=False)
+        self.bn2_2 = nn.BatchNorm2d(channels_mid)
+        self.conv3x3_2 = nn.Conv2d(channels_mid, channels_mid, kernel_size=(3, 3), stride=1, padding=1, bias=False)
+        self.bn3_2 = nn.BatchNorm2d(channels_mid)
+
+        self.bn_upsample_1 = nn.BatchNorm2d(channels)
+        self.conv1x1_up1 = nn.Conv2d(channels_mid, channels, kernel_size=(1, 1), stride=1, padding=0, bias=False)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """
+        :param x: Shape: [b, 2048, h, w]
+        :return: out: Feature maps. Shape: [b, 2048, h, w]
+        """
+        # Master branch
+        x_master = self.conv_master(x)
+        x_master = self.bn_master(x_master)
+
+        # Global pooling branch
+        x_gpb = nn.AvgPool2d(x.shape[2:])(x).view(x.shape[0], self.channels_cond, 1, 1)
+        x_gpb = self.conv_gpb(x_gpb)
+        #x_gpb = self.bn_gpb(x_gpb)
+
+        # Branch 1
+        x1_1 = self.conv7x7_1(x)
+        x1_1 = self.bn1_1(x1_1)
+        x1_1 = self.relu(x1_1)
+        x1_2 = self.conv7x7_2(x1_1)
+        x1_2 = self.bn1_2(x1_2)
+
+        # Branch 2
+        x2_1 = self.conv5x5_1(x1_1)
+        x2_1 = self.bn2_1(x2_1)
+        x2_1 = self.relu(x2_1)
+        x2_2 = self.conv5x5_2(x2_1)
+        x2_2 = self.bn2_2(x2_2)
+
+        # Branch 3
+        x3_1 = self.conv3x3_1(x2_1)
+        x3_1 = self.bn3_1(x3_1)
+        x3_1 = self.relu(x3_1)
+        x3_2 = self.conv3x3_2(x3_1)
+        x3_2 = self.bn3_2(x3_2)
+
+        # Merge branch 1 and 2
+        x3_upsample = F.upsample(x3_2, size=x2_2.shape[-2:],
+                                 mode='bilinear', align_corners=False)
+
+        x2_merge = self.relu(x2_2 + x3_upsample)
+
+        x2_upsample = F.upsample(x2_merge, size=x1_2.shape[-2:],
+                                 mode='bilinear', align_corners=False)
+        x1_merge = self.relu(x1_2 + x2_upsample)
+
+        x1_merge_upsample = F.upsample(x1_merge, size=x_master.shape[-2:],
+                                       mode='bilinear', align_corners=False)
+        x1_merge_upsample_ch = self.relu(self.bn_upsample_1(self.conv1x1_up1(x1_merge_upsample)))
+        x_master = x_master * x1_merge_upsample_ch
+        #
+        out = self.relu(x_master + x_gpb)
+
+        return out
+
+
+class GAU(nn.Module):
+    def __init__(self, channels_high, channels_low, upsample=True):
+        super(GAU, self).__init__()
+        # Global Attention Upsample
+        self.upsample = upsample
+        self.conv3x3 = nn.Conv2d(channels_low, channels_low, kernel_size=3, padding=1, bias=False)
+        self.bn_low = nn.BatchNorm2d(channels_low)
+
+        self.conv1x1 = nn.Conv2d(channels_high, channels_low, kernel_size=1, padding=0, bias=False)
+        #self.bn_high = nn.BatchNorm2d(channels_low)
+
+        if upsample:
+            self.conv_upsample = nn.ConvTranspose2d(channels_high, channels_low, kernel_size=4, stride=2, padding=1, bias=False)
+            self.bn_upsample = nn.BatchNorm2d(channels_low)
+        else:
+            self.conv_reduction = nn.Conv2d(channels_high, channels_low, kernel_size=1, padding=0, bias=False)
+            self.bn_reduction = nn.BatchNorm2d(channels_low)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, fms_high, fms_low, fm_mask=None):
+        """
+        Use the high level features with abundant catagory information to weight the low level features with pixel
+        localization information. In the meantime, we further use mask feature maps with catagory-specific information
+        to localize the mask position.
+        :param fms_high: Features of high level. Tensor.
+        :param fms_low: Features of low level.  Tensor.
+        :param fm_mask:
+        :return: fms_att_upsample
+        """
+        b, c, h, w = fms_high.shape
+
+        fms_high_gp = nn.AvgPool2d(fms_high.shape[2:])(fms_high).view(len(fms_high), c, 1, 1)
+        fms_high_gp = self.conv1x1(fms_high_gp)
+        # fms_high_gp = self.bn_high(fms_high_gp)# arlog, when the spatial size HxW = 1x1, the BN cannot be used.
+        fms_high_gp = self.relu(fms_high_gp)
+
+        # fms_low_mask = torch.cat([fms_low, fm_mask], dim=1)
+        fms_low_mask = self.conv3x3(fms_low)
+        fms_low_mask = self.bn_low(fms_low_mask)
+
+        fms_att = fms_low_mask * fms_high_gp
+        if self.upsample:
+            out = self.relu(
+                self.bn_upsample(self.conv_upsample(fms_high)) + fms_att)
+        else:
+            out = self.relu(
+                self.bn_reduction(self.conv_reduction(fms_high)) + fms_att)
+        return out
+
+
+class PAN(nn.Module):
+    def __init__(self):
+        """
+        :param blocks: Blocks of the network with reverse sequential.
+        """
+        super(PAN, self).__init__()
+        channels_blocks = [2048, 1024, 512, 256]
+
+        self.fpa = FPA(channels=channels_blocks[0])
+
+        self.gau_block1 = GAU(channels_blocks[0], channels_blocks[1])
+        self.gau_block2 = GAU(channels_blocks[1], channels_blocks[2])
+        self.gau_block3 = GAU(channels_blocks[2], channels_blocks[3])
+        self.gau = [self.gau_block1, self.gau_block2, self.gau_block3]
+
+    def forward(self, fms):
+        """
+        :param fms: Feature maps of forward propagation in the network with reverse sequential. shape:[b, c, h, w]
+        :return: fm_high. [b, 256, h, w]
+        """
+        feats = []
+        for i, fm_low in enumerate(fms[::-1]):
+            if i == 0:
+                fm_high = self.fpa(fm_low)
+            else:
+                fm_high = self.gau[int(i-1)](fm_high, fm_low)
+            feats.append(fm_high)
+        feats.reverse()
+        return tuple(feats)
diff --git a/maskrcnn_benchmark/modeling/backbone/resnet.py b/maskrcnn_benchmark/modeling/backbone/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..39adf1520463abcf5778a674c7e4d5fb3dc0163d
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/backbone/resnet.py
@@ -0,0 +1,498 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+Variant of the resnet module that takes cfg as an argument.
+Example usage. Strings may be specified in the config file.
+    model = ResNet(
+        "StemWithFixedBatchNorm",
+        "BottleneckWithFixedBatchNorm",
+        "ResNet50StagesTo4",
+    )
+OR:
+    model = ResNet(
+        "StemWithGN",
+        "BottleneckWithGN",
+        "ResNet50StagesTo4",
+    )
+Custom implementations may be written in user code and hooked in via the
+`register_*` functions.
+"""
+from collections import namedtuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from maskrcnn_benchmark.layers import FrozenBatchNorm2d
+from maskrcnn_benchmark.layers import Conv2d
+from maskrcnn_benchmark.modeling.make_layers import group_norm
+from maskrcnn_benchmark.layers import DCN
+from maskrcnn_benchmark.utils.registry import Registry
+
+
+# ResNet stage specification
+StageSpec = namedtuple(
+    "StageSpec",
+    [
+        "index",  # Index of the stage, eg 1, 2, ..,. 5
+        "block_count",  # Number of residual blocks in the stage
+        "return_features",  # True => return the last feature map from this stage
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Standard ResNet models
+# -----------------------------------------------------------------------------
+# ResNet-50 (including all stages)
+ResNet50StagesTo5 = tuple(
+    StageSpec(index=i, block_count=c, return_features=r)
+    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, False), (4, 3, True))
+)
+# ResNet-50 up to stage 4 (excludes stage 5)
+ResNet50StagesTo4 = tuple(
+    StageSpec(index=i, block_count=c, return_features=r)
+    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, True))
+)
+# ResNet-101 (including all stages)
+ResNet101StagesTo5 = tuple(
+    StageSpec(index=i, block_count=c, return_features=r)
+    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, False), (4, 3, True))
+)
+# ResNet-101 up to stage 4 (excludes stage 5)
+ResNet101StagesTo4 = tuple(
+    StageSpec(index=i, block_count=c, return_features=r)
+    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, True))
+)
+# ResNet-50-FPN (including all stages)
+ResNet50FPNStagesTo5 = tuple(
+    StageSpec(index=i, block_count=c, return_features=r)
+    for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 6, True), (4, 3, True))
+)
+# ResNet-101-FPN (including all stages)
+ResNet101FPNStagesTo5 = tuple(
+    StageSpec(index=i, block_count=c, return_features=r)
+    for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 23, True), (4, 3, True))
+)
+# ResNet-152-FPN (including all stages)
+ResNet152FPNStagesTo5 = tuple(
+    StageSpec(index=i, block_count=c, return_features=r)
+    for (i, c, r) in ((1, 3, True), (2, 8, True), (3, 36, True), (4, 3, True))
+)
+
+class ResNet(nn.Module):
+    def __init__(self, cfg):
+        super(ResNet, self).__init__()
+
+        # If we want to use the cfg in forward(), then we should make a copy
+        # of it and store it for later use:
+        # self.cfg = cfg.clone()
+
+        # Translate string names to implementations
+        stem_module = _STEM_MODULES[cfg.MODEL.RESNETS.STEM_FUNC]
+        stage_specs = _STAGE_SPECS[cfg.MODEL.BACKBONE.CONV_BODY]
+        transformation_module = _TRANSFORMATION_MODULES[cfg.MODEL.RESNETS.TRANS_FUNC]
+        deformable_module = _TRANSFORMATION_MODULES[cfg.MODEL.RESNETS.DEF_FUNC]
+        start_module = cfg.MODEL.RESNETS.DEF_START_MODULE
+        _DEF_IDX = {"C3": 1, "C4": 2, "C5": 3}
+        if start_module in _DEF_IDX:
+            start_idx = _DEF_IDX[start_module]
+        else:
+            start_idx = 65535
+
+        # Construct the stem module
+        self.stem = stem_module(cfg)
+
+        # Constuct the specified ResNet stages
+        num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
+        width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+        in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+        stage2_bottleneck_channels = num_groups * width_per_group
+        stage2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+        self.stages = []
+        self.return_features = {}
+        for i, stage_spec in enumerate(stage_specs):
+            name = "layer" + str(stage_spec.index)
+            stage2_relative_factor = 2 ** (stage_spec.index - 1)
+            bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor
+            out_channels = stage2_out_channels * stage2_relative_factor
+            if i >= start_idx:
+                trans_mod = deformable_module
+            else:
+                trans_mod = transformation_module
+            module = _make_stage(
+                trans_mod,
+                in_channels,
+                bottleneck_channels,
+                out_channels,
+                stage_spec.block_count,
+                num_groups,
+                cfg.MODEL.RESNETS.STRIDE_IN_1X1,
+                first_stride=int(stage_spec.index > 1) + 1,
+            )
+            in_channels = out_channels
+            self.add_module(name, module)
+            self.stages.append(name)
+            self.return_features[name] = stage_spec.return_features
+
+        # Optionally freeze (requires_grad=False) parts of the backbone
+        self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT)
+
+    def _freeze_backbone(self, freeze_at):
+        if freeze_at < 0:
+            return
+        for stage_index in range(freeze_at):
+            if stage_index == 0:
+                m = self.stem  # stage 0 is the stem
+            else:
+                m = getattr(self, "layer" + str(stage_index))
+            for p in m.parameters():
+                p.requires_grad = False
+
+    def forward(self, x):
+        outputs = []
+        x = self.stem(x)
+        for stage_name in self.stages:
+            x = getattr(self, stage_name)(x)
+            if self.return_features[stage_name]:
+                outputs.append(x)
+        return outputs
+
+
+class ResNetHead(nn.Module):
+    def __init__(
+        self,
+        block_module,
+        stages,
+        num_groups=1,
+        width_per_group=64,
+        stride_in_1x1=True,
+        stride_init=None,
+        res2_out_channels=256,
+        dilation=1
+    ):
+        super(ResNetHead, self).__init__()
+
+        stage2_relative_factor = 2 ** (stages[0].index - 1)
+        # print('stage2_relative_factor---',stage2_relative_factor)
+
+        stage2_bottleneck_channels = num_groups * width_per_group
+        # print('stage2_bottleneck_channels---',stage2_bottleneck_channels)
+
+        out_channels = res2_out_channels * stage2_relative_factor
+        # print('out_channels---',out_channels)
+
+        in_channels = out_channels // 2
+        # print('in_channels---',in_channels)
+        #
+        bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor
+        # print('bottleneck_channels---',bottleneck_channels)
+
+        block_module = _TRANSFORMATION_MODULES[block_module]
+        # print('block_module---',block_module)
+
+
+        self.stages = []
+        stride = stride_init
+        for stage in stages:
+            name = "layer" + str(stage.index)
+            if not stride:
+                stride = int(stage.index > 1) + 1
+                # print('stride---', stride)
+            print('stage.block_count---', stage.block_count)
+            module = _make_stage(
+                block_module,
+                in_channels,
+                bottleneck_channels,
+                out_channels,
+                stage.block_count,
+                num_groups,
+                stride_in_1x1,
+                first_stride=stride,
+                dilation=dilation
+            )
+            stride = None
+            self.add_module(name, module)
+            self.stages.append(name)
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        for stage in self.stages:
+            x = getattr(self, stage)(x)
+            print('x-----------',x.shape)
+        return x
+
+
+def _make_stage(
+    transformation_module,
+    in_channels,
+    bottleneck_channels,
+    out_channels,
+    block_count,
+    num_groups,
+    stride_in_1x1,
+    first_stride,
+    dilation=1
+):
+    blocks = []
+    stride = first_stride
+    for _ in range(block_count):
+        blocks.append(
+            transformation_module(
+                in_channels,
+                bottleneck_channels,
+                out_channels,
+                num_groups,
+                stride_in_1x1,
+                stride,
+                dilation=dilation
+            )
+        )
+        stride = 1
+        in_channels = out_channels
+    return nn.Sequential(*blocks)
+
+
+class Bottleneck(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            bottleneck_channels,
+            out_channels,
+            num_groups,
+            stride_in_1x1,
+            stride,
+            dilation,
+            norm_func,
+            conv_func=Conv2d
+    ):
+        super(Bottleneck, self).__init__()
+
+        self.downsample = None
+        if in_channels != out_channels:
+            down_stride = stride if dilation == 1 else 1
+            self.downsample = nn.Sequential(
+                conv_func(
+                    in_channels, out_channels,
+                    kernel_size=1, stride=down_stride, bias=False
+                ),
+                norm_func(out_channels),
+            )
+            for modules in [self.downsample,]:
+                for l in modules.modules():
+                    if isinstance(l, Conv2d):
+                        nn.init.kaiming_uniform_(l.weight, a=1)
+
+        if dilation > 1:
+            stride = 1 # reset to be 1
+
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = conv_func(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+        )
+        self.bn1 = norm_func(bottleneck_channels)
+        # TODO: specify init for the above
+
+        self.conv2 = conv_func(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation
+        )
+        self.bn2 = norm_func(bottleneck_channels)
+
+        self.conv3 = Conv2d(
+            bottleneck_channels, out_channels, kernel_size=1, bias=False
+        )
+        self.bn3 = norm_func(out_channels)
+
+        for l in [self.conv1, self.conv2, self.conv3,]:
+            nn.init.kaiming_uniform_(l.weight, a=1)
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = F.relu_(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = F.relu_(out)
+
+        out0 = self.conv3(out)
+        out = self.bn3(out0)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = F.relu_(out)
+
+        return out
+
+
+class BaseStem(nn.Module):
+    def __init__(self, cfg, norm_func):
+        super(BaseStem, self).__init__()
+
+        out_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+
+        self.conv1 = Conv2d(
+            3, out_channels, kernel_size=7, stride=2, padding=3, bias=False
+        )
+        self.bn1 = norm_func(out_channels)
+
+        for l in [self.conv1,]:
+            nn.init.kaiming_uniform_(l.weight, a=1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu_(x)
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+        return x
+
+#############################################
+
+class BottleneckWithFixedBatchNorm(Bottleneck):
+    def __init__(
+        self,
+        in_channels,
+        bottleneck_channels,
+        out_channels,
+        num_groups=1,
+        stride_in_1x1=True,
+        stride=1,
+        dilation=1
+    ):
+        super(BottleneckWithFixedBatchNorm, self).__init__(
+            in_channels=in_channels,
+            bottleneck_channels=bottleneck_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+            stride_in_1x1=stride_in_1x1,
+            stride=stride,
+            dilation=dilation,
+            norm_func=FrozenBatchNorm2d
+        )
+
+
+class DeformableConvWithFixedBatchNorm(Bottleneck):
+    def __init__(
+        self,
+        in_channels,
+        bottleneck_channels,
+        out_channels,
+        num_groups=1,
+        stride_in_1x1=True,
+        stride=1,
+        dilation=1
+    ):
+        super(DeformableConvWithFixedBatchNorm, self).__init__(
+            in_channels=in_channels,
+            bottleneck_channels=bottleneck_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+            stride_in_1x1=stride_in_1x1,
+            stride=stride,
+            dilation=dilation,
+            norm_func=FrozenBatchNorm2d,
+            conv_func=DCN
+        )
+
+
+class StemWithFixedBatchNorm(BaseStem):
+    def __init__(self, cfg):
+        super(StemWithFixedBatchNorm, self).__init__(
+            cfg, norm_func=FrozenBatchNorm2d
+        )
+
+
+class BottleneckWithGN(Bottleneck):
+    def __init__(
+        self,
+        in_channels,
+        bottleneck_channels,
+        out_channels,
+        num_groups=1,
+        stride_in_1x1=True,
+        stride=1,
+        dilation=1
+    ):
+        super(BottleneckWithGN, self).__init__(
+            in_channels=in_channels,
+            bottleneck_channels=bottleneck_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+            stride_in_1x1=stride_in_1x1,
+            stride=stride,
+            dilation=dilation,
+            norm_func=group_norm
+        )
+
+
+class DeformableConvWithGN(Bottleneck):
+    def __init__(
+        self,
+        in_channels,
+        bottleneck_channels,
+        out_channels,
+        num_groups=1,
+        stride_in_1x1=True,
+        stride=1,
+        dilation=1
+    ):
+        super(DeformableConvWithGN, self).__init__(
+            in_channels=in_channels,
+            bottleneck_channels=bottleneck_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+            stride_in_1x1=stride_in_1x1,
+            stride=stride,
+            dilation=dilation,
+            norm_func=group_norm,
+            conv_func=DCN
+        )
+
+
+class StemWithGN(BaseStem):
+    def __init__(self, cfg):
+        super(StemWithGN, self).__init__(cfg, norm_func=group_norm)
+
+
+_TRANSFORMATION_MODULES = Registry({
+    "BottleneckWithFixedBatchNorm": BottleneckWithFixedBatchNorm,
+    "BottleneckWithGN": BottleneckWithGN,
+    "DeformableConvWithFixedBatchNorm": DeformableConvWithFixedBatchNorm,
+    "DeformableConvWithGN": DeformableConvWithGN,
+})
+
+_STEM_MODULES = Registry({
+    "StemWithFixedBatchNorm": StemWithFixedBatchNorm,
+    "StemWithGN": StemWithGN,
+})
+
+_STAGE_SPECS = Registry({
+    "R-50-C4": ResNet50StagesTo4,
+    "R-50-C5": ResNet50StagesTo5,
+    "R-101-C4": ResNet101StagesTo4,
+    "R-101-C5": ResNet101StagesTo5,
+    "R-50-FPN": ResNet50FPNStagesTo5,
+    "R-50-FPN-RETINANET": ResNet50FPNStagesTo5,
+    "R-101-FPN": ResNet101FPNStagesTo5,
+    "R-101-PAN": ResNet101FPNStagesTo5,
+    "R-101-FPN-RETINANET": ResNet101FPNStagesTo5,
+    "R-152-FPN": ResNet152FPNStagesTo5,
+    "R-152-PAN": ResNet152FPNStagesTo5,
+})
diff --git a/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0bd00444d3b1bdefa1a4015e8e6af72166817cf
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py
@@ -0,0 +1,68 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+
+
+class BalancedPositiveNegativeSampler(object):
+    """
+    This class samples batches, ensuring that they contain a fixed proportion of positives
+    """
+
+    def __init__(self, batch_size_per_image, positive_fraction):
+        """
+        Arguments:
+            batch_size_per_image (int): number of elements to be selected per image
+            positive_fraction (float): percentace of positive elements per batch
+        """
+        self.batch_size_per_image = batch_size_per_image
+        self.positive_fraction = positive_fraction
+
+    def __call__(self, matched_idxs):
+        """
+        Arguments:
+            matched idxs: list of tensors containing -1, 0 or positive values.
+                Each tensor corresponds to a specific image.
+                -1 values are ignored, 0 are considered as negatives and > 0 as
+                positives.
+
+        Returns:
+            pos_idx (list[tensor])
+            neg_idx (list[tensor])
+
+        Returns two lists of binary masks for each image.
+        The first list contains the positive elements that were selected,
+        and the second list the negative example.
+        """
+        pos_idx = []
+        neg_idx = []
+        for matched_idxs_per_image in matched_idxs:
+            positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
+            negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
+
+            num_pos = int(self.batch_size_per_image * self.positive_fraction)
+            # protect against not enough positive examples
+            num_pos = min(positive.numel(), num_pos)
+            num_neg = self.batch_size_per_image - num_pos
+            # protect against not enough negative examples
+            num_neg = min(negative.numel(), num_neg)
+
+            # randomly select positive and negative examples
+            perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+            perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+
+            pos_idx_per_image = positive[perm1]
+            neg_idx_per_image = negative[perm2]
+
+            # create binary mask from indices
+            pos_idx_per_image_mask = torch.zeros_like(
+                matched_idxs_per_image, dtype=torch.uint8
+            )
+            neg_idx_per_image_mask = torch.zeros_like(
+                matched_idxs_per_image, dtype=torch.uint8
+            )
+            pos_idx_per_image_mask[pos_idx_per_image] = 1
+            neg_idx_per_image_mask[neg_idx_per_image] = 1
+
+            pos_idx.append(pos_idx_per_image_mask)
+            neg_idx.append(neg_idx_per_image_mask)
+
+        return pos_idx, neg_idx
diff --git a/maskrcnn_benchmark/modeling/box_coder.py b/maskrcnn_benchmark/modeling/box_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..5579503fa55c92b82690fe55dd9715447ab8f081
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/box_coder.py
@@ -0,0 +1,193 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import math
+
+import torch
+import pandas as pd
+from maskrcnn_benchmark.data.datasets.evaluation.word import io_
+class BoxCoder(object):
+    """
+    This class encodes and decodes a set of bounding boxes into the representation used for training the regressors.
+    """
+
+    def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)):
+        """
+        Arguments:
+            weights (4-element tuple)
+            bbox_xform_clip (float)
+        """
+        self.weights = weights
+        self.bbox_xform_clip = bbox_xform_clip
+
+    def encode(self, reference_boxes, proposals):
+        """
+        Encode a set of proposals with respect to some
+        reference boxes
+
+        Arguments:
+            reference_boxes (Tensor): reference boxes
+            proposals (Tensor): boxes to be encoded
+        """
+        TO_REMOVE = 1  # TODO remove
+        ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE
+        ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE
+        ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths
+        ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights
+
+        gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE
+        gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE
+        gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths
+        gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights
+
+        wx, wy, ww, wh = self.weights
+        targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
+        targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
+        targets_dw = ww * torch.log(gt_widths / ex_widths)
+        targets_dh = wh * torch.log(gt_heights / ex_heights)
+
+        targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
+        return targets
+
+    def encode_iou(self, reference_boxes, proposals):
+        """
+        Encode a set of proposals with respect to some
+        reference boxes
+
+        Arguments:
+            reference_boxes (Tensor): reference boxes
+            proposals (Tensor): boxes to be encoded
+        """
+        TO_REMOVE = 1  # TODO remove
+        ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE
+        ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE
+        ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths
+        ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights
+
+        gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE
+        gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE
+        gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths
+        gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights
+
+        wx, wy, ww, wh = self.weights
+        targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
+        targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
+        targets_dw = ww * torch.log(gt_widths / ex_widths)
+        targets_dh = wh * torch.log(gt_heights / ex_heights)
+
+        targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
+        return targets
+
+
+    def decode(self, rel_codes, boxes):
+        """
+        From a set of original boxes and encoded relative box offsets,
+        get the decoded boxes.
+
+        Arguments:
+            rel_codes (Tensor): encoded boxes   # predict  [2, 12000, 4]
+            boxes (Tensor): reference boxes.   # anchor  [2, 12000, 4]  xmin0 ymin1 xmax2 ymax3
+        """
+        boxes = boxes.to(rel_codes.dtype)
+
+
+        TO_REMOVE = 1  # TODO remove
+        widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE
+        heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+
+        wx, wy, ww, wh = self.weights
+        dx = rel_codes[:, 0::4] / wx
+        dy = rel_codes[:, 1::4] / wy
+        dw = rel_codes[:, 2::4] / ww
+        dh = rel_codes[:, 3::4] / wh
+
+        dw = torch.clamp(dw, max=self.bbox_xform_clip)
+        dh = torch.clamp(dh, max=self.bbox_xform_clip)
+
+        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+        pred_w = torch.exp(dw) * widths[:, None]
+        pred_h = torch.exp(dh) * heights[:, None]
+
+        ##############################
+
+        pred_boxes = torch.zeros_like(rel_codes)
+        pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+        pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+        pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
+        pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
+
+        return pred_boxes
+
+
+    def decode_iou(self, rel_codes, boxes, num_p = 8):
+        """
+        From a set of original boxes and encoded relative box offsets,
+        get the decoded boxes.
+
+        Arguments:
+            rel_codes (Tensor): encoded boxes   # predict  [2, 12000, 4]
+            boxes (Tensor): reference boxes.   # anchor  [2, 12000, 4]  xmin0 ymin1 xmax2 ymax3
+        """
+        boxes = boxes.to(rel_codes.dtype)
+
+        TO_REMOVE = 1  # TODO remove
+        widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE
+        heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE
+
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+        # 123
+        # 8#4
+        # 765
+        if num_p == 8:  # 8 boundary points
+            x_1 = boxes[:, 0] + widths * rel_codes[:, 0]
+            y_1 = boxes[:, 1] + heights * rel_codes[:, 1]
+            x_2 = ctr_x + widths * rel_codes[:, 2]
+            y_2 = boxes[:, 1] + heights * rel_codes[:, 3]
+            x_3 = boxes[:, 2] + widths * rel_codes[:, 4]
+            y_3 = boxes[:, 1] + heights * rel_codes[:, 5]
+            x_4 = boxes[:, 2] + widths * rel_codes[:, 6]
+            y_4 = ctr_y + heights * rel_codes[:, 7]
+            x_5 = boxes[:, 2] + widths * rel_codes[:, 8]
+            y_5 = boxes[:, 3] + heights * rel_codes[:, 9]
+            x_6 = ctr_x + widths * rel_codes[:, 10]
+            y_6 = boxes[:, 3] + heights * rel_codes[:, 11]
+            x_7 = boxes[:, 0] + widths * rel_codes[:, 12]
+            y_7 = boxes[:, 3] + heights * rel_codes[:, 13]
+            x_8 = boxes[:, 0] + widths * rel_codes[:, 14]
+            y_8 = ctr_y + heights * rel_codes[:, 15]
+            x_total = torch.stack([x_1, x_2, x_3, x_4, x_5, x_6, x_7, x_8], 0)
+            y_total = torch.stack([y_1, y_2, y_3, y_4, y_5, y_6, y_7, y_8], 0)
+
+        x_min = torch.min(x_total, 0, keepdim=True)  # [1, N]
+        x_max = torch.max(x_total, 0, keepdim=True)
+
+        y_min = torch.min(y_total, 0, keepdim=True)
+        y_max = torch.max(y_total, 0, keepdim=True)
+
+        N1, N2 = x_min[0].shape
+        x_min = x_min[0].view([N2])
+        x_max = x_max[0].view([N2])
+        y_min = y_min[0].view([N2])
+        y_max = y_max[0].view([N2])
+
+        x_min = torch.stack([x_min, ctr_x], 0)
+        x_max = torch.stack([x_max, ctr_x], 0)
+        y_min = torch.stack([y_min, ctr_y], 0)
+        y_max = torch.stack([y_max, ctr_y], 0)
+
+        x_min = torch.min(x_min, 0, keepdim=True)  # [1, N]
+        x_max = torch.max(x_max, 0, keepdim=True)
+        y_min = torch.min(y_min, 0, keepdim=True)
+        y_max = torch.max(y_max, 0, keepdim=True)
+
+        pred_boxes = torch.zeros_like(boxes)
+
+        pred_boxes[:, 0] = x_min[0][0, :]
+        pred_boxes[:, 1] = y_min[0][0, :]
+        pred_boxes[:, 2] = x_max[0][0, :]
+        pred_boxes[:, 3] = y_max[0][0, :]
+
+
+        return pred_boxes
diff --git a/maskrcnn_benchmark/modeling/detector/__init__.py b/maskrcnn_benchmark/modeling/detector/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff421e281e16e6623bab2551b242ea003d1f2166
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/detector/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .detectors import build_detection_model
diff --git a/maskrcnn_benchmark/modeling/detector/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/detector/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..674c0fcc2a09067ad812c6e7f1f2c295bb13d495
Binary files /dev/null and b/maskrcnn_benchmark/modeling/detector/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/detector/__pycache__/detectors.cpython-37.pyc b/maskrcnn_benchmark/modeling/detector/__pycache__/detectors.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..525398659531677234501163fa548df7e21deb7e
Binary files /dev/null and b/maskrcnn_benchmark/modeling/detector/__pycache__/detectors.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/detector/__pycache__/generalized_rcnn.cpython-37.pyc b/maskrcnn_benchmark/modeling/detector/__pycache__/generalized_rcnn.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fa1e99e5a9a2c671b6d3b09474c1a3be7bc370a
Binary files /dev/null and b/maskrcnn_benchmark/modeling/detector/__pycache__/generalized_rcnn.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/detector/detectors.py b/maskrcnn_benchmark/modeling/detector/detectors.py
new file mode 100644
index 0000000000000000000000000000000000000000..af2100cac15830cd60be5911aa15d0d7c9309a17
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/detector/detectors.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .generalized_rcnn import GeneralizedRCNN
+
+
+_DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN}
+
+
+def build_detection_model(cfg):
+    meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE]
+    return meta_arch(cfg)
diff --git a/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dfab58deee63d3483927a50f1a8b3a548119ee6
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+Implements the Generalized R-CNN framework
+"""
+
+import torch
+from torch import nn
+
+from maskrcnn_benchmark.structures.image_list import to_image_list
+
+from ..backbone import build_backbone
+from ..rpn.rpn import build_rpn
+from ..roi_heads.roi_heads import build_roi_heads
+import numpy as np
+import cv2
+
+class GeneralizedRCNN(nn.Module):
+    """
+    Main class for Generalized R-CNN. Currently supports boxes and masks.
+    It consists of three main parts:
+    - backbone
+    - rpn
+    - heads: takes the features + the proposals from the RPN and computes
+        detections / masks from it.
+    """
+
+    def __init__(self, cfg):
+        super(GeneralizedRCNN, self).__init__()
+
+        self.cfg = cfg.clone()
+        self.backbone = build_backbone(cfg)
+        self.rpn = build_rpn(cfg, self.backbone.out_channels)
+        self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels)
+
+
+    def forward(self, images, targets=None):
+        """
+        Arguments:
+            images (list[Tensor] or ImageList): images to be processed
+            targets (list[BoxList]): ground-truth boxes present in the image (optional)
+
+        Returns:
+            result (list[BoxList] or dict[Tensor]): the output from the model.
+                During training, it returns a dict[Tensor] which contains the losses.
+                During testing, it returns list[BoxList] contains additional fields
+                like `scores`, `labels` and `mask` (for Mask R-CNN models).
+
+        """
+        if self.training and targets is None:
+            raise ValueError("In training mode, targets should be passed")
+
+
+        images = to_image_list(images)
+
+        features = self.backbone(images.tensors)
+        proposals, proposal_losses = self.rpn(images, features, targets)
+        if self.roi_heads:
+            x, result, detector_losses = self.roi_heads(features, proposals, targets)
+        else:
+            #self.warm_start -= 1
+            # RPN-only models don't have roi_heads
+            x = features
+            result = proposals
+            detector_losses = {}
+
+        if self.training:
+            losses = {}
+            losses.update(detector_losses)
+            losses.update(proposal_losses)
+
+            return losses
+        else:
+            return result
diff --git a/maskrcnn_benchmark/modeling/make_layers.py b/maskrcnn_benchmark/modeling/make_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1656fb4f0ea4aeb65260f46beb80e8bd14fcc091
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/make_layers.py
@@ -0,0 +1,126 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+Miscellaneous utility functions
+"""
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from maskrcnn_benchmark.config import cfg
+from maskrcnn_benchmark.layers import Conv2d, DCN
+from maskrcnn_benchmark.modeling.poolers import Pooler
+
+
+def get_group_gn(dim, dim_per_gp, num_groups):
+    """get number of groups used by GroupNorm, based on number of channels."""
+    assert dim_per_gp == -1 or num_groups == -1, \
+        "GroupNorm: can only specify G or C/G."
+
+    if dim_per_gp > 0:
+        assert dim % dim_per_gp == 0, \
+            "dim: {}, dim_per_gp: {}".format(dim, dim_per_gp)
+        group_gn = dim // dim_per_gp
+    else:
+        assert dim % num_groups == 0, \
+            "dim: {}, num_groups: {}".format(dim, num_groups)
+        group_gn = num_groups
+
+    return group_gn
+
+
+def group_norm(out_channels, affine=True, divisor=1):
+    out_channels = out_channels // divisor
+    dim_per_gp = cfg.MODEL.GROUP_NORM.DIM_PER_GP // divisor
+    num_groups = cfg.MODEL.GROUP_NORM.NUM_GROUPS // divisor
+    eps = cfg.MODEL.GROUP_NORM.EPSILON # default: 1e-5
+    return torch.nn.GroupNorm(
+        get_group_gn(out_channels, dim_per_gp, num_groups), 
+        out_channels, 
+        eps, 
+        affine
+    )
+
+
+def make_conv3x3(
+    in_channels, 
+    out_channels, 
+    dilation=1, 
+    stride=1, 
+    use_gn=False,
+    use_relu=False,
+    kaiming_init=True
+):
+    conv = Conv2d(
+        in_channels, 
+        out_channels, 
+        kernel_size=3, 
+        stride=stride, 
+        padding=dilation, 
+        dilation=dilation, 
+        bias=False if use_gn else True
+    )
+    if kaiming_init:
+        nn.init.kaiming_normal_(
+            conv.weight, mode="fan_out", nonlinearity="relu"
+        )
+    else:
+        torch.nn.init.normal_(conv.weight, std=0.01)
+    if not use_gn:
+        nn.init.constant_(conv.bias, 0)
+    module = [conv,]
+    if use_gn:
+        module.append(group_norm(out_channels))
+    if use_relu:
+        module.append(nn.ReLU(inplace=True))
+    if len(module) > 1:
+        return nn.Sequential(*module)
+    return conv
+
+
+def make_fc(dim_in, hidden_dim, use_gn=False):
+    '''
+        Caffe2 implementation uses XavierFill, which in fact
+        corresponds to kaiming_uniform_ in PyTorch
+    '''
+    if use_gn:
+        fc = nn.Linear(dim_in, hidden_dim, bias=False)
+        nn.init.kaiming_uniform_(fc.weight, a=1)
+        return nn.Sequential(fc, group_norm(hidden_dim))
+    fc = nn.Linear(dim_in, hidden_dim)
+    nn.init.kaiming_uniform_(fc.weight, a=1)
+    nn.init.constant_(fc.bias, 0)
+    return fc
+
+
+def conv_with_kaiming_uniform(use_gn=False, use_relu=False, use_deformable=False):
+    def make_conv(
+        in_channels, out_channels, kernel_size, stride=1, dilation=1
+    ):
+        if use_deformable:
+            conv_func = DCN
+        else:
+            conv_func = Conv2d
+        conv = conv_func(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=dilation * (kernel_size - 1) // 2,
+            dilation=dilation,
+            bias=False if use_gn else True
+        )
+        # Caffe2 implementation uses XavierFill, which in fact
+        # corresponds to kaiming_uniform_ in PyTorch
+        nn.init.kaiming_uniform_(conv.weight, a=1)
+        if not use_gn:
+            nn.init.constant_(conv.bias, 0)
+        module = [conv,]
+        if use_gn:
+            module.append(group_norm(out_channels))
+        if use_relu:
+            module.append(nn.ReLU(inplace=True))
+        if len(module) > 1:
+            return nn.Sequential(*module)
+        return conv
+
+    return make_conv
diff --git a/maskrcnn_benchmark/modeling/matcher.py b/maskrcnn_benchmark/modeling/matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..35ec5f1fe819526055c10607f05d47ac88277de6
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/matcher.py
@@ -0,0 +1,112 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+
+
+class Matcher(object):
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth
+    element. Each predicted element will have exactly zero or one matches; each
+    ground-truth element may be assigned to zero or more predicted elements.
+
+    Matching is based on the MxN match_quality_matrix, that characterizes how well
+    each (ground-truth, predicted)-pair match. For example, if the elements are
+    boxes, the matrix may contain box IoU overlap values.
+
+    The matcher returns a tensor of size N containing the index of the ground-truth
+    element m that matches to prediction n. If there is no match, a negative value
+    is returned.
+    """
+
+    BELOW_LOW_THRESHOLD = -1
+    BETWEEN_THRESHOLDS = -2
+
+    def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False):
+        """
+        Args:
+            high_threshold (float): quality values greater than or equal to
+                this value are candidate matches.
+            low_threshold (float): a lower quality threshold used to stratify
+                matches into three levels:
+                1) matches >= high_threshold
+                2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold)
+                3) BELOW_LOW_THRESHOLD matches in [0, low_threshold)
+            allow_low_quality_matches (bool): if True, produce additional matches
+                for predictions that have only low-quality match candidates. See
+                set_low_quality_matches_ for more details.
+        """
+        assert low_threshold <= high_threshold
+        self.high_threshold = high_threshold
+        self.low_threshold = low_threshold
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+            pairwise quality between M ground-truth elements and N predicted elements.
+
+        Returns:
+            matches (Tensor[int64]): an N tensor where N[i] is a matched gt in
+            [0, M - 1] or a negative value indicating that prediction i could not
+            be matched.
+        """
+        if match_quality_matrix.numel() == 0:
+            # empty targets or proposals not supported during training
+            if match_quality_matrix.shape[0] == 0:
+                raise ValueError(
+                    "No ground-truth boxes available for one of the images "
+                    "during training")
+            else:
+                raise ValueError(
+                    "No proposal boxes available for one of the images "
+                    "during training")
+
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+        if self.allow_low_quality_matches:
+            all_matches = matches.clone()
+
+        # Assign candidate matches with low quality to negative (unassigned) values
+        below_low_threshold = matched_vals < self.low_threshold
+        between_thresholds = (matched_vals >= self.low_threshold) & (
+            matched_vals < self.high_threshold
+        )
+        matches[below_low_threshold] = Matcher.BELOW_LOW_THRESHOLD
+        matches[between_thresholds] = Matcher.BETWEEN_THRESHOLDS
+
+        if self.allow_low_quality_matches:
+            self.set_low_quality_matches_(matches, all_matches, match_quality_matrix)
+
+        return matches
+
+    def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches.
+        Specifically, for each ground-truth find the set of predictions that have
+        maximum overlap with it (including ties); for each prediction in that set, if
+        it is unmatched, then match it to the ground-truth with which it has the highest
+        quality value.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find highest quality match available, even if it is low, including ties
+        gt_pred_pairs_of_highest_quality = torch.nonzero(
+            match_quality_matrix == highest_quality_foreach_gt[:, None]
+        )
+        # Example gt_pred_pairs_of_highest_quality:
+        #   tensor([[    0, 39796],
+        #           [    1, 32055],
+        #           [    1, 32070],
+        #           [    2, 39190],
+        #           [    2, 40255],
+        #           [    3, 40390],
+        #           [    3, 41455],
+        #           [    4, 45470],
+        #           [    5, 45325],
+        #           [    5, 46390]])
+        # Each row is a (gt index, prediction index)
+        # Note how gt items 1, 2, 3, and 5 each have two ties
+
+        pred_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1]
+        matches[pred_inds_to_update] = all_matches[pred_inds_to_update]
diff --git a/maskrcnn_benchmark/modeling/poolers.py b/maskrcnn_benchmark/modeling/poolers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0164f439b8668fb136611249eb8301a2d90e7d1d
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/poolers.py
@@ -0,0 +1,151 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from maskrcnn_benchmark.layers import ROIAlign
+from maskrcnn_benchmark.layers import DCNPooling
+
+from .utils import cat
+
+
+class LevelMapper(object):
+    """Determine which FPN level each RoI in a set of RoIs should map to based
+    on the heuristic in the FPN paper.
+    """
+
+    def __init__(self, k_min, k_max, canonical_scale=224, canonical_level=4, eps=1e-6):
+        """
+        Arguments:
+            k_min (int)
+            k_max (int)
+            canonical_scale (int)
+            canonical_level (int)
+            eps (float)
+        """
+        self.k_min = k_min
+        self.k_max = k_max
+        self.s0 = canonical_scale
+        self.lvl0 = canonical_level
+        self.eps = eps
+
+    def __call__(self, boxlists):
+        """
+        Arguments:
+            boxlists (list[BoxList])
+        """
+        # Compute level ids
+        s = torch.sqrt(cat([boxlist.area() for boxlist in boxlists]))
+
+        # Eqn.(1) in FPN paper
+        target_lvls = torch.floor(self.lvl0 + torch.log2(s / self.s0 + self.eps))
+        target_lvls = torch.clamp(target_lvls, min=self.k_min, max=self.k_max)
+        return target_lvls.to(torch.int64) - self.k_min
+
+    def get_random(self, level):
+        """ Generate a random roi for target level
+        """
+        xmin, ymin, xmax, ymax = torch.tensor
+
+
+class Pooler(nn.Module):
+    """
+    Pooler for Detection with or without FPN.
+    It currently hard-code ROIAlign in the implementation,
+    but that can be made more generic later on.
+    Also, the requirement of passing the scales is not strictly necessary, as they
+    can be inferred from the size of the feature map / size of original image,
+    which is available thanks to the BoxList.
+    """
+
+    def __init__(self, output_size, scales, sampling_ratio,
+                 deformable=False, output_channel=256):
+        """
+        Arguments:
+            output_size (list[tuple[int]] or list[int]): output size for the pooled region
+            scales (list[float]): scales for each Pooler
+            sampling_ratio (int): sampling ratio for ROIAlign
+        """
+        super(Pooler, self).__init__()
+        poolers = []
+        for scale in scales:
+            poolers.append(
+                ROIAlign(
+                    output_size, spatial_scale=scale, sampling_ratio=sampling_ratio
+                ) if not deformable else
+                DCNPooling(spatial_scale=scale, pooled_size=output_size, no_trans=False,
+                           group_size=1, trans_std=0.1, output_dim=output_channel)
+            )
+        self.poolers = nn.ModuleList(poolers)
+        self.output_size = output_size
+        # get the levels in the feature map by leveraging the fact that the network always
+        # downsamples by a factor of 2 at each level.
+        lvl_min = -torch.log2(torch.tensor(scales[0], dtype=torch.float32)).item()
+        lvl_max = -torch.log2(torch.tensor(scales[-1], dtype=torch.float32)).item()
+        self.map_levels = LevelMapper(lvl_min, lvl_max, canonical_scale=160)
+
+    def convert_to_roi_format(self, boxes):
+        concat_boxes = cat([b.bbox for b in boxes], dim=0)
+        device, dtype = concat_boxes.device, concat_boxes.dtype
+        ids = cat(
+            [
+                torch.full((len(b), 1), i, dtype=dtype, device=device)
+                for i, b in enumerate(boxes)
+            ],
+            dim=0,
+        )
+        rois = torch.cat([ids, concat_boxes], dim=1)
+        return rois
+
+    def forward(self, x, boxes):
+        """
+        Arguments:
+            x (list[Tensor]): feature maps for each level
+            boxes (list[BoxList]): boxes to be used to perform the pooling operation.
+        Returns:
+            result (Tensor)
+        """
+        num_levels = len(self.poolers)
+        rois = self.convert_to_roi_format(boxes)
+        if num_levels == 1:
+            return self.poolers[0](x[0], rois)
+
+        levels = self.map_levels(boxes)
+
+        num_rois = len(rois)
+        num_channels = x[0].shape[1]
+        output_size = self.output_size[0]
+
+        dtype, device = x[0].dtype, x[0].device
+        result = torch.zeros(
+            (num_rois, num_channels, output_size, output_size),
+            dtype=dtype,
+            device=device,
+        )
+        for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)):
+            idx_in_level = torch.nonzero(levels == level).squeeze(1)
+            rois_per_level = rois[idx_in_level]
+            if idx_in_level.numel() == 0:
+                if num_rois == 0:
+                    continue
+                # create a roi and do one empty forward pass
+                new_level = idx_in_level.new_tensor((0,))
+                new_rois = rois[new_level]
+                result[new_level] = result[new_level] \
+                                    + pooler(per_level_feature, new_rois) * 0.0
+            else:
+                result[idx_in_level] = pooler(per_level_feature, rois_per_level)
+
+        return result
+
+
+def make_pooler(cfg, head_name):
+    resolution = cfg.MODEL[head_name].POOLER_RESOLUTION
+    scales = cfg.MODEL[head_name].POOLER_SCALES
+    sampling_ratio = cfg.MODEL[head_name].POOLER_SAMPLING_RATIO
+    pooler = Pooler(
+        output_size=(resolution, resolution),
+        scales=scales,
+        sampling_ratio=sampling_ratio,
+    )
+    return pooler
diff --git a/maskrcnn_benchmark/modeling/registry.py b/maskrcnn_benchmark/modeling/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..e14fb118c458d0ba97d2a699be3004c6bdd3913c
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/registry.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from maskrcnn_benchmark.utils.registry import Registry
+
+BACKBONES = Registry()
+RPN_HEADS = Registry()
+ROI_BOX_FEATURE_EXTRACTORS = Registry()
+ROI_BOX_PREDICTOR = Registry()
+ROI_KEYPOINT_FEATURE_EXTRACTORS = Registry()
+ROI_KEYPOINT_PREDICTOR = Registry()
+ROI_MASK_FEATURE_EXTRACTORS = Registry()
+ROI_MASK_PREDICTOR = Registry()
diff --git a/maskrcnn_benchmark/modeling/roi_heads/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/maskrcnn_benchmark/modeling/roi_heads/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..200603ec4fc014adada1cc6d180f84c22c92d9a9
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/__pycache__/roi_heads.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/__pycache__/roi_heads.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50e17ea8bccbccce7553e5d955ed608e15878c22
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/__pycache__/roi_heads.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..671bf7a8e888f033a1b68d3a691bd4024518a426
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/boundary_head.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/boundary_head.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83bbc12a984d6e8dc18152757b28c65a55ad0345
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/boundary_head.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/inference.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..adaf00725f22784d78ced491b33369f9540a3bfe
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/inference.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/ke_head.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/ke_head.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..194d3a457ba237a4916d16b7609302cc6d18f608
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/ke_head.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/loss.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/loss.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07f33dcf36472ecf3c90e17717fb50cfcb034e28
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/loss.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_feature_extractors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_feature_extractors.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aaeaa85fe3187a5b97ec7d912f56ea518ff0e7f0
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_feature_extractors.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_predictors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_predictors.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8899f61e1f2d33be72ea139133231f50c0b45342
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_boundary_predictors.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_feature_extractors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_feature_extractors.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34d4eb611ed7e32740b31dfb5f5733410128c1e4
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_feature_extractors.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_predictors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_predictors.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7777e886dcfa288907e56652e6ba98dfd4de29c4
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/__pycache__/roi_ke_predictors.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/boundary_head.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/boundary_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..643e58b01e04cab324420ce9a09f0310f2a97d91
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/boundary_head.py
@@ -0,0 +1,104 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch import nn
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+from .roi_boundary_feature_extractors import make_roi_boundary_feature_extractor
+from .roi_boundary_predictors import make_roi_boundary_predictor
+from .inference import make_roi_boundary_post_processor
+from .loss import make_roi_boundary_loss_evaluator
+
+def keep_only_positive_boxes(boxes):
+    """
+    Given a set of BoxList containing the `labels` field,
+    return a set of BoxList for which `labels > 0`.
+
+    Arguments:
+        boxes (list of BoxList)
+    """
+    assert isinstance(boxes, (list, tuple))
+    assert isinstance(boxes[0], BoxList)
+    assert boxes[0].has_field("labels")
+    positive_boxes = []
+    positive_inds = []
+    num_boxes = 0
+    for boxes_per_image in boxes:
+        labels = boxes_per_image.get_field("labels")
+        inds_mask = labels > 0
+        inds = inds_mask.nonzero().squeeze(1)
+        positive_boxes.append(boxes_per_image[inds])
+        positive_inds.append(inds_mask)
+    return positive_boxes, positive_inds
+
+
+def keep_only_positive_boxes(boxes):
+    """
+    Given a set of BoxList containing the `labels` field,
+    return a set of BoxList for which `labels > 0`.
+
+    Arguments:
+        boxes (list of BoxList)
+    """
+    assert isinstance(boxes, (list, tuple))
+    assert isinstance(boxes[0], BoxList)
+    assert boxes[0].has_field("labels")
+    positive_boxes = []
+    positive_inds = []
+    num_boxes = 0
+    for boxes_per_image in boxes:
+        labels = boxes_per_image.get_field("labels")
+        inds_mask = labels > 0
+        inds = inds_mask.nonzero().squeeze(1)
+        positive_boxes.append(boxes_per_image[inds])
+        positive_inds.append(inds_mask)
+    return positive_boxes, positive_inds
+
+
+class ROIBOHead(torch.nn.Module):
+    def __init__(self, cfg, in_channels):
+        super(ROIBOHead, self).__init__()
+        self.cfg = cfg.clone()
+        self.feature_extractor = make_roi_boundary_feature_extractor(cfg, in_channels)
+        self.predictor = make_roi_boundary_predictor(cfg)
+        self.post_processor = make_roi_boundary_post_processor(cfg)
+        self.loss_evaluator = make_roi_boundary_loss_evaluator(cfg)
+
+    def forward(self, features, proposals, targets=None):
+        """
+        Arguments:
+            features (list[Tensor]): feature-maps from possibly several levels
+            proposals (list[BoxList]): proposal boxes
+            targets (list[BoxList], optional): the ground-truth targets.
+
+        Returns:
+            x (Tensor): the result of the feature extractor
+            proposals (list[BoxList]): during training, the original proposals
+                are returned. During testing, the predicted boxlists are returned
+                with the `mask` field set
+            losses (dict[Tensor]): During training, returns the losses for the
+                head. During testing, returns an empty dict.
+        """
+
+        if self.training:
+            # during training, only focus on positive boxes
+            with torch.no_grad():
+                # proposals = self.loss_evaluator.subsample(proposals, targets)
+                all_proposals = proposals
+                proposals, positive_inds = keep_only_positive_boxes(proposals)
+
+        x = self.feature_extractor(features, proposals)
+        outputs_x, outputs_y= self.predictor(x)
+
+        if not self.training:
+            result = self.post_processor(outputs_x, outputs_y, proposals)
+
+            return x, result, {}, {}, {}
+
+        loss_bo, loss_x, loss_y = self.loss_evaluator(proposals, outputs_x, outputs_y, targets)
+
+        return x, proposals, dict(loss_bo=loss_bo), dict(loss_bo_x=loss_x), dict(loss_bo_y=loss_y)
+
+
+def build_roi_boundary_head(cfg, in_channels):
+    return ROIBOHead(cfg, in_channels)
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e734da2b274434d001fecaec37d4437e890edfda
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.py
@@ -0,0 +1,207 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import numpy as np
+import torch
+from torch import nn
+from maskrcnn_benchmark.layers.misc import interpolate
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+
+# TODO check if want to return a single BoxList or a composite
+# object
+class MaskPostProcessor(nn.Module):
+    """
+    From the results of the CNN, post process the masks
+    by taking the mask corresponding to the class with max
+    probability (which are of fixed size and directly output
+    by the CNN) and return the masks in the mask field of the BoxList.
+
+    If a masker object is passed, it will additionally
+    project the masks in the image according to the locations in boxes,
+    """
+
+    def __init__(self, masker=None):
+        super(MaskPostProcessor, self).__init__()
+        self.masker = masker
+
+    def forward(self, x, y, boxes):
+        """
+        Arguments:
+            x (Tensor): the mask logits
+            boxes (list[BoxList]): bounding boxes that are used as
+                reference, one for ech image
+
+        Returns:
+            results (list[BoxList]): one BoxList for each image, containing
+                the extra field mask
+        """
+        mask_prob_x = x.sigmoid()
+        mask_prob_y = y.sigmoid()
+        # select masks coresponding to the predicted classes
+        num_masks = x.shape[0]  #  286
+        labels = [bbox.get_field("labels") for bbox in boxes]
+        labels = torch.cat(labels)
+        index = torch.arange(num_masks, device=labels.device)
+        mask_prob_x = mask_prob_x[index, 0][:, None]
+        mask_prob_y = mask_prob_y[index, 0][:, None]
+
+        boxes_per_image = [len(box) for box in boxes]  # boxes for one image
+        mask_prob_x = mask_prob_x.split(boxes_per_image, dim=0)
+        mask_prob_y = mask_prob_y.split(boxes_per_image, dim=0)
+
+        if self.masker:
+            print('yes!!!')
+            mask_prob_x = self.masker(mask_prob_x, boxes)
+            mask_prob_y = self.masker(mask_prob_y, boxes)
+
+        results = []
+        for prob_x, prob_y, box in zip(mask_prob_x, mask_prob_y, boxes):
+            bbox = BoxList(box.bbox, box.size, mode="xyxy")
+            for field in box.fields():
+                bbox.add_field(field, box.get_field(field))
+            bbox.add_field("mask_x", prob_x)
+            bbox.add_field("mask_y", prob_y)
+            results.append(bbox)
+        return results
+
+
+class MaskPostProcessorCOCOFormat(MaskPostProcessor):
+    """
+    From the results of the CNN, post process the results
+    so that the masks are pasted in the image, and
+    additionally convert the results to COCO format.
+    """
+
+    def forward(self, x, boxes):
+        import pycocotools.mask as mask_util
+        import numpy as np
+
+        results = super(MaskPostProcessorCOCOFormat, self).forward(x, boxes)
+        for result in results:
+            masks = result.get_field("mask").cpu()
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+            result.add_field("mask", rles)
+        return results
+
+
+# the next two functions should be merged inside Masker
+# but are kept here for the moment while we need them
+# temporarily gor paste_mask_in_image
+def expand_boxes(boxes, scale):
+    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = torch.zeros_like(boxes)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+    return boxes_exp
+
+
+def expand_masks(mask, padding):
+    N = mask.shape[0]
+    M = mask.shape[-1]
+    pad2 = 2 * padding
+    scale = float(M + pad2) / M
+    padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2))
+
+    padded_mask[:, :, padding:-padding, padding:-padding] = mask
+    return padded_mask, scale
+
+
+def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1):
+    padded_mask, scale = expand_masks(mask[None], padding=padding)
+    mask = padded_mask[0, 0]
+    box = expand_boxes(box[None], scale)[0]
+    box = box.to(dtype=torch.int32)
+    TO_REMOVE = 1
+    w = int(box[2] - box[0] + TO_REMOVE)
+    h = int(box[3] - box[1] + TO_REMOVE)
+    w = max(w, 1)
+    h = max(h, 1)
+
+    # Set shape to [batchxCxHxW]
+    mask = mask.expand((1, 1, -1, -1))
+
+    # Resize mask
+    mask = mask.to(torch.float32)
+    mask = interpolate(mask, size=(h, w), mode='bilinear', align_corners=False)
+    mask = mask[0][0]
+
+    if thresh >= 0:
+        mask = mask > thresh
+    else:
+        # for visualization and debugging, we also
+        # allow it to return an unmodified mask
+        mask = (mask * 255).to(torch.uint8)
+
+    im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8)
+    x_0 = max(box[0], 0)
+    x_1 = min(box[2] + 1, im_w)
+    y_0 = max(box[1], 0)
+    y_1 = min(box[3] + 1, im_h)
+
+    im_mask[y_0:y_1, x_0:x_1] = mask[
+        (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
+    ]
+    return im_mask
+
+
+class Masker(object):
+    """
+    Projects a set of masks in an image on the locations specified by the bounding boxes
+    """
+
+    def __init__(self, threshold=0.5, padding=1):
+        self.threshold = threshold
+        self.padding = padding
+
+    def forward_single_image(self, masks, boxes):
+        boxes = boxes.convert("xyxy")
+        im_w, im_h = boxes.size
+        res = [
+            paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding)
+            for mask, box in zip(masks, boxes.bbox)
+        ]
+        if len(res) > 0:
+            res = torch.stack(res, dim=0)[:, None]
+        else:
+            res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1]))
+        return res
+
+    def __call__(self, masks, boxes):
+        if isinstance(boxes, BoxList):
+            boxes = [boxes]
+
+        # Make some sanity check
+        assert len(boxes) == len(masks), "Masks and boxes should have the same length."
+
+        # TODO:  Is this JIT compatible?
+        # If not we should make it compatible.
+        results = []
+        for mask, box in zip(masks, boxes):
+            assert mask.shape[0] == len(box), "Number of objects should be the same."
+            result = self.forward_single_image(mask, box)
+            results.append(result)
+        return results
+
+
+def make_roi_boundary_post_processor(cfg):
+    if cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS:
+        mask_threshold = cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD ## 0.5
+        masker = Masker(threshold=mask_threshold, padding=1)
+    else:
+        masker = None
+    mask_post_processor = MaskPostProcessor(masker)
+    return mask_post_processor
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.pybk b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.pybk
new file mode 100644
index 0000000000000000000000000000000000000000..60a9d4cba3aba34fd33366890571fb8a88fd8030
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/inference.pybk
@@ -0,0 +1,293 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+import cv2
+
+# TODO check if want to return a single BoxList or a composite
+# object
+class KEPostProcessor(nn.Module):
+    """
+    From the results of the CNN, post process the kes
+    by taking the ke corresponding to the class with max
+    probability (which are of fixed size and directly output
+    by the CNN) and return the kes in the ke field of the BoxList.
+
+    If a keer object is passed, it will additionally
+    project the kes in the image according to the locations in boxes,
+    """
+
+    def __init__(self, keer=None):
+        super(KEPostProcessor, self).__init__()
+        self.keer = keer
+
+    def forward(self, x, boxes):
+        """
+        Arguments:
+            x (Tensor): the ke logits
+            boxes (list[BoxList]): bounding boxes that are used as
+                reference, one for ech image
+
+        Returns:
+            results (list[BoxList]): one BoxList for each image, containing
+                the extra field ke
+        """
+        # ke_prob = x.sigmoid()
+
+        # select kes coresponding to the predicted classes
+        num_proposals = x.shape[0]
+        labels = [bbox.get_field("labels") for bbox in boxes]
+        labels = torch.cat(labels)
+        index = torch.arange(num_proposals, device=labels.device)
+        ####### outputs
+
+        ke_prob = x[index]
+        # print("labels", labels)
+        # print("x",x.size())
+        # print("ke_",ke_prob.size())
+        # assert(0)
+
+        boxes_per_image = [len(box) for box in boxes]
+        ke_prob = ke_prob.split(boxes_per_image, dim=0)
+
+        if self.keer:
+            ke_prob = self.keer(ke_prob, boxes)
+
+        results = []
+        for prob, box in zip(ke_prob, boxes):
+            bbox = BoxList(box.bbox, box.size, mode="xyxy")
+            for field in box.fields():
+                bbox.add_field(field, box.get_field(field))
+            bbox.add_field("ke", prob)
+            results.append(bbox)
+
+        return results
+
+
+class KEPostProcessorCOCOFormat(KEPostProcessor):
+    """
+    From the results of the CNN, post process the results
+    so that the kes are pasted in the image, and
+    additionally convert the results to COCO format.
+    """
+
+    def forward(self, x, boxes):
+        # import pycocotools.mask as mask_util
+        import numpy as np
+
+        results = super(KEPostProcessorCOCOFormat, self).forward(x, boxes)
+        for result in results:
+            kes = result.get_field("ke").cpu()
+            rles = [
+                ke_util.encode(np.array(ke[0, :, :, np.newaxis], order="F"))[0]
+                for ke in kes
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+            result.add_field("ke", rles)
+        return results
+
+
+# the next two functions should be merged inside keer
+# but are kept here for the moment while we need them
+# temporarily gor paste_ke_in_image
+def expand_boxes(boxes, scale):
+    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = torch.zeros_like(boxes)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+    return boxes_exp
+
+
+def expand_kes(ke, padding):
+    N = ke.shape[0]
+    M = ke.shape[-1]
+    # print("NM ", N ,M)
+    pad2 = 2 * padding
+    scale = float(M + pad2) / M
+    padded_ke = ke.new_zeros((N, 1, M + pad2, M + pad2))
+    padded_ke[:, :, padding:-padding, padding:-padding] = ke
+    # print("padded_ke ", padded_ke.size())
+    return padded_ke, scale
+
+
+def paste_ke_in_image(ke, box, im_h, im_w, thresh=0.5, padding=1):
+    # print("ke ", ke.size(), ke[None].size())
+    padded_ke, scale = expand_kes(ke[None], padding=padding)
+    ke = padded_ke[0, 0]
+    box = expand_boxes(box[None], scale)[0]
+    box = box.to(dtype=torch.int32)
+
+    TO_REMOVE = 1
+    w = int(box[2] - box[0] + TO_REMOVE)
+    h = int(box[3] - box[1] + TO_REMOVE)
+    w = max(w, 1)
+    h = max(h, 1)
+
+    # Set shape to [batchxCxHxW]
+    ke = ke.expand((1, 1, -1, -1))
+
+    # print("ke 2", ke.size())
+    # Resize ke
+    ke = ke.to(torch.float32)
+    ke = F.interpolate(ke, size=(h, w), mode='bilinear', align_corners=False)
+    ke = ke[0][0]
+
+    # print("ke3 ", ke.size())
+
+    if thresh >= 0:
+        ke = ke > thresh
+    else:
+        # for visualization and debugging, we also
+        # allow it to return an unmodified ke
+        ke = (ke * 255).to(torch.uint8)
+
+    im_ke = torch.zeros((im_h, im_w), dtype=torch.uint8)
+    x_0 = max(box[0], 0)
+    x_1 = min(box[2] + 1, im_w)
+    y_0 = max(box[1], 0)
+    y_1 = min(box[3] + 1, im_h)
+
+    im_ke[y_0:y_1, x_0:x_1] = ke[
+        (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
+    ]
+    # print("im_ke ", im_ke.size())
+    return im_ke
+
+def scores_to_probs(scores):
+    """Transforms CxHxW of scores to probabilities spatially."""
+    channels = scores.shape[0]
+    for c in range(channels):
+        temp = scores[c, :, :]
+        max_score = temp.max()
+        temp = np.exp(temp - max_score) / np.sum(np.exp(temp - max_score))
+        scores[c, :, :] = temp
+    return scores
+
+def heatmaps_to_kes(maps, rois):
+    # This function converts a discrete image coordinate in a HEATMAP_SIZE x
+    # HEATMAP_SIZE image to a continuous ke coordinate. We maintain
+    # consistency with ke_to_heatmap_labels by using the conversion from
+    # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a
+    # continuous coordinate.
+    rois =rois.numpy()
+    maps = maps.numpy()
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+
+    widths = rois[:, 2] - rois[:, 0]
+    heights = rois[:, 3] - rois[:, 1]
+    widths = np.maximum(widths, 1)
+    heights = np.maximum(heights, 1)
+    widths_ceil = np.ceil(widths)
+    heights_ceil = np.ceil(heights)
+
+    # NCHW to NHWC for use with OpenCV
+    maps = np.transpose(maps, [0, 2, 3, 1])
+    # min_size = cfg.KRCNN.INFERENCE_MIN_SIZE
+    
+    num_kes = 10
+    xy_preds = np.zeros(
+        (len(rois), 4, num_kes), dtype=np.float32)
+    for i in range(len(rois)):
+        # if min_size > 0:
+            # roi_map_width = int(np.maximum(widths_ceil[i], min_size))
+            # roi_map_height = int(np.maximum(heights_ceil[i], min_size))
+        # else:
+            # roi_map_width = widths_ceil[i]
+            # roi_map_height = heights_ceil[i]
+        roi_map_width = int(widths_ceil[i])
+        roi_map_height = int(heights_ceil[i])
+
+        width_correction = widths[i] / roi_map_width
+        height_correction = heights[i] / roi_map_height
+        roi_map = cv2.resize(
+            maps[i], (roi_map_width, roi_map_height),
+            interpolation=cv2.INTER_CUBIC)
+        # Bring back to CHW
+        roi_map = np.transpose(roi_map, [2, 0, 1])
+        roi_map_probs = scores_to_probs(roi_map.copy())
+        w = roi_map.shape[2]
+        for k in range(num_kes):
+            pos = roi_map[k, :, :].argmax()
+            x_int = pos % w
+            y_int = (pos - x_int) // w
+            assert (roi_map_probs[k, y_int, x_int] ==
+                    roi_map_probs[k, :, :].max())
+            x = (x_int + 0.5) * width_correction
+            y = (y_int + 0.5) * height_correction
+            xy_preds[i, 0, k] = x + offset_x[i]
+            xy_preds[i, 1, k] = y + offset_y[i]
+            xy_preds[i, 2, k] = roi_map[k, y_int, x_int]
+            xy_preds[i, 3, k] = roi_map_probs[k, y_int, x_int]
+
+    return xy_preds
+
+class KEer(object):
+    """
+    Projects a set of kes in an image on the locations
+    specified by the bounding boxes
+    """
+
+    def __init__(self, threshold=0.5, padding=1):
+        self.threshold = threshold
+        self.padding = padding
+
+    def forward_single_image(self, kes, boxes):
+        boxes = boxes.convert("xyxy")
+        im_w, im_h = boxes.size
+        # print("KEer kes.size()", kes.size(), kes[0].size(), kes[0][0].size())
+        # assert(0)
+        # res = [
+        #     paste_ke_in_image(ke[0], box, im_h, im_w, self.threshold, self.padding)
+        #     for ke, box in zip(kes, boxes.bbox)
+        # ]
+        res = heatmaps_to_kes(kes, boxes.bbox)
+
+        if len(res) > 0:
+            # res = torch.stack(res, dim=0)[:, None]
+            res = torch.from_numpy(res)
+        else:
+            res = kes.new_empty((0, 1, kes.shape[-2], kes.shape[-1]))
+        print("res inference.py", res.size())
+        return res
+
+    def __call__(self, kes, boxes):
+        if isinstance(boxes, BoxList):
+            boxes = [boxes]
+
+        # Make some sanity check
+        assert len(boxes) == len(kes), "kes and boxes should have the same length."
+
+        # TODO:  Is this JIT compatible?
+        # If not we should make it compatible.
+        results = []
+        for ke, box in zip(kes, boxes):
+            assert ke.shape[0] == len(box), "Number of objects should be the same."
+            # print("ke inference.py", ke.size())
+            result = self.forward_single_image(ke, box)
+            results.append(result)
+        return results
+
+
+def make_roi_ke_post_processor(cfg):
+    if cfg.MODEL.ROI_KE_HEAD.POSTPROCESS_KES:
+        ke_threshold = cfg.MODEL.ROI_KE_HEAD.POSTPROCESS_KES_THRESHOLD
+        keer = KEer(threshold=ke_threshold, padding=1)
+    else:
+        keer = None
+    ke_post_processor = KEPostProcessor(keer)
+    return ke_post_processor
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b659e1abde19746ef13aae30fa3bb2f298a57c
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/loss.py
@@ -0,0 +1,259 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch.nn import functional as F
+
+from maskrcnn_benchmark.layers import smooth_l1_loss
+from maskrcnn_benchmark.modeling.matcher import Matcher
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+from maskrcnn_benchmark.modeling.utils import cat
+
+from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import (
+    BalancedPositiveNegativeSampler
+)
+# import torch import torch.nn as nn
+from maskrcnn_benchmark.structures.ke import kes_to_heat_map
+import numpy as np
+import os, time
+import cv2
+DEBUG = 0
+
+from scipy.ndimage.morphology import distance_transform_edt
+
+
+def onehot_to_binary_edges(mask, radius):
+    """
+    Converts a segmentation mask (K,H,W) to a binary edgemap (1,H,W)
+    """
+    if radius < 0:
+        return mask
+
+    # We need to pad the borders for boundary conditions
+
+    mask = np.pad(mask, ((1, 1), (1, 1)), mode='constant', constant_values=0)
+    mask = distance_transform_edt(mask)
+    mask = mask[1:-1, 1:-1]
+    mask[mask > radius] = 0
+    mask = (mask > 0).astype(np.uint8)
+    return mask
+
+
+def project_masks_on_boxes(segmentation_masks, proposals, discretization_size):
+    """
+    Given segmentation masks and the bounding boxes corresponding
+    to the location of the masks in the image, this function
+    crops and resizes the masks in the position defined by the
+    boxes. This prepares the masks for them to be fed to the
+    loss computation as the targets.
+
+    Arguments:
+        segmentation_masks: an instance of SegmentationMask
+        proposals: an instance of BoxList
+    """
+    masks = []
+    M = discretization_size
+    device = proposals.bbox.device
+    proposals = proposals.convert("xyxy")
+    assert segmentation_masks.size == proposals.size, "{}, {}".format(
+        segmentation_masks, proposals
+    )
+
+    # FIXME: CPU computation bottleneck, this should be parallelized
+    proposals = proposals.bbox.to(torch.device("cpu"))
+    for segmentation_mask, proposal in zip(segmentation_masks, proposals):
+        # crop the masks, resize them to the desired resolution and
+        # then convert them to the tensor representation.
+        cropped_mask = segmentation_mask.crop(proposal)
+        scaled_mask = cropped_mask.resize((M, M))
+        mask = scaled_mask.get_mask_tensor()
+        mask = mask.numpy().astype(np.uint8)
+        mask  = onehot_to_binary_edges(mask, 2)
+        mask = torch.from_numpy(mask)
+        masks.append(mask)
+    if len(masks) == 0:
+        return torch.empty(0, dtype=torch.float32, device=device)
+    return torch.stack(masks, dim=0).to(device, dtype=torch.float32)
+
+
+def project_kes_to_heatmap(kes, mty, proposals, discretization_size):
+    proposals = proposals.convert('xyxy')
+    out_x, out_y, valid_x, valid_y, out_mty, valid_mty = kes_to_heat_map(kes.kes_x, kes.kes_y, mty.mty, proposals.bbox, discretization_size)
+    return out_x, out_y, valid_x, valid_y, out_mty, valid_mty
+
+def _within_box(points_x, points_y, boxes):
+    """Validate which kes are contained inside a given box.
+    points: NxKx2
+    boxes: Nx4
+    output: NxK
+    """
+    x_within = (points_x[..., :, 0] >= boxes[:, 0, None]) & (points_x[..., :, 0] <= boxes[:, 2, None])
+    y_within = (points_y[..., :, 0] >= boxes[:, 1, None]) & (points_y[..., :, 0] <= boxes[:, 3, None])
+    return x_within & y_within
+
+_TOTAL_SKIPPED = 0
+
+def balance_ce_loss(pre_mk, target_mk):
+    pre_mk = torch.sigmoid(pre_mk)
+
+    pos_inds = target_mk.eq(1)
+    pos_num = torch.sum(pos_inds).float()
+    neg_num = torch.sum(1 - pos_inds).float()
+    loss = -(target_mk * torch.log(pre_mk + 1e-4)) / pos_num - ((1 - target_mk) * torch.log(1 - pre_mk + 1e-4)) / neg_num
+    return loss.sum()
+
+
+def edge_loss(input, target):
+    n, c, h, w = input.size()
+
+    log_p = input.transpose(1, 2).transpose(2, 3).contiguous().view(1, -1)
+    target_t = target.transpose(1, 2).transpose(2, 3).contiguous().view(1, -1)
+    pos_index = (target_t == 1)
+    neg_index = (target_t == 0)
+    pos_index = pos_index.data.cpu().numpy().astype(bool)
+    neg_index = neg_index.data.cpu().numpy().astype(bool)
+    weight = torch.Tensor(log_p.size()).fill_(0)
+    weight = weight.numpy()
+    pos_num = pos_index.sum()
+    neg_num = neg_index.sum()
+    sum_num = pos_num + neg_num
+    weight[pos_index] = neg_num * 1.0 / sum_num
+    weight[neg_index] = pos_num * 1.0 / sum_num
+    weight = torch.from_numpy(weight)
+    weight = weight.cuda()
+    loss = F.binary_cross_entropy_with_logits(log_p, target_t, weight, size_average=True)
+    # del pos_index, neg_index
+    # del weight
+    return loss
+
+class BORCNNLossComputation(object):
+    def __init__(self, proposal_matcher, fg_bg_sampler, discretization_size, cfg):
+        """
+        Arguments:
+            proposal_matcher (Matcher)
+            discretization_size (int)
+        """
+        self.proposal_matcher = proposal_matcher
+        self.fg_bg_sampler = fg_bg_sampler
+        self.discretization_size = discretization_size
+        self.cfg = cfg.clone()
+
+    def match_targets_to_proposals(self, proposal, target):
+        match_quality_matrix = boxlist_iou(target, proposal)
+        matched_idxs = self.proposal_matcher(match_quality_matrix)
+        target = target.copy_with_fields(["labels", "masks"])
+        matched_targets = target[matched_idxs.clamp(min=0)]
+        matched_targets.add_field("matched_idxs", matched_idxs)
+        return matched_targets
+
+    def prepare_targets(self, proposals, targets):
+        labels = []
+        masks = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            matched_targets = self.match_targets_to_proposals(
+                proposals_per_image, targets_per_image
+            )
+            matched_idxs = matched_targets.get_field("matched_idxs")
+
+            labels_per_image = matched_targets.get_field("labels")
+            labels_per_image = labels_per_image.to(dtype=torch.int64)
+
+            # this can probably be removed, but is left here for clarity
+            # and completeness
+            neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD
+            labels_per_image[neg_inds] = 0
+
+            # mask scores are only computed on positive samples
+            positive_inds = torch.nonzero(labels_per_image > 0).squeeze(1)
+
+            segmentation_masks = matched_targets.get_field("masks")
+            segmentation_masks = segmentation_masks[positive_inds]
+
+            positive_proposals = proposals_per_image[positive_inds]
+
+            masks_per_image = project_masks_on_boxes(
+                segmentation_masks, positive_proposals, self.discretization_size
+            )
+
+            labels.append(labels_per_image)
+            masks.append(masks_per_image)
+
+        return labels, masks
+
+    def subsample(self, proposals, targets):
+        """
+        This method performs the positive/negative sampling, and return
+        the sampled proposals.
+        Note: this function keeps a state.
+
+        Arguments:
+            proposals (list[BoxList])
+            targets (list[BoxList])
+        """
+
+        labels, kes, mty = self.prepare_targets(proposals, targets)
+        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
+
+        proposals = list(proposals)
+        # add corresponding label and regression_targets information to the bounding boxes
+        for labels_per_image, kes_per_image, mty_per_image, proposals_per_image in zip(
+            labels, kes, mty, proposals
+        ):
+            proposals_per_image.add_field("labels", labels_per_image)
+            proposals_per_image.add_field("kes", kes_per_image)
+            proposals_per_image.add_field("mty", mty_per_image)
+
+        # distributed sampled proposals, that were obtained on all feature maps
+        # concatenated via the fg_bg_sampler, into individual feature map levels
+        for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
+            zip(sampled_pos_inds, sampled_neg_inds)
+        ):
+            # img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
+            img_sampled_inds = torch.nonzero(pos_inds_img).squeeze(1)
+            proposals_per_image = proposals[img_idx][img_sampled_inds]
+            proposals[img_idx] = proposals_per_image
+
+        self._proposals = proposals
+        return proposals
+
+    def __call__(self, proposals, ke_logits_x, ke_logits_y, targets):
+        """
+        Arguments:
+            proposals (list[BoxList])
+            mask_logits (Tensor)
+            targets (list[BoxList])
+
+        Return:
+            mask_loss (Tensor): scalar tensor containing the loss
+        """
+        labels, mask_targets = self.prepare_targets(proposals, targets)
+
+        labels = cat(labels, dim=0)
+        mask_targets = cat(mask_targets, dim=0)
+        positive_inds = torch.nonzero(labels > 0).squeeze(1)
+
+        if mask_targets.numel() == 0:
+            return 0
+
+        sb, sh, sw = mask_targets.shape
+        mask_loss_x = edge_loss( ke_logits_x[positive_inds, 0].view([sb, 1, sh, sw]), mask_targets.view([sb, 1, sh, sw]))
+        mask_loss_y = edge_loss( ke_logits_y[positive_inds, 0].view([sb, 1, sh, sw]), mask_targets.view([sb, 1, sh, sw]))
+
+        mask_loss = mask_loss_x + mask_loss_y
+
+        return mask_loss , mask_loss_x, mask_loss_y
+
+def make_roi_boundary_loss_evaluator(cfg):
+    matcher = Matcher(
+        cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD,
+        cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD,
+        allow_low_quality_matches=False,
+    )
+
+    fg_bg_sampler = BalancedPositiveNegativeSampler(
+        cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
+    )
+
+    loss_evaluator = BORCNNLossComputation(
+        matcher, fg_bg_sampler, cfg.MODEL.ROI_BOUNDARY_HEAD.RESOLUTION, cfg
+    )
+
+    return loss_evaluator
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_feature_extractors.py
new file mode 100644
index 0000000000000000000000000000000000000000..96fe5b019a54ae06799065cf39adea7ba452442d
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_feature_extractors.py
@@ -0,0 +1,69 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from torch import nn
+from torch.nn import functional as F
+
+# from ..box_head.roi_box_feature_extractors import ResNet50Conv5ROIFeatureExtractor
+from maskrcnn_benchmark.modeling.poolers import Pooler
+from maskrcnn_benchmark.modeling.make_layers import make_conv3x3
+
+
+class BOUNDARYRCNNFPNFeatureExtractor(nn.Module):
+    """
+    Heads for FPN for classification
+    """
+
+    def __init__(self, cfg, in_channels):
+        """
+        Arguments:
+            num_classes (int): number of output classes
+            input_size (int): number of channels of the input once it's flattened
+            representation_size (int): size of the intermediate representation
+        """
+        super(BOUNDARYRCNNFPNFeatureExtractor, self).__init__()
+
+        resolution = cfg.MODEL.ROI_BOUNDARY_HEAD.POOLER_RESOLUTION
+        scales = cfg.MODEL.ROI_BOUNDARY_HEAD.POOLER_SCALES
+        sampling_ratio = cfg.MODEL.ROI_BOUNDARY_HEAD.POOLER_SAMPLING_RATIO
+        pooler = Pooler(
+            output_size=(resolution, resolution),
+            scales=scales,
+            sampling_ratio=sampling_ratio,
+            deformable=cfg.MODEL.ROI_BOUNDARY_HEAD.DEFORMABLE_POOLING
+        # deformable = True
+        )
+        input_size = in_channels
+        self.pooler = pooler
+
+        layers = cfg.MODEL.ROI_BOUNDARY_HEAD.CONV_LAYERS
+        use_gn = cfg.MODEL.ROI_MASK_HEAD.USE_GN
+        dilation = cfg.MODEL.ROI_MASK_HEAD.DILATION
+
+        next_feature = input_size
+        self.blocks = []
+        for layer_idx, layer_features in enumerate(layers, 1):
+            layer_name = "boundary_fcn{}".format(layer_idx)
+            module = make_conv3x3(
+                next_feature, layer_features,
+                dilation=dilation, stride=1, use_gn=use_gn
+            )
+            self.add_module(layer_name, module)
+            next_feature = layer_features
+            self.blocks.append(layer_name)
+
+    def forward(self, x, proposals):
+        x = self.pooler(x, proposals)
+
+        for layer_name in self.blocks:
+            x = F.relu(getattr(self, layer_name)(x))
+
+        return x
+
+
+_ROI_KE_FEATURE_EXTRACTORS = {
+    "BoundaryRCNNFPNFeatureExtractor": BOUNDARYRCNNFPNFeatureExtractor,
+}
+
+
+def make_roi_boundary_feature_extractor(cfg, in_channels):
+    func = _ROI_KE_FEATURE_EXTRACTORS[cfg.MODEL.ROI_BOUNDARY_HEAD.FEATURE_EXTRACTOR]
+    return func(cfg, in_channels)
diff --git a/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_predictors.py
new file mode 100644
index 0000000000000000000000000000000000000000..9727592b5ca4d6280a4c017d5501f40f6a0d16d5
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/boundary_head/roi_boundary_predictors.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from maskrcnn_benchmark.layers import Conv2d
+from maskrcnn_benchmark.layers import ConvTranspose2d
+
+from maskrcnn_benchmark import layers
+
+class BOUNDARYRCNNC4Predictor(nn.Module):
+    def __init__(self, cfg):
+        super(BOUNDARYRCNNC4Predictor, self).__init__()
+        dim_reduced = cfg.MODEL.ROI_BOUNDARY_HEAD.CONV_LAYERS[-1]
+        self.resol = cfg.MODEL.ROI_BOUNDARY_HEAD.RESOLUTION  # 56
+
+        if cfg.MODEL.ROI_HEADS.USE_FPN:
+            num_inputs = dim_reduced
+        else:
+            stage_index = 4
+            stage2_relative_factor = 2 ** (stage_index - 1)
+            res2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS  #256
+            num_inputs = res2_out_channels * stage2_relative_factor
+
+        self.bo_input_xy = Conv2d(num_inputs, num_inputs, 1, 1, 0)
+        nn.init.kaiming_normal_(self.bo_input_xy.weight,
+                mode='fan_out', nonlinearity='relu')
+        nn.init.constant_(self.bo_input_xy.bias, 0)
+
+        self.conv5_bo_xy = ConvTranspose2d(num_inputs, dim_reduced, 2, 2, 0)
+        nn.init.kaiming_normal_(self.conv5_bo_xy.weight,
+                mode='fan_out', nonlinearity='relu')
+        nn.init.constant_(self.conv5_bo_xy.bias, 0)
+
+        self.bo_input_1_1 = Conv2d(dim_reduced, dim_reduced, 1, 1, 0)
+        nn.init.kaiming_normal_(self.bo_input_1_1.weight,
+                                mode='fan_out', nonlinearity='relu')
+        nn.init.constant_(self.bo_input_1_1.bias, 0)
+
+        self.bo_input_2_1 = Conv2d(dim_reduced, dim_reduced, 1, 1, 0)
+        nn.init.kaiming_normal_(self.bo_input_2_1.weight,
+                                mode='fan_out', nonlinearity='relu')
+        nn.init.constant_(self.bo_input_2_1.bias, 0)
+
+        self.conv5_bo_x = Conv2d(dim_reduced, 1, (3, 1), 1, (1,0)) # H W
+        nn.init.kaiming_normal_(self.conv5_bo_x.weight,
+                mode='fan_out', nonlinearity='relu') # 'relu'
+        nn.init.constant_(self.conv5_bo_x.bias, 0)
+
+        self.conv5_bo_y = Conv2d(dim_reduced, 1, (1, 3), 1, (0,1)) # H W
+        nn.init.kaiming_normal_(self.conv5_bo_y.weight,
+                mode='fan_out', nonlinearity='relu')
+        nn.init.constant_(self.conv5_bo_y.bias, 0)
+        self.up_scale=2
+
+
+    def forward(self, ft):
+        ft = self.bo_input_xy(ft)
+        ft_2x = self.conv5_bo_xy(ft)
+
+        ft_2x = layers.interpolate(ft_2x, size = (48,48), mode='bilinear', align_corners=True)
+
+        x = self.bo_input_1_1(ft_2x)
+        y = self.bo_input_2_1(ft_2x)
+
+        x = self.conv5_bo_x(x)
+        y = self.conv5_bo_y(y)
+
+        return x, y
+
+
+
+_ROI_KE_PREDICTOR = {"BoundaryRCNNC4Predictor": BOUNDARYRCNNC4Predictor}
+
+
+def make_roi_boundary_predictor(cfg):
+    func = _ROI_KE_PREDICTOR[cfg.MODEL.ROI_BOUNDARY_HEAD.PREDICTOR]
+    return func(cfg)
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c47b2586ccedadd8b3fe5c31aca37cc36bfb1ac
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/box_head.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/box_head.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99992e37d12c3eacec8dcc131c10dd2635f55332
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/box_head.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/inference.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b93e667ee972fbdad5d2a4587e140a72d4cd8b11
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/inference.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/loss.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/loss.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6843988b68dad375ba351116c84c74d6ef36c
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/loss.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_feature_extractors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_feature_extractors.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3094234af1a6183f20b5395a5b3d2b013c072d57
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_feature_extractors.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_predictors.cpython-37.pyc b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_predictors.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e60df9f2aca4ce51866f378e65e6709d420a1b37
Binary files /dev/null and b/maskrcnn_benchmark/modeling/roi_heads/box_head/__pycache__/roi_box_predictors.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..482081b8de7431282c8a017cd34d965c8f355bb0
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch import nn
+
+from .roi_box_feature_extractors import make_roi_box_feature_extractor
+from .roi_box_predictors import make_roi_box_predictor
+from .inference import make_roi_box_post_processor
+from .loss import make_roi_box_loss_evaluator
+
+
+class ROIBoxHead(torch.nn.Module):
+    """
+    Generic Box Head class.
+    """
+
+    def __init__(self, cfg, in_channels):
+        super(ROIBoxHead, self).__init__()
+        self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels)
+        self.predictor = make_roi_box_predictor(
+            cfg, self.feature_extractor.out_channels)
+        self.post_processor = make_roi_box_post_processor(cfg)
+        self.loss_evaluator = make_roi_box_loss_evaluator(cfg)
+
+    def forward(self, features, proposals, targets=None):
+        """
+        Arguments:
+            features (list[Tensor]): feature-maps from possibly several levels
+            proposals (list[BoxList]): proposal boxes
+            targets (list[BoxList], optional): the ground-truth targets.
+
+        Returns:
+            x (Tensor): the result of the feature extractor
+            proposals (list[BoxList]): during training, the subsampled proposals
+                are returned. During testing, the predicted boxlists are returned
+            losses (dict[Tensor]): During training, returns the losses for the
+                head. During testing, returns an empty dict.
+        """
+
+        if self.training:
+            # Faster R-CNN subsamples during training the proposals with a fixed
+            # positive / negative ratio
+            with torch.no_grad():
+                proposals = self.loss_evaluator.subsample(proposals, targets)
+
+        # extract features that will be fed to the final classifier. The
+        # feature_extractor generally corresponds to the pooler + heads
+        x = self.feature_extractor(features, proposals)
+        # final classifier that converts the features into predictions
+        class_logits, box_regression = self.predictor(x)
+
+        if not self.training:
+            result = self.post_processor((class_logits, box_regression), proposals)
+            return x, result, {}
+
+        loss_classifier, loss_box_reg = self.loss_evaluator(
+            [class_logits], [box_regression]
+        )
+        return (
+            x,
+            proposals,
+            dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg),
+        )
+
+
+def build_roi_box_head(cfg, in_channels):
+    """
+    Constructs a new box head.
+    By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class
+    and make it a parameter in the config
+    """
+    return ROIBoxHead(cfg, in_channels)
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..595a2e61620fbd345bc36060c43191792fc010ea
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py
@@ -0,0 +1,167 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+from maskrcnn_benchmark.modeling.box_coder import BoxCoder
+
+
+class PostProcessor(nn.Module):
+    """
+    From a set of classification scores, box regression and proposals,
+    computes the post-processed boxes, and applies NMS to obtain the
+    final results
+    """
+
+    def __init__(
+        self,
+        score_thresh=0.05,
+        nms=0.5,
+        detections_per_img=100,
+        box_coder=None,
+        cls_agnostic_bbox_reg=False
+    ):
+        """
+        Arguments:
+            score_thresh (float)
+            nms (float)
+            detections_per_img (int)
+            box_coder (BoxCoder)
+        """
+        super(PostProcessor, self).__init__()
+        self.score_thresh = score_thresh
+        self.nms = nms
+        self.detections_per_img = detections_per_img
+        if box_coder is None:
+            box_coder = BoxCoder(weights=(10., 10., 5., 5.))
+        self.box_coder = box_coder
+        self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg
+
+    def forward(self, x, boxes):
+        """
+        Arguments:
+            x (tuple[tensor, tensor]): x contains the class logits
+                and the box_regression from the model.
+            boxes (list[BoxList]): bounding boxes that are used as
+                reference, one for ech image
+
+        Returns:
+            results (list[BoxList]): one BoxList for each image, containing
+                the extra fields labels and scores
+        """
+        class_logits, box_regression = x
+        class_prob = F.softmax(class_logits, -1)
+
+        # TODO think about a representation of batch of boxes
+        image_shapes = [box.size for box in boxes]
+        boxes_per_image = [len(box) for box in boxes]
+        concat_boxes = torch.cat([a.bbox for a in boxes], dim=0)
+
+        if self.cls_agnostic_bbox_reg:
+            box_regression = box_regression[:, -4:]
+        proposals = self.box_coder.decode(
+            box_regression.view(sum(boxes_per_image), -1), concat_boxes
+        )
+        if self.cls_agnostic_bbox_reg:
+            proposals = proposals.repeat(1, class_prob.shape[1])
+
+        num_classes = class_prob.shape[1]
+
+        proposals = proposals.split(boxes_per_image, dim=0)
+        class_prob = class_prob.split(boxes_per_image, dim=0)
+
+        results = []
+        for prob, boxes_per_img, image_shape in zip(
+            class_prob, proposals, image_shapes
+        ):
+            boxlist = self.prepare_boxlist(boxes_per_img, prob, image_shape)
+            boxlist = boxlist.clip_to_image(remove_empty=False)
+            boxlist = self.filter_results(boxlist, num_classes)
+            results.append(boxlist)
+        return results
+
+    def prepare_boxlist(self, boxes, scores, image_shape):
+        """
+        Returns BoxList from `boxes` and adds probability scores information
+        as an extra field
+        `boxes` has shape (#detections, 4 * #classes), where each row represents
+        a list of predicted bounding boxes for each of the object classes in the
+        dataset (including the background class). The detections in each row
+        originate from the same object proposal.
+        `scores` has shape (#detection, #classes), where each row represents a list
+        of object detection confidence scores for each of the object classes in the
+        dataset (including the background class). `scores[i, j]`` corresponds to the
+        box at `boxes[i, j * 4:(j + 1) * 4]`.
+        """
+        boxes = boxes.reshape(-1, 4)
+        scores = scores.reshape(-1)
+        boxlist = BoxList(boxes, image_shape, mode="xyxy")
+        boxlist.add_field("scores", scores)
+        return boxlist
+
+    def filter_results(self, boxlist, num_classes):
+        """Returns bounding-box detection results by thresholding on scores and
+        applying non-maximum suppression (NMS).
+        """
+        # unwrap the boxlist to avoid additional overhead.
+        # if we had multi-class NMS, we could perform this directly on the boxlist
+        boxes = boxlist.bbox.reshape(-1, num_classes * 4)
+        scores = boxlist.get_field("scores").reshape(-1, num_classes)
+
+        device = scores.device
+        result = []
+        # Apply threshold on detection probabilities and apply NMS
+        # Skip j = 0, because it's the background class
+        inds_all = scores > self.score_thresh
+        for j in range(1, num_classes):
+            inds = inds_all[:, j].nonzero().squeeze(1)
+            scores_j = scores[inds, j]
+            boxes_j = boxes[inds, j * 4 : (j + 1) * 4]
+            boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
+            boxlist_for_class.add_field("scores", scores_j)
+            boxlist_for_class = boxlist_nms(
+                boxlist_for_class, self.nms
+            )
+            num_labels = len(boxlist_for_class)
+            boxlist_for_class.add_field(
+                "labels", torch.full((num_labels,), j, dtype=torch.int64, device=device)
+            )
+            result.append(boxlist_for_class)
+
+        result = cat_boxlist(result)
+        number_of_detections = len(result)
+
+        # Limit to max_per_image detections **over all classes**
+        if number_of_detections > self.detections_per_img > 0:
+            cls_scores = result.get_field("scores")
+            image_thresh, _ = torch.kthvalue(
+                cls_scores.cpu(), number_of_detections - self.detections_per_img + 1
+            )
+            keep = cls_scores >= image_thresh.item()
+            keep = torch.nonzero(keep).squeeze(1)
+            result = result[keep]
+        return result
+
+
+def make_roi_box_post_processor(cfg):
+    use_fpn = cfg.MODEL.ROI_HEADS.USE_FPN
+
+    bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS
+    box_coder = BoxCoder(weights=bbox_reg_weights)
+
+    score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH
+    nms_thresh = cfg.MODEL.ROI_HEADS.NMS
+    detections_per_img = cfg.MODEL.ROI_HEADS.DETECTIONS_PER_IMG
+    cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG
+
+    postprocessor = PostProcessor(
+        score_thresh,
+        nms_thresh,
+        detections_per_img,
+        box_coder,
+        cls_agnostic_bbox_reg
+    )
+    return postprocessor
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f2771d029e6d027b29e60b83d268f03628d3a14
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py
@@ -0,0 +1,193 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch.nn import functional as F
+
+from maskrcnn_benchmark.layers import smooth_l1_loss
+from maskrcnn_benchmark.modeling.box_coder import BoxCoder
+from maskrcnn_benchmark.modeling.matcher import Matcher
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import (
+    BalancedPositiveNegativeSampler
+)
+from maskrcnn_benchmark.modeling.utils import cat
+
+
+class FastRCNNLossComputation(object):
+    """
+    Computes the loss for Faster R-CNN.
+    Also supports FPN
+    """
+
+    def __init__(
+        self, 
+        proposal_matcher, 
+        fg_bg_sampler, 
+        box_coder, 
+        cls_agnostic_bbox_reg=False
+    ):
+        """
+        Arguments:
+            proposal_matcher (Matcher)
+            fg_bg_sampler (BalancedPositiveNegativeSampler)
+            box_coder (BoxCoder)
+        """
+        self.proposal_matcher = proposal_matcher
+        self.fg_bg_sampler = fg_bg_sampler
+        self.box_coder = box_coder
+        self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg
+
+    def match_targets_to_proposals(self, proposal, target):
+        match_quality_matrix = boxlist_iou(target, proposal)
+        matched_idxs = self.proposal_matcher(match_quality_matrix)
+        # Fast RCNN only need "labels" field for selecting the targets
+        target = target.copy_with_fields("labels")
+        # get the targets corresponding GT for each proposal
+        # NB: need to clamp the indices because we can have a single
+        # GT in the image, and matched_idxs can be -2, which goes
+        # out of bounds
+        matched_targets = target[matched_idxs.clamp(min=0)]
+        matched_targets.add_field("matched_idxs", matched_idxs)
+        return matched_targets
+
+    def prepare_targets(self, proposals, targets):
+        labels = []
+        regression_targets = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            matched_targets = self.match_targets_to_proposals(
+                proposals_per_image, targets_per_image
+            )
+            matched_idxs = matched_targets.get_field("matched_idxs")
+
+            labels_per_image = matched_targets.get_field("labels")
+            labels_per_image = labels_per_image.to(dtype=torch.int64)
+
+            # Label background (below the low threshold)
+            bg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD
+            labels_per_image[bg_inds] = 0
+
+            # Label ignore proposals (between low and high thresholds)
+            ignore_inds = matched_idxs == Matcher.BETWEEN_THRESHOLDS
+            labels_per_image[ignore_inds] = -1  # -1 is ignored by sampler
+
+            # compute regression targets
+            regression_targets_per_image = self.box_coder.encode(
+                matched_targets.bbox, proposals_per_image.bbox
+            )
+
+            labels.append(labels_per_image)
+            regression_targets.append(regression_targets_per_image)
+
+        return labels, regression_targets
+
+    def subsample(self, proposals, targets):
+        """
+        This method performs the positive/negative sampling, and return
+        the sampled proposals.
+        Note: this function keeps a state.
+
+        Arguments:
+            proposals (list[BoxList])
+            targets (list[BoxList])
+        """
+
+        labels, regression_targets = self.prepare_targets(proposals, targets)
+        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
+
+        proposals = list(proposals)
+        # add corresponding label and regression_targets information to the bounding boxes
+        for labels_per_image, regression_targets_per_image, proposals_per_image in zip(
+            labels, regression_targets, proposals
+        ):
+            proposals_per_image.add_field("labels", labels_per_image)
+            proposals_per_image.add_field(
+                "regression_targets", regression_targets_per_image
+            )
+
+        # distributed sampled proposals, that were obtained on all feature maps
+        # concatenated via the fg_bg_sampler, into individual feature map levels
+        for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
+            zip(sampled_pos_inds, sampled_neg_inds)
+        ):
+            img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
+            proposals_per_image = proposals[img_idx][img_sampled_inds]
+            proposals[img_idx] = proposals_per_image
+
+        self._proposals = proposals
+        return proposals
+
+    def __call__(self, class_logits, box_regression):
+        """
+        Computes the loss for Faster R-CNN.
+        This requires that the subsample method has been called beforehand.
+
+        Arguments:
+            class_logits (list[Tensor])
+            box_regression (list[Tensor])
+
+        Returns:
+            classification_loss (Tensor)
+            box_loss (Tensor)
+        """
+
+        class_logits = cat(class_logits, dim=0)
+        box_regression = cat(box_regression, dim=0)
+        device = class_logits.device
+
+        if not hasattr(self, "_proposals"):
+            raise RuntimeError("subsample needs to be called before")
+
+        proposals = self._proposals
+
+        labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
+        regression_targets = cat(
+            [proposal.get_field("regression_targets") for proposal in proposals], dim=0
+        )
+
+        classification_loss = F.cross_entropy(class_logits, labels)
+
+        # get indices that correspond to the regression targets for
+        # the corresponding ground truth labels, to be used with
+        # advanced indexing
+        sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1)
+        labels_pos = labels[sampled_pos_inds_subset]
+        if self.cls_agnostic_bbox_reg:
+            map_inds = torch.tensor([4, 5, 6, 7], device=device)
+        else:
+            map_inds = 4 * labels_pos[:, None] + torch.tensor(
+                [0, 1, 2, 3], device=device)
+
+        box_loss = smooth_l1_loss(
+            box_regression[sampled_pos_inds_subset[:, None], map_inds],
+            regression_targets[sampled_pos_inds_subset],
+            size_average=False,
+            beta=1,
+        )
+        box_loss = box_loss / labels.numel()
+
+        return classification_loss, box_loss
+
+
+def make_roi_box_loss_evaluator(cfg):
+    matcher = Matcher(
+        cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD,
+        cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD,
+        allow_low_quality_matches=False,
+    )
+
+    bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS
+    box_coder = BoxCoder(weights=bbox_reg_weights)
+
+    fg_bg_sampler = BalancedPositiveNegativeSampler(
+        cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
+    )
+
+    cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG
+
+    loss_evaluator = FastRCNNLossComputation(
+        matcher, 
+        fg_bg_sampler, 
+        box_coder, 
+        cls_agnostic_bbox_reg
+    )
+
+    return loss_evaluator
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4406deedc2ce5430bf54d75868ea1a438b7bc57
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py
@@ -0,0 +1,153 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from maskrcnn_benchmark.modeling import registry
+from maskrcnn_benchmark.modeling.backbone import resnet
+from maskrcnn_benchmark.modeling.poolers import Pooler
+from maskrcnn_benchmark.modeling.make_layers import group_norm
+from maskrcnn_benchmark.modeling.make_layers import make_fc
+
+
+@registry.ROI_BOX_FEATURE_EXTRACTORS.register("ResNet50Conv5ROIFeatureExtractor")
+class ResNet50Conv5ROIFeatureExtractor(nn.Module):
+    def __init__(self, config, in_channels):
+        super(ResNet50Conv5ROIFeatureExtractor, self).__init__()
+
+        resolution = config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        scales = config.MODEL.ROI_BOX_HEAD.POOLER_SCALES
+        sampling_ratio = config.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler = Pooler(
+            output_size=(resolution, resolution),
+            scales=scales,
+            sampling_ratio=sampling_ratio,
+            deformable=_C.MODEL.ROI_BOX_HEAD.DEFORMABLE_POOLING
+        )
+
+        stage = resnet.StageSpec(index=4, block_count=3, return_features=False)
+        head = resnet.ResNetHead(
+            block_module=config.MODEL.RESNETS.TRANS_FUNC,
+            stages=(stage,),
+            num_groups=config.MODEL.RESNETS.NUM_GROUPS,
+            width_per_group=config.MODEL.RESNETS.WIDTH_PER_GROUP,
+            stride_in_1x1=config.MODEL.RESNETS.STRIDE_IN_1X1,
+            stride_init=None,
+            res2_out_channels=config.MODEL.RESNETS.RES2_OUT_CHANNELS,
+            dilation=config.MODEL.RESNETS.RES5_DILATION
+        )
+
+        self.pooler = pooler
+        self.head = head
+        self.out_channels = head.out_channels
+
+    def forward(self, x, proposals):
+        x = self.pooler(x, proposals)
+        x = self.head(x)
+        return x
+
+
+@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPN2MLPFeatureExtractor")
+class FPN2MLPFeatureExtractor(nn.Module):
+    """
+    Heads for FPN for classification
+    """
+
+    def __init__(self, cfg, in_channels):
+        super(FPN2MLPFeatureExtractor, self).__init__()
+
+        resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES
+        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler = Pooler(
+            output_size=(resolution, resolution),
+            scales=scales,
+            sampling_ratio=sampling_ratio,
+            deformable=cfg.MODEL.RESNETS.DEFORM_POOLING
+        )
+        input_size = in_channels * resolution ** 2
+        representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM
+        use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN
+        self.pooler = pooler
+        self.fc6 = make_fc(input_size, representation_size, use_gn)
+        self.fc7 = make_fc(representation_size, representation_size, use_gn)
+        self.out_channels = representation_size
+
+    def forward(self, x, proposals):
+        x = self.pooler(x, proposals)
+        x = x.view(x.size(0), -1)
+
+        x = F.relu(self.fc6(x))
+        x = F.relu(self.fc7(x))
+
+        return x
+
+
+@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPNXconv1fcFeatureExtractor")
+class FPNXconv1fcFeatureExtractor(nn.Module):
+    """
+    Heads for FPN for classification
+    """
+
+    def __init__(self, cfg, in_channels):
+        super(FPNXconv1fcFeatureExtractor, self).__init__()
+
+        resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES
+        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler = Pooler(
+            output_size=(resolution, resolution),
+            scales=scales,
+            sampling_ratio=sampling_ratio,
+        )
+        self.pooler = pooler
+
+        use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN
+        conv_head_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM
+        num_stacked_convs = cfg.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS
+        dilation = cfg.MODEL.ROI_BOX_HEAD.DILATION
+
+        xconvs = []
+        for ix in range(num_stacked_convs):
+            xconvs.append(
+                nn.Conv2d(
+                    in_channels,
+                    conv_head_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=dilation,
+                    dilation=dilation,
+                    bias=False if use_gn else True
+                )
+            )
+            in_channels = conv_head_dim
+            if use_gn:
+                xconvs.append(group_norm(in_channels))
+            xconvs.append(nn.ReLU(inplace=True))
+
+        self.add_module("xconvs", nn.Sequential(*xconvs))
+        for modules in [self.xconvs,]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.normal_(l.weight, std=0.01)
+                    if not use_gn:
+                        torch.nn.init.constant_(l.bias, 0)
+
+        input_size = conv_head_dim * resolution ** 2
+        representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM
+        self.fc6 = make_fc(input_size, representation_size, use_gn=False)
+        self.out_channels = representation_size
+
+    def forward(self, x, proposals):
+        x = self.pooler(x, proposals)
+        x = self.xconvs(x)
+        x = x.view(x.size(0), -1)
+        x = F.relu(self.fc6(x))
+        return x
+
+
+def make_roi_box_feature_extractor(cfg, in_channels):
+    func = registry.ROI_BOX_FEATURE_EXTRACTORS[
+        cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR
+    ]
+    return func(cfg, in_channels)
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py
new file mode 100644
index 0000000000000000000000000000000000000000..66ee4ace585cff5ea2933553d3e800f03757eba9
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py
@@ -0,0 +1,62 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from maskrcnn_benchmark.modeling import registry
+from torch import nn
+
+
+@registry.ROI_BOX_PREDICTOR.register("FastRCNNPredictor")
+class FastRCNNPredictor(nn.Module):
+    def __init__(self, config, in_channels):
+        super(FastRCNNPredictor, self).__init__()
+        assert in_channels is not None
+
+        num_inputs = in_channels
+
+        num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.cls_score = nn.Linear(num_inputs, num_classes)
+        num_bbox_reg_classes = 2 if config.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes
+        self.bbox_pred = nn.Linear(num_inputs, num_bbox_reg_classes * 4)
+
+        nn.init.normal_(self.cls_score.weight, mean=0, std=0.01)
+        nn.init.constant_(self.cls_score.bias, 0)
+
+        nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001)
+        nn.init.constant_(self.bbox_pred.bias, 0)
+
+    def forward(self, x):
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        cls_logit = self.cls_score(x)
+        bbox_pred = self.bbox_pred(x)
+        return cls_logit, bbox_pred
+
+
+@registry.ROI_BOX_PREDICTOR.register("FPNPredictor")
+class FPNPredictor(nn.Module):
+    def __init__(self, cfg, in_channels):
+        super(FPNPredictor, self).__init__()
+        num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES
+        representation_size = in_channels
+
+        self.cls_score = nn.Linear(representation_size, num_classes)
+        num_bbox_reg_classes = 2 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes
+        self.bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4)
+
+        nn.init.normal_(self.cls_score.weight, std=0.01)
+        nn.init.normal_(self.bbox_pred.weight, std=0.001)
+        for l in [self.cls_score, self.bbox_pred]:
+            nn.init.constant_(l.bias, 0)
+
+    def forward(self, x):
+        if x.ndimension() == 4:
+            assert list(x.shape[2:]) == [1, 1]
+            x = x.view(x.size(0), -1)
+        scores = self.cls_score(x)
+        bbox_deltas = self.bbox_pred(x)
+
+        return scores, bbox_deltas
+
+
+def make_roi_box_predictor(cfg, in_channels):
+    func = registry.ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR]
+    return func(cfg, in_channels)
diff --git a/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py b/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..614a36203c95ffc5d01373d9bdf50b1c11c9790d
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+
+from .box_head.box_head import build_roi_box_head
+from .boundary_head.boundary_head import build_roi_boundary_head
+class CombinedROIHeads(torch.nn.ModuleDict):
+    """
+    Combines a set of individual heads (for box prediction or masks) into a single
+    head.
+    """
+
+    def __init__(self, cfg, heads):
+        super(CombinedROIHeads, self).__init__(heads)
+        self.cfg = cfg.clone()
+        if cfg.MODEL.BOUNDARY_ON and cfg.MODEL.ROI_BOUNDARY_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
+            self.mask.feature_extractor = self.box.feature_extractor
+            self.bo.feature_extractor = self.bo.feature_extractor
+
+    def forward(self, features, proposals, targets=None, prefix=''):
+        """
+        prefix (str): Some model may use auxiliary heads which don't share rpn,
+        use this to separate the loss names
+        """
+        losses = {}
+        # TODO rename x to roi_box_features, if it doesn't increase memory consumption
+        x, detections, loss_box = self.box(features, proposals, targets)
+        losses.update(loss_box)
+        if self.cfg.MODEL.MASK_ON:
+            mask_features = features
+            # optimization: during training, if we share the feature extractor between
+            # the box and the mask heads, then we can reuse the features already computed
+            if (
+                self.training
+                and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR
+            ):
+                mask_features = x
+            # During training, self.box() will return the unaltered proposals as "detections"
+            # this makes the API consistent during training and testing
+
+            x, detections, loss_mask = self.mask(mask_features, detections, targets)
+
+            losses.update(loss_mask)
+
+        if self.cfg.MODEL.BOUNDARY_ON:
+            bo_features = features
+            if (
+                self.training
+                and self.cfg.MODEL.ROI_BOUNDARY_HEAD.SHARE_BOX_FEATURE_EXTRACTOR
+            ):
+                bo_features = x
+            x, detections, loss_bo, loss_bo_x, loss_bo_y = self.bound(bo_features, detections, targets)
+            losses.update(loss_bo)
+            losses.update(loss_bo_x)
+            losses.update(loss_bo_y)
+
+        losses = {prefix + k: losses[k] for k in losses}
+
+
+
+        return x, detections, losses
+
+
+def build_roi_heads(cfg, in_channels):
+    # individually create the heads, that will be combined together
+    # afterwards
+    roi_heads = []
+    if cfg.MODEL.RETINANET_ON:
+        return []
+
+    if not cfg.MODEL.RPN_ONLY:
+        roi_heads.append(("box", build_roi_box_head(cfg, in_channels)))
+    if cfg.MODEL.BOUNDARY_ON:
+        roi_heads.append(("bound", build_roi_boundary_head(cfg, in_channels)))
+    # combine individual heads in a single module
+    if roi_heads:
+        roi_heads = CombinedROIHeads(cfg, roi_heads)
+
+    return roi_heads
diff --git a/maskrcnn_benchmark/modeling/rpn/__init__.py b/maskrcnn_benchmark/modeling/rpn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b01f30cfddd8ed97d5a39f55641fbc929297d885
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# from .rpn import build_rpn
diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f8adc17c922e0e094f9d6993ec5ad10f71dad47
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/anchor_generator.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/anchor_generator.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec46ba8427ecaa1225303687066c180588f7dd2a
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/anchor_generator.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/inference.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd3fb12fe494f0e7164921545b8c7cfffb1dde18
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/inference.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/loss.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/loss.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7fb46be527bd179c65b6fc9a07877d941c2854c8
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/loss.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/rpn.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/rpn.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40623a7cc1a203c6aefac289709d3a40dec55115
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/rpn.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/__pycache__/utils.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/__pycache__/utils.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16d829a945683875f785401af5537d39b697662c
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/__pycache__/utils.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/anchor_generator.py b/maskrcnn_benchmark/modeling/rpn/anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..5314d0c9d0bf449cb8ee1ca3eea0385cb7c2a8e5
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/anchor_generator.py
@@ -0,0 +1,292 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import math
+
+import numpy as np
+import torch
+from torch import nn
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+
+class BufferList(nn.Module):
+    """
+    Similar to nn.ParameterList, but for buffers
+    """
+
+    def __init__(self, buffers=None):
+        super(BufferList, self).__init__()
+        if buffers is not None:
+            self.extend(buffers)
+
+    def extend(self, buffers):
+        offset = len(self)
+        for i, buffer in enumerate(buffers):
+            self.register_buffer(str(offset + i), buffer)
+        return self
+
+    def __len__(self):
+        return len(self._buffers)
+
+    def __iter__(self):
+        return iter(self._buffers.values())
+
+
+class AnchorGenerator(nn.Module):
+    """
+    For a set of image sizes and feature maps, computes a set
+    of anchors
+    """
+    def __init__(
+        self,
+        sizes=(128, 256, 512),   # 32, 64, 128, 256, 512
+        aspect_ratios=(0.5, 1.0, 2.0),   # 0.25, 0.5, 1.0, 2.0, 4.0
+        anchor_strides=(8, 16, 32),   # 4, 8, 16, 32, 64
+        straddle_thresh=0,   # 0
+    ):
+        super(AnchorGenerator, self).__init__()
+
+        if len(anchor_strides) == 1:
+            anchor_stride = anchor_strides[0]
+            cell_anchors = [
+                generate_anchors(anchor_stride, sizes, aspect_ratios).float()
+            ]
+        else:
+
+            # This step is done
+
+            if len(anchor_strides) != len(sizes):
+                raise RuntimeError("FPN should have #anchor_strides == #sizes")
+
+            cell_anchors = [
+                generate_anchors(
+                    anchor_stride,
+                    size if isinstance(size, (tuple, list)) else (size,),
+                    aspect_ratios
+                ).float()
+                for anchor_stride, size in zip(anchor_strides, sizes)
+            ]
+        self.strides = anchor_strides
+        self.cell_anchors = BufferList(cell_anchors)
+        self.straddle_thresh = straddle_thresh
+
+    def num_anchors_per_location(self):
+        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
+
+    def grid_anchors(self, grid_sizes):
+        anchors = []
+        for size, stride, base_anchors in zip(
+            grid_sizes, self.strides, self.cell_anchors
+        ):
+            grid_height, grid_width = size
+            device = base_anchors.device
+            shifts_x = torch.arange(
+                0, grid_width * stride, step=stride, dtype=torch.float32, device=device
+            )
+
+            shifts_y = torch.arange(
+                0, grid_height * stride, step=stride, dtype=torch.float32, device=device
+            )
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+            shift_x = shift_x.reshape(-1)
+            shift_y = shift_y.reshape(-1)
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
+
+            anchors.append(
+                (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4)
+            )
+
+        return anchors
+
+    def add_visibility_to(self, boxlist):
+        image_width, image_height = boxlist.size
+        anchors = boxlist.bbox
+        if self.straddle_thresh >= 0:
+            inds_inside = (
+                (anchors[..., 0] >= -self.straddle_thresh)
+                & (anchors[..., 1] >= -self.straddle_thresh)
+                & (anchors[..., 2] < image_width + self.straddle_thresh)
+                & (anchors[..., 3] < image_height + self.straddle_thresh)
+            )
+        else:
+            device = anchors.device
+            inds_inside = torch.ones(anchors.shape[0], dtype=torch.uint8, device=device)
+        boxlist.add_field("visibility", inds_inside)
+
+    def forward(self, image_list, feature_maps):
+        grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]  #  size of features
+        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes)
+        anchors = []
+        for i, (image_height, image_width) in enumerate(image_list.image_sizes):
+            anchors_in_image = []
+            for anchors_per_feature_map in anchors_over_all_feature_maps:
+                boxlist = BoxList(
+                    anchors_per_feature_map, (image_width, image_height), mode="xyxy"
+                )
+                self.add_visibility_to(boxlist)
+                anchors_in_image.append(boxlist)
+            anchors.append(anchors_in_image)
+        return anchors  # [image,number,[n,4]]
+
+
+def make_anchor_generator(config):
+    anchor_sizes = config.MODEL.RPN.ANCHOR_SIZES  # 32, 64, 128, 256, 512
+    aspect_ratios = config.MODEL.RPN.ASPECT_RATIOS  # 0.25, 0.5, 1.0, 2.0, 4.0
+    anchor_stride = config.MODEL.RPN.ANCHOR_STRIDE  # 4, 8, 16, 32, 64
+    straddle_thresh = config.MODEL.RPN.STRADDLE_THRESH  #0
+
+    if config.MODEL.RPN.USE_FPN:   #  This step is done
+        assert len(anchor_stride) == len(
+            anchor_sizes
+        ), "FPN should have len(ANCHOR_STRIDE) == len(ANCHOR_SIZES)"
+    else:
+        assert len(anchor_stride) == 1, "Non-FPN should have a single ANCHOR_STRIDE"
+    anchor_generator = AnchorGenerator(
+        anchor_sizes, aspect_ratios, anchor_stride, straddle_thresh
+    )
+    return anchor_generator
+
+
+def make_anchor_generator_retinanet(config):
+    anchor_sizes = config.MODEL.RETINANET.ANCHOR_SIZES
+    aspect_ratios = config.MODEL.RETINANET.ASPECT_RATIOS
+    anchor_strides = config.MODEL.RETINANET.ANCHOR_STRIDES
+    straddle_thresh = config.MODEL.RETINANET.STRADDLE_THRESH
+    octave = config.MODEL.RETINANET.OCTAVE
+    scales_per_octave = config.MODEL.RETINANET.SCALES_PER_OCTAVE
+
+    assert len(anchor_strides) == len(anchor_sizes), "Only support FPN now"
+    new_anchor_sizes = []
+    for size in anchor_sizes:
+        per_layer_anchor_sizes = []
+        for scale_per_octave in range(scales_per_octave):
+            octave_scale = octave ** (scale_per_octave / float(scales_per_octave))
+            per_layer_anchor_sizes.append(octave_scale * size)
+        new_anchor_sizes.append(tuple(per_layer_anchor_sizes))
+
+    anchor_generator = AnchorGenerator(
+        tuple(new_anchor_sizes), aspect_ratios, anchor_strides, straddle_thresh
+    )
+    return anchor_generator
+
+# Copyright (c) 2017-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+#
+# Based on:
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Sean Bell
+# --------------------------------------------------------
+
+
+# Verify that we compute the same anchors as Shaoqing's matlab implementation:
+#
+#    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
+#    >> anchors
+#
+#    anchors =
+#
+#       -83   -39   100    56
+#      -175   -87   192   104
+#      -359  -183   376   200
+#       -55   -55    72    72
+#      -119  -119   136   136
+#      -247  -247   264   264
+#       -35   -79    52    96
+#       -79  -167    96   184
+#      -167  -343   184   360
+
+# array([[ -83.,  -39.,  100.,   56.],
+#        [-175.,  -87.,  192.,  104.],
+#        [-359., -183.,  376.,  200.],
+#        [ -55.,  -55.,   72.,   72.],
+#        [-119., -119.,  136.,  136.],
+#        [-247., -247.,  264.,  264.],
+#        [ -35.,  -79.,   52.,   96.],
+#        [ -79., -167.,   96.,  184.],
+#        [-167., -343.,  184.,  360.]])
+
+
+def generate_anchors(
+    stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)
+):
+    """Generates a matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors
+    are centered on stride / 2, have (approximate) sqrt areas of the specified
+    sizes, and aspect ratios as given.
+    """
+    return _generate_anchors(
+        stride,
+        np.array(sizes, dtype=np.float) / stride,
+        np.array(aspect_ratios, dtype=np.float),
+    )
+
+
+def _generate_anchors(base_size, scales, aspect_ratios):
+    """Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, base_size - 1, base_size - 1) window.
+    """
+    anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1
+    anchors = _ratio_enum(anchor, aspect_ratios)
+    anchors = np.vstack(
+        [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])]
+    )
+    return torch.from_numpy(anchors)
+
+
+def _whctrs(anchor):
+    """Return width, height, x center, and y center for an anchor (window)."""
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack(
+        (
+            x_ctr - 0.5 * (ws - 1),
+            y_ctr - 0.5 * (hs - 1),
+            x_ctr + 0.5 * (ws - 1),
+            y_ctr + 0.5 * (hs - 1),
+        )
+    )
+    return anchors
+
+
+def _ratio_enum(anchor, ratios):
+    """Enumerate a set of anchors for each aspect ratio wrt an anchor."""
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def _scale_enum(anchor, scales):
+    """Enumerate a set of anchors for each scale wrt an anchor."""
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/__init__.py b/maskrcnn_benchmark/modeling/rpn/fcos/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ac106b0dc0a110a88c4c5bb96c6b2a32b01ebe7
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/fcos.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/fcos.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d1a79212b4451708988487c84fafada2b80b9ae
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/fcos.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/inference.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d28c4630201020e2365ff585ea67db29907af0e2
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/inference.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/loss.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/loss.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b39c77d13f4b149176a594c16cfb431c5ef4ae1a
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/fcos/__pycache__/loss.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/fcos.py b/maskrcnn_benchmark/modeling/rpn/fcos/fcos.py
new file mode 100644
index 0000000000000000000000000000000000000000..0492218ece4a437392a049fcb389f3aaf38ef11f
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/fcos/fcos.py
@@ -0,0 +1,199 @@
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from .inference import make_fcos_postprocessor
+from .loss import make_fcos_loss_evaluator
+
+from maskrcnn_benchmark.layers import Scale
+
+
+class FCOSHead(torch.nn.Module):
+    def __init__(self, cfg, in_channels):
+        """
+        Arguments:
+            in_channels (int): number of channels of the input feature
+        """
+        super(FCOSHead, self).__init__()
+        # TODO: Implement the sigmoid version first.
+        num_classes = cfg.MODEL.FCOS.NUM_CLASSES - 1
+
+        cls_tower = []
+        bbox_tower = []
+        for i in range(cfg.MODEL.FCOS.NUM_CONVS):
+            cls_tower.append(
+                nn.Conv2d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1
+                )
+            )
+            cls_tower.append(nn.GroupNorm(32, in_channels))
+            cls_tower.append(nn.ReLU())
+            bbox_tower.append(
+                nn.Conv2d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1
+                )
+            )
+            bbox_tower.append(nn.GroupNorm(32, in_channels))
+            bbox_tower.append(nn.ReLU())
+
+        self.add_module('cls_tower', nn.Sequential(*cls_tower))
+        self.add_module('bbox_tower', nn.Sequential(*bbox_tower))
+        self.cls_logits = nn.Conv2d(
+            in_channels, num_classes, kernel_size=3, stride=1,
+            padding=1
+        )
+        self.bbox_pred = nn.Conv2d(
+            in_channels, 4, kernel_size=3, stride=1,
+            padding=1
+        )
+        self.centerness = nn.Conv2d(
+            in_channels, 1, kernel_size=3, stride=1,
+            padding=1
+        )
+
+        # initialization
+        for modules in [self.cls_tower, self.bbox_tower,
+                        self.cls_logits, self.bbox_pred,
+                        self.centerness]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.normal_(l.weight, std=0.01)
+                    torch.nn.init.constant_(l.bias, 0)
+
+        # initialize the bias for focal loss
+        prior_prob = cfg.MODEL.FCOS.PRIOR_PROB
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        torch.nn.init.constant_(self.cls_logits.bias, bias_value)
+
+        self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(5)])
+
+    def forward(self, x):
+        logits = []
+        bbox_reg = []
+        centerness = []
+        for l, feature in enumerate(x):
+            cls_tower = self.cls_tower(feature)
+            logits.append(self.cls_logits(cls_tower))
+            centerness.append(self.centerness(cls_tower))
+            bbox_reg.append(torch.exp(self.scales[l](
+                self.bbox_pred(self.bbox_tower(feature))
+            )))
+        return logits, bbox_reg, centerness
+
+
+class FCOSModule(torch.nn.Module):
+    """
+    Module for FCOS computation. Takes feature maps from the backbone and
+    FCOS outputs and losses. Only Test on FPN now.
+    """
+
+    def __init__(self, cfg, in_channels):
+        super(FCOSModule, self).__init__()
+
+        self.cfg = cfg.clone()
+
+        head = FCOSHead(cfg, in_channels)
+
+        box_selector_train = make_fcos_postprocessor(cfg, is_train=True)
+        box_selector_test = make_fcos_postprocessor(cfg)
+
+        loss_evaluator = make_fcos_loss_evaluator(cfg)
+        self.head = head
+        self.box_selector_train = box_selector_train
+        self.box_selector_test = box_selector_test
+        self.loss_evaluator = loss_evaluator
+        self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES
+
+    def forward(self, images, features, targets=None):
+        """
+        Arguments:
+            images (ImageList): images for which we want to compute the predictions
+            features (list[Tensor]): features computed from the images that are
+                used for computing the predictions. Each tensor in the list
+                correspond to different feature levels
+            targets (list[BoxList): ground-truth boxes present in the image (optional)
+
+        Returns:
+            boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per
+                image.
+            losses (dict[Tensor]): the losses for the model during training. During
+                testing, it is an empty dict.
+        """
+        box_cls, box_regression, centerness = self.head(features)
+        locations = self.compute_locations(features)
+ 
+        if self.training:
+            return self._forward_train(
+                locations, box_cls,
+                box_regression,
+                centerness, targets, images.image_sizes
+            )
+        else:
+            return self._forward_test(
+                locations, box_cls, box_regression,
+                centerness, images.image_sizes
+            )
+
+    def _forward_train(self, locations, box_cls, box_regression,
+                       centerness, targets, image_sizes):
+        loss_box_cls, loss_box_reg, loss_centerness = self.loss_evaluator(
+            locations, box_cls, box_regression, centerness, targets
+        )
+        if self.cfg.MODEL.RPN_ONLY:
+            boxes = None
+        else:
+            with torch.no_grad():
+                boxes = self.box_selector_train(
+                    locations, box_cls, box_regression,
+                    centerness, image_sizes)
+        losses = {
+            "loss_cls": loss_box_cls,
+            "loss_reg": loss_box_reg,
+            "loss_centerness": loss_centerness
+        }
+        return boxes, losses
+
+    def _forward_test(self, locations, box_cls, box_regression, centerness, image_sizes):
+        boxes = self.box_selector_test(
+            locations, box_cls, box_regression, 
+            centerness, image_sizes
+        )
+        return boxes, {}
+
+    def compute_locations(self, features):
+        locations = []
+        for level, feature in enumerate(features):
+            h, w = feature.size()[-2:]
+            locations_per_level = self.compute_locations_per_level(
+                h, w, self.fpn_strides[level],
+                feature.device
+            )
+            locations.append(locations_per_level)
+        return locations
+
+    def compute_locations_per_level(self, h, w, stride, device):
+        shifts_x = torch.arange(
+            0, w * stride, step=stride,
+            dtype=torch.float32, device=device
+        )
+        shifts_y = torch.arange(
+            0, h * stride, step=stride,
+            dtype=torch.float32, device=device
+        )
+        shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+        shift_x = shift_x.reshape(-1)
+        shift_y = shift_y.reshape(-1)
+        locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
+        return locations
+
+def build_fcos(cfg, in_channels):
+    return FCOSModule(cfg, in_channels)
diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/inference.py b/maskrcnn_benchmark/modeling/rpn/fcos/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..72e5bf911142367d03fcb2ea4dc6d8fd17004dc8
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/fcos/inference.py
@@ -0,0 +1,209 @@
+import torch
+
+from ..inference import RPNPostProcessor
+from ..utils import permute_and_flatten
+
+from maskrcnn_benchmark.modeling.box_coder import BoxCoder
+from maskrcnn_benchmark.modeling.utils import cat
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms
+from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes
+
+
+class FCOSPostProcessor(torch.nn.Module):
+    """
+    Performs post-processing on the outputs of the RetinaNet boxes.
+    This is only used in the testing.
+    """
+    def __init__(
+        self,
+        pre_nms_thresh,
+        pre_nms_top_n,
+        nms_thresh,
+        fpn_post_nms_top_n,
+        min_size,
+        num_classes,
+    ):
+        """
+        Arguments:
+            pre_nms_thresh (float)
+            pre_nms_top_n (int)
+            nms_thresh (float)
+            fpn_post_nms_top_n (int)
+            min_size (int)
+            num_classes (int)
+            box_coder (BoxCoder)
+        """
+        super(FCOSPostProcessor, self).__init__()
+        self.pre_nms_thresh = pre_nms_thresh
+        self.pre_nms_top_n = pre_nms_top_n
+        self.nms_thresh = nms_thresh
+        self.fpn_post_nms_top_n = fpn_post_nms_top_n
+        self.min_size = min_size
+        self.num_classes = num_classes
+
+    def forward_for_single_feature_map(
+            self, locations, box_cls,
+            box_regression, centerness,
+            image_sizes):
+        """
+        Arguments:
+            anchors: list[BoxList]
+            box_cls: tensor of size N, A * C, H, W
+            box_regression: tensor of size N, A * 4, H, W
+        """
+        N, C, H, W = box_cls.shape
+
+        # put in the same format as locations
+        box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1)
+        box_cls = box_cls.reshape(N, -1, C).sigmoid()
+        box_regression = box_regression.view(N, 4, H, W).permute(0, 2, 3, 1)
+        box_regression = box_regression.reshape(N, -1, 4)
+        centerness = centerness.view(N, 1, H, W).permute(0, 2, 3, 1)
+        centerness = centerness.reshape(N, -1).sigmoid()
+
+        candidate_inds = box_cls > self.pre_nms_thresh
+        pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
+        pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)
+        #print('pre_nms_top_n: ', pre_nms_top_n)
+
+        # multiply the classification scores with centerness scores
+        box_cls = box_cls * centerness[:, :, None]
+
+        results = []
+        for i in range(N):
+            per_box_cls = box_cls[i]
+            per_candidate_inds = candidate_inds[i]
+            per_box_cls = per_box_cls[per_candidate_inds]
+
+            per_candidate_nonzeros = per_candidate_inds.nonzero()
+            per_box_loc = per_candidate_nonzeros[:, 0]
+            per_class = per_candidate_nonzeros[:, 1] + 1
+
+            per_box_regression = box_regression[i]
+            per_box_regression = per_box_regression[per_box_loc]
+            per_locations = locations[per_box_loc]
+
+            per_pre_nms_top_n = pre_nms_top_n[i]
+
+            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
+                per_box_cls, top_k_indices = \
+                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)
+                per_class = per_class[top_k_indices]
+                per_box_regression = per_box_regression[top_k_indices]
+                per_locations = per_locations[top_k_indices]
+
+            detections = torch.stack([
+                per_locations[:, 0] - per_box_regression[:, 0],
+                per_locations[:, 1] - per_box_regression[:, 1],
+                per_locations[:, 0] + per_box_regression[:, 2],
+                per_locations[:, 1] + per_box_regression[:, 3],
+            ], dim=1)
+
+            h, w = image_sizes[i]
+            boxlist = BoxList(detections, (int(w), int(h)), mode="xyxy")
+            boxlist.add_field("labels", per_class)
+            boxlist.add_field("scores", per_box_cls)
+            boxlist = boxlist.clip_to_image(remove_empty=False)
+            boxlist = remove_small_boxes(boxlist, self.min_size)
+            results.append(boxlist)
+
+        return results
+
+    def forward(self, locations, box_cls, box_regression, centerness, image_sizes):
+        """
+        Arguments:
+            anchors: list[list[BoxList]]
+            box_cls: list[tensor]
+            box_regression: list[tensor]
+            image_sizes: list[(h, w)]
+        Returns:
+            boxlists (list[BoxList]): the post-processed anchors, after
+                applying box decoding and NMS
+        """
+        sampled_boxes = []
+        for _, (l, o, b, c) in enumerate(zip(locations, box_cls, box_regression, centerness)):
+            sampled_boxes.append(
+                self.forward_for_single_feature_map(
+                    l, o, b, c, image_sizes
+                )
+            )
+
+        boxlists = list(zip(*sampled_boxes))
+        boxlists = [cat_boxlist(boxlist) for boxlist in boxlists]
+        boxlists = self.select_over_all_levels(boxlists)
+
+        return boxlists
+
+    # TODO very similar to filter_results from PostProcessor
+    # but filter_results is per image
+    # TODO Yang: solve this issue in the future. No good solution
+    # right now.
+    def select_over_all_levels(self, boxlists):
+        num_images = len(boxlists)
+        results = []
+        for i in range(num_images):
+            scores = boxlists[i].get_field("scores")
+            labels = boxlists[i].get_field("labels")
+            boxes = boxlists[i].bbox
+            boxlist = boxlists[i]
+            result = []
+            # skip the background
+            for j in range(1, self.num_classes):
+                inds = (labels == j).nonzero().view(-1)
+
+                scores_j = scores[inds]
+                boxes_j = boxes[inds, :].view(-1, 4)
+                boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
+                boxlist_for_class.add_field("scores", scores_j)
+                boxlist_for_class = boxlist_nms(
+                    boxlist_for_class, self.nms_thresh,
+                    score_field="scores"
+                )
+                num_labels = len(boxlist_for_class)
+                boxlist_for_class.add_field(
+                    "labels", torch.full((num_labels,), j,
+                                         dtype=torch.int64,
+                                         device=scores.device)
+                )
+                result.append(boxlist_for_class)
+
+            result = cat_boxlist(result)
+            number_of_detections = len(result)
+
+            # Limit to max_per_image detections **over all classes**
+            if number_of_detections > self.fpn_post_nms_top_n > 0:
+                cls_scores = result.get_field("scores")
+                image_thresh, _ = torch.kthvalue(
+                    cls_scores.cpu(),
+                    number_of_detections - self.fpn_post_nms_top_n + 1
+                )
+                keep = cls_scores >= image_thresh.item()
+                keep = torch.nonzero(keep).squeeze(1)
+                result = result[keep]
+            results.append(result)
+        return results
+
+
+def make_fcos_postprocessor(config, is_train=False):
+    pre_nms_thresh = config.MODEL.FCOS.INFERENCE_TH
+    pre_nms_top_n = config.MODEL.FCOS.PRE_NMS_TOP_N
+    nms_thresh = config.MODEL.FCOS.NMS_TH
+    fpn_post_nms_top_n = config.TEST.DETECTIONS_PER_IMG
+
+    if is_train:
+        fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN
+        pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TRAIN
+        pre_nms_thresh = 0.01
+
+    box_selector = FCOSPostProcessor(
+        pre_nms_thresh=pre_nms_thresh,
+        pre_nms_top_n=pre_nms_top_n,
+        nms_thresh=nms_thresh,
+        fpn_post_nms_top_n=fpn_post_nms_top_n,
+        min_size=0,
+        num_classes=config.MODEL.FCOS.NUM_CLASSES
+    )
+
+    return box_selector
diff --git a/maskrcnn_benchmark/modeling/rpn/fcos/loss.py b/maskrcnn_benchmark/modeling/rpn/fcos/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ae915c416d5d4182b01b36738e2ee0485c5f7eb
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/fcos/loss.py
@@ -0,0 +1,194 @@
+"""
+This file contains specific functions for computing losses of FCOS
+file
+"""
+
+import torch
+from torch.nn import functional as F
+from torch import nn
+
+from ..utils import concat_box_prediction_layers
+from maskrcnn_benchmark.layers import IOULoss
+from maskrcnn_benchmark.layers import SigmoidFocalLoss
+from maskrcnn_benchmark.modeling.matcher import Matcher
+from maskrcnn_benchmark.modeling.utils import cat
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+
+
+INF = 100000000
+
+
+class FCOSLossComputation(object):
+    """
+    This class computes the FCOS losses.
+    """
+
+    def __init__(self, cfg):
+        self.cls_loss_func = SigmoidFocalLoss(
+            cfg.MODEL.FCOS.LOSS_GAMMA,
+            cfg.MODEL.FCOS.LOSS_ALPHA
+        )
+        # we make use of IOU Loss for bounding boxes regression,
+        # but we found that L1 in log scale can yield a similar performance
+        self.box_reg_loss_func = IOULoss()
+        self.centerness_loss_func = nn.BCEWithLogitsLoss()
+        # generate sizes of interest
+        soi = []
+        prev_size = -1
+        for s in cfg.MODEL.FCOS.SIZES_OF_INTEREST:
+            soi.append([prev_size, s])
+            prev_size = s
+        soi.append([prev_size, INF])
+        self.object_sizes_of_interest = soi
+
+    def prepare_targets(self, points, targets):
+        object_sizes_of_interest = self.object_sizes_of_interest
+        expanded_object_sizes_of_interest = []
+        for l, points_per_level in enumerate(points):
+            object_sizes_of_interest_per_level = \
+                points_per_level.new_tensor(object_sizes_of_interest[l])
+            expanded_object_sizes_of_interest.append(
+                object_sizes_of_interest_per_level[None].expand(len(points_per_level), -1)
+            )
+
+        expanded_object_sizes_of_interest = torch.cat(expanded_object_sizes_of_interest, dim=0)
+        num_points_per_level = [len(points_per_level) for points_per_level in points]
+        points_all_level = torch.cat(points, dim=0)
+        labels, reg_targets = self.compute_targets_for_locations(
+            points_all_level, targets, expanded_object_sizes_of_interest
+        )
+
+        for i in range(len(labels)):
+            labels[i] = torch.split(labels[i], num_points_per_level, dim=0)
+            reg_targets[i] = torch.split(reg_targets[i], num_points_per_level, dim=0)
+
+        labels_level_first = []
+        reg_targets_level_first = []
+        for level in range(len(points)):
+            labels_level_first.append(
+                torch.cat([labels_per_im[level] for labels_per_im in labels], dim=0)
+            )
+            reg_targets_level_first.append(
+                torch.cat([reg_targets_per_im[level] for reg_targets_per_im in reg_targets], dim=0)
+            )
+
+        return labels_level_first, reg_targets_level_first
+
+    def compute_targets_for_locations(self, locations, targets, object_sizes_of_interest):
+        labels = []
+        reg_targets = []
+        xs, ys = locations[:, 0], locations[:, 1]
+
+        for im_i in range(len(targets)):
+            targets_per_im = targets[im_i]
+            assert targets_per_im.mode == "xyxy"
+            bboxes = targets_per_im.bbox
+            labels_per_im = targets_per_im.get_field("labels")
+            area = targets_per_im.area()
+
+            l = xs[:, None] - bboxes[:, 0][None]
+            t = ys[:, None] - bboxes[:, 1][None]
+            r = bboxes[:, 2][None] - xs[:, None]
+            b = bboxes[:, 3][None] - ys[:, None]
+            reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
+
+            is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0
+
+            max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0]
+            # limit the regression range for each location
+            is_cared_in_the_level = \
+                (max_reg_targets_per_im >= object_sizes_of_interest[:, [0]]) & \
+                (max_reg_targets_per_im <= object_sizes_of_interest[:, [1]])
+
+            locations_to_gt_area = area[None].repeat(len(locations), 1)
+            locations_to_gt_area[is_in_boxes == 0] = INF
+            locations_to_gt_area[is_cared_in_the_level == 0] = INF
+
+            # if there are still more than one objects for a location,
+            # we choose the one with minimal area
+            locations_to_min_aera, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
+
+            reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds]
+            labels_per_im = labels_per_im[locations_to_gt_inds]
+            labels_per_im[locations_to_min_aera == INF] = 0
+
+            labels.append(labels_per_im)
+            reg_targets.append(reg_targets_per_im)
+
+        return labels, reg_targets
+
+    def compute_centerness_targets(self, reg_targets):
+        left_right = reg_targets[:, [0, 2]]
+        top_bottom = reg_targets[:, [1, 3]]
+        centerness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
+                      (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+        return torch.sqrt(centerness)
+
+    def __call__(self, locations, box_cls, box_regression, centerness, targets):
+        """
+        Arguments:
+            locations (list[BoxList])
+            box_cls (list[Tensor])
+            box_regression (list[Tensor])
+            centerness (list[Tensor])
+            targets (list[BoxList])
+
+        Returns:
+            cls_loss (Tensor)
+            reg_loss (Tensor)
+            centerness_loss (Tensor)
+        """
+        N = box_cls[0].size(0)
+        num_classes = box_cls[0].size(1)
+        labels, reg_targets = self.prepare_targets(locations, targets)
+
+        box_cls_flatten = []
+        box_regression_flatten = []
+        centerness_flatten = []
+        labels_flatten = []
+        reg_targets_flatten = []
+        for l in range(len(labels)):
+            box_cls_flatten.append(box_cls[l].permute(0, 2, 3, 1).reshape(-1, num_classes))
+            box_regression_flatten.append(box_regression[l].permute(0, 2, 3, 1).reshape(-1, 4))
+            labels_flatten.append(labels[l].reshape(-1))
+            reg_targets_flatten.append(reg_targets[l].reshape(-1, 4))
+            centerness_flatten.append(centerness[l].reshape(-1))
+
+        box_cls_flatten = torch.cat(box_cls_flatten, dim=0)
+        box_regression_flatten = torch.cat(box_regression_flatten, dim=0)
+        centerness_flatten = torch.cat(centerness_flatten, dim=0)
+        labels_flatten = torch.cat(labels_flatten, dim=0)
+        reg_targets_flatten = torch.cat(reg_targets_flatten, dim=0)
+
+        pos_inds = torch.nonzero(labels_flatten > 0).squeeze(1)
+        cls_loss = self.cls_loss_func(
+            box_cls_flatten,
+            labels_flatten.int()
+        ) / (pos_inds.numel() + N)  # add N to avoid dividing by a zero
+
+        box_regression_flatten = box_regression_flatten[pos_inds]
+        reg_targets_flatten = reg_targets_flatten[pos_inds]
+        centerness_flatten = centerness_flatten[pos_inds]
+
+        if pos_inds.numel() > 0:
+            centerness_targets = self.compute_centerness_targets(reg_targets_flatten)
+            reg_loss = self.box_reg_loss_func(
+                box_regression_flatten,
+                reg_targets_flatten,
+                centerness_targets
+            )
+            centerness_loss = self.centerness_loss_func(
+                centerness_flatten,
+                centerness_targets
+            )
+        else:
+            reg_loss = box_regression_flatten.sum()
+            centerness_loss = centerness_flatten.sum()
+
+        return cls_loss, reg_loss, centerness_loss
+
+
+def make_fcos_loss_evaluator(cfg):
+    loss_evaluator = FCOSLossComputation(cfg)
+    return loss_evaluator
diff --git a/maskrcnn_benchmark/modeling/rpn/inference.py b/maskrcnn_benchmark/modeling/rpn/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a2e3871e42fac9fcef3db00da626ec0386d68b2
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/inference.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+
+from maskrcnn_benchmark.modeling.box_coder import BoxCoder
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms
+from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes
+
+from ..utils import cat
+from .utils import permute_and_flatten
+
+class RPNPostProcessor(torch.nn.Module):
+    """
+    Performs post-processing on the outputs of the RPN boxes, before feeding the
+    proposals to the heads
+    """
+
+    def __init__(
+        self,
+        pre_nms_top_n,
+        post_nms_top_n,
+        nms_thresh,
+        min_size,
+        box_coder=None,
+        fpn_post_nms_top_n=None,
+    ):
+        """
+        Arguments:
+            pre_nms_top_n (int)
+            post_nms_top_n (int)
+            nms_thresh (float)
+            min_size (int)
+            box_coder (BoxCoder)
+            fpn_post_nms_top_n (int)
+        """
+        super(RPNPostProcessor, self).__init__()
+        self.pre_nms_top_n = pre_nms_top_n  # 12000
+        self.post_nms_top_n = post_nms_top_n  # 2000
+        self.nms_thresh = nms_thresh  # 0.7
+        self.min_size = min_size  # 0
+
+        if box_coder is None:
+            box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
+        self.box_coder = box_coder
+
+        if fpn_post_nms_top_n is None:
+            fpn_post_nms_top_n = post_nms_top_n
+        self.fpn_post_nms_top_n = fpn_post_nms_top_n  # 2000
+
+    def add_gt_proposals(self, proposals, targets):
+        """
+        Arguments:
+            proposals: list[BoxList]
+            targets: list[BoxList]
+        """
+        # Get the device we're operating on
+        device = proposals[0].bbox.device
+
+        gt_boxes = [target.copy_with_fields([]) for target in targets]
+
+        # later cat of bbox requires all fields to be present for all bbox
+        # so we need to add a dummy for objectness that's missing
+        for gt_box in gt_boxes:
+            gt_box.add_field("objectness", torch.ones(len(gt_box), device=device))
+
+        proposals = [
+            cat_boxlist((proposal, gt_box))
+            for proposal, gt_box in zip(proposals, gt_boxes)
+        ]
+
+        return proposals
+
+    def forward_for_single_feature_map(self, anchors, objectness, box_regression):
+        """
+        Arguments:
+            anchors: list[BoxList] # [image,number,[n,4]]
+            objectness: tensor of size N, A, H, W
+            box_regression: tensor of size N, A * 4, H, W
+        """
+        device = objectness.device
+        N, A, H, W = objectness.shape
+        # put in the same format as anchors
+        objectness = permute_and_flatten(objectness, N, A, 1, H, W).view(N, -1) # N H*W*A*1
+        objectness = objectness.sigmoid()
+        box_regression = permute_and_flatten(box_regression, N, A, 18, H, W) # N H*W*A 4
+        num_anchors = A * H * W  # 391040 97760
+
+        pre_nms_top_n = min(self.pre_nms_top_n, num_anchors)  #12000
+        objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True)
+        # objectness = objectness.cpu()
+        batch_idx = torch.arange(N, device=device)[:, None]
+        box_regression = box_regression[batch_idx, topk_idx]
+        image_shapes = [box.size for box in anchors]
+        concat_anchors = torch.cat([a.bbox for a in anchors], dim=0)
+        concat_anchors = concat_anchors.reshape(N, -1, 4)[batch_idx, topk_idx]
+        proposals = self.box_coder.decode_iou(
+            box_regression.view(-1, 18), concat_anchors.view(-1, 4)
+        )
+
+        proposals = proposals.view(N, -1, 4)
+
+        result = []
+        for proposal, score, im_shape in zip(proposals, objectness, image_shapes):
+            boxlist = BoxList(proposal, im_shape, mode="xyxy")
+            boxlist.add_field("objectness", score)
+            boxlist = boxlist.clip_to_image(remove_empty=False)
+            boxlist = remove_small_boxes(boxlist, self.min_size)
+            boxlist = boxlist_nms(
+                boxlist,
+                self.nms_thresh,
+                max_proposals=self.post_nms_top_n,
+                score_field="objectness",
+            )
+            result.append(boxlist)
+        return result
+
+    def forward(self, anchors, objectness, box_regression, targets=None):
+        """
+        Arguments:
+            anchors: list[list[BoxList]]
+            objectness: list[tensor]
+            box_regression: list[tensor]
+
+        Returns:
+            boxlists (list[BoxList]): the post-processed anchors, after
+                applying box decoding and NMS
+        """
+        sampled_boxes = []
+        num_levels = len(objectness)     # classification
+        anchors = list(zip(*anchors))    # [image,number,[n,4]]
+        # i =-1
+        for a, o, b in zip(anchors, objectness, box_regression):
+            sampled_boxes.append(self.forward_for_single_feature_map(a, o, b))
+
+
+        boxlists = list(zip(*sampled_boxes))
+        boxlists = [cat_boxlist(boxlist) for boxlist in boxlists]
+
+        if num_levels > 1:
+            boxlists = self.select_over_all_levels(boxlists)
+
+        # append ground-truth bboxes to proposals
+        if self.training and targets is not None:
+            boxlists = self.add_gt_proposals(boxlists, targets)
+
+        return boxlists
+
+    def select_over_all_levels(self, boxlists):
+        num_images = len(boxlists)
+        # different behavior during training and during testing:
+        # during training, post_nms_top_n is over *all* the proposals combined, while
+        # during testing, it is over the proposals for each image
+        # TODO resolve this difference and make it consistent. It should be per image,
+        # and not per batch
+        if self.training:
+            objectness = torch.cat(
+                [boxlist.get_field("objectness") for boxlist in boxlists], dim=0
+            )
+            box_sizes = [len(boxlist) for boxlist in boxlists]
+            post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness))
+            _, inds_sorted = torch.topk(objectness, post_nms_top_n, dim=0, sorted=True)
+            inds_mask = torch.zeros_like(objectness, dtype=torch.uint8)
+            inds_mask[inds_sorted] = 1
+            inds_mask = inds_mask.split(box_sizes)
+            for i in range(num_images):
+                boxlists[i] = boxlists[i][inds_mask[i]]
+        else:
+            for i in range(num_images):
+                objectness = boxlists[i].get_field("objectness")
+                post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness))
+                _, inds_sorted = torch.topk(
+                    objectness, post_nms_top_n, dim=0, sorted=True
+                )
+                boxlists[i] = boxlists[i][inds_sorted]
+        return boxlists
+
+
+def make_rpn_postprocessor(config, rpn_box_coder, is_train):
+    fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN  # 2000
+    if not is_train:
+        fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST
+
+    pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TRAIN  # 12000
+    post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TRAIN  # 2000
+    if not is_train:
+        pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TEST
+        post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TEST
+    nms_thresh = config.MODEL.RPN.NMS_THRESH  # 0.7
+    min_size = config.MODEL.RPN.MIN_SIZE  # 0
+    box_selector = RPNPostProcessor(
+        pre_nms_top_n=pre_nms_top_n,  #12000
+        post_nms_top_n=post_nms_top_n,  #2000
+        nms_thresh=nms_thresh,  # 0.7
+        min_size=min_size,  # 0
+        box_coder=rpn_box_coder,
+        fpn_post_nms_top_n=fpn_post_nms_top_n,  #2000
+    )
+    return box_selector
diff --git a/maskrcnn_benchmark/modeling/rpn/loss.py b/maskrcnn_benchmark/modeling/rpn/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..d39d03ce96eeca40d81df45eaa262674c21dbccc
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/loss.py
@@ -0,0 +1,153 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+This file contains specific functions for computing losses on the RPN
+file
+"""
+
+import torch
+from torch.nn import functional as F
+from maskrcnn_benchmark.config import cfg
+
+from .utils import concat_box_prediction_layers
+
+from ..balanced_positive_negative_sampler import BalancedPositiveNegativeSampler
+from ..utils import cat
+
+from maskrcnn_benchmark.layers import smooth_l1_loss
+from maskrcnn_benchmark.layers import iou_regress
+from maskrcnn_benchmark.modeling.matcher import Matcher
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+
+
+class RPNLossComputation(object):
+    """
+    This class computes the RPN loss.
+    """
+
+    def __init__(self, proposal_matcher, fg_bg_sampler, box_coder,
+                 generate_labels_func):
+        """
+        Arguments:
+            proposal_matcher (Matcher)
+            fg_bg_sampler (BalancedPositiveNegativeSampler)
+            box_coder (BoxCoder)
+        """
+        self.proposal_matcher = proposal_matcher
+        self.fg_bg_sampler = fg_bg_sampler
+        self.box_coder = box_coder
+        self.copied_fields = []
+        self.generate_labels_func = generate_labels_func
+        self.discard_cases = ['not_visibility', 'between_thresholds']
+
+    def match_targets_to_anchors(self, anchor, target, copied_fields=[]):
+
+        match_quality_matrix = boxlist_iou(target, anchor)
+        matched_idxs = self.proposal_matcher(match_quality_matrix)
+        # RPN doesn't need any fields from target for creating the labels, so clear them all
+        target = target.copy_with_fields(copied_fields)
+        # get the targets corresponding GT for each anchor
+        # NB: need to clamp the indices because we can have a single
+        # GT in the image, and matched_idxs can be -2, which goes
+        # out of bounds
+        matched_targets = target[matched_idxs.clamp(min=0)]
+        matched_targets.add_field("matched_idxs", matched_idxs)
+        return matched_targets
+
+    def prepare_targets(self, anchors, targets):
+        labels = []
+        regression_targets = []
+
+        for anchors_per_image, targets_per_image in zip(anchors, targets):
+            matched_targets = self.match_targets_to_anchors(
+                anchors_per_image, targets_per_image, self.copied_fields
+            )
+
+            matched_idxs = matched_targets.get_field("matched_idxs")
+            labels_per_image = self.generate_labels_func(matched_targets)
+            labels_per_image = labels_per_image.to(dtype=torch.float32)
+
+            # Background (negative examples)
+            bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD
+            labels_per_image[bg_indices] = 0
+
+            # discard anchors that go out of the boundaries of the image
+            if "not_visibility" in self.discard_cases:
+                labels_per_image[~anchors_per_image.get_field("visibility")] = -1
+
+            # discard indices that are between thresholds
+            if "between_thresholds" in self.discard_cases:
+                inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS
+                labels_per_image[inds_to_discard] = -1
+            regression_targets_per_image = matched_targets.bbox
+            labels.append(labels_per_image)
+            regression_targets.append(regression_targets_per_image)
+
+        return labels, regression_targets
+
+
+    def __call__(self, anchors, objectness, box_regression, targets):
+        """
+        Arguments:
+            anchors (list[BoxList])
+            objectness (list[Tensor])
+            box_regression (list[Tensor])
+            targets (list[BoxList])
+
+        Returns:
+            objectness_loss (Tensor)
+            box_loss (Tensor
+        """
+        anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]
+
+        labels, regression_targets = self.prepare_targets(anchors, targets)
+        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
+        sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
+        sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)
+
+        sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)
+        objectness, box_regression = \
+                concat_box_prediction_layers(objectness, box_regression)
+        objectness = objectness.squeeze()   # [1041820]
+        labels = torch.cat(labels, dim=0)
+        regression_targets = torch.cat(regression_targets, dim=0)
+
+        box_loss = iou_regress(
+            box_regression[sampled_pos_inds],
+            regression_targets[sampled_pos_inds],
+            beta=1.0 / 9,
+            size_average=False,
+        ) / (sampled_inds.numel())
+
+        box_loss *= cfg.MODEL.ROI_BOUNDARY_HEAD.Loss_balance
+
+        objectness_loss = F.binary_cross_entropy_with_logits(
+            objectness[sampled_inds], labels[sampled_inds]
+        )
+        return objectness_loss, box_loss
+
+# This function should be overwritten in RetinaNet 11
+def generate_rpn_labels(matched_targets):
+    matched_idxs = matched_targets.get_field("matched_idxs")
+    labels_per_image = matched_idxs >= 0
+    return labels_per_image
+
+
+def make_rpn_loss_evaluator(cfg, box_coder):
+    matcher = Matcher(
+        cfg.MODEL.RPN.FG_IOU_THRESHOLD,
+        cfg.MODEL.RPN.BG_IOU_THRESHOLD,
+        allow_low_quality_matches=True,
+    )
+
+    fg_bg_sampler = BalancedPositiveNegativeSampler(
+        cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, cfg.MODEL.RPN.POSITIVE_FRACTION
+    )
+
+    loss_evaluator = RPNLossComputation(
+        matcher,
+        fg_bg_sampler,
+        box_coder,
+        generate_rpn_labels
+    )
+    return loss_evaluator
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py b/maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd36d5fbbe62db89fc5c32cc62146d1808c9459a
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/inference.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/inference.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57e0916dfc6fd334502ba62f3293cfc1338c10b0
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/inference.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/loss.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/loss.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f498585c7958def9ad05cf056265d2e790624aa
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/loss.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/retinanet.cpython-37.pyc b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/retinanet.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..559289d998ca6b6e6a6290d7d6933534544b9e26
Binary files /dev/null and b/maskrcnn_benchmark/modeling/rpn/retinanet/__pycache__/retinanet.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py b/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..09c17adfc2565871632cffecd51738a2cbd9acb2
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py
@@ -0,0 +1,194 @@
+import torch
+
+from ..inference import RPNPostProcessor
+from ..utils import permute_and_flatten
+
+from maskrcnn_benchmark.modeling.box_coder import BoxCoder
+from maskrcnn_benchmark.modeling.utils import cat
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms
+from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes
+
+
+class RetinaNetPostProcessor(RPNPostProcessor):
+    """
+    Performs post-processing on the outputs of the RetinaNet boxes.
+    This is only used in the testing.
+    """
+    def __init__(
+        self,
+        pre_nms_thresh,
+        pre_nms_top_n,
+        nms_thresh,
+        fpn_post_nms_top_n,
+        min_size,
+        num_classes,
+        box_coder=None,
+    ):
+        """
+        Arguments:
+            pre_nms_thresh (float)
+            pre_nms_top_n (int)
+            nms_thresh (float)
+            fpn_post_nms_top_n (int)
+            min_size (int)
+            num_classes (int)
+            box_coder (BoxCoder)
+        """
+        super(RetinaNetPostProcessor, self).__init__(
+            pre_nms_thresh, 0, nms_thresh, min_size
+        )
+        self.pre_nms_thresh = pre_nms_thresh
+        self.pre_nms_top_n = pre_nms_top_n
+        self.nms_thresh = nms_thresh
+        self.fpn_post_nms_top_n = fpn_post_nms_top_n
+        self.min_size = min_size
+        self.num_classes = num_classes
+
+        if box_coder is None:
+            box_coder = BoxCoder(weights=(10., 10., 5., 5.))
+        self.box_coder = box_coder
+ 
+    def add_gt_proposals(self, proposals, targets):
+        """
+        This function is not used in RetinaNet
+        """
+        pass
+
+    def forward_for_single_feature_map(
+            self, anchors, box_cls, box_regression):
+        """
+        Arguments:
+            anchors: list[BoxList]
+            box_cls: tensor of size N, A * C, H, W
+            box_regression: tensor of size N, A * 4, H, W
+        """
+        device = box_cls.device
+        N, _, H, W = box_cls.shape
+        A = box_regression.size(1) // 4
+        C = box_cls.size(1) // A
+
+        # put in the same format as anchors
+        box_cls = permute_and_flatten(box_cls, N, A, C, H, W)
+        box_cls = box_cls.sigmoid()
+
+        box_regression = permute_and_flatten(box_regression, N, A, 4, H, W)
+        box_regression = box_regression.reshape(N, -1, 4)
+
+        num_anchors = A * H * W
+
+        candidate_inds = box_cls > self.pre_nms_thresh
+
+        pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
+        pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)
+
+        results = []
+        for per_box_cls, per_box_regression, per_pre_nms_top_n, \
+        per_candidate_inds, per_anchors in zip(
+            box_cls,
+            box_regression,
+            pre_nms_top_n,
+            candidate_inds,
+            anchors):
+
+            # Sort and select TopN
+            # TODO most of this can be made out of the loop for
+            # all images. 
+            # TODO:Yang: Not easy to do. Because the numbers of detections are
+            # different in each image. Therefore, this part needs to be done
+            # per image. 
+            per_box_cls = per_box_cls[per_candidate_inds]
+ 
+            per_box_cls, top_k_indices = \
+                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)
+
+            per_candidate_nonzeros = \
+                    per_candidate_inds.nonzero()[top_k_indices, :]
+
+            per_box_loc = per_candidate_nonzeros[:, 0]
+            per_class = per_candidate_nonzeros[:, 1]
+            per_class += 1
+
+            detections = self.box_coder.decode(
+                per_box_regression[per_box_loc, :].view(-1, 4),
+                per_anchors.bbox[per_box_loc, :].view(-1, 4)
+            )
+
+            boxlist = BoxList(detections, per_anchors.size, mode="xyxy")
+            boxlist.add_field("labels", per_class)
+            boxlist.add_field("scores", per_box_cls)
+            boxlist = boxlist.clip_to_image(remove_empty=False)
+            boxlist = remove_small_boxes(boxlist, self.min_size)
+            results.append(boxlist)
+
+        return results
+
+    # TODO very similar to filter_results from PostProcessor
+    # but filter_results is per image
+    # TODO Yang: solve this issue in the future. No good solution
+    # right now.
+    def select_over_all_levels(self, boxlists):
+        num_images = len(boxlists)
+        results = []
+        for i in range(num_images):
+            scores = boxlists[i].get_field("scores")
+            labels = boxlists[i].get_field("labels")
+            boxes = boxlists[i].bbox
+            boxlist = boxlists[i]
+            result = []
+            # skip the background
+            for j in range(1, self.num_classes):
+                inds = (labels == j).nonzero().view(-1)
+
+                scores_j = scores[inds]
+                boxes_j = boxes[inds, :].view(-1, 4)
+                boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
+                boxlist_for_class.add_field("scores", scores_j)
+                boxlist_for_class = boxlist_nms(
+                    boxlist_for_class, self.nms_thresh,
+                    score_field="scores"
+                )
+                num_labels = len(boxlist_for_class)
+                boxlist_for_class.add_field(
+                    "labels", torch.full((num_labels,), j,
+                                         dtype=torch.int64,
+                                         device=scores.device)
+                )
+                result.append(boxlist_for_class)
+
+            result = cat_boxlist(result)
+            number_of_detections = len(result)
+
+            # Limit to max_per_image detections **over all classes**
+            if number_of_detections > self.fpn_post_nms_top_n > 0:
+                cls_scores = result.get_field("scores")
+                image_thresh, _ = torch.kthvalue(
+                    cls_scores.cpu(),
+                    number_of_detections - self.fpn_post_nms_top_n + 1
+                )
+                keep = cls_scores >= image_thresh.item()
+                keep = torch.nonzero(keep).squeeze(1)
+                result = result[keep]
+            results.append(result)
+        return results
+
+
+def make_retinanet_postprocessor(config, rpn_box_coder, is_train):
+    pre_nms_thresh = config.MODEL.RETINANET.INFERENCE_TH
+    pre_nms_top_n = config.MODEL.RETINANET.PRE_NMS_TOP_N
+    nms_thresh = config.MODEL.RETINANET.NMS_TH
+    fpn_post_nms_top_n = config.TEST.DETECTIONS_PER_IMG
+    min_size = 0
+
+    box_selector = RetinaNetPostProcessor(
+        pre_nms_thresh=pre_nms_thresh,
+        pre_nms_top_n=pre_nms_top_n,
+        nms_thresh=nms_thresh,
+        fpn_post_nms_top_n=fpn_post_nms_top_n,
+        min_size=min_size,
+        num_classes=config.MODEL.RETINANET.NUM_CLASSES,
+        box_coder=rpn_box_coder,
+    )
+
+    return box_selector
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py b/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..080e2153ba59e90e620f30a5adc5426a1551e4e8
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py
@@ -0,0 +1,107 @@
+"""
+This file contains specific functions for computing losses on the RetinaNet
+file
+"""
+
+import torch
+from torch.nn import functional as F
+
+from ..utils import concat_box_prediction_layers
+
+from maskrcnn_benchmark.layers import smooth_l1_loss
+from maskrcnn_benchmark.layers import SigmoidFocalLoss
+from maskrcnn_benchmark.modeling.matcher import Matcher
+from maskrcnn_benchmark.modeling.utils import cat
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
+from maskrcnn_benchmark.modeling.rpn.loss import RPNLossComputation
+
+class RetinaNetLossComputation(RPNLossComputation):
+    """
+    This class computes the RetinaNet loss.
+    """
+
+    def __init__(self, proposal_matcher, box_coder,
+                 generate_labels_func,
+                 sigmoid_focal_loss,
+                 bbox_reg_beta=0.11,
+                 regress_norm=1.0):
+        """
+        Arguments:
+            proposal_matcher (Matcher)
+            box_coder (BoxCoder)
+        """
+        self.proposal_matcher = proposal_matcher
+        self.box_coder = box_coder
+        self.box_cls_loss_func = sigmoid_focal_loss
+        self.bbox_reg_beta = bbox_reg_beta
+        self.copied_fields = ['labels']
+        self.generate_labels_func = generate_labels_func
+        self.discard_cases = ['between_thresholds']
+        self.regress_norm = regress_norm
+
+    def __call__(self, anchors, box_cls, box_regression, targets):
+        """
+        Arguments:
+            anchors (list[BoxList])
+            box_cls (list[Tensor])
+            box_regression (list[Tensor])
+            targets (list[BoxList])
+
+        Returns:
+            retinanet_cls_loss (Tensor)
+            retinanet_regression_loss (Tensor
+        """
+        anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]
+        labels, regression_targets = self.prepare_targets(anchors, targets)
+
+        N = len(labels)
+        box_cls, box_regression = \
+                concat_box_prediction_layers(box_cls, box_regression)
+
+        labels = torch.cat(labels, dim=0)
+        regression_targets = torch.cat(regression_targets, dim=0)
+        pos_inds = torch.nonzero(labels > 0).squeeze(1)
+
+        retinanet_regression_loss = smooth_l1_loss(
+            box_regression[pos_inds],
+            regression_targets[pos_inds],
+            beta=self.bbox_reg_beta,
+            size_average=False,
+        ) / (max(1, pos_inds.numel() * self.regress_norm))
+
+        labels = labels.int()
+
+        retinanet_cls_loss = self.box_cls_loss_func(
+            box_cls,
+            labels
+        ) / (pos_inds.numel() + N)
+
+        return retinanet_cls_loss, retinanet_regression_loss
+
+
+def generate_retinanet_labels(matched_targets):
+    labels_per_image = matched_targets.get_field("labels")
+    return labels_per_image
+
+
+def make_retinanet_loss_evaluator(cfg, box_coder):
+    matcher = Matcher(
+        cfg.MODEL.RETINANET.FG_IOU_THRESHOLD,
+        cfg.MODEL.RETINANET.BG_IOU_THRESHOLD,
+        allow_low_quality_matches=True,
+    )
+    sigmoid_focal_loss = SigmoidFocalLoss(
+        cfg.MODEL.RETINANET.LOSS_GAMMA,
+        cfg.MODEL.RETINANET.LOSS_ALPHA
+    )
+
+    loss_evaluator = RetinaNetLossComputation(
+        matcher,
+        box_coder,
+        generate_retinanet_labels,
+        sigmoid_focal_loss,
+        bbox_reg_beta = cfg.MODEL.RETINANET.BBOX_REG_BETA,
+        regress_norm = cfg.MODEL.RETINANET.BBOX_REG_WEIGHT,
+    )
+    return loss_evaluator
diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py b/maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1599b29b2e9bbb626b31d652022fbbd034bf5e30
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py
@@ -0,0 +1,152 @@
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from .inference import  make_retinanet_postprocessor
+from .loss import make_retinanet_loss_evaluator
+from ..anchor_generator import make_anchor_generator_retinanet
+
+from maskrcnn_benchmark.modeling.box_coder import BoxCoder
+
+
+class RetinaNetHead(torch.nn.Module):
+    """
+    Adds a RetinNet head with classification and regression heads
+    """
+
+    def __init__(self, cfg, in_channels):
+        """
+        Arguments:
+            in_channels (int): number of channels of the input feature
+            num_anchors (int): number of anchors to be predicted
+        """
+        super(RetinaNetHead, self).__init__()
+        # TODO: Implement the sigmoid version first.
+        num_classes = cfg.MODEL.RETINANET.NUM_CLASSES - 1
+        num_anchors = len(cfg.MODEL.RETINANET.ASPECT_RATIOS) \
+                        * cfg.MODEL.RETINANET.SCALES_PER_OCTAVE
+
+        cls_tower = []
+        bbox_tower = []
+        for i in range(cfg.MODEL.RETINANET.NUM_CONVS):
+            cls_tower.append(
+                nn.Conv2d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1
+                )
+            )
+            cls_tower.append(nn.ReLU())
+            bbox_tower.append(
+                nn.Conv2d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1
+                )
+            )
+            bbox_tower.append(nn.ReLU())
+
+        self.add_module('cls_tower', nn.Sequential(*cls_tower))
+        self.add_module('bbox_tower', nn.Sequential(*bbox_tower))
+        self.cls_logits = nn.Conv2d(
+            in_channels, num_anchors * num_classes, kernel_size=3, stride=1,
+            padding=1
+        )
+        self.bbox_pred = nn.Conv2d(
+            in_channels,  num_anchors * 4, kernel_size=3, stride=1,
+            padding=1
+        )
+
+        # Initialization
+        for modules in [self.cls_tower, self.bbox_tower, self.cls_logits,
+                  self.bbox_pred]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.normal_(l.weight, std=0.01)
+                    torch.nn.init.constant_(l.bias, 0)
+
+
+        # retinanet_bias_init
+        prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        torch.nn.init.constant_(self.cls_logits.bias, bias_value)
+
+    def forward(self, x):
+        logits = []
+        bbox_reg = []
+        for feature in x:
+            logits.append(self.cls_logits(self.cls_tower(feature)))
+            bbox_reg.append(self.bbox_pred(self.bbox_tower(feature)))
+        return logits, bbox_reg
+
+
+class RetinaNetModule(torch.nn.Module):
+    """
+    Module for RetinaNet computation. Takes feature maps from the backbone and
+    RetinaNet outputs and losses. Only Test on FPN now.
+    """
+
+    def __init__(self, cfg, in_channels):
+        super(RetinaNetModule, self).__init__()
+
+        self.cfg = cfg.clone()
+
+        anchor_generator = make_anchor_generator_retinanet(cfg)
+        head = RetinaNetHead(cfg, in_channels)
+        box_coder = BoxCoder(weights=(10., 10., 5., 5.))
+
+        box_selector_test = make_retinanet_postprocessor(cfg, box_coder, is_train=False)
+
+        loss_evaluator = make_retinanet_loss_evaluator(cfg, box_coder)
+
+        self.anchor_generator = anchor_generator
+        self.head = head
+        self.box_selector_test = box_selector_test
+        self.loss_evaluator = loss_evaluator
+
+    def forward(self, images, features, targets=None):
+        """
+        Arguments:
+            images (ImageList): images for which we want to compute the predictions
+            features (list[Tensor]): features computed from the images that are
+                used for computing the predictions. Each tensor in the list
+                correspond to different feature levels
+            targets (list[BoxList): ground-truth boxes present in the image (optional)
+
+        Returns:
+            boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per
+                image.
+            losses (dict[Tensor]): the losses for the model during training. During
+                testing, it is an empty dict.
+        """
+        box_cls, box_regression = self.head(features)
+        anchors = self.anchor_generator(images, features)
+ 
+        if self.training:
+            return self._forward_train(anchors, box_cls, box_regression, targets)
+        else:
+            return self._forward_test(anchors, box_cls, box_regression)
+
+    def _forward_train(self, anchors, box_cls, box_regression, targets):
+
+        loss_box_cls, loss_box_reg = self.loss_evaluator(
+            anchors, box_cls, box_regression, targets
+        )
+        losses = {
+            "loss_retina_cls": loss_box_cls,
+            "loss_retina_reg": loss_box_reg,
+        }
+        return anchors, losses
+
+    def _forward_test(self, anchors, box_cls, box_regression):
+        boxes = self.box_selector_test(anchors, box_cls, box_regression)
+        return boxes, {}
+
+
+def build_retinanet(cfg, in_channels):
+    return RetinaNetModule(cfg, in_channels)
diff --git a/maskrcnn_benchmark/modeling/rpn/rpn.py b/maskrcnn_benchmark/modeling/rpn/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b027855cf114594c437f7d867a187b496b3bc80
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/rpn.py
@@ -0,0 +1,321 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+import torch.nn.functional as F
+from torch import nn
+import math
+from maskrcnn_benchmark.modeling import registry
+from maskrcnn_benchmark.modeling.box_coder import BoxCoder
+from maskrcnn_benchmark.modeling.rpn.retinanet.retinanet import build_retinanet
+from maskrcnn_benchmark.modeling.rpn.fcos.fcos import build_fcos
+from .loss import make_rpn_loss_evaluator
+from .anchor_generator import make_anchor_generator
+from .inference import make_rpn_postprocessor
+
+
+class RPNHeadConvRegressor(nn.Module):
+    """
+    A simple RPN Head for classification and bbox regression
+    """
+
+    def __init__(self, cfg, in_channels, num_anchors):
+        """
+        Arguments:
+            cfg              : config
+            in_channels (int): number of channels of the input feature
+            num_anchors (int): number of anchors to be predicted
+        """
+        super(RPNHeadConvRegressor, self).__init__()
+        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
+        self.bbox_pred = nn.Conv2d(
+            in_channels, num_anchors * 4, kernel_size=1, stride=1
+        )
+
+        for l in [self.cls_logits, self.bbox_pred]:
+            torch.nn.init.normal_(l.weight, std=0.01)
+            torch.nn.init.constant_(l.bias, 0)
+
+    def forward(self, x):
+        assert isinstance(x, (list, tuple))
+        logits = [self.cls_logits(y) for y in x]
+        bbox_reg = [self.bbox_pred(y) for y in x]
+
+        return logits, bbox_reg
+
+
+class RPNHeadFeatureSingleConv(nn.Module):
+    """
+    Adds a simple RPN Head with one conv to extract the feature
+    """
+
+    def __init__(self, cfg, in_channels):
+        """
+        Arguments:
+            cfg              : config
+            in_channels (int): number of channels of the input feature
+        """
+        super(RPNHeadFeatureSingleConv, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        for l in [self.conv]:
+            torch.nn.init.normal_(l.weight, std=0.01)
+            torch.nn.init.constant_(l.bias, 0)
+
+        self.out_channels = in_channels
+
+    def forward(self, x):
+        assert isinstance(x, (list, tuple))
+        x = [F.relu(self.conv(z)) for z in x]
+
+        return x
+
+
+@registry.RPN_HEADS.register("SingleConvRPNHead_1")
+class RPNHead(nn.Module):
+    """
+    Adds a simple RPN Head with classification and regression heads
+    """
+
+    def __init__(self, cfg, in_channels, num_anchors):
+        """
+        Arguments:
+            cfg              : config
+            in_channels (int): number of channels of the input feature
+            num_anchors (int): number of anchors to be predicted
+        """
+        super(RPNHead, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
+        self.bbox_pred_new = nn.Conv2d(
+            in_channels, num_anchors * 18, kernel_size=1, stride=1
+        )
+
+        for l in [self.conv, self.cls_logits, self.bbox_pred_new]:
+            torch.nn.init.normal_(l.weight, std=0.01)
+            torch.nn.init.constant_(l.bias, 0)
+
+    def forward(self, x):
+
+        logits = []
+        bbox_reg = []
+        for feature in x:
+            t = F.relu(self.conv(feature))
+            logits.append(self.cls_logits(t))
+            bbox_reg.append(self.bbox_pred_new(t))
+        return logits, bbox_reg
+
+
+class RPNModule(torch.nn.Module):
+    """
+    Module for RPN computation. Takes feature maps from the backbone and RPN
+    proposals and losses. Works for both FPN and non-FPN.
+    """
+
+    def __init__(self, cfg, in_channels):
+        super(RPNModule, self).__init__()
+
+        self.cfg = cfg.clone()
+
+        anchor_generator = make_anchor_generator(cfg)
+
+        rpn_head = registry.RPN_HEADS[cfg.MODEL.RPN.RPN_HEAD]
+        head = rpn_head(
+            cfg, in_channels, anchor_generator.num_anchors_per_location()[0]
+        )
+
+        rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
+
+        box_selector_train = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=True)
+        box_selector_test = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=False)
+
+        loss_evaluator = make_rpn_loss_evaluator(cfg, rpn_box_coder)
+
+        self.anchor_generator = anchor_generator
+        self.head = head
+        self.box_selector_train = box_selector_train
+        self.box_selector_test = box_selector_test
+        self.loss_evaluator = loss_evaluator
+
+    def forward(self, images, features, targets=None, prefix=''):
+        """
+        Arguments:
+            images (ImageList): images for which we want to compute the predictions
+            features (list[Tensor]): features computed from the images that are
+                used for computing the predictions. Each tensor in the list
+                correspond to different feature levels
+            targets (list[BoxList): ground-truth boxes present in the image (optional)
+
+        Returns:
+            boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per
+                image.
+            losses (dict[Tensor]): the losses for the model during training. During
+                testing, it is an empty dict.
+        """
+        objectness, rpn_box_regression = self.head(features)   # len = 5
+        anchors = self.anchor_generator(images, features)
+
+        if self.training:
+            return self._forward_train(anchors, objectness,
+                                       rpn_box_regression, targets, prefix)
+        else:
+            return self._forward_test(anchors, objectness, rpn_box_regression)
+
+    def _forward_train(self, anchors, objectness, rpn_box_regression,  # [image,number,[n,4]]
+                       targets, prefix):
+        if self.cfg.MODEL.RPN_ONLY:
+            # When training an RPN-only model, the loss is determined by the
+            # predicted objectness and rpn_box_regression values and there is
+            # no need to transform the anchors into predicted boxes; this is an
+            # optimization that avoids the unnecessary transformation.
+            boxes = anchors
+        else:
+            # print('\n---end-to-end model---\n')
+            # For end-to-end models, anchors must be transformed into boxes and
+            # sampled into a training batch.
+            with torch.no_grad():
+                boxes = self.box_selector_train(
+                    anchors, objectness, rpn_box_regression, targets
+                )
+        anchors_new = list(zip(*anchors))
+        regress_new = regress_to_box(anchors_new, rpn_box_regression)
+
+        loss_objectness, loss_rpn_box_reg = self.loss_evaluator(
+            anchors, objectness, regress_new, targets
+        )
+        losses = {
+            prefix + "loss_objectness": loss_objectness,
+            prefix + "loss_rpn_box_reg": loss_rpn_box_reg,
+        }
+        return boxes, losses
+
+    def _forward_test(self, anchors, objectness, rpn_box_regression):
+        boxes = self.box_selector_test(anchors, objectness, rpn_box_regression)
+        if self.cfg.MODEL.RPN_ONLY:
+            # For end-to-end models, the RPN proposals are an intermediate state
+            # and don't bother to sort them in decreasing score order. For RPN-only
+            # models, the proposals are the final output and we return them in
+            # high-to-low confidence order.
+            inds = [
+                box.get_field("objectness").sort(descending=True)[1] for box in boxes
+            ]
+            boxes = [box[ind] for box, ind in zip(boxes, inds)]
+        return boxes, {}
+
+
+def build_rpn(cfg, in_channels):
+    """
+    This gives the gist of it. Not super important because it doesn't change as much
+    """
+    if cfg.MODEL.FCOS_ON:
+        return build_fcos(cfg, in_channels)
+    if cfg.MODEL.RETINANET_ON:
+        return build_retinanet(cfg, in_channels)
+
+    return RPNModule(cfg, in_channels)
+
+
+def regress_to_box(anchor_define,regress_pre):
+
+    boxes_total = []
+    num_f = 0
+    for a, b in zip(anchor_define, regress_pre):
+        boxes_total.append(forward_feature_map(a, b))
+        num_f += 1
+    return boxes_total
+
+def forward_feature_map(anchors_define, boxes_regression):
+    N, A, H, W = boxes_regression.shape
+
+    boxes_regression = faltten(boxes_regression, N, A, 18, H, W)  #
+
+    # image_shapes = [box.size for box in anchors_define]
+    concat_anchors = torch.cat([a.bbox for a in anchors_define], dim=0)
+    concat_anchors = concat_anchors.reshape(N, -1, 4)
+    proposals = decode_iou(boxes_regression.view(-1, 18), concat_anchors.view(-1, 4))
+    box_temp_post = proposals.view(N, -1, 4)
+
+    return box_temp_post
+
+def faltten(layer, N, A, C, H, W):
+    layer = layer.view(N, -1, C, H, W)
+    layer = layer.permute(0, 3, 4, 1, 2)  #N H W A C
+    layer = layer.reshape(N, -1, C)  # N H*W*A C
+    return layer
+
+def decode_iou( rel_codes, boxes, num_p = 8):
+        """
+        From a set of original boxes and encoded relative box offsets,
+        get the decoded boxes.
+
+        Arguments:
+            rel_codes (Tensor): encoded boxes   # predict  [2, 12000, 4]
+            boxes (Tensor): reference boxes.   # anchor  [2, 12000, 4]  xmin0 ymin1 xmax2 ymax3
+        """
+        boxes = boxes.to(rel_codes.dtype)
+
+        TO_REMOVE = 1  # TODO remove
+        widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE
+        heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE
+        dx = rel_codes[:, 16]
+        dy = rel_codes[:, 17]
+
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+
+        ctr_x_new = dx * widths * 0.5 + ctr_x
+        ctr_y_new = dy * heights * 0.5 + ctr_y
+        # 123
+        # 8#4
+        # 765
+        if num_p == 8:  # 8 boundary points
+            x_1 = boxes[:, 0] + widths * rel_codes[:, 0]
+            y_1 = boxes[:, 1] + heights * rel_codes[:, 1]
+            x_2 = ctr_x + widths * rel_codes[:, 2]
+            y_2 = boxes[:, 1] + heights * rel_codes[:, 3]
+            x_3 = boxes[:, 2] + widths * rel_codes[:, 4]
+            y_3 = boxes[:, 1] + heights * rel_codes[:, 5]
+            x_4 = boxes[:, 2] + widths * rel_codes[:, 6]
+            y_4 = ctr_y + heights * rel_codes[:, 7]
+            x_5 = boxes[:, 2] + widths * rel_codes[:, 8]
+            y_5 = boxes[:, 3] + heights * rel_codes[:, 9]
+            x_6 = ctr_x + widths * rel_codes[:, 10]
+            y_6 = boxes[:, 3] + heights * rel_codes[:, 11]
+            x_7 = boxes[:, 0] + widths * rel_codes[:, 12]
+            y_7 = boxes[:, 3] + heights * rel_codes[:, 13]
+            x_8 = boxes[:, 0] + widths * rel_codes[:, 14]
+            y_8 = ctr_y + heights * rel_codes[:, 15]
+            x_total = torch.stack([x_1, x_2, x_3, x_4, x_5, x_6, x_7, x_8], 0)  # [8, N]
+            y_total = torch.stack([y_1, y_2, y_3, y_4, y_5, y_6, y_7, y_8], 0)
+
+        x_min = torch.min(x_total, 0, keepdim=True)  # [1, N]
+        x_max = torch.max(x_total, 0, keepdim=True)  # [1, N]
+        y_min = torch.min(y_total, 0, keepdim=True)  # [1, N]
+        y_max = torch.max(y_total, 0, keepdim=True)  # [1, N]
+
+        N1, N2 = x_min[0].shape
+        x_min = x_min[0].view([N2])
+        x_max = x_max[0].view([N2])
+        y_min = y_min[0].view([N2])
+        y_max = y_max[0].view([N2])
+
+        x_min = torch.stack([x_min, ctr_x_new], 0)
+        x_max = torch.stack([x_max, ctr_x_new], 0)
+        y_min = torch.stack([y_min, ctr_y_new], 0)
+        y_max = torch.stack([y_max, ctr_y_new], 0)
+
+        x_min = torch.min(x_min, 0, keepdim=True)  # [1, N]
+        x_max = torch.max(x_max, 0, keepdim=True)  # [1, N]
+        y_min = torch.min(y_min, 0, keepdim=True)  # [1, N]
+        y_max = torch.max(y_max, 0, keepdim=True)  # [1, N]
+
+        pred_boxes = torch.zeros_like(boxes)
+
+        pred_boxes[:, 0] = x_min[0][0, :]
+        pred_boxes[:, 1] = y_min[0][0, :]
+        pred_boxes[:, 2] = x_max[0][0, :]
+        pred_boxes[:, 3] = y_max[0][0, :]
+
+        return pred_boxes
\ No newline at end of file
diff --git a/maskrcnn_benchmark/modeling/rpn/utils.py b/maskrcnn_benchmark/modeling/rpn/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d29a5a7d97c56bc2ce60af3f562d40e5ed98125b
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/rpn/utils.py
@@ -0,0 +1,41 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+Utility functions minipulating the prediction layers
+"""
+
+from ..utils import cat
+
+import torch
+
+def permute_and_flatten(layer, N, A, C, H, W):
+    layer = layer.view(N, -1, C, H, W)
+    layer = layer.permute(0, 3, 4, 1, 2)  #N H W A C
+    layer = layer.reshape(N, -1, C)  # N H*W*A C
+    return layer
+
+
+def concat_box_prediction_layers(box_cls, box_regression):
+    box_cls_flattened = []
+    box_regression_flattened = []
+    # for each feature level, permute the outputs to make them be in the
+    # same format as the labels. Note that the labels are computed for
+    # all feature levels concatenated, so we keep the same representation
+    # for the objectness and the box_regression
+    for box_cls_per_level, box_regression_per_level in zip(
+        box_cls, box_regression
+    ):
+        N, AxC, H, W = box_cls_per_level.shape
+        Ax4 = box_regression_per_level.shape[1]
+        A = 5
+        C = AxC // A   # 1
+
+        box_cls_per_level = permute_and_flatten( box_cls_per_level, N, A, C, H, W)
+        box_cls_flattened.append(box_cls_per_level)
+        box_regression_flattened.append(box_regression_per_level)
+    # concatenate on the first dimension (representing the feature levels), to
+    # take into account the way the labels were generated (with all feature maps
+    # being concatenated as well)
+    box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C)
+    box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4)
+
+    return box_cls, box_regression
diff --git a/maskrcnn_benchmark/modeling/utils.py b/maskrcnn_benchmark/modeling/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b1d79a812ab3db034cf817583281c006b11b90a
--- /dev/null
+++ b/maskrcnn_benchmark/modeling/utils.py
@@ -0,0 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+Miscellaneous utility functions
+"""
+
+import torch
+
+
+def cat(tensors, dim=0):
+    """
+    Efficient version of torch.cat that avoids a copy if there is only a single element in a list
+    """
+    assert isinstance(tensors, (list, tuple))
+    if len(tensors) == 1:
+        return tensors[0]
+    return torch.cat(tensors, dim)
diff --git a/maskrcnn_benchmark/solver/__init__.py b/maskrcnn_benchmark/solver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..75f40530cccb6b989d33193de92a6c26a07cf751
--- /dev/null
+++ b/maskrcnn_benchmark/solver/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .build import make_optimizer
+from .build import make_lr_scheduler
+from .lr_scheduler import WarmupMultiStepLR
diff --git a/maskrcnn_benchmark/solver/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/solver/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d989ebb97bd46ebece2fc664b09b180c85b10090
Binary files /dev/null and b/maskrcnn_benchmark/solver/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/solver/__pycache__/build.cpython-37.pyc b/maskrcnn_benchmark/solver/__pycache__/build.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfad300d379e1097ee9470260ff9f55ef465b583
Binary files /dev/null and b/maskrcnn_benchmark/solver/__pycache__/build.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/solver/__pycache__/lr_scheduler.cpython-37.pyc b/maskrcnn_benchmark/solver/__pycache__/lr_scheduler.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd084ad3466d7e25a3aac716d11bb65aed11e00f
Binary files /dev/null and b/maskrcnn_benchmark/solver/__pycache__/lr_scheduler.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/solver/build.py b/maskrcnn_benchmark/solver/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..865a4ec8d1b3d996b0618e3b2b77bd1b44acfa96
--- /dev/null
+++ b/maskrcnn_benchmark/solver/build.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+
+from .lr_scheduler import WarmupMultiStepLR
+
+
+def make_optimizer(cfg, model):
+    params = []
+    for key, value in model.named_parameters():
+        if not value.requires_grad:
+            continue
+        lr = cfg.SOLVER.BASE_LR
+        weight_decay = cfg.SOLVER.WEIGHT_DECAY
+        if "bias" in key:
+            lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
+            weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
+        params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
+
+    optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM)
+    return optimizer
+
+
+def make_lr_scheduler(cfg, optimizer):
+    return WarmupMultiStepLR(
+        optimizer,
+        cfg.SOLVER.STEPS,
+        cfg.SOLVER.GAMMA,
+        warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
+        warmup_iters=cfg.SOLVER.WARMUP_ITERS,
+        warmup_method=cfg.SOLVER.WARMUP_METHOD,
+    )
diff --git a/maskrcnn_benchmark/solver/lr_scheduler.py b/maskrcnn_benchmark/solver/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7d45b6c6f98e66a5da5b8b84a50258a517bb1e4
--- /dev/null
+++ b/maskrcnn_benchmark/solver/lr_scheduler.py
@@ -0,0 +1,52 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from bisect import bisect_right
+
+import torch
+
+
+# FIXME ideally this would be achieved with a CombinedLRScheduler,
+# separating MultiStepLR with WarmupLR
+# but the current LRScheduler design doesn't allow it
+class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
+    def __init__(
+        self,
+        optimizer,
+        milestones,
+        gamma=0.1,
+        warmup_factor=1.0 / 3,
+        warmup_iters=500,
+        warmup_method="linear",
+        last_epoch=-1,
+    ):
+        if not list(milestones) == sorted(milestones):
+            raise ValueError(
+                "Milestones should be a list of" " increasing integers. Got {}",
+                milestones,
+            )
+
+        if warmup_method not in ("constant", "linear"):
+            raise ValueError(
+                "Only 'constant' or 'linear' warmup_method accepted"
+                "got {}".format(warmup_method)
+            )
+        self.milestones = milestones
+        self.gamma = gamma
+        self.warmup_factor = warmup_factor
+        self.warmup_iters = warmup_iters
+        self.warmup_method = warmup_method
+        super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        warmup_factor = 1
+        if self.last_epoch < self.warmup_iters:
+            if self.warmup_method == "constant":
+                warmup_factor = self.warmup_factor
+            elif self.warmup_method == "linear":
+                alpha = float(self.last_epoch) / self.warmup_iters
+                warmup_factor = self.warmup_factor * (1 - alpha) + alpha
+        return [
+            base_lr
+            * warmup_factor
+            * self.gamma ** bisect_right(self.milestones, self.last_epoch)
+            for base_lr in self.base_lrs
+        ]
diff --git a/maskrcnn_benchmark/structures/__init__.py b/maskrcnn_benchmark/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/maskrcnn_benchmark/structures/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3e9bf87ce7d981f3d0ac69bc1875c9aab3a82a7
Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/structures/__pycache__/bounding_box.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/bounding_box.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fd7c3e37ffc1920db7a269caaaf0fed05e1ccb1
Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/bounding_box.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/structures/__pycache__/boxlist_ops.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/boxlist_ops.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20ccdf81f21989509a19735dcae158249fce70df
Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/boxlist_ops.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/structures/__pycache__/image_list.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/image_list.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae2762abe21312d6e2dea08a90f2653734245050
Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/image_list.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/structures/__pycache__/ke.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/ke.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0901e970575ca48573ada6619d22555f8cbfd7e
Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/ke.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/structures/__pycache__/keypoint.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/keypoint.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8761c854ca0519614cb9b6391718ea5214d8d7e2
Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/keypoint.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/structures/__pycache__/mty.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/mty.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f2faaf1273900153149166e8a36f3e0293a5c67
Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/mty.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/structures/__pycache__/segmentation_mask.cpython-37.pyc b/maskrcnn_benchmark/structures/__pycache__/segmentation_mask.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cbae8444905986fa9fcc3cc4bb1609ee6eeb6bb
Binary files /dev/null and b/maskrcnn_benchmark/structures/__pycache__/segmentation_mask.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/structures/bounding_box.py b/maskrcnn_benchmark/structures/bounding_box.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a1ecf746c1c6183d83d0613f0a13686ecb2a04b
--- /dev/null
+++ b/maskrcnn_benchmark/structures/bounding_box.py
@@ -0,0 +1,271 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+
+# transpose
+FLIP_LEFT_RIGHT = 0
+FLIP_TOP_BOTTOM = 1
+
+
+class BoxList(object):
+    """
+    This class represents a set of bounding boxes.
+    The bounding boxes are represented as a Nx4 Tensor.
+    In order to uniquely determine the bounding boxes with respect
+    to an image, we also store the corresponding image dimensions.
+    They can contain extra information that is specific to each bounding box, such as
+    labels.
+    """
+
+    def __init__(self, bbox, image_size, mode="xyxy"):
+        device = bbox.device if isinstance(bbox, torch.Tensor) else torch.device("cpu")
+        bbox = torch.as_tensor(bbox, dtype=torch.float32, device=device)
+        if bbox.ndimension() != 2:
+            raise ValueError(
+                "bbox should have 2 dimensions, got {}".format(bbox.ndimension())
+            )
+        if bbox.size(-1) != 4:
+            raise ValueError(
+                "last dimension of bbox should have a "
+                "size of 4, got {}".format(bbox.size(-1))
+            )
+        if mode not in ("xyxy", "xywh"):
+            raise ValueError("mode should be 'xyxy' or 'xywh'")
+
+        self.bbox = bbox
+        self.size = image_size  # (image_width, image_height)
+        self.mode = mode
+        self.extra_fields = {}
+
+    def add_field(self, field, field_data):
+        self.extra_fields[field] = field_data
+
+    def get_field(self, field):
+        return self.extra_fields[field]
+
+    def has_field(self, field):
+        return field in self.extra_fields
+
+    def fields(self):
+        return list(self.extra_fields.keys())
+
+    def _copy_extra_fields(self, bbox):
+        for k, v in bbox.extra_fields.items():
+            self.extra_fields[k] = v
+
+    def convert(self, mode):
+        if mode not in ("xyxy", "xywh"):
+            raise ValueError("mode should be 'xyxy' or 'xywh'")
+        if mode == self.mode:
+            return self
+        # we only have two modes, so don't need to check
+        # self.mode
+        xmin, ymin, xmax, ymax = self._split_into_xyxy()
+        if mode == "xyxy":
+            bbox = torch.cat((xmin, ymin, xmax, ymax), dim=-1)
+            bbox = BoxList(bbox, self.size, mode=mode)
+        else:
+            TO_REMOVE = 1
+            bbox = torch.cat(
+                (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE), dim=-1
+            )
+            bbox = BoxList(bbox, self.size, mode=mode)
+        bbox._copy_extra_fields(self)
+        return bbox
+
+    def _split_into_xyxy(self):
+        if self.mode == "xyxy":
+            xmin, ymin, xmax, ymax = self.bbox.split(1, dim=-1)
+            return xmin, ymin, xmax, ymax
+        elif self.mode == "xywh":
+            TO_REMOVE = 1
+            xmin, ymin, w, h = self.bbox.split(1, dim=-1)
+            return (
+                xmin,
+                ymin,
+                xmin + (w - TO_REMOVE).clamp(min=0),
+                ymin + (h - TO_REMOVE).clamp(min=0),
+            )
+        else:
+            raise RuntimeError("Should not be here")
+
+    def resize(self, size, *args, **kwargs):
+        """
+        Returns a resized copy of this bounding box
+
+        :param size: The requested size in pixels, as a 2-tuple:
+            (width, height).
+        """
+
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size))
+        if ratios[0] == ratios[1]:
+            ratio = ratios[0]
+            scaled_box = self.bbox * ratio
+            bbox = BoxList(scaled_box, size, mode=self.mode)
+            # bbox._copy_extra_fields(self)
+            for k, v in self.extra_fields.items():
+                if not isinstance(v, torch.Tensor):
+                    v = v.resize(size, *args, **kwargs)
+                bbox.add_field(k, v)
+            return bbox
+
+        ratio_width, ratio_height = ratios
+        xmin, ymin, xmax, ymax = self._split_into_xyxy()
+        scaled_xmin = xmin * ratio_width
+        scaled_xmax = xmax * ratio_width
+        scaled_ymin = ymin * ratio_height
+        scaled_ymax = ymax * ratio_height
+        scaled_box = torch.cat(
+            (scaled_xmin, scaled_ymin, scaled_xmax, scaled_ymax), dim=-1
+        )
+        bbox = BoxList(scaled_box, size, mode="xyxy")
+        # bbox._copy_extra_fields(self)
+        for k, v in self.extra_fields.items():
+            if not isinstance(v, torch.Tensor):
+                v = v.resize(size, *args, **kwargs)
+            bbox.add_field(k, v)
+
+        return bbox.convert(self.mode)
+
+    def transpose(self, method):
+        """
+        Transpose bounding box (flip or rotate in 90 degree steps)
+        :param method: One of :py:attr:`PIL.Image.FLIP_LEFT_RIGHT`,
+          :py:attr:`PIL.Image.FLIP_TOP_BOTTOM`, :py:attr:`PIL.Image.ROTATE_90`,
+          :py:attr:`PIL.Image.ROTATE_180`, :py:attr:`PIL.Image.ROTATE_270`,
+          :py:attr:`PIL.Image.TRANSPOSE` or :py:attr:`PIL.Image.TRANSVERSE`.
+        """
+        if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM):
+            raise NotImplementedError(
+                "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented"
+            )
+
+        image_width, image_height = self.size
+        xmin, ymin, xmax, ymax = self._split_into_xyxy()
+        if method == FLIP_LEFT_RIGHT:
+            TO_REMOVE = 1
+            transposed_xmin = image_width - xmax - TO_REMOVE
+            transposed_xmax = image_width - xmin - TO_REMOVE
+            transposed_ymin = ymin
+            transposed_ymax = ymax
+        elif method == FLIP_TOP_BOTTOM:
+            transposed_xmin = xmin
+            transposed_xmax = xmax
+            transposed_ymin = image_height - ymax
+            transposed_ymax = image_height - ymin
+
+        transposed_boxes = torch.cat(
+            (transposed_xmin, transposed_ymin, transposed_xmax, transposed_ymax), dim=-1
+        )
+        bbox = BoxList(transposed_boxes, self.size, mode="xyxy")
+        # bbox._copy_extra_fields(self)
+        for k, v in self.extra_fields.items():
+            if not isinstance(v, torch.Tensor):
+                v = v.transpose(method)
+            bbox.add_field(k, v)
+        return bbox.convert(self.mode)
+
+    def crop(self, box, remove_empty=False):
+        """
+        Cropss a rectangular region from this bounding box. The box is a
+        4-tuple defining the left, upper, right, and lower pixel
+        coordinate.
+        """
+        xmin, ymin, xmax, ymax = self._split_into_xyxy()
+        w, h = box[2] - box[0], box[3] - box[1]
+        cropped_xmin = (xmin - box[0]).clamp(min=0, max=w)
+        cropped_ymin = (ymin - box[1]).clamp(min=0, max=h)
+        cropped_xmax = (xmax - box[0]).clamp(min=0, max=w)
+        cropped_ymax = (ymax - box[1]).clamp(min=0, max=h)
+
+        # TODO should I filter empty boxes here?
+        if False:
+            is_empty = (cropped_xmin == cropped_xmax) | (cropped_ymin == cropped_ymax)
+
+        cropped_box = torch.cat(
+            (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1
+        )
+        bbox = BoxList(cropped_box, (w, h), mode="xyxy")
+        # bbox._copy_extra_fields(self)
+        for k, v in self.extra_fields.items():
+            if not isinstance(v, torch.Tensor):
+                v = v.crop(box)
+            bbox.add_field(k, v)
+
+        if remove_empty:
+            box = bbox.bbox
+            keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0])
+            bbox = bbox[keep]
+        return bbox.convert(self.mode)
+
+    # Tensor-like methods
+
+    def to(self, device):
+        bbox = BoxList(self.bbox.to(device), self.size, self.mode)
+        for k, v in self.extra_fields.items():
+            if hasattr(v, "to"):
+                v = v.to(device)
+            bbox.add_field(k, v)
+        return bbox
+
+    def __getitem__(self, item):
+        bbox = BoxList(self.bbox[item], self.size, self.mode)
+        for k, v in self.extra_fields.items():
+            bbox.add_field(k, v[item])
+        return bbox
+
+    def __len__(self):
+        return self.bbox.shape[0]
+
+    def clip_to_image(self, remove_empty=True):
+        TO_REMOVE = 1
+        self.bbox[:, 0].clamp_(min=0, max=self.size[0] - TO_REMOVE)
+        self.bbox[:, 1].clamp_(min=0, max=self.size[1] - TO_REMOVE)
+        self.bbox[:, 2].clamp_(min=0, max=self.size[0] - TO_REMOVE)
+        self.bbox[:, 3].clamp_(min=0, max=self.size[1] - TO_REMOVE)
+        if remove_empty:
+            box = self.bbox
+            keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0])
+            return self[keep]
+        return self
+
+    def area(self):
+        box = self.bbox
+        if self.mode == "xyxy":
+            TO_REMOVE = 1
+            area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (box[:, 3] - box[:, 1] + TO_REMOVE)
+        elif self.mode == "xywh":
+            area = box[:, 2] * box[:, 3]
+        else:
+            raise RuntimeError("Should not be here")
+
+        return area
+
+    def copy_with_fields(self, fields, skip_missing=False):
+        bbox = BoxList(self.bbox, self.size, self.mode)
+        if not isinstance(fields, (list, tuple)):
+            fields = [fields]
+        for field in fields:
+            if self.has_field(field):
+                bbox.add_field(field, self.get_field(field))
+            elif not skip_missing:
+                raise KeyError("Field '{}' not found in {}".format(field, self))
+        return bbox
+
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_boxes={}, ".format(len(self))
+        s += "image_width={}, ".format(self.size[0])
+        s += "image_height={}, ".format(self.size[1])
+        s += "mode={})".format(self.mode)
+        return s
+
+
+if __name__ == "__main__":
+    bbox = BoxList([[0, 0, 10, 10], [0, 0, 5, 5]], (10, 10))
+    s_bbox = bbox.resize((5, 5))
+    print(s_bbox)
+    print(s_bbox.bbox)
+
+    t_bbox = bbox.transpose(0)
+    print(t_bbox)
+    print(t_bbox.bbox)
diff --git a/maskrcnn_benchmark/structures/boxlist_ops.py b/maskrcnn_benchmark/structures/boxlist_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc51212f4ff7abc6d978df75d3de44f956f38f67
--- /dev/null
+++ b/maskrcnn_benchmark/structures/boxlist_ops.py
@@ -0,0 +1,128 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+
+from .bounding_box import BoxList
+
+from maskrcnn_benchmark.layers import nms as _box_nms
+
+
+def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores"):
+    """
+    Performs non-maximum suppression on a boxlist, with scores specified
+    in a boxlist field via score_field.
+
+    Arguments:
+        boxlist(BoxList)
+        nms_thresh (float)
+        max_proposals (int): if > 0, then only the top max_proposals are kept
+            after non-maximum suppression
+        score_field (str)
+    """
+    if nms_thresh <= 0:
+        return boxlist
+    mode = boxlist.mode
+    boxlist = boxlist.convert("xyxy")
+    boxes = boxlist.bbox
+    score = boxlist.get_field(score_field)
+    keep = _box_nms(boxes, score, nms_thresh)
+    if max_proposals > 0:
+        keep = keep[: max_proposals]
+    boxlist = boxlist[keep]
+    return boxlist.convert(mode)
+
+
+def remove_small_boxes(boxlist, min_size):
+    """
+    Only keep boxes with both sides >= min_size
+
+    Arguments:
+        boxlist (Boxlist)
+        min_size (int)
+    """
+    # TODO maybe add an API for querying the ws / hs
+    xywh_boxes = boxlist.convert("xywh").bbox
+    _, _, ws, hs = xywh_boxes.unbind(dim=1)
+    keep = (
+        (ws >= min_size) & (hs >= min_size)
+    ).nonzero().squeeze(1)
+    return boxlist[keep]
+
+
+# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
+# with slight modifications
+def boxlist_iou(boxlist1, boxlist2):
+    """Compute the intersection over union of two set of boxes.
+    The box order must be (xmin, ymin, xmax, ymax).
+
+    Arguments:
+      box1: (BoxList) bounding boxes, sized [N,4].
+      box2: (BoxList) bounding boxes, sized [M,4].
+
+    Returns:
+      (tensor) iou, sized [N,M].
+
+    Reference:
+      https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py
+    """
+    if boxlist1.size != boxlist2.size:
+        raise RuntimeError(
+                "boxlists should have same image size, got {}, {}".format(boxlist1, boxlist2))
+
+    N = len(boxlist1)
+    M = len(boxlist2)
+
+    area1 = boxlist1.area()
+    area2 = boxlist2.area()
+
+    box1, box2 = boxlist1.bbox, boxlist2.bbox
+
+    lt = torch.max(box1[:, None, :2], box2[:, :2])  # [N,M,2]
+    rb = torch.min(box1[:, None, 2:], box2[:, 2:])  # [N,M,2]
+
+    TO_REMOVE = 1
+
+    wh = (rb - lt + TO_REMOVE).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    iou = inter / (area1[:, None] + area2 - inter)
+    return iou
+
+
+# TODO redundant, remove
+def _cat(tensors, dim=0):
+    """
+    Efficient version of torch.cat that avoids a copy if there is only a single element in a list
+    """
+    assert isinstance(tensors, (list, tuple))
+    if len(tensors) == 1:
+        return tensors[0]
+    return torch.cat(tensors, dim)
+
+
+def cat_boxlist(bboxes):
+    """
+    Concatenates a list of BoxList (having the same image size) into a
+    single BoxList
+
+    Arguments:
+        bboxes (list[BoxList])
+    """
+    assert isinstance(bboxes, (list, tuple))
+    assert all(isinstance(bbox, BoxList) for bbox in bboxes)
+
+    size = bboxes[0].size
+    assert all(bbox.size == size for bbox in bboxes)
+
+    mode = bboxes[0].mode
+    assert all(bbox.mode == mode for bbox in bboxes)
+
+    fields = set(bboxes[0].fields())
+    assert all(set(bbox.fields()) == fields for bbox in bboxes)
+
+    cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode)
+
+    for field in fields:
+        data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0)
+        cat_boxes.add_field(field, data)
+
+    return cat_boxes
diff --git a/maskrcnn_benchmark/structures/image_list.py b/maskrcnn_benchmark/structures/image_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..590b87a65a23aa94234022bcc530cb00e1e25b47
--- /dev/null
+++ b/maskrcnn_benchmark/structures/image_list.py
@@ -0,0 +1,72 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import division
+
+import torch
+
+
+class ImageList(object):
+    """
+    Structure that holds a list of images (of possibly
+    varying sizes) as a single tensor.
+    This works by padding the images to the same size,
+    and storing in a field the original sizes of each image
+    """
+
+    def __init__(self, tensors, image_sizes):
+        """
+        Arguments:
+            tensors (tensor)
+            image_sizes (list[tuple[int, int]])
+        """
+        self.tensors = tensors
+        self.image_sizes = image_sizes
+
+    def to(self, *args, **kwargs):
+        cast_tensor = self.tensors.to(*args, **kwargs)
+        return ImageList(cast_tensor, self.image_sizes)
+
+
+def to_image_list(tensors, size_divisible=0):
+    """
+    tensors can be an ImageList, a torch.Tensor or
+    an iterable of Tensors. It can't be a numpy array.
+    When tensors is an iterable of Tensors, it pads
+    the Tensors with zeros so that they have the same
+    shape
+    """
+    if isinstance(tensors, torch.Tensor) and size_divisible > 0:
+        tensors = [tensors]
+
+    if isinstance(tensors, ImageList):
+        return tensors
+    elif isinstance(tensors, torch.Tensor):
+        # single tensor shape can be inferred
+        if tensors.dim() == 3:
+            tensors = tensors[None]
+        assert tensors.dim() == 4
+        image_sizes = [tensor.shape[-2:] for tensor in tensors]
+        return ImageList(tensors, image_sizes)
+    elif isinstance(tensors, (tuple, list)):
+        max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors]))
+
+        # TODO Ideally, just remove this and let me model handle arbitrary
+        # input sizs
+        if size_divisible > 0:
+            import math
+
+            stride = size_divisible
+            max_size = list(max_size)
+            max_size[1] = int(math.ceil(max_size[1] / stride) * stride)
+            max_size[2] = int(math.ceil(max_size[2] / stride) * stride)
+            max_size = tuple(max_size)
+
+        batch_shape = (len(tensors),) + max_size
+        batched_imgs = tensors[0].new(*batch_shape).zero_()
+        for img, pad_img in zip(tensors, batched_imgs):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+
+        image_sizes = [im.shape[-2:] for im in tensors]
+
+        return ImageList(batched_imgs, image_sizes)
+    else:
+        raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors)))
diff --git a/maskrcnn_benchmark/structures/ke.py b/maskrcnn_benchmark/structures/ke.py
new file mode 100644
index 0000000000000000000000000000000000000000..41e777e803f0ad206bb7042ddd35395c107a84eb
--- /dev/null
+++ b/maskrcnn_benchmark/structures/ke.py
@@ -0,0 +1,164 @@
+import torch
+
+
+# transpose
+FLIP_LEFT_RIGHT = 0
+FLIP_TOP_BOTTOM = 1
+
+class KES(object):
+    def __init__(self, kes, size, mode=None):
+        # FIXME remove check once we have better integration with device
+        # in my version this would consistently return a CPU tensor
+        device = kes.device if isinstance(kes, torch.Tensor) else torch.device('cpu')
+        kes = torch.as_tensor(kes, dtype=torch.float32, device=device)
+        if len(kes.size()) == 2:
+            kes = kes.unsqueeze(2)
+            if not kes.size()[0] ==0:
+                assert(kes.size()[-2] == 12), str(kes.size()) # 12kes
+
+        num_kes = kes.shape[0]
+        kes_x = kes[:, :6, 0] # 4+2=6
+        kes_y = kes[:, 6:, 0]
+        # TODO remove once support or zero in dim is in
+        if not kes.size()[0] ==0:
+            assert(kes_x.size() == kes_y.size()), str(kes_x.size())+' '+str(kes_y.size())
+
+        if num_kes > 0:
+            kes = kes.view(num_kes, -1, 1)
+            kes_x = kes_x.view(num_kes, -1, 1)
+            kes_y = kes_y.view(num_kes, -1, 1)
+
+        # TODO should I split them?
+        self.kes = kes
+        self.kes_x = kes_x
+        self.kes_y = kes_y
+
+        self.size = size
+        self.mode = mode
+
+    def crop(self, box):
+        w, h = box[2] - box[0], box[3] - box[1]
+        k = self.kes.clone()
+        k[:, :6, 0] -= box[0]
+        k[:, 6:, 0] -= box[1]
+        return type(self)(k, (w, h), self.mode)
+
+    def resize(self, size, *args, **kwargs):
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size))
+        ratio_w, ratio_h = ratios
+        resized_data_x = self.kes_x.clone()
+        resized_data_x[..., :] *= ratio_w
+
+        resized_data_y = self.kes_y.clone()
+        resized_data_y[..., :] *= ratio_h
+
+        resized_data = torch.cat((resized_data_x, resized_data_y), dim=-2)
+        return type(self)(resized_data, size, self.mode)
+
+    def transpose(self, method):
+        if method not in (FLIP_LEFT_RIGHT,):
+            raise NotImplementedError(
+                    "Only FLIP_LEFT_RIGHT implemented")
+
+        flip_inds = type(self).FLIP_INDS
+        flipped_data_x = self.kes_x[:, flip_inds]
+        width = self.size[0]
+        TO_REMOVE = 1
+        # Flip x coordinates
+        flipped_data_x[..., :] = width - flipped_data_x[..., :] - TO_REMOVE
+
+        flipped_data_y = self.kes_y.clone()
+        flipped_data = torch.cat((flipped_data_x, flipped_data_y), dim=-2)
+        return type(self)(flipped_data, self.size, self.mode)
+
+    def to(self, *args, **kwargs):
+        return type(self)(self.kes.to(*args, **kwargs), self.size, self.mode)
+
+    def __getitem__(self, item):
+        return type(self)(self.kes[item], self.size, self.mode)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'num_instances_x={}, '.format(len(self.kes_x))
+        s += 'num_instances_y={}, '.format(len(self.kes_y))
+        s += 'image_width={}, '.format(self.size[0])
+        s += 'image_height={})'.format(self.size[1])
+        return s
+
+
+def _create_flip_indices(names, flip_map):
+    full_flip_map = flip_map.copy()
+    full_flip_map.update({v: k for k, v in flip_map.items()})
+    flipped_names = [i if i not in full_flip_map else full_flip_map[i] for i in names]
+    flip_indices = [names.index(i) for i in flipped_names]
+    return torch.tensor(flip_indices)
+
+
+class textKES(KES):
+    NAMES = [ # x and y
+        'meanx', 
+        'xmin',
+        'x2',
+        'x3',
+        'xmax',
+        'cx'
+        # 'meany',
+        # 'ymin',
+        # 'y2',
+        # 'y3',
+        # 'ymax',
+        # 'cy'
+    ]
+    FLIP_MAP = {
+        'xmin': 'xmax',
+        'x2': 'x3',
+    }
+
+
+# TODO this doesn't look great
+textKES.FLIP_INDS = _create_flip_indices(textKES.NAMES, textKES.FLIP_MAP)
+
+
+# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop)
+def kes_to_heat_map(kes_x, kes_y, mty, rois, heatmap_size):
+    if rois.numel() == 0:
+        return rois.new().long(), rois.new().long()
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+    scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
+    scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
+
+    offset_x = offset_x[:, None]
+    offset_y = offset_y[:, None]
+    scale_x = scale_x[:, None]
+    scale_y = scale_y[:, None]
+
+    x = kes_x[..., 0]
+    y = kes_y[..., 0]
+
+    x_boundary_inds = x == rois[:, 2][:, None]
+    y_boundary_inds = y == rois[:, 3][:, None]
+
+    x = (x - offset_x) * scale_x
+    x = x.floor().long()
+    y = (y - offset_y) * scale_y
+    y = y.floor().long()
+
+    x[x_boundary_inds] = heatmap_size - 1
+    y[y_boundary_inds] = heatmap_size - 1
+
+    valid_loc_x = (x >= 0) & (x < heatmap_size)
+    valid_x = (valid_loc_x).long()
+
+    valid_loc_y = (y >= 0) & (y < heatmap_size)
+    valid_y = (valid_loc_y).long()
+
+    valid_mty = ((x >= 0) & (x < heatmap_size)) & ((y >= 0) & (y < heatmap_size))
+    valid_mty = valid_mty.sum(dim=1)>0
+    valid_mty = (valid_mty).long()
+
+    heatmap_x = x
+    heatmap_y = y
+
+    mty = mty
+    return heatmap_x, heatmap_y, valid_x, valid_y, mty, valid_mty
diff --git a/maskrcnn_benchmark/structures/keypoint.py b/maskrcnn_benchmark/structures/keypoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6881f72f4f757855105638f2f7a9fca81760bb7
--- /dev/null
+++ b/maskrcnn_benchmark/structures/keypoint.py
@@ -0,0 +1,188 @@
+import torch
+
+
+# transpose
+FLIP_LEFT_RIGHT = 0
+FLIP_TOP_BOTTOM = 1
+
+class Keypoints(object):
+    def __init__(self, keypoints, size, mode=None):
+        # FIXME remove check once we have better integration with device
+        # in my version this would consistently return a CPU tensor
+        device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device('cpu')
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device)
+        num_keypoints = keypoints.shape[0]
+        if num_keypoints:
+            keypoints = keypoints.view(num_keypoints, -1, 3)
+        
+        # TODO should I split them?
+        # self.visibility = keypoints[..., 2]
+        self.keypoints = keypoints# [..., :2]
+
+        self.size = size
+        self.mode = mode
+        self.extra_fields = {}
+
+    def crop(self, box):
+        raise NotImplementedError()
+
+    def resize(self, size, *args, **kwargs):
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size))
+        ratio_w, ratio_h = ratios
+        resized_data = self.keypoints.clone()
+        resized_data[..., 0] *= ratio_w
+        resized_data[..., 1] *= ratio_h
+        keypoints = type(self)(resized_data, size, self.mode)
+        for k, v in self.extra_fields.items():
+            keypoints.add_field(k, v)
+        return keypoints
+
+    def transpose(self, method):
+        if method not in (FLIP_LEFT_RIGHT,):
+            raise NotImplementedError(
+                    "Only FLIP_LEFT_RIGHT implemented")
+
+        flip_inds = type(self).FLIP_INDS
+        flipped_data = self.keypoints[:, flip_inds]
+        width = self.size[0]
+        TO_REMOVE = 1
+        # Flip x coordinates
+        flipped_data[..., 0] = width - flipped_data[..., 0] - TO_REMOVE
+
+        # Maintain COCO convention that if visibility == 0, then x, y = 0
+        inds = flipped_data[..., 2] == 0
+        flipped_data[inds] = 0
+
+        keypoints = type(self)(flipped_data, self.size, self.mode)
+        for k, v in self.extra_fields.items():
+            keypoints.add_field(k, v)
+        return keypoints
+
+    def to(self, *args, **kwargs):
+        keypoints = type(self)(self.keypoints.to(*args, **kwargs), self.size, self.mode)
+        for k, v in self.extra_fields.items():
+            if hasattr(v, "to"):
+                v = v.to(*args, **kwargs)
+            keypoints.add_field(k, v)
+        return keypoints
+
+    def __getitem__(self, item):
+        keypoints = type(self)(self.keypoints[item], self.size, self.mode)
+        for k, v in self.extra_fields.items():
+            keypoints.add_field(k, v[item])
+        return keypoints
+
+    def add_field(self, field, field_data):
+        self.extra_fields[field] = field_data
+
+    def get_field(self, field):
+        return self.extra_fields[field]
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'num_instances={}, '.format(len(self.keypoints))
+        s += 'image_width={}, '.format(self.size[0])
+        s += 'image_height={})'.format(self.size[1])
+        return s
+
+
+def _create_flip_indices(names, flip_map):
+    full_flip_map = flip_map.copy()
+    full_flip_map.update({v: k for k, v in flip_map.items()})
+    flipped_names = [i if i not in full_flip_map else full_flip_map[i] for i in names]
+    flip_indices = [names.index(i) for i in flipped_names]
+    return torch.tensor(flip_indices)
+
+
+class PersonKeypoints(Keypoints):
+    NAMES = [
+        'nose',
+        'left_eye',
+        'right_eye',
+        'left_ear',
+        'right_ear',
+        'left_shoulder',
+        'right_shoulder',
+        'left_elbow',
+        'right_elbow',
+        'left_wrist',
+        'right_wrist',
+        'left_hip',
+        'right_hip',
+        'left_knee',
+        'right_knee',
+        'left_ankle',
+        'right_ankle'
+    ]
+    FLIP_MAP = {
+        'left_eye': 'right_eye',
+        'left_ear': 'right_ear',
+        'left_shoulder': 'right_shoulder',
+        'left_elbow': 'right_elbow',
+        'left_wrist': 'right_wrist',
+        'left_hip': 'right_hip',
+        'left_knee': 'right_knee',
+        'left_ankle': 'right_ankle'
+    }
+
+
+# TODO this doesn't look great
+PersonKeypoints.FLIP_INDS = _create_flip_indices(PersonKeypoints.NAMES, PersonKeypoints.FLIP_MAP)
+def kp_connections(keypoints):
+    kp_lines = [
+        [keypoints.index('left_eye'), keypoints.index('right_eye')],
+        [keypoints.index('left_eye'), keypoints.index('nose')],
+        [keypoints.index('right_eye'), keypoints.index('nose')],
+        [keypoints.index('right_eye'), keypoints.index('right_ear')],
+        [keypoints.index('left_eye'), keypoints.index('left_ear')],
+        [keypoints.index('right_shoulder'), keypoints.index('right_elbow')],
+        [keypoints.index('right_elbow'), keypoints.index('right_wrist')],
+        [keypoints.index('left_shoulder'), keypoints.index('left_elbow')],
+        [keypoints.index('left_elbow'), keypoints.index('left_wrist')],
+        [keypoints.index('right_hip'), keypoints.index('right_knee')],
+        [keypoints.index('right_knee'), keypoints.index('right_ankle')],
+        [keypoints.index('left_hip'), keypoints.index('left_knee')],
+        [keypoints.index('left_knee'), keypoints.index('left_ankle')],
+        [keypoints.index('right_shoulder'), keypoints.index('left_shoulder')],
+        [keypoints.index('right_hip'), keypoints.index('left_hip')],
+    ]
+    return kp_lines
+PersonKeypoints.CONNECTIONS = kp_connections(PersonKeypoints.NAMES)
+
+
+# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop)
+def keypoints_to_heat_map(keypoints, rois, heatmap_size):
+    if rois.numel() == 0:
+        return rois.new().long(), rois.new().long()
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+    scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
+    scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
+
+    offset_x = offset_x[:, None]
+    offset_y = offset_y[:, None]
+    scale_x = scale_x[:, None]
+    scale_y = scale_y[:, None]
+
+    x = keypoints[..., 0]
+    y = keypoints[..., 1]
+
+    x_boundary_inds = x == rois[:, 2][:, None]
+    y_boundary_inds = y == rois[:, 3][:, None]
+
+    x = (x - offset_x) * scale_x
+    x = x.floor().long()
+    y = (y - offset_y) * scale_y
+    y = y.floor().long()
+    
+    x[x_boundary_inds] = heatmap_size - 1
+    y[y_boundary_inds] = heatmap_size - 1
+
+    valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
+    vis = keypoints[..., 2] > 0
+    valid = (valid_loc & vis).long()
+
+    lin_ind = y * heatmap_size + x
+    heatmaps = lin_ind * valid
+
+    return heatmaps, valid
diff --git a/maskrcnn_benchmark/structures/mty.py b/maskrcnn_benchmark/structures/mty.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5758683d1958a8b02fe7b8ffff0193d1236169f
--- /dev/null
+++ b/maskrcnn_benchmark/structures/mty.py
@@ -0,0 +1,59 @@
+import torch
+
+# transpose
+FLIP_LEFT_RIGHT = 0
+FLIP_TOP_BOTTOM = 1
+
+all_types = [[1,2,3,4],[1,2,4,3],[1,3,2,4],[1,3,4,2],[1,4,2,3],[1,4,3,2],\
+              [2,1,3,4],[2,1,4,3],[2,3,1,4],[2,3,4,1],[2,4,1,3],[2,4,3,1],\
+              [3,1,2,4],[3,1,4,2],[3,2,1,4],[3,2,4,1],[3,4,1,2],[3,4,2,1],\
+              [4,1,2,3],[4,1,3,2],[4,2,1,3],[4,2,3,1],[4,3,1,2],[4,3,2,1]]
+aty= [[all_types[iat][0]-1,all_types[iat][1]-1,all_types[iat][2]-1,all_types[iat][3]-1] for iat in range(24)]
+
+class MTY(object):
+    def __init__(self, mty, size, mode=None):
+        # FIXME remove check once we have better integration with device
+        # in my version this would consistently return a CPU tensor
+        device = mty.device if isinstance(mty, torch.Tensor) else torch.device('cpu')
+        mty = torch.as_tensor(mty, dtype=torch.int64, device=device)
+            
+        # TODO should I split them?
+        assert(len(mty.size()) == 1), str(mty.size())
+        self.mty = mty
+
+        self.size = size
+        self.mode = mode
+
+    def crop(self, box):
+        w, h = box[2] - box[0], box[3] - box[1]
+        return type(self)(self.mty, (w, h), self.mode)
+
+    def resize(self, size, *args, **kwargs):
+        return type(self)(self.mty, size, self.mode)
+
+    def transpose(self, method):
+        if method not in (FLIP_LEFT_RIGHT,):
+            raise NotImplementedError(
+                    "Only FLIP_LEFT_RIGHT implemented")
+
+        flipped_data = self.mty.clone()
+        for i in range(self.mty.size()[0]):
+            revs = [it for it in aty[self.mty[i]]]
+            revs.reverse()
+            flip_type = aty.index(revs)
+            flipped_data[i] = flip_type
+
+        return type(self)(flipped_data, self.size, self.mode)
+
+    def to(self, *args, **kwargs):
+        return type(self)(self.mty.to(*args, **kwargs), self.size, self.mode)
+
+    def __getitem__(self, item):
+        return type(self)(self.mty[item], self.size, self.mode)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'num_instances={}, '.format(len(self.mty))
+        s += 'image_width={}, '.format(self.size[0])
+        s += 'image_height={})'.format(self.size[1])
+        return s
diff --git a/maskrcnn_benchmark/structures/segmentation_mask.py b/maskrcnn_benchmark/structures/segmentation_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e1ba07767df487c9b4cccca4a87540a4bce3b99
--- /dev/null
+++ b/maskrcnn_benchmark/structures/segmentation_mask.py
@@ -0,0 +1,535 @@
+import cv2
+import copy
+import torch
+import numpy as np
+from maskrcnn_benchmark.layers.misc import interpolate
+
+import pycocotools.mask as mask_utils
+
+# transpose
+FLIP_LEFT_RIGHT = 0
+FLIP_TOP_BOTTOM = 1
+
+
+""" ABSTRACT
+Segmentations come in either:
+1) Binary masks
+2) Polygons
+
+Binary masks can be represented in a contiguous array
+and operations can be carried out more efficiently,
+therefore BinaryMaskList handles them together.
+
+Polygons are handled separately for each instance,
+by PolygonInstance and instances are handled by
+PolygonList.
+
+SegmentationList is supposed to represent both,
+therefore it wraps the functions of BinaryMaskList
+and PolygonList to make it transparent.
+"""
+
+
+class BinaryMaskList(object):
+    """
+    This class handles binary masks for all objects in the image
+    """
+
+    def __init__(self, masks, size):
+        """
+            Arguments:
+                masks: Either torch.tensor of [num_instances, H, W]
+                    or list of torch.tensors of [H, W] with num_instances elems,
+                    or RLE (Run Length Encoding) - interpreted as list of dicts,
+                    or BinaryMaskList.
+                size: absolute image size, width first
+
+            After initialization, a hard copy will be made, to leave the
+            initializing source data intact.
+        """
+
+        if isinstance(masks, torch.Tensor):
+            # The raw data representation is passed as argument
+            masks = masks.clone()
+        elif isinstance(masks, (list, tuple)):
+            if isinstance(masks[0], torch.Tensor):
+                masks = torch.stack(masks, dim=2).clone()
+            elif isinstance(masks[0], dict) and "count" in masks[0]:
+                # RLE interpretation
+
+                masks = mask_utils
+            else:
+                RuntimeError(
+                    "Type of `masks[0]` could not be interpreted: %s" % type(masks)
+                )
+        elif isinstance(masks, BinaryMaskList):
+            # just hard copy the BinaryMaskList instance's underlying data
+            masks = masks.masks.clone()
+        else:
+            RuntimeError(
+                "Type of `masks` argument could not be interpreted:%s" % type(masks)
+            )
+
+        if len(masks.shape) == 2:
+            # if only a single instance mask is passed
+            masks = masks[None]
+
+        assert len(masks.shape) == 3
+        assert masks.shape[1] == size[1], "%s != %s" % (masks.shape[1], size[1])
+        assert masks.shape[2] == size[0], "%s != %s" % (masks.shape[2], size[0])
+
+        self.masks = masks
+        self.size = tuple(size)
+
+    def transpose(self, method):
+        dim = 1 if method == FLIP_TOP_BOTTOM else 2
+        flipped_masks = self.masks.flip(dim)
+        return BinaryMaskList(flipped_masks, self.size)
+
+    def crop(self, box):
+        assert isinstance(box, (list, tuple, torch.Tensor)), str(type(box))
+        # box is assumed to be xyxy
+        current_width, current_height = self.size
+        xmin, ymin, xmax, ymax = [round(float(b)) for b in box]
+
+        assert xmin <= xmax and ymin <= ymax, str(box)
+        xmin = min(max(xmin, 0), current_width - 1)
+        ymin = min(max(ymin, 0), current_height - 1)
+
+        xmax = min(max(xmax, 0), current_width)
+        ymax = min(max(ymax, 0), current_height)
+
+        xmax = max(xmax, xmin + 1)
+        ymax = max(ymax, ymin + 1)
+
+        width, height = xmax - xmin, ymax - ymin
+        cropped_masks = self.masks[:, ymin:ymax, xmin:xmax]
+        cropped_size = width, height
+        return BinaryMaskList(cropped_masks, cropped_size)
+
+    def resize(self, size):
+        try:
+            iter(size)
+        except TypeError:
+            assert isinstance(size, (int, float))
+            size = size, size
+        width, height = map(int, size)
+
+        assert width > 0
+        assert height > 0
+
+        # Height comes first here!
+        resized_masks = torch.nn.functional.interpolate(
+            input=self.masks[None].float(),
+            size=(height, width),
+            mode="bilinear",
+            align_corners=False,
+        )[0].type_as(self.masks)
+        resized_size = width, height
+        return BinaryMaskList(resized_masks, resized_size)
+
+    def convert_to_polygon(self):
+        contours = self._findContours()
+        return PolygonList(contours, self.size)
+
+    def to(self, *args, **kwargs):
+        return self
+
+    def _findContours(self):
+        contours = []
+        masks = self.masks.detach().numpy()
+        for mask in masks:
+            mask = cv2.UMat(mask)
+            contour, hierarchy = cv2.findContours(
+                mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_L1
+            )
+
+            reshaped_contour = []
+            for entity in contour:
+                assert len(entity.shape) == 3
+                assert entity.shape[1] == 1, "Hierarchical contours are not allowed"
+                reshaped_contour.append(entity.reshape(-1).tolist())
+            contours.append(reshaped_contour)
+        return contours
+
+    def __len__(self):
+        return len(self.masks)
+
+    def __getitem__(self, index):
+        # Probably it can cause some overhead
+        # but preserves consistency
+        masks = self.masks[index].clone()
+        return BinaryMaskList(masks, self.size)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self.masks))
+        s += "image_width={}, ".format(self.size[0])
+        s += "image_height={})".format(self.size[1])
+        return s
+
+
+class PolygonInstance(object):
+    """
+    This class holds a set of polygons that represents a single instance
+    of an object mask. The object can be represented as a set of
+    polygons
+    """
+
+    def __init__(self, polygons, size):
+        """
+            Arguments:
+                a list of lists of numbers.
+                The first level refers to all the polygons that compose the
+                object, and the second level to the polygon coordinates.
+        """
+        if isinstance(polygons, (list, tuple)):
+            valid_polygons = []
+            for p in polygons:
+                p = torch.as_tensor(p, dtype=torch.float32)
+                if len(p) >= 6:  # 3 * 2 coordinates
+                    valid_polygons.append(p)
+            polygons = valid_polygons
+
+        elif isinstance(polygons, PolygonInstance):
+            polygons = copy.copy(polygons.polygons)
+        else:
+            RuntimeError(
+                "Type of argument `polygons` is not allowed:%s" % (type(polygons))
+            )
+
+        """ This crashes the training way too many times...
+        for p in polygons:
+            assert p[::2].min() >= 0
+            assert p[::2].max() < size[0]
+            assert p[1::2].min() >= 0
+            assert p[1::2].max() , size[1]
+        """
+
+        self.polygons = polygons
+        self.size = tuple(size)
+
+    def transpose(self, method):
+        if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM):
+            raise NotImplementedError(
+                "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented"
+            )
+
+        flipped_polygons = []
+        width, height = self.size
+        if method == FLIP_LEFT_RIGHT:
+            dim = width
+            idx = 0
+        elif method == FLIP_TOP_BOTTOM:
+            dim = height
+            idx = 1
+
+        for poly in self.polygons:
+            p = poly.clone()
+            TO_REMOVE = 1
+            p[idx::2] = dim - poly[idx::2] - TO_REMOVE
+            flipped_polygons.append(p)
+
+        return PolygonInstance(flipped_polygons, size=self.size)
+
+    def crop(self, box):
+        assert isinstance(box, (list, tuple, torch.Tensor)), str(type(box))
+
+        # box is assumed to be xyxy
+        current_width, current_height = self.size
+        xmin, ymin, xmax, ymax = map(float, box)
+
+        assert xmin <= xmax and ymin <= ymax, str(box)
+        xmin = min(max(xmin, 0), current_width - 1)
+        ymin = min(max(ymin, 0), current_height - 1)
+
+        xmax = min(max(xmax, 0), current_width)
+        ymax = min(max(ymax, 0), current_height)
+
+        xmax = max(xmax, xmin + 1)
+        ymax = max(ymax, ymin + 1)
+
+        w, h = xmax - xmin, ymax - ymin
+
+        cropped_polygons = []
+        for poly in self.polygons:
+            p = poly.clone()
+            p[0::2] = p[0::2] - xmin  # .clamp(min=0, max=w)
+            p[1::2] = p[1::2] - ymin  # .clamp(min=0, max=h)
+            cropped_polygons.append(p)
+
+        return PolygonInstance(cropped_polygons, size=(w, h))
+
+    def resize(self, size):
+        try:
+            iter(size)
+        except TypeError:
+            assert isinstance(size, (int, float))
+            size = size, size
+
+        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size))
+
+        if ratios[0] == ratios[1]:
+            ratio = ratios[0]
+            scaled_polys = [p * ratio for p in self.polygons]
+            return PolygonInstance(scaled_polys, size)
+
+        ratio_w, ratio_h = ratios
+        scaled_polygons = []
+        for poly in self.polygons:
+            p = poly.clone()
+            p[0::2] *= ratio_w
+            p[1::2] *= ratio_h
+            scaled_polygons.append(p)
+
+        return PolygonInstance(scaled_polygons, size=size)
+
+    def convert_to_binarymask(self):
+        width, height = self.size
+        # formatting for COCO PythonAPI
+        polygons = [p.numpy() for p in self.polygons]
+        rles = mask_utils.frPyObjects(polygons, height, width)
+        rle = mask_utils.merge(rles)
+        mask = mask_utils.decode(rle)
+        mask = torch.from_numpy(mask)
+        return mask
+
+    def __len__(self):
+        return len(self.polygons)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_groups={}, ".format(len(self.polygons))
+        s += "image_width={}, ".format(self.size[0])
+        s += "image_height={}, ".format(self.size[1])
+        return s
+
+
+class PolygonList(object):
+    """
+    This class handles PolygonInstances for all objects in the image
+    """
+
+    def __init__(self, polygons, size):
+        """
+        Arguments:
+            polygons:
+                a list of list of lists of numbers. The first
+                level of the list correspond to individual instances,
+                the second level to all the polygons that compose the
+                object, and the third level to the polygon coordinates.
+
+                OR
+
+                a list of PolygonInstances.
+
+                OR
+
+                a PolygonList
+
+            size: absolute image size
+
+        """
+        if isinstance(polygons, (list, tuple)):
+            if len(polygons) == 0:
+                polygons = [[[]]]
+            if isinstance(polygons[0], (list, tuple)):
+                assert isinstance(polygons[0][0], (list, tuple)), str(
+                    type(polygons[0][0])
+                )
+            else:
+                assert isinstance(polygons[0], PolygonInstance), str(type(polygons[0]))
+
+        elif isinstance(polygons, PolygonList):
+            size = polygons.size
+            polygons = polygons.polygons
+
+        else:
+            RuntimeError(
+                "Type of argument `polygons` is not allowed:%s" % (type(polygons))
+            )
+
+        assert isinstance(size, (list, tuple)), str(type(size))
+
+        self.polygons = []
+        for p in polygons:
+            p = PolygonInstance(p, size)
+            if len(p) > 0:
+                self.polygons.append(p)
+
+        self.size = tuple(size)
+
+    def transpose(self, method):
+        if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM):
+            raise NotImplementedError(
+                "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented"
+            )
+
+        flipped_polygons = []
+        for polygon in self.polygons:
+            flipped_polygons.append(polygon.transpose(method))
+
+        return PolygonList(flipped_polygons, size=self.size)
+
+    def crop(self, box):
+        w, h = box[2] - box[0], box[3] - box[1]
+        cropped_polygons = []
+        for polygon in self.polygons:
+            cropped_polygons.append(polygon.crop(box))
+
+        cropped_size = w, h
+        return PolygonList(cropped_polygons, cropped_size)
+
+    def resize(self, size):
+        resized_polygons = []
+        for polygon in self.polygons:
+            resized_polygons.append(polygon.resize(size))
+
+        resized_size = size
+        return PolygonList(resized_polygons, resized_size)
+
+    def to(self, *args, **kwargs):
+        return self
+
+    def convert_to_binarymask(self):
+        if len(self) > 0:
+            masks = torch.stack([p.convert_to_binarymask() for p in self.polygons])
+        else:
+            size = self.size
+            masks = torch.empty([0, size[1], size[0]], dtype=torch.uint8)
+
+        return BinaryMaskList(masks, size=self.size)
+
+    def __len__(self):
+        return len(self.polygons)
+
+    def __getitem__(self, item):
+        if isinstance(item, int):
+            selected_polygons = [self.polygons[item]]
+        elif isinstance(item, slice):
+            selected_polygons = self.polygons[item]
+        else:
+            # advanced indexing on a single dimension
+            selected_polygons = []
+            if isinstance(item, torch.Tensor) and item.dtype == torch.uint8:
+                item = item.nonzero()
+                item = item.squeeze(1) if item.numel() > 0 else item
+                item = item.tolist()
+            for i in item:
+                selected_polygons.append(self.polygons[i])
+        return PolygonList(selected_polygons, size=self.size)
+
+    def __iter__(self):
+        return iter(self.polygons)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self.polygons))
+        s += "image_width={}, ".format(self.size[0])
+        s += "image_height={})".format(self.size[1])
+        return s
+
+
+class SegmentationMask(object):
+
+    """
+    This class stores the segmentations for all objects in the image.
+    It wraps BinaryMaskList and PolygonList conveniently.
+    """
+
+    def __init__(self, instances, size, mode="poly"):
+        """
+        Arguments:
+            instances: two types
+                (1) polygon
+                (2) binary mask
+            size: (width, height)
+            mode: 'poly', 'mask'. if mode is 'mask', convert mask of any format to binary mask
+        """
+
+        assert isinstance(size, (list, tuple))
+        assert len(size) == 2
+        if isinstance(size[0], torch.Tensor):
+            assert isinstance(size[1], torch.Tensor)
+            size = size[0].item(), size[1].item()
+
+        assert isinstance(size[0], (int, float))
+        assert isinstance(size[1], (int, float))
+
+        if mode == "poly":
+            self.instances = PolygonList(instances, size)
+        elif mode == "mask":
+            self.instances = BinaryMaskList(instances, size)
+        else:
+            raise NotImplementedError("Unknown mode: %s" % str(mode))
+
+        self.mode = mode
+        self.size = tuple(size)
+
+    def transpose(self, method):
+        flipped_instances = self.instances.transpose(method)
+        return SegmentationMask(flipped_instances, self.size, self.mode)
+
+    def crop(self, box):
+        cropped_instances = self.instances.crop(box)
+        cropped_size = cropped_instances.size
+        return SegmentationMask(cropped_instances, cropped_size, self.mode)
+
+    def resize(self, size, *args, **kwargs):
+        resized_instances = self.instances.resize(size)
+        resized_size = size
+        return SegmentationMask(resized_instances, resized_size, self.mode)
+
+    def to(self, *args, **kwargs):
+        return self
+
+    def convert(self, mode):
+        if mode == self.mode:
+            return self
+
+        if mode == "poly":
+            converted_instances = self.instances.convert_to_polygon()
+        elif mode == "mask":
+            converted_instances = self.instances.convert_to_binarymask()
+        else:
+            raise NotImplementedError("Unknown mode: %s" % str(mode))
+
+        return SegmentationMask(converted_instances, self.size, mode)
+
+    def get_mask_tensor(self):
+        instances = self.instances
+        if self.mode == "poly":
+            instances = instances.convert_to_binarymask()
+        # If there is only 1 instance
+        return instances.masks.squeeze(0)
+
+    def __len__(self):
+        return len(self.instances)
+
+    def __getitem__(self, item):
+        selected_instances = self.instances.__getitem__(item)
+        return SegmentationMask(selected_instances, self.size, self.mode)
+
+    def __iter__(self):
+        self.iter_idx = 0
+        return self
+
+    def __next__(self):
+        if self.iter_idx < self.__len__():
+            next_segmentation = self.__getitem__(self.iter_idx)
+            self.iter_idx += 1
+            return next_segmentation
+        raise StopIteration()
+        
+    next = __next__  # Python 2 compatibility
+
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self.instances))
+        s += "image_width={}, ".format(self.size[0])
+        s += "image_height={}, ".format(self.size[1])
+        s += "mode={})".format(self.mode)
+        return s
diff --git a/maskrcnn_benchmark/utils/README.md b/maskrcnn_benchmark/utils/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9765b24a730b77556104187ac3ef5439ab0859fd
--- /dev/null
+++ b/maskrcnn_benchmark/utils/README.md
@@ -0,0 +1,5 @@
+# Utility functions
+
+This folder contain utility functions that are not used in the
+core library, but are useful for building models or training
+code using the config system.
diff --git a/maskrcnn_benchmark/utils/__init__.py b/maskrcnn_benchmark/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/maskrcnn_benchmark/utils/__pycache__/__init__.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..478558ef880084e6a2e3a227739eb8f598b9e6e8
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/__init__.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/c2_model_loading.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/c2_model_loading.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80db9f7391ecfe0185dcc70d1dce27efb7b28214
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/c2_model_loading.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/chars.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/chars.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b079ba502c033f66ce8bc3ee26b7804ccb00a47
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/chars.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/checkpoint.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/checkpoint.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..218c09ad70e4057624442578f4bef3548f0a703a
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/checkpoint.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/collect_env.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/collect_env.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9dcea6d95dba11c3ee8021015b932da57e73c8b9
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/collect_env.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/comm.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/comm.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a379af37b9249d18c5f4ab59c2d669af5f6dd4f9
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/comm.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/env.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/env.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6efe3e59d8490caad73ba355b0739695a35485f
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/env.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/imports.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/imports.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e660149ec763a317cae0e30aab436585ed72b27
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/imports.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/logger.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/logger.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..155a0c02ea5abb57c2d0ca70e532f4b0e3221a31
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/logger.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/miscellaneous.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/miscellaneous.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d6b49b716c5b20cba1aca496a1526b811e9428e
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/miscellaneous.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/model_serialization.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/model_serialization.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8df189759e6c901fa822c6f4f1000455765e75ad
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/model_serialization.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/model_zoo.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/model_zoo.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3211d8a0a435d4cadcf85fc556a4ce4b66eead56
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/model_zoo.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/registry.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/registry.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c40cfa8baab3c427083602ebb6a9a8dacb17f248
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/registry.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/__pycache__/timer.cpython-37.pyc b/maskrcnn_benchmark/utils/__pycache__/timer.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25c633890986b8b6647c0b0bd8a5b6213d62174f
Binary files /dev/null and b/maskrcnn_benchmark/utils/__pycache__/timer.cpython-37.pyc differ
diff --git a/maskrcnn_benchmark/utils/c2_model_loading.py b/maskrcnn_benchmark/utils/c2_model_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..041d7e0141d52c2b6390d13a437062477b493fd5
--- /dev/null
+++ b/maskrcnn_benchmark/utils/c2_model_loading.py
@@ -0,0 +1,177 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import logging
+import pickle
+from collections import OrderedDict
+
+import torch
+
+from maskrcnn_benchmark.utils.model_serialization import load_state_dict
+from maskrcnn_benchmark.utils.registry import Registry
+
+
+def _rename_basic_resnet_weights(layer_keys):
+    layer_keys = [k.replace("_", ".") for k in layer_keys]
+    layer_keys = [k.replace(".w", ".weight") for k in layer_keys]
+    layer_keys = [k.replace(".bn", "_bn") for k in layer_keys]
+    layer_keys = [k.replace(".b", ".bias") for k in layer_keys]
+    layer_keys = [k.replace("_bn.s", "_bn.scale") for k in layer_keys]
+    layer_keys = [k.replace(".biasranch", ".branch") for k in layer_keys]
+    layer_keys = [k.replace("bbox.pred", "bbox_pred") for k in layer_keys]
+    layer_keys = [k.replace("cls.score", "cls_score") for k in layer_keys]
+    layer_keys = [k.replace("res.conv1_", "conv1_") for k in layer_keys]
+
+    # RPN / Faster RCNN
+    layer_keys = [k.replace(".biasbox", ".bbox") for k in layer_keys]
+    layer_keys = [k.replace("conv.rpn", "rpn.conv") for k in layer_keys]
+    layer_keys = [k.replace("rpn.bbox.pred", "rpn.bbox_pred") for k in layer_keys]
+    layer_keys = [k.replace("rpn.cls.logits", "rpn.cls_logits") for k in layer_keys]
+
+    # Affine-Channel -> BatchNorm enaming
+    layer_keys = [k.replace("_bn.scale", "_bn.weight") for k in layer_keys]
+
+    # Make torchvision-compatible
+    layer_keys = [k.replace("conv1_bn.", "bn1.") for k in layer_keys]
+
+    layer_keys = [k.replace("res2.", "layer1.") for k in layer_keys]
+    layer_keys = [k.replace("res3.", "layer2.") for k in layer_keys]
+    layer_keys = [k.replace("res4.", "layer3.") for k in layer_keys]
+    layer_keys = [k.replace("res5.", "layer4.") for k in layer_keys]
+
+    layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2a_bn.", ".bn1.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2b_bn.", ".bn2.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2c_bn.", ".bn3.") for k in layer_keys]
+
+    layer_keys = [k.replace(".branch1.", ".downsample.0.") for k in layer_keys]
+    layer_keys = [k.replace(".branch1_bn.", ".downsample.1.") for k in layer_keys]
+
+    # GroupNorm
+    layer_keys = [k.replace("conv1.gn.s", "bn1.weight") for k in layer_keys]
+    layer_keys = [k.replace("conv1.gn.bias", "bn1.bias") for k in layer_keys]
+    layer_keys = [k.replace("conv2.gn.s", "bn2.weight") for k in layer_keys]
+    layer_keys = [k.replace("conv2.gn.bias", "bn2.bias") for k in layer_keys]
+    layer_keys = [k.replace("conv3.gn.s", "bn3.weight") for k in layer_keys]
+    layer_keys = [k.replace("conv3.gn.bias", "bn3.bias") for k in layer_keys]
+    layer_keys = [k.replace("downsample.0.gn.s", "downsample.1.weight") \
+        for k in layer_keys]
+    layer_keys = [k.replace("downsample.0.gn.bias", "downsample.1.bias") \
+        for k in layer_keys]
+
+    return layer_keys
+
+def _rename_fpn_weights(layer_keys, stage_names):
+    for mapped_idx, stage_name in enumerate(stage_names, 1):
+        suffix = ""
+        if mapped_idx < 4:
+            suffix = ".lateral"
+        layer_keys = [
+            k.replace("fpn.inner.layer{}.sum{}".format(stage_name, suffix), "fpn_inner{}".format(mapped_idx)) for k in layer_keys
+        ]
+        layer_keys = [k.replace("fpn.layer{}.sum".format(stage_name), "fpn_layer{}".format(mapped_idx)) for k in layer_keys]
+
+
+    layer_keys = [k.replace("rpn.conv.fpn2", "rpn.conv") for k in layer_keys]
+    layer_keys = [k.replace("rpn.bbox_pred.fpn2", "rpn.bbox_pred") for k in layer_keys]
+    layer_keys = [
+        k.replace("rpn.cls_logits.fpn2", "rpn.cls_logits") for k in layer_keys
+    ]
+
+    return layer_keys
+
+
+def _rename_weights_for_resnet(weights, stage_names):
+    original_keys = sorted(weights.keys())
+    layer_keys = sorted(weights.keys())
+
+    # for X-101, rename output to fc1000 to avoid conflicts afterwards
+    layer_keys = [k if k != "pred_b" else "fc1000_b" for k in layer_keys]
+    layer_keys = [k if k != "pred_w" else "fc1000_w" for k in layer_keys]
+
+    # performs basic renaming: _ -> . , etc
+    layer_keys = _rename_basic_resnet_weights(layer_keys)
+
+    # FPN
+    layer_keys = _rename_fpn_weights(layer_keys, stage_names)
+
+    # Mask R-CNN
+    layer_keys = [k.replace("mask.fcn.logits", "mask_fcn_logits") for k in layer_keys]
+    layer_keys = [k.replace(".[mask].fcn", "mask_fcn") for k in layer_keys]
+    layer_keys = [k.replace("conv5.mask", "conv5_mask") for k in layer_keys]
+
+    # Keypoint R-CNN
+    layer_keys = [k.replace("kps.score.lowres", "kps_score_lowres") for k in layer_keys]
+    layer_keys = [k.replace("kps.score", "kps_score") for k in layer_keys]
+    layer_keys = [k.replace("conv.fcn", "conv_fcn") for k in layer_keys]
+
+    # Rename for our RPN structure
+    layer_keys = [k.replace("rpn.", "rpn.head.") for k in layer_keys]
+
+    key_map = {k: v for k, v in zip(original_keys, layer_keys)}
+
+    logger = logging.getLogger(__name__)
+    logger.info("Remapping C2 weights")
+    max_c2_key_size = max([len(k) for k in original_keys if "_momentum" not in k])
+
+    new_weights = OrderedDict()
+    for k in original_keys:
+        v = weights[k]
+        if "_momentum" in k:
+            continue
+        # if 'fc1000' in k:
+        #     continue
+        w = torch.from_numpy(v)
+        # if "bn" in k:
+        #     w = w.view(1, -1, 1, 1)
+        logger.info("C2 name: {: <{}} mapped name: {}".format(k, max_c2_key_size, key_map[k]))
+        new_weights[key_map[k]] = w
+
+    return new_weights
+
+
+def _load_c2_pickled_weights(file_path):
+    with open(file_path, "rb") as f:
+        if torch._six.PY3:
+            data = pickle.load(f, encoding="latin1")
+        else:
+            data = pickle.load(f)
+    if "blobs" in data:
+        weights = data["blobs"]
+    else:
+        weights = data
+    return weights
+
+
+_C2_STAGE_NAMES = {
+    "R-50": ["1.2", "2.3", "3.5", "4.2"],
+    "R-101": ["1.2", "2.3", "3.22", "4.2"],
+    "R-152": ["1.2", "2.7", "3.35", "4.2"],
+}
+
+C2_FORMAT_LOADER = Registry()
+
+
+@C2_FORMAT_LOADER.register("R-50-C4")
+@C2_FORMAT_LOADER.register("R-50-C5")
+@C2_FORMAT_LOADER.register("R-101-C4")
+@C2_FORMAT_LOADER.register("R-101-C5")
+@C2_FORMAT_LOADER.register("R-50-FPN")
+@C2_FORMAT_LOADER.register("R-50-FPN-RETINANET")
+@C2_FORMAT_LOADER.register("R-101-FPN")
+@C2_FORMAT_LOADER.register("R-101-PAN")
+@C2_FORMAT_LOADER.register("R-101-FPN-RETINANET")
+@C2_FORMAT_LOADER.register("R-152-FPN")
+@C2_FORMAT_LOADER.register("R-152-PAN")
+def load_resnet_c2_format(cfg, f):
+    state_dict = _load_c2_pickled_weights(f)
+    conv_body = cfg.MODEL.BACKBONE.CONV_BODY
+    arch = conv_body.replace("-C4", "").replace("-C5", "").replace("-FPN", "")
+    arch = arch.replace("-RETINANET", "").replace("-PAN", "")
+    stages = _C2_STAGE_NAMES[arch]
+    state_dict = _rename_weights_for_resnet(state_dict, stages)
+    return dict(model=state_dict)
+
+
+def load_c2_format(cfg, f):
+    return C2_FORMAT_LOADER[cfg.MODEL.BACKBONE.CONV_BODY](cfg, f)
diff --git a/maskrcnn_benchmark/utils/chars.py b/maskrcnn_benchmark/utils/chars.py
new file mode 100644
index 0000000000000000000000000000000000000000..71772ab85dec2b42458e25593b611e5f24e465d2
--- /dev/null
+++ b/maskrcnn_benchmark/utils/chars.py
@@ -0,0 +1,199 @@
+import os
+
+import cv2
+import numpy as np
+
+
+def char2num(char):
+    if char in "0123456789":
+        num = ord(char) - ord("0") + 1
+    elif char in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
+        num = ord(char.lower()) - ord("a") + 11
+    else:
+        num = 0
+    return num
+
+
+def num2char(num):
+    chars = "_0123456789abcdefghijklmnopqrstuvwxyz"
+    char = chars[num]
+    # if num >=1 and num <=10:
+    # 	char = chr(ord('0') + num - 1)
+    # elif num > 10 and num <= 36:
+    # 	char = chr(ord('a') + num - 11)
+    # else:
+    # 	print('error number:%d'%(num))
+    # 	exit()
+    return char
+
+
+def getstr_grid(seg, box, threshold=192):
+    pos = 255 - (seg[0] * 255).astype(np.uint8)
+    mask_index = np.argmax(seg, axis=0)
+    mask_index = mask_index.astype(np.uint8)
+    pos = pos.astype(np.uint8)
+    string, score, rec_scores, char_polygons = seg2text(
+        pos, mask_index, seg, box, threshold=threshold
+    )
+    return string, score, rec_scores, char_polygons
+
+
+def seg2text(gray, mask, seg, box, threshold=192):
+    ## input numpy
+    img_h, img_w = gray.shape
+    box_w = box[2] - box[0]
+    box_h = box[3] - box[1]
+    ratio_h = float(box_h) / img_h
+    ratio_w = float(box_w) / img_w
+    # SE1=cv2.getStructuringElement(cv2.MORPH_RECT,(3,3))
+    # gray = cv2.erode(gray,SE1)
+    # gray = cv2.dilate(gray,SE1)
+    # gray = cv2.morphologyEx(gray,cv2.MORPH_CLOSE,SE1)
+    ret, thresh = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY)
+    try:
+        _, contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+    except:
+        contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+    chars = []
+    scores = []
+    char_polygons = []
+    for i in range(len(contours)):
+        char = {}
+        temp = np.zeros((img_h, img_w)).astype(np.uint8)
+        cv2.drawContours(temp, [contours[i]], 0, (255), -1)
+        x, y, w, h = cv2.boundingRect(contours[i])
+        c_x, c_y = x + w / 2, y + h / 2
+        perimeter = cv2.arcLength(contours[i], True)
+        epsilon = 0.01 * cv2.arcLength(contours[i], True)
+        approx = cv2.approxPolyDP(contours[i], epsilon, True)
+        pts = approx.reshape((-1, 2))
+        pts[:, 0] = pts[:, 0] * ratio_w + box[0]
+        pts[:, 1] = pts[:, 1] * ratio_h + box[1]
+        polygon = list(pts.reshape((-1,)))
+        polygon = list(map(int, polygon))
+        if len(polygon) >= 6:
+            char_polygons.append(polygon)
+        # x1 = x * ratio_w + box[0]
+        # y1 = y * ratio_h + box[1]
+        # x3 = (x + w) * ratio_w + box[0]
+        # y3 = (y + h) * ratio_h + box[1]
+        # polygon = [x1, y1, x3, y1, x3, y3, x1, y3]
+        regions = seg[1:, temp == 255].reshape((36, -1))
+        cs = np.mean(regions, axis=1)
+        sym = num2char(np.argmax(cs.reshape((-1))) + 1)
+        char["x"] = c_x
+        char["y"] = c_y
+        char["s"] = sym
+        char["cs"] = cs.reshape((-1, 1))
+        scores.append(np.max(char["cs"], axis=0)[0])
+
+        chars.append(char)
+    chars = sorted(chars, key=lambda x: x["x"])
+    string = ""
+    css = []
+    for char in chars:
+        string = string + char["s"]
+        css.append(char["cs"])
+    if len(scores) > 0:
+        score = sum(scores) / len(scores)
+    else:
+        score = 0.00
+    if not css:
+        css = [0.0]
+    return string, score, np.hstack(css), char_polygons
+
+
+# def get_tight_rect(points, start_x, start_y, image_height, image_width, scale):
+#     points = list(points)
+#     ps = sorted(points, key=lambda x: x[0])
+#
+#     if ps[1][1] > ps[0][1]:
+#         px1 = ps[0][0] * scale + start_x
+#         py1 = ps[0][1] * scale + start_y
+#         px4 = ps[1][0] * scale + start_x
+#         py4 = ps[1][1] * scale + start_y
+#     else:
+#         px1 = ps[1][0] * scale + start_x
+#         py1 = ps[1][1] * scale + start_y
+#         px4 = ps[0][0] * scale + start_x
+#         py4 = ps[0][1] * scale + start_y
+#     if ps[3][1] > ps[2][1]:
+#         px2 = ps[2][0] * scale + start_x
+#         py2 = ps[2][1] * scale + start_y
+#         px3 = ps[3][0] * scale + start_x
+#         py3 = ps[3][1] * scale + start_y
+#     else:
+#         px2 = ps[3][0] * scale + start_x
+#         py2 = ps[3][1] * scale + start_y
+#         px3 = ps[2][0] * scale + start_x
+#         py3 = ps[2][1] * scale + start_y
+#
+#     if px1 < 0:
+#         px1 = 1
+#     if px1 > image_width:
+#         px1 = image_width - 1
+#     if px2 < 0:
+#         px2 = 1
+#     if px2 > image_width:
+#         px2 = image_width - 1
+#     if px3 < 0:
+#         px3 = 1
+#     if px3 > image_width:
+#         px3 = image_width - 1
+#     if px4 < 0:
+#         px4 = 1
+#     if px4 > image_width:
+#         px4 = image_width - 1
+#
+#     if py1 < 0:
+#         py1 = 1
+#     if py1 > image_height:
+#         py1 = image_height - 1
+#     if py2 < 0:
+#         py2 = 1
+#     if py2 > image_height:
+#         py2 = image_height - 1
+#     if py3 < 0:
+#         py3 = 1
+#     if py3 > image_height:
+#         py3 = image_height - 1
+#     if py4 < 0:
+#         py4 = 1
+#     if py4 > image_height:
+#         py4 = image_height - 1
+#     return [px1, py1, px2, py2, px3, py3, px4, py4]
+
+def get_tight_rect(points, start_x, start_y, image_height, image_width, scale):
+    points = list(points)
+    ps = sorted(points, key=lambda x: x[0])
+
+    if ps[1][1] > ps[0][1]:
+        px1 = ps[0][0] * scale + start_x
+        py1 = ps[0][1] * scale + start_y
+        px4 = ps[1][0] * scale + start_x
+        py4 = ps[1][1] * scale + start_y
+    else:
+        px1 = ps[1][0] * scale + start_x
+        py1 = ps[1][1] * scale + start_y
+        px4 = ps[0][0] * scale + start_x
+        py4 = ps[0][1] * scale + start_y
+    if ps[3][1] > ps[2][1]:
+        px2 = ps[2][0] * scale + start_x
+        py2 = ps[2][1] * scale + start_y
+        px3 = ps[3][0] * scale + start_x
+        py3 = ps[3][1] * scale + start_y
+    else:
+        px2 = ps[3][0] * scale + start_x
+        py2 = ps[3][1] * scale + start_y
+        px3 = ps[2][0] * scale + start_x
+        py3 = ps[2][1] * scale + start_y
+
+    px1 = min(max(px1, 1), image_width - 1)
+    px2 = min(max(px2, 1), image_width - 1)
+    px3 = min(max(px3, 1), image_width - 1)
+    px4 = min(max(px4, 1), image_width - 1)
+    py1 = min(max(py1, 1), image_height - 1)
+    py2 = min(max(py2, 1), image_height - 1)
+    py3 = min(max(py3, 1), image_height - 1)
+    py4 = min(max(py4, 1), image_height - 1)
+    return [px1, py1, px2, py2, px3, py3, px4, py4]
diff --git a/maskrcnn_benchmark/utils/checkpoint.py b/maskrcnn_benchmark/utils/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdb2293cd99cb78ce97e58ed3493dddf49716033
--- /dev/null
+++ b/maskrcnn_benchmark/utils/checkpoint.py
@@ -0,0 +1,141 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import logging
+import os
+
+import torch
+
+from maskrcnn_benchmark.utils.model_serialization import load_state_dict
+from maskrcnn_benchmark.utils.c2_model_loading import load_c2_format
+from maskrcnn_benchmark.utils.imports import import_file
+from maskrcnn_benchmark.utils.model_zoo import cache_url
+
+
+class Checkpointer(object):
+    def __init__(
+        self,
+        model,
+        optimizer=None,
+        scheduler=None,
+        save_dir="",
+        save_to_disk=None,
+        logger=None,
+    ):
+        self.model = model
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        self.save_dir = save_dir
+        self.save_to_disk = save_to_disk
+        if logger is None:
+            logger = logging.getLogger(__name__)
+        self.logger = logger
+
+    def save(self, name, **kwargs):
+        if not self.save_dir:
+            return
+
+        if not self.save_to_disk:
+            return
+
+        data = {}
+        data["model"] = self.model.state_dict()
+        if self.optimizer is not None:
+            data["optimizer"] = self.optimizer.state_dict()
+        if self.scheduler is not None:
+            data["scheduler"] = self.scheduler.state_dict()
+        data.update(kwargs)
+
+        save_file = os.path.join(self.save_dir, "{}.pth".format(name))
+        self.logger.info("Saving checkpoint to {}".format(save_file))
+        torch.save(data, save_file)
+        self.tag_last_checkpoint(save_file)
+
+    def load(self, f=None):
+        if self.has_checkpoint():
+            # override argument with existing checkpoint
+            f = self.get_checkpoint_file()
+        if not f:
+            # no checkpoint could be found
+            self.logger.info("No checkpoint found. Initializing model from scratch")
+            return {}
+
+        self.logger.info("Loading checkpoint from {}".format(f))
+
+        checkpoint = self._load_file(f)
+        self._load_model(checkpoint)
+        if "optimizer" in checkpoint and self.optimizer:
+            self.logger.info("Loading optimizer from {}".format(f))
+            self.optimizer.load_state_dict(checkpoint.pop("optimizer"))
+        if "scheduler" in checkpoint and self.scheduler:
+            self.logger.info("Loading scheduler from {}".format(f))
+            self.scheduler.load_state_dict(checkpoint.pop("scheduler"))
+
+        # return any further checkpoint data
+        return checkpoint
+
+    def has_checkpoint(self):
+        save_file = os.path.join(self.save_dir, "last_checkpoint")
+        return os.path.exists(save_file)
+
+    def get_checkpoint_file(self):
+        save_file = os.path.join(self.save_dir, "last_checkpoint")
+        try:
+            with open(save_file, "r") as f:
+                last_saved = f.read()
+                last_saved = last_saved.strip()
+        except IOError:
+            # if file doesn't exist, maybe because it has just been
+            # deleted by a separate process
+            last_saved = ""
+        return last_saved
+
+    def tag_last_checkpoint(self, last_filename):
+        save_file = os.path.join(self.save_dir, "last_checkpoint")
+        with open(save_file, "w") as f:
+            f.write(last_filename)
+
+    def _load_file(self, f):
+        return torch.load(f, map_location=torch.device("cpu"))
+
+    def _load_model(self, checkpoint):
+        load_state_dict(self.model, checkpoint.pop("model"))
+
+
+class DetectronCheckpointer(Checkpointer):
+    def __init__(
+        self,
+        cfg,
+        model,
+        optimizer=None,
+        scheduler=None,
+        save_dir="",
+        save_to_disk=None,
+        logger=None,
+    ):
+        super(DetectronCheckpointer, self).__init__(
+            model, optimizer, scheduler, save_dir, save_to_disk, logger
+        )
+        self.cfg = cfg.clone()
+
+    def _load_file(self, f):
+        # catalog lookup
+        if f.startswith("catalog://"):
+            paths_catalog = import_file(
+                "maskrcnn_benchmark.config.paths_catalog", self.cfg.PATHS_CATALOG, True
+            )
+            catalog_f = paths_catalog.ModelCatalog.get(f[len("catalog://") :])
+            # self.logger.info("{} points to {}".format(f, catalog_f))
+            f = catalog_f
+        # download url files
+        if f.startswith("http"):
+            # if the file is a url path, download it and cache it
+            cached_f = cache_url(f)
+            # self.logger.info("url {} cached in {}".format(f, cached_f))
+            f = cached_f
+        # convert Caffe2 checkpoint from pkl
+        if f.endswith(".pkl"):
+            return load_c2_format(self.cfg, f)
+        # load native detectron.pytorch checkpoint
+        loaded = super(DetectronCheckpointer, self)._load_file(f)
+        if "model" not in loaded:
+            loaded = dict(model=loaded)
+        return loaded
diff --git a/maskrcnn_benchmark/utils/collect_env.py b/maskrcnn_benchmark/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d0641dda61c9950cb54d0552106246248e571ef
--- /dev/null
+++ b/maskrcnn_benchmark/utils/collect_env.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import PIL
+
+from torch.utils.collect_env import get_pretty_env_info
+
+
+def get_pil_version():
+    return "\n        Pillow ({})".format(PIL.__version__)
+
+
+def collect_env_info():
+    env_str = get_pretty_env_info()
+    env_str += get_pil_version()
+    return env_str
diff --git a/maskrcnn_benchmark/utils/comm.py b/maskrcnn_benchmark/utils/comm.py
new file mode 100644
index 0000000000000000000000000000000000000000..46d7c55ce04b4180def3909cd0989c21e544085f
--- /dev/null
+++ b/maskrcnn_benchmark/utils/comm.py
@@ -0,0 +1,117 @@
+"""
+This file contains primitives for multi-gpu communication.
+This is useful when doing distributed training.
+"""
+
+import pickle
+import time
+
+import torch
+import torch.distributed as dist
+
+
+def get_world_size():
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when
+    using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.IntTensor([tensor.numel()]).to("cuda")
+    size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
+    if local_size != max_size:
+        padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that process with rank
+    0 has the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.reduce(values, dst=0)
+        if dist.get_rank() == 0 and average:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
diff --git a/maskrcnn_benchmark/utils/cv2_util.py b/maskrcnn_benchmark/utils/cv2_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bbc0fb2d08337bfd8242cbedd514a41d8d7353f
--- /dev/null
+++ b/maskrcnn_benchmark/utils/cv2_util.py
@@ -0,0 +1,24 @@
+"""
+Module for cv2 utility functions and maintaining version compatibility
+between 3.x and 4.x
+"""
+import cv2
+
+
+def findContours(*args, **kwargs):
+    """
+    Wraps cv2.findContours to maintain compatiblity between versions
+    3 and 4
+
+    Returns:
+        contours, hierarchy
+    """
+    if cv2.__version__.startswith('4'):
+        contours, hierarchy = cv2.findContours(*args, **kwargs)
+    elif cv2.__version__.startswith('3'):
+        _, contours, hierarchy = cv2.findContours(*args, **kwargs)
+    else:
+        raise AssertionError(
+            'cv2 must be either version 3 or 4 to call this method')
+
+    return contours, hierarchy
diff --git a/maskrcnn_benchmark/utils/env.py b/maskrcnn_benchmark/utils/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c7db32e41ec266ead9734f90d0173b4feff61ef
--- /dev/null
+++ b/maskrcnn_benchmark/utils/env.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import os
+
+from maskrcnn_benchmark.utils.imports import import_file
+
+
+def setup_environment():
+    """Perform environment setup work. The default setup is a no-op, but this
+    function allows the user to specify a Python source file that performs
+    custom setup work that may be necessary to their computing environment.
+    """
+    custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE")
+    if custom_module_path:
+        setup_custom_environment(custom_module_path)
+    else:
+        # The default setup is a no-op
+        pass
+
+
+def setup_custom_environment(custom_module_path):
+    """Load custom environment setup from a Python source file and run the setup
+    function.
+    """
+    module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path)
+    assert hasattr(module, "setup_environment") and callable(
+        module.setup_environment
+    ), (
+        "Custom environment module defined in {} does not have the "
+        "required callable attribute 'setup_environment'."
+    ).format(
+        custom_module_path
+    )
+    module.setup_environment()
+
+
+# Force environment setup when this module is imported
+setup_environment()
diff --git a/maskrcnn_benchmark/utils/imports.py b/maskrcnn_benchmark/utils/imports.py
new file mode 100644
index 0000000000000000000000000000000000000000..53e27e2bcfd6d9dd57579f48d42811072daf0df5
--- /dev/null
+++ b/maskrcnn_benchmark/utils/imports.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import torch
+
+if torch._six.PY3:
+    import importlib
+    import importlib.util
+    import sys
+
+
+    # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
+    def import_file(module_name, file_path, make_importable=False):
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        if make_importable:
+            sys.modules[module_name] = module
+        return module
+else:
+    import imp
+
+    def import_file(module_name, file_path, make_importable=None):
+        module = imp.load_source(module_name, file_path)
+        return module
diff --git a/maskrcnn_benchmark/utils/logger.py b/maskrcnn_benchmark/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..13847a3a76b481e132190ee0757b3539fb8981ae
--- /dev/null
+++ b/maskrcnn_benchmark/utils/logger.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import logging
+import os
+import sys
+
+
+def setup_logger(name, save_dir, distributed_rank, filename="log.txt"):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    # don't log results for the non-master process
+    if distributed_rank > 0:
+        return logger
+    ch = logging.StreamHandler(stream=sys.stdout)
+    ch.setLevel(logging.DEBUG)
+    formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+
+    if save_dir:
+        fh = logging.FileHandler(os.path.join(save_dir, filename))
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(formatter)
+        logger.addHandler(fh)
+
+    return logger
diff --git a/maskrcnn_benchmark/utils/metric_logger.py b/maskrcnn_benchmark/utils/metric_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e37a72ea4b4c85546de98210295a6adb134a297
--- /dev/null
+++ b/maskrcnn_benchmark/utils/metric_logger.py
@@ -0,0 +1,66 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from collections import defaultdict
+from collections import deque
+
+import torch
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20):
+        self.deque = deque(maxlen=window_size)
+        self.series = []
+        self.total = 0.0
+        self.count = 0
+
+    def update(self, value):
+        self.deque.append(value)
+        self.series.append(value)
+        self.count += 1
+        self.total += value
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque))
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+                    type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg)
+            )
+        return self.delimiter.join(loss_str)
diff --git a/maskrcnn_benchmark/utils/miscellaneous.py b/maskrcnn_benchmark/utils/miscellaneous.py
new file mode 100644
index 0000000000000000000000000000000000000000..db9a8b3679ceea2a5cd2b807421793bbbd3d3677
--- /dev/null
+++ b/maskrcnn_benchmark/utils/miscellaneous.py
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import errno
+import os
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
diff --git a/maskrcnn_benchmark/utils/model_serialization.py b/maskrcnn_benchmark/utils/model_serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..a95ad8b2a7a787d62dc3ea580b2dfd30e358da28
--- /dev/null
+++ b/maskrcnn_benchmark/utils/model_serialization.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from collections import OrderedDict
+import logging
+
+import torch
+
+from maskrcnn_benchmark.utils.imports import import_file
+
+
+def align_and_update_state_dicts(model_state_dict, loaded_state_dict):
+    """
+    Strategy: suppose that the models that we will create will have prefixes appended
+    to each of its keys, for example due to an extra level of nesting that the original
+    pre-trained weights from ImageNet won't contain. For example, model.state_dict()
+    might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains
+    res2.conv1.weight. We thus want to match both parameters together.
+    For that, we look for each model weight, look among all loaded keys if there is one
+    that is a suffix of the current weight name, and use it if that's the case.
+    If multiple matches exist, take the one with longest size
+    of the corresponding name. For example, for the same model as before, the pretrained
+    weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case,
+    we want to match backbone[0].body.conv1.weight to conv1.weight, and
+    backbone[0].body.res2.conv1.weight to res2.conv1.weight.
+    """
+    current_keys = sorted(list(model_state_dict.keys()))
+    loaded_keys = sorted(list(loaded_state_dict.keys()))
+    # get a matrix of string matches, where each (i, j) entry correspond to the size of the
+    # loaded_key string, if it matches
+    match_matrix = [
+        len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys
+    ]
+    match_matrix = torch.as_tensor(match_matrix).view(
+        len(current_keys), len(loaded_keys)
+    )
+    max_match_size, idxs = match_matrix.max(1)
+    # remove indices that correspond to no-match
+    idxs[max_match_size == 0] = -1
+
+    # used for logging
+    max_size = max([len(key) for key in current_keys]) if current_keys else 1
+    max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1
+    log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
+    logger = logging.getLogger(__name__)
+    for idx_new, idx_old in enumerate(idxs.tolist()):
+        if idx_old == -1:
+            continue
+        key = current_keys[idx_new]
+        key_old = loaded_keys[idx_old]
+        model_state_dict[key] = loaded_state_dict[key_old]
+        logger.info(
+            log_str_template.format(
+                key,
+                max_size,
+                key_old,
+                max_size_loaded,
+                tuple(loaded_state_dict[key_old].shape),
+            )
+        )
+
+
+def strip_prefix_if_present(state_dict, prefix):
+    keys = sorted(state_dict.keys())
+    if not all(key.startswith(prefix) for key in keys):
+        return state_dict
+    stripped_state_dict = OrderedDict()
+    for key, value in state_dict.items():
+        stripped_state_dict[key.replace(prefix, "")] = value
+    return stripped_state_dict
+
+
+def load_state_dict(model, loaded_state_dict):
+    model_state_dict = model.state_dict()
+    # if the state_dict comes from a model that was wrapped in a
+    # DataParallel or DistributedDataParallel during serialization,
+    # remove the "module" prefix before performing the matching
+    loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.")
+    align_and_update_state_dicts(model_state_dict, loaded_state_dict)
+
+    # use strict loading
+    model.load_state_dict(model_state_dict)
diff --git a/maskrcnn_benchmark/utils/model_zoo.py b/maskrcnn_benchmark/utils/model_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..92c1ed7e5dab54bd9fa3358185c71f9d5fcf26a8
--- /dev/null
+++ b/maskrcnn_benchmark/utils/model_zoo.py
@@ -0,0 +1,58 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import os
+import sys
+
+try:
+    from torch.utils.model_zoo import _download_url_to_file, urlparse, HASH_REGEX
+except ImportError:
+    # support for pytorch 1.1.0dev
+    from torch.hub import _download_url_to_file, urlparse, HASH_REGEX
+
+from maskrcnn_benchmark.utils.comm import is_main_process
+from maskrcnn_benchmark.utils.comm import synchronize
+
+
+# very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py
+# but with a few improvements and modifications
+def cache_url(url, model_dir=None, progress=True):
+    r"""Loads the Torch serialized object at the given URL.
+    If the object is already present in `model_dir`, it's deserialized and
+    returned. The filename part of the URL should follow the naming convention
+    ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
+    digits of the SHA256 hash of the contents of the file. The hash is used to
+    ensure unique names and to verify the contents of the file.
+    The default value of `model_dir` is ``$TORCH_HOME/models`` where
+    ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be
+    overridden with the ``$TORCH_MODEL_ZOO`` environment variable.
+    Args:
+        url (string): URL of the object to download
+        model_dir (string, optional): directory in which to save the object
+        progress (bool, optional): whether or not to display a progress bar to stderr
+    Example:
+        >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
+    """
+    if model_dir is None:
+        torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch'))
+        model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models'))
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    parts = urlparse(url)
+    filename = os.path.basename(parts.path)
+    if filename == "model_final.pkl":
+        # workaround as pre-trained Caffe2 models from Detectron have all the same filename
+        # so make the full path the filename by replacing / with _
+        filename = parts.path.replace("/", "_")
+    cached_file = os.path.join(model_dir, filename)
+    if not os.path.exists(cached_file) and is_main_process():
+        sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
+        hash_prefix = HASH_REGEX.search(filename)
+        if hash_prefix is not None:
+            hash_prefix = hash_prefix.group(1)
+            # workaround: Caffe2 models don't have a hash, but follow the R-50 convention,
+            # which matches the hash PyTorch uses. So we skip the hash matching
+            # if the hash_prefix is less than 6 characters
+            if len(hash_prefix) < 6:
+                hash_prefix = None
+        _download_url_to_file(url, cached_file, hash_prefix, progress=progress)
+    synchronize()
+    return cached_file
diff --git a/maskrcnn_benchmark/utils/registry.py b/maskrcnn_benchmark/utils/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3204e14148fe3341307c5d24ba9154c07449511
--- /dev/null
+++ b/maskrcnn_benchmark/utils/registry.py
@@ -0,0 +1,45 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+
+def _register_generic(module_dict, module_name, module):
+    assert module_name not in module_dict
+    module_dict[module_name] = module
+
+
+class Registry(dict):
+    '''
+    A helper class for managing registering modules, it extends a dictionary
+    and provides a register functions.
+
+    Eg. creeting a registry:
+        some_registry = Registry({"default": default_module})
+
+    There're two ways of registering new modules:
+    1): normal way is just calling register function:
+        def foo():
+            ...
+        some_registry.register("foo_module", foo)
+    2): used as decorator when declaring the module:
+        @some_registry.register("foo_module")
+        @some_registry.register("foo_modeul_nickname")
+        def foo():
+            ...
+
+    Access of module is just like using a dictionary, eg:
+        f = some_registry["foo_modeul"]
+    '''
+    def __init__(self, *args, **kwargs):
+        super(Registry, self).__init__(*args, **kwargs)
+
+    def register(self, module_name, module=None):
+        # used as function call
+        if module is not None:
+            _register_generic(self, module_name, module)
+            return
+
+        # used as decorator
+        def register_fn(fn):
+            _register_generic(self, module_name, fn)
+            return fn
+
+        return register_fn
diff --git a/maskrcnn_benchmark/utils/timer.py b/maskrcnn_benchmark/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..935af1a30811abd81de29afd2cfec6cf6880cc5e
--- /dev/null
+++ b/maskrcnn_benchmark/utils/timer.py
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+
+import time
+import datetime
+
+
+class Timer(object):
+    def __init__(self):
+        self.reset()
+
+    @property
+    def average_time(self):
+        return self.total_time / self.calls if self.calls > 0 else 0.0
+
+    def tic(self):
+        # using time.time instead of time.clock because time time.clock
+        # does not normalize for multithreading
+        self.start_time = time.time()
+
+    def toc(self, average=True):
+        self.add(time.time() - self.start_time)
+        if average:
+            return self.average_time
+        else:
+            return self.diff
+
+    def add(self, time_diff):
+        self.diff = time_diff
+        self.total_time += self.diff
+        self.calls += 1
+
+    def reset(self):
+        self.total_time = 0.0
+        self.calls = 0
+        self.start_time = 0.0
+        self.diff = 0.0
+
+    def avg_time_str(self):
+        time_str = str(datetime.timedelta(seconds=self.average_time))
+        return time_str
+
+
+def get_time_str(time_diff):
+    time_str = str(datetime.timedelta(seconds=time_diff))
+    return time_str
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a67b697bd543bc0648f92a63535180d18e870985
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+ninja
+yacs
+cython
+matplotlib
+tqdm
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..837c2cd15f4624f630540ef6993dcb9123adb39b
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,69 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#!/usr/bin/env python
+
+import glob
+import os
+
+import torch
+from setuptools import find_packages
+from setuptools import setup
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+requirements = ["torch", "torchvision"]
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+
+    include_dirs = [extensions_dir]
+
+    ext_modules = [
+        extension(
+            "maskrcnn_benchmark._C",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+setup(
+    name="maskrcnn_benchmark",
+    version="0.1",
+    author="fmassa",
+    url="https://github.com/facebookresearch/maskrcnn-benchmark",
+    description="object detection in pytorch",
+    packages=find_packages(exclude=("configs", "tests",)),
+    # install_requires=requirements,
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/test_contour.sh b/test_contour.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b6cb1d6f0a2b5aab192ac88725b537c26d3fd813
--- /dev/null
+++ b/test_contour.sh
@@ -0,0 +1,3 @@
+export NGPUS=1
+CUDA_VISIBLE_DEVICES=0 python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/test_net.py \
+	--config-file "configs/ctw/r50_baseline.yaml" 
diff --git a/tools/demo.py b/tools/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f4b2b4da1c7ebf6593da0dd07fce157c889ddd6
--- /dev/null
+++ b/tools/demo.py
@@ -0,0 +1,620 @@
+import os
+import cv2
+import torch
+from torchvision import transforms as T
+import torch.nn as nn
+
+from maskrcnn_benchmark.modeling.detector import build_detection_model
+from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
+from maskrcnn_benchmark.structures.image_list import to_image_list
+from maskrcnn_benchmark.config import cfg
+from maskrcnn_benchmark.utils.chars import getstr_grid, get_tight_rect
+from maskrcnn_benchmark.data.datasets.evaluation.word.alfashape import getAlfaShapes
+from maskrcnn_benchmark.modeling.roi_heads.boundary_head.inference import Masker
+from shapely.geometry import *
+import random
+from torchvision.transforms import functional as F
+
+from PIL import Image
+import numpy as np
+import argparse
+
+class Resize(object):
+    def __init__(self, min_size, max_size):
+        if not isinstance(min_size, (list, tuple)):
+            min_size = (min_size,)
+        self.min_size = min_size
+        self.max_size = max_size
+
+    # modified from torchvision to add support for max size
+    def get_size(self, image_size):
+       w, h = image_size
+       size = random.choice(self.min_size)
+       max_size = self.max_size
+       if max_size is not None:
+           min_original_size = float(min((w, h)))
+           max_original_size = float(max((w, h)))
+           if max_original_size / min_original_size * size > max_size:
+               size = int(round(max_size * min_original_size / max_original_size))
+
+       if (w <= h and w == size) or (h <= w and h == size):
+           return (h, w)
+
+       if w < h:
+           ow = size
+           oh = int(size * h / w)
+       else:
+           oh = size
+           ow = int(size * w / h)
+
+       return (oh, ow)
+
+    def __call__(self, image):
+        size = self.get_size(image.size)
+        image = F.resize(image, size)
+        return image
+
+class TextDemo(object):
+    def __init__(
+        self,
+        cfg,
+        confidence_threshold=0.7,
+        min_image_size=(1200,2000),
+        output_polygon=True
+    ):
+        self.cfg = cfg.clone()
+        self.model = build_detection_model(cfg)
+        self.model.eval()
+        self.device = torch.device(cfg.MODEL.DEVICE)
+        self.model.to(self.device)
+        self.min_image_size = min_image_size
+
+        checkpointer = DetectronCheckpointer(cfg, self.model, save_dir=cfg.OUTPUT_DIR)
+        _ = checkpointer.load(cfg.MODEL.WEIGHT)
+
+        self.transforms = self.build_transform()
+        self.cpu_device = torch.device("cpu")
+        self.confidence_threshold = confidence_threshold
+        self.output_polygon = output_polygon
+
+    def build_transform(self):
+        """
+        Creates a basic transformation that was used to train the models
+        """
+        cfg = self.cfg
+        # we are loading images with OpenCV, so we don't need to convert them
+        # to BGR, they are already! So all we need to do is to normalize
+        # by 255 if we want to convert to BGR255 format, or flip the channels
+        # if we want it to be in RGB in [0-1] range.
+        if cfg.INPUT.TO_BGR255:
+            to_bgr_transform = T.Lambda(lambda x: x * 255)
+        else:
+            to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]])
+
+        normalize_transform = T.Normalize(
+            mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD
+        )
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+
+        transform = T.Compose(
+            [
+                T.ToPILImage(),
+                Resize(min_size, max_size),
+                T.ToTensor(),
+                to_bgr_transform,
+                normalize_transform,
+            ]
+        )
+        return transform
+
+    def run_on_opencv_image(self, image):
+        """
+        Arguments:
+            image (np.ndarray): an image as returned by OpenCV
+        Returns:
+            result_polygons (list): detection results
+            result_words (list): recognition results
+        """
+        result_polygons = self.compute_prediction(image)
+        return result_polygons
+
+    def contour_to_valid(self, cnt, image_shape):
+        """Convert rect to xys, i.e., eight points
+        The `image_shape` is used to to make sure all points return are valid, i.e., within image area
+        """
+        # rect = cv2.minAreaRect(cnt)
+        if len(cnt.shape) != 3:
+            assert 1 < 0
+        rect = cnt.reshape([cnt.shape[0], cnt.shape[2]])
+        h, w = image_shape[0:2]
+
+        def get_valid_x(x):
+            if x < 0:
+                return 0
+            if x >= w:
+                return w - 1
+            return x
+
+        def get_valid_y(y):
+            if y < 0:
+                return 0
+            if y >= h:
+                return h - 1
+            return y
+        for i_xy, (x, y) in enumerate(rect):
+            x = get_valid_x(x)
+            y = get_valid_y(y)
+            rect[i_xy, :] = [x, y]
+
+        points = np.reshape(rect, -1)
+        return points
+
+    def _nms_y(self, heat, kernel=3):
+        pad = (kernel - 1) // 2
+        hmax = nn.functional.max_pool2d(
+            heat, (1, kernel), stride=1, padding=(0, pad))
+        keep = (hmax == heat).float()
+        return heat * keep
+
+    def _nms_x(self, heat, kernel=3):
+        pad = (kernel - 1) // 2
+        hmax = nn.functional.max_pool2d(
+            heat, (kernel, 1), stride=1, padding=(pad, 0))
+        keep = (hmax == heat).float()
+        return heat * keep
+
+    def CTW_order_lr(self, map_in):
+        line_out_l2r = []
+        line_out_r2l = []
+
+        map_in = torch.tensor(map_in)
+        value, top = torch.topk(map_in, 2, dim=0)
+        value = value.numpy()
+        top = top.numpy()
+        top_th = np.where(value[1] > 0.1)[0]  # L
+        # print(top_th)
+        if len(top_th) == 0:
+            return []
+        top1 = np.sort(top, axis=0)
+        for i in range(len(top_th)):
+            line_out_l2r.append([top_th[i], top1[0][top_th[i]]])
+            line_out_r2l.append([top_th[i], top1[1][top_th[i]]])
+        line_out = line_out_l2r+line_out_r2l[::-1]
+        # print(line_out)
+        return line_out
+
+    def CTW_order_bt(self, map_in):
+        line_out_t2b = []
+        line_out_b2t = []
+
+        map_in = torch.tensor(map_in)
+        value, top = torch.topk(map_in, 2, dim=1)
+        value = value.numpy()
+        top = top.numpy()
+        top_th = np.where(value[:, 1] > 0.1)[0]  # H
+        if len(top_th) == 0:
+            return []
+        top1 = np.sort(top, axis=1)
+        for i in range(len(top_th)):
+            line_out_b2t.append([top1[top_th[i]][0], top_th[i]])
+            line_out_t2b.append([top1[top_th[i]][1], top_th[i]])
+        line_out = line_out_b2t[::-1] + line_out_t2b
+        # print(line_out)
+        return line_out
+
+    def boundary_to_mask_ic(self, bo_x, bo_y):
+
+        # NMS Hmap and Vmap
+        Vmap = self._nms_x(bo_x, kernel=5)
+        Hmap = self._nms_y(bo_y, kernel=3)
+        Vmap = Vmap[0]
+        Hmap = Hmap[0]
+        ploys_Alfa_x = Vmap.clone().numpy()
+        ploys_Alfa_y = Hmap.clone().numpy()
+
+        # Threshold Hmap and Vmap
+        thresh = 0.5
+        ploys_Alfa_x[ploys_Alfa_x < thresh] = 0
+        ploys_Alfa_x[ploys_Alfa_x >= thresh] = 1
+        ploys_Alfa_y[ploys_Alfa_y < thresh] = 0
+        ploys_Alfa_y[ploys_Alfa_y >= thresh] = 1
+        # Output points with strong texture inforamtion in both maps
+        ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y
+        ploys_Alfa[ploys_Alfa < 2] = 0
+        ploys_Alfa[ploys_Alfa == 2] = 1
+        img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8)
+
+        # calculate polygon by Alpha-Shape Algorithm
+        if ploys_Alfa.sum() == 0:
+            return img_draw
+        ploys_Alfa_inds = np.argwhere(ploys_Alfa == 1)
+        zero_detect_x = ploys_Alfa_inds[:, 0] - ploys_Alfa_inds[0, 0]
+        zero_detect_y = ploys_Alfa_inds[:, 1] - ploys_Alfa_inds[0, 1]
+        if np.where(zero_detect_x != 0)[0].shape[0] == 0 or np.where(zero_detect_y != 0)[0].shape[0] == 0 or \
+                ploys_Alfa_inds.shape[0] < 4:
+            draw_line = ploys_Alfa_inds[np.newaxis, np.newaxis, :, :]
+            cv2.fillPoly(img_draw, draw_line, 1)
+            return img_draw
+        ploys_Alfa_inds = ploys_Alfa_inds.tolist()
+        ploys_Alfa_inds = [tuple(ploys_Alfa_ind) for ploys_Alfa_ind in ploys_Alfa_inds]
+        lines = getAlfaShapes(ploys_Alfa_inds, alfas=[1])
+        draw_line = np.array(lines)
+        if len(draw_line.shape) == 4:
+            if draw_line.shape[1] == 1:
+                draw_line[0, 0, :, :] = draw_line[0, 0, :, ::-1]
+                cv2.fillPoly(img_draw, draw_line, 1)
+            else:
+                i_draw = 0
+                for draw_l in draw_line[0]:
+                    img_draw_new = np.zeros([28, 28], dtype=np.uint8)
+                    draw_l = draw_l[np.newaxis, np.newaxis, :, :]
+                    cv2.fillPoly(img_draw, np.int32(draw_l), 1)
+                    cv2.fillPoly(img_draw_new, np.int32(draw_l), 1)
+                    i_draw += 1
+
+        else:
+            for i, line in enumerate(lines[0]):
+                draw_line = np.array(line)
+                draw_line = draw_line[np.newaxis, np.newaxis, :, :]
+                draw_line[0, 0, :, :] = draw_line[0, 0, :, ::-1]
+                cv2.fillPoly(img_draw, draw_line, 1)
+        return img_draw
+
+    def boundary_to_mask_ctw(self, bo_x, bo_y, p_temp_box):
+        w_half = (p_temp_box[2] - p_temp_box[0]) * .5
+        h_half = (p_temp_box[3] - p_temp_box[1]) * .5
+        thresh_total = 0.5
+
+        if w_half >= h_half:
+            # point re-scoring
+            bo_x = self._nms_x(bo_x, kernel=9)
+            bo_x = bo_x[0]
+            bo_y = bo_y[0]
+            ploys_Alfa_x = bo_x.clone().numpy()
+            ploys_Alfa_y = bo_y.clone().numpy()
+            thresh_x = thresh_total
+            thresh_y = thresh_total
+            ploys_Alfa_x_1 = bo_x.clone().numpy()
+            ploys_Alfa_y_1 = bo_y.clone().numpy()
+            ploys_Alfa__1 = ploys_Alfa_x_1 + ploys_Alfa_y_1
+            ploys_Alfa_x[ploys_Alfa_x < thresh_x] = 0
+            ploys_Alfa_x[ploys_Alfa_x >= thresh_x] = 1
+            ploys_Alfa_y[ploys_Alfa_y < thresh_y] = 0
+            ploys_Alfa_y[ploys_Alfa_y >= thresh_y] = 1
+            ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y
+            ploys_Alfa[ploys_Alfa < 2] = 0
+            ploys_Alfa[ploys_Alfa == 2] = 1
+            ploys_Alfa *= ploys_Alfa__1
+            # rebuild text region from contour points
+            img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8)
+            if ploys_Alfa.sum() == 0:
+                return img_draw
+            lines = self.CTW_order_lr(ploys_Alfa)
+        else:
+            bo_y = self._nms_y(bo_y,kernel=9)
+            bo_x = bo_x[0]
+            bo_y = bo_y[0]
+            ploys_Alfa_x = bo_x.clone().numpy()
+            ploys_Alfa_y = bo_y.clone().numpy()
+            thresh_x = thresh_total
+            thresh_y = thresh_total
+            ploys_Alfa_x_1 = bo_x.clone().numpy()
+            ploys_Alfa_y_1 = bo_y.clone().numpy()
+            ploys_Alfa__1 = ploys_Alfa_x_1 + ploys_Alfa_y_1
+            ploys_Alfa_x[ploys_Alfa_x < thresh_x] = 0
+            ploys_Alfa_x[ploys_Alfa_x >= thresh_x] = 1
+            ploys_Alfa_y[ploys_Alfa_y < thresh_y] = 0
+            ploys_Alfa_y[ploys_Alfa_y >= thresh_y] = 1
+            ploys_Alfa = ploys_Alfa_x + ploys_Alfa_y
+            ploys_Alfa[ploys_Alfa < 2] = 0
+            ploys_Alfa[ploys_Alfa == 2] = 1
+            ploys_Alfa *= ploys_Alfa__1
+            img_draw = np.zeros([ploys_Alfa_y.shape[-1], ploys_Alfa_y.shape[-1]], dtype=np.uint8)
+            if ploys_Alfa.sum() == 0:
+                return img_draw
+            lines = self.CTW_order_bt(ploys_Alfa)
+        if len(lines) <=10:
+            return img_draw
+        draw_line = np.array(lines)
+        draw_line = draw_line[np.newaxis, np.newaxis, :, :]
+        cv2.fillPoly(img_draw, draw_line, 1)
+        img_draw = img_draw.astype(np.uint8)
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+        img_draw = cv2.morphologyEx(img_draw, cv2.MORPH_CLOSE, kernel)
+        return img_draw
+
+    def contour_to_xys(self, cnt, image_shape):
+        """Convert rect to xys, i.e., eight points
+        The `image_shape` is used to to make sure all points return are valid, i.e., within image area
+        """
+        rect = cv2.minAreaRect(cnt)
+        h, w = image_shape[0:2]
+
+        def get_valid_x(x):
+            if x < 0:
+                return 0
+            if x >= w:
+                return w - 1
+            return x
+
+        def get_valid_y(y):
+            if y < 0:
+                return 0
+            if y >= h:
+                return h - 1
+            return y
+
+        points = cv2.boxPoints(rect)
+        points = np.int0(points)
+        for i_xy, (x, y) in enumerate(points):
+            x = get_valid_x(x)
+            y = get_valid_y(y)
+            points[i_xy, :] = [x, y]
+        points = np.reshape(points, -1)
+        return points
+
+    def mask_to_roRect(self, mask, img_shape):
+        ## convert mask into rotated rect
+        e = mask[0, :, :]
+        _, countours, hier = cv2.findContours(e.clone().numpy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)  # Aarlog
+        if len(countours) == 0:
+            return np.zeros((1, 8))
+        t_c = countours[0].copy()
+        quad = self.contour_to_xys(t_c, img_shape)
+        return quad
+
+    def mask_to_contours(self, mask, img_shape):
+        e = mask[0, :, :]
+
+        _, countours, hier = cv2.findContours(e.clone().numpy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)  # Aarlog
+
+        if len(countours) == 0:
+            return np.zeros((1, 8))
+        t_c = countours[0].copy()
+        quad = self.contour_to_valid(t_c, img_shape)
+        return quad
+
+    def py_cpu_pnms(self, dets, scores, thresh):
+        pts = []
+        for det in dets:
+            pts.append([[det[i][0], det[i][1]] for i in range(len(det))])
+        order = scores.argsort()[::-1]
+        areas = np.zeros(scores.shape)
+        order = scores.argsort()[::-1]
+        inter_areas = np.zeros((scores.shape[0], scores.shape[0]))
+        for il in range(len(pts)):
+            poly = Polygon(pts[il])
+            areas[il] = poly.area
+            for jl in range(il, len(pts)):
+                polyj = Polygon(pts[jl])
+                try:
+                    inS = poly.intersection(polyj)
+                except:
+                    print(poly, polyj)
+                inter_areas[il][jl] = inS.area
+                inter_areas[jl][il] = inS.area
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            ovr = inter_areas[i][order[1:]] / (areas[i] + areas[order[1:]] - inter_areas[i][order[1:]])
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+        return keep
+
+    def esd_pnms(self, esd, pnms_thresh):
+        scores = []
+        dets = []
+        for ele in esd:
+            score = ele['score']
+            quad = ele['seg_rorect']
+            # det = np.array([[quad[0][0], quad[0][1]], [quad[1][0], quad[1][1]],[quad[2][0], quad[2][1]],[quad[3][0], quad[3][1]]])
+            det = np.array([[quad[0], quad[1]], [quad[2], quad[3]], [quad[4], quad[5]], [quad[6], quad[7]]])
+            scores.append(score)
+            dets.append(det)
+        scores = np.array(scores)
+        dets = np.array(dets)
+        keep = self.py_cpu_pnms(dets, scores, pnms_thresh)
+        return keep
+
+    def compute_prediction(self, original_image):
+        # apply pre-processing to image
+        image = self.transforms(original_image)
+        # convert to an ImageList, padded so that it is divisible by
+        # cfg.DATALOADER.SIZE_DIVISIBILITY
+        image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY)
+        image_list = image_list.to(self.device)
+        # compute predictions
+        with torch.no_grad():
+            output = self.model(image_list)
+        prediction = [o.to(self.cpu_device) for o in output][0]
+        #global_predictions = predictions[0]
+        #char_predictions = predictions[1]
+        #char_mask = char_predictions['char_mask']
+        #char_boxes = char_predictions['boxes']
+        #words, rec_scores = self.process_char_mask(char_mask, char_boxes)
+        #seq_words = char_predictions['seq_outputs']
+        #seq_scores = char_predictions['seq_scores']
+
+        # reshape prediction (a BoxList) into the original image size
+        image_height, image_width = original_image.shape[:-1]
+        prediction = prediction.resize((image_width, image_height))
+        if len(prediction) == 0:
+            return
+        prediction = prediction.convert("xywh")
+        boxes = prediction.bbox.tolist()
+        scores = prediction.get_field("scores").tolist()
+        masks_x = prediction.get_field("mask_x")
+        masks_y = prediction.get_field("mask_y")
+        #masks = [self.boundary_to_mask_ic(mask_x, mask_y) for
+        #             mask_x, mask_y in zip(masks_x, masks_y)]
+        masks = [self.boundary_to_mask_ctw(mask_x, mask_y, p_temp) for
+                     mask_x, mask_y, p_temp in zip(masks_x, masks_y, prediction.bbox)]
+        masks = torch.from_numpy(np.array(masks)[:, np.newaxis, :, :])
+        # Masker is necessary only if masks haven't been already resized.
+        masker = Masker(threshold=0.5, padding=1)
+        if list(masks.shape[-2:]) != [image_height, image_width]:
+            masks = masker(masks.expand(1, -1, -1, -1, -1), prediction)
+            masks = masks[0]
+
+        '''
+        rects = [self.mask_to_roRect(mask, [image_height, image_width]) for mask in masks]
+
+        esd = []
+        for k, rect in enumerate(rects):
+            if rect.all() == 0:
+                continue
+            else:
+                esd.append(
+                    {
+                        "seg_rorect": rect.tolist(),
+                        "score": scores[k],
+                    }
+                )
+
+        if cfg.PROCESS.PNMS:
+            pnms_thresh = cfg.PROCESS.NMS_THRESH
+            keep = self.esd_pnms(esd, pnms_thresh)
+            im_write = cv2.imread('./demo/1.jpg')[:, :, ::-1]
+            for i in keep:
+                box = esd[i]
+                # print(box)
+                # assert 1<0
+                box = np.array(box['seg_rorect'])
+                box = np.around(box).astype(np.int32)
+                cv2.polylines(im_write[:, :, ::-1], [box.astype(np.int32).reshape((-1, 1, 2))], True,
+                                color=(0, 255, 0), thickness=2)  # 0,255,255 y 0,255,0 g
+            cv2.imwrite('./demo/example_results.jpg', im_write[:, :, ::-1])
+        
+        '''
+        contours = [self.mask_to_contours(mask, [image_height, image_width]) for mask in masks]
+        '''
+        im_write = original_image[:, :, ::-1]
+        for box in contours:
+            box = np.array(box)
+            box = np.around(box).astype(np.int32)
+            cv2.polylines(im_write[:, :, ::-1], [box.astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), thickness=2)  # 0,255,255 y 0,255,0 g
+        cv2.imwrite('./demo/example_results.jpg', im_write[:, :, ::-1])
+        '''
+        
+        return contours, np.array(masks.repeat(1,3,1,1)).astype(np.bool_).transpose(0,2,3,1)
+
+    def process_char_mask(self, char_masks, boxes, threshold=192):
+        texts, rec_scores = [], []
+        for index in range(char_masks.shape[0]):
+            box = list(boxes[index])
+            box = list(map(int, box))
+            text, rec_score, _, _ = getstr_grid(char_masks[index,:,:,:].copy(), box, threshold=threshold)
+            texts.append(text)
+            rec_scores.append(rec_score)
+        return texts, rec_scores
+
+    def mask2polygon(self, mask, box, im_size, threshold=0.5, output_polygon=True):
+        # mask 32*128
+        image_width, image_height = im_size[1], im_size[0]
+        box_h = box[3] - box[1]
+        box_w = box[2] - box[0]
+        cls_polys = (mask*255).astype(np.uint8)
+        poly_map = np.array(Image.fromarray(cls_polys).resize((box_w, box_h)))
+        poly_map = poly_map.astype(np.float32) / 255
+        poly_map=cv2.GaussianBlur(poly_map,(3,3),sigmaX=3)
+        ret, poly_map = cv2.threshold(poly_map,0.5,1,cv2.THRESH_BINARY)
+        if output_polygon:
+            SE1=cv2.getStructuringElement(cv2.MORPH_RECT,(3,3))
+            poly_map = cv2.erode(poly_map,SE1) 
+            poly_map = cv2.dilate(poly_map,SE1);
+            poly_map = cv2.morphologyEx(poly_map,cv2.MORPH_CLOSE,SE1)
+            try:
+                _, contours, _ = cv2.findContours((poly_map * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
+            except:
+                contours, _ = cv2.findContours((poly_map * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
+            if len(contours)==0:
+                print(contours)
+                print(len(contours))
+                return None
+            max_area=0
+            max_cnt = contours[0]
+            for cnt in contours:
+                area=cv2.contourArea(cnt)
+                if area > max_area:
+                    max_area = area
+                    max_cnt = cnt
+            perimeter = cv2.arcLength(max_cnt,True)
+            epsilon = 0.01*cv2.arcLength(max_cnt,True)
+            approx = cv2.approxPolyDP(max_cnt,epsilon,True)
+            pts = approx.reshape((-1,2))
+            pts[:,0] = pts[:,0] + box[0]
+            pts[:,1] = pts[:,1] + box[1]
+            polygon = list(pts.reshape((-1,)))
+            polygon = list(map(int, polygon))
+            if len(polygon)<6:
+                return None     
+        else:      
+            SE1=cv2.getStructuringElement(cv2.MORPH_RECT,(3,3))
+            poly_map = cv2.erode(poly_map,SE1) 
+            poly_map = cv2.dilate(poly_map,SE1);
+            poly_map = cv2.morphologyEx(poly_map,cv2.MORPH_CLOSE,SE1)
+            idy,idx=np.where(poly_map == 1)
+            xy=np.vstack((idx,idy))
+            xy=np.transpose(xy)
+            hull = cv2.convexHull(xy, clockwise=True)
+            #reverse order of points.
+            if  hull is None:
+                return None
+            hull=hull[::-1]
+            #find minimum area bounding box.
+            rect = cv2.minAreaRect(hull)
+            corners = cv2.boxPoints(rect)
+            corners = np.array(corners, dtype="int")
+            pts = get_tight_rect(corners, box[0], box[1], image_height, image_width, 1)
+            polygon = [x * 1.0 for x in pts]
+            polygon = list(map(int, polygon))
+        return polygon
+
+    def visualization(self, image, polygons, masks):
+        green = np.ones(image.shape).astype(np.uint8)
+        green[...,0] = 0
+        green[...,1] = 255
+        green[...,2] = 0
+        for mask in masks:
+            image[mask] = image[mask] * 0.5 + green[mask] * 0.5
+        '''
+        for polygon in polygons:
+            pts = np.array(polygon, np.int32)
+            pts = pts.reshape((-1,1,2))
+            xmin = min(pts[:,0,0])
+            ymin = min(pts[:,0,1])
+            cv2.polylines(image,[pts],True,(0,0,255))
+            #cv2.putText(image, word, (xmin, ymin), cv2.FONT_HERSHEY_COMPLEX, 1, (0,0,255), 2)
+        '''
+        return image
+
+
+def main(args):
+    # update the config options with the config file
+    cfg.merge_from_file(args.config_file)
+    # manual override some options
+    # cfg.merge_from_list(["MODEL.DEVICE", "cpu"])
+
+    text_demo = TextDemo(
+        cfg,
+        min_image_size=(1200,2000),
+        confidence_threshold=0.85,
+        output_polygon=True
+    )
+    # load image and then run prediction
+    
+    image = cv2.imread(args.image_path)
+    result_polygons, result_masks = text_demo.run_on_opencv_image(image)
+    image = text_demo.visualization(image, result_polygons, result_masks)
+    cv2.imwrite(args.visu_path, image)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='parameters for demo')
+    parser.add_argument("--config-file", type=str, default='./configs/ctw/r50_baseline.yaml')
+    parser.add_argument("--image_path", type=str, default='./det_visual/1223.jpg')
+    parser.add_argument("--visu_path", type=str, default='./demo/example_results.jpg')
+    args = parser.parse_args()
+    main(args)
diff --git a/tools/test_net.py b/tools/test_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..e96ab47825679ab40718dc390a1688a91a5eb629
--- /dev/null
+++ b/tools/test_net.py
@@ -0,0 +1,95 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Set up custom environment before nearly anything else is imported
+# NOTE: this should be the first import (no not reorder)
+from maskrcnn_benchmark.utils.env import setup_environment  # noqa F401 isort:skip
+
+import argparse
+import os
+
+import torch
+from maskrcnn_benchmark.config import cfg
+from maskrcnn_benchmark.data import make_data_loader
+from maskrcnn_benchmark.engine.inference import inference
+from maskrcnn_benchmark.modeling.detector import build_detection_model
+from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
+from maskrcnn_benchmark.utils.collect_env import collect_env_info
+from maskrcnn_benchmark.utils.comm import synchronize, get_rank
+from maskrcnn_benchmark.utils.logger import setup_logger
+from maskrcnn_benchmark.utils.miscellaneous import mkdir
+
+
+def main():
+    parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
+    parser.add_argument(
+        "--config-file",
+        default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--local_rank", type=int, default=0)
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+
+    args = parser.parse_args()
+
+    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
+    distributed = num_gpus > 1
+
+    if distributed:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(
+            backend="nccl", init_method="env://"
+        )
+        synchronize()
+
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+
+    save_dir = ""
+    logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank())
+    logger.info("Using {} GPUs".format(num_gpus))
+    logger.info(cfg)
+
+    logger.info("Collecting env info (might take some time)")
+    logger.info("\n" + collect_env_info())
+
+    model = build_detection_model(cfg)
+    model.to(cfg.MODEL.DEVICE)
+
+    output_dir = cfg.OUTPUT_DIR
+    checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
+    _ = checkpointer.load(cfg.MODEL.WEIGHT)
+
+    iou_types = ("bbox",)
+    if cfg.MODEL.BOUNDARY_ON:
+        iou_types = iou_types + ("bo",)
+    output_folders = [None] * len(cfg.DATASETS.TEST)
+    dataset_names = cfg.DATASETS.TEST
+    if cfg.OUTPUT_DIR:
+        for idx, dataset_name in enumerate(dataset_names):
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
+            mkdir(output_folder)
+            output_folders[idx] = output_folder
+    data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
+    for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
+        inference(
+            model,
+            data_loader_val,
+            dataset_name=dataset_name,
+            iou_types=iou_types,
+            box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
+            device=cfg.MODEL.DEVICE,
+            expected_results=cfg.TEST.EXPECTED_RESULTS,
+            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
+            output_folder=output_folder,
+        )
+        synchronize()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/train_net.py b/tools/train_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b62001757b1d18b8cb74581af27dbded97be731
--- /dev/null
+++ b/tools/train_net.py
@@ -0,0 +1,174 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+r"""
+Basic training script for PyTorch
+"""
+
+# Set up custom environment before nearly anything else is imported
+# NOTE: this should be the first import (no not reorder)
+from maskrcnn_benchmark.utils.env import setup_environment  # noqa F401 isort:skip
+
+import argparse
+import os
+
+import torch
+from maskrcnn_benchmark.config import cfg
+from maskrcnn_benchmark.data import make_data_loader
+from maskrcnn_benchmark.solver import make_lr_scheduler
+from maskrcnn_benchmark.solver import make_optimizer
+from maskrcnn_benchmark.engine.inference import inference
+from maskrcnn_benchmark.engine.trainer import do_train
+from maskrcnn_benchmark.modeling.detector import build_detection_model
+from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
+from maskrcnn_benchmark.utils.collect_env import collect_env_info
+from maskrcnn_benchmark.utils.comm import synchronize, get_rank
+from maskrcnn_benchmark.utils.imports import import_file
+from maskrcnn_benchmark.utils.logger import setup_logger
+from maskrcnn_benchmark.utils.miscellaneous import mkdir
+
+
+def train(cfg, local_rank, distributed):
+    model = build_detection_model(cfg)
+    device = torch.device(cfg.MODEL.DEVICE)
+    model.to(device)
+
+    optimizer = make_optimizer(cfg, model)
+    scheduler = make_lr_scheduler(cfg, optimizer)
+
+    if distributed:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[local_rank], output_device=local_rank,
+            # this should be removed if we update BatchNorm stats
+            broadcast_buffers=False,
+        )
+
+    arguments = {}
+    arguments["iteration"] = 0
+
+    output_dir = cfg.OUTPUT_DIR
+
+    save_to_disk = get_rank() == 0
+    checkpointer = DetectronCheckpointer(
+        cfg, model, optimizer, scheduler, output_dir, save_to_disk
+    )
+    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
+    arguments.update(extra_checkpoint_data)
+
+    data_loader = make_data_loader(
+        cfg,
+        is_train=True,
+        is_distributed=distributed,
+        start_iter=arguments["iteration"],
+    )
+
+    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
+
+    do_train(
+        model,
+        data_loader,
+        optimizer,
+        scheduler,
+        checkpointer,
+        device,
+        checkpoint_period,
+        arguments,
+    )
+
+    return model
+
+
+def run_test(cfg, model, distributed):
+    if distributed:
+        model = model.module
+    torch.cuda.empty_cache()  # TODO check if it helps
+    iou_types = ("bbox",)
+    if cfg.MODEL.MASK_ON:
+        iou_types = iou_types + ("segm",)
+    if cfg.MODEL.KEYPOINT_ON:
+        iou_types = iou_types + ("keypoints",)
+    output_folders = [None] * len(cfg.DATASETS.TEST)
+    dataset_names = cfg.DATASETS.TEST
+    if cfg.OUTPUT_DIR:
+        for idx, dataset_name in enumerate(dataset_names):
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
+            mkdir(output_folder)
+            output_folders[idx] = output_folder
+    data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
+    for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
+        inference(
+            model,
+            data_loader_val,
+            dataset_name=dataset_name,
+            iou_types=iou_types,
+            box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
+            device=cfg.MODEL.DEVICE,
+            expected_results=cfg.TEST.EXPECTED_RESULTS,
+            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
+            output_folder=output_folder,
+        )
+        synchronize()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
+    parser.add_argument(
+        "--config-file",
+        default="",
+        metavar="FILE",
+        help="path to config file",
+        type=str,
+    )
+    parser.add_argument("--local_rank", type=int, default=0)
+    parser.add_argument(
+        "--skip-test",
+        dest="skip_test",
+        help="Do not test the final model",
+        action="store_true",
+    )
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+
+    args = parser.parse_args()
+
+    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
+    args.distributed = num_gpus > 1
+
+    if args.distributed:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(
+            backend="nccl", init_method="env://"
+        )
+        synchronize()
+
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+
+    output_dir = cfg.OUTPUT_DIR
+    if output_dir:
+        mkdir(output_dir)
+
+    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
+    logger.info("Using {} GPUs".format(num_gpus))
+    logger.info(args)
+
+    logger.info("Collecting env info (might take some time)")
+    logger.info("\n" + collect_env_info())
+
+    logger.info("Loaded configuration file {}".format(args.config_file))
+    with open(args.config_file, "r") as cf:
+        config_str = "\n" + cf.read()
+        logger.info(config_str)
+    logger.info("Running with config:\n{}".format(cfg))
+
+    model = train(cfg, args.local_rank, args.distributed)
+
+    if not args.skip_test:
+        run_test(cfg, model, args.distributed)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/train_contour.sh b/train_contour.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c96e18b37e8ed6ddc38c1df123f9ed436f554278
--- /dev/null
+++ b/train_contour.sh
@@ -0,0 +1,6 @@
+# export NCCL_P2P_DISABLE=1
+export NGPUS=1
+CUDA_VISIBLE_DEVICES=1 python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_net.py \
+	--config-file "configs/ic/r50_baseline.yaml" \
+	--skip-test
+