Spaces:

fun-research
/

kMaX-DeepLab

Runtime error

App Files Files Community

Qihang Yu commited on Jun 7, 2023

Commit

a06fad0

1 Parent(s): f6d10ab

Add kMaX-DeepLab

Browse files

Files changed (40) hide show

app.py +71 -4
configs/coco/panoptic-segmentation/kmax_convnext_base.yaml +13 -0
configs/coco/panoptic-segmentation/kmax_convnext_large.yaml +13 -0
configs/coco/panoptic-segmentation/kmax_convnext_small.yaml +13 -0
configs/coco/panoptic-segmentation/kmax_convnext_tiny.yaml +13 -0
configs/coco/panoptic-segmentation/kmax_r50.yaml +91 -0
convert-pretrained-model-to-d2.py +36 -0
convert-tf-weights-to-d2.py +400 -0
demo/demo.ipynb +213 -0
demo/demo.py +156 -0
demo/predictor.py +166 -0
docs/clustering_view_of_mask_transformer.png +0 -0
docs/kmax_decoder.png +0 -0
kmax_deeplab/__init__.py +15 -0
kmax_deeplab/config.py +96 -0
kmax_deeplab/data/__init__.py +1 -0
kmax_deeplab/data/dataset_mappers/__init__.py +0 -0
kmax_deeplab/data/dataset_mappers/coco_panoptic_kmaxdeeplab_dataset_mapper.py +326 -0
kmax_deeplab/data/datasets/__init__.py +3 -0
kmax_deeplab/data/datasets/register_coco_panoptic_annos_semseg.py +182 -0
kmax_deeplab/evaluation/__init__.py +0 -0
kmax_deeplab/evaluation/instance_evaluation.py +107 -0
kmax_deeplab/evaluation/panoptic_evaluation.py +269 -0
kmax_deeplab/kmax_model.py +446 -0
kmax_deeplab/modeling/__init__.py +4 -0
kmax_deeplab/modeling/backbone/__init__.py +0 -0
kmax_deeplab/modeling/backbone/convnext.py +210 -0
kmax_deeplab/modeling/backbone/resnet.py +697 -0
kmax_deeplab/modeling/criterion.py +432 -0
kmax_deeplab/modeling/matcher.py +128 -0
kmax_deeplab/modeling/meta_arch/__init__.py +0 -0
kmax_deeplab/modeling/meta_arch/kmax_deeplab_head.py +88 -0
kmax_deeplab/modeling/pixel_decoder/__init__.py +0 -0
kmax_deeplab/modeling/pixel_decoder/kmax_pixel_decoder.py +370 -0
kmax_deeplab/modeling/transformer_decoder/__init__.py +1 -0
kmax_deeplab/modeling/transformer_decoder/kmax_transformer_decoder.py +453 -0
pakages.txt +4 -0
requirements.txt +34 -0
train_net.py +266 -0
train_net_utils.py +225 -0

app.py CHANGED Viewed

@@ -1,7 +1,74 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

+import os
+import sys
+os.system("pip install gdown")
+os.system("pip install imutils")
+os.system('pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html')
+os.system("pip install git+https://github.com/cocodataset/panopticapi.git")
 import gradio as gr
+# check pytorch installation:
+import detectron2
+from detectron2.utils.logger import setup_logger
+# import some common libraries
+import numpy as np
+import cv2
+import torch
+# import some common detectron2 utilities
+from detectron2 import model_zoo
+from detectron2.engine import DefaultPredictor
+from detectron2.config import get_cfg
+from detectron2.utils.visualizer import Visualizer, ColorMode
+from detectron2.data import MetadataCatalog
+from detectron2.projects.deeplab import add_deeplab_config
+coco_metadata = MetadataCatalog.get("coco_2017_val_panoptic")
+# import kMaXDeepLab project
+from kmax_deeplab import add_kmax_deeplab_config
+from PIL import Image
+import imutils
+cfg = get_cfg()
+cfg.MODEL.DEVICE='cpu'
+add_deeplab_config(cfg)
+add_kmax_deeplab_config(cfg)
+cfg.merge_from_file("configs/coco/panoptic-segmentation/kmax_convnext_large.yaml")
+os.system("gdown 1b6rEnKw4PNTdqSdWpmb0P9dsvN0pkOiN")
+cfg.MODEL.WEIGHTS = './kmax_convnext_large.pth'
+cfg.MODEL.KMAX_DEEPLAB.TEST.SEMANTIC_ON = True
+cfg.MODEL.KMAX_DEEPLAB.TEST.INSTANCE_ON = True
+cfg.MODEL.KMAX_DEEPLAB.TEST.PANOPTIC_ON = True
+predictor = DefaultPredictor(cfg)
+os.system("wget https://i.imgur.com/Vj17K5z.jpg")
+def inference(img):
+    im = cv2.imread(img)
+    im = imutils.resize(im, width=512)
+    outputs = predictor(im)
+    v = Visualizer(im[:, :, ::-1], coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
+    panoptic_result = v.draw_panoptic_seg(outputs["panoptic_seg"][0].to("cpu"), outputs["panoptic_seg"][1]).get_image()
+    v = Visualizer(im[:, :, ::-1], coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
+    instance_result = v.draw_instance_predictions(outputs["instances"].to("cpu")).get_image()
+    v = Visualizer(im[:, :, ::-1], coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)
+    semantic_result = v.draw_sem_seg(outputs["sem_seg"].argmax(0).to("cpu")).get_image()
+    return Image.fromarray(np.uint8(panoptic_result)).convert('RGB'),Image.fromarray(np.uint8(instance_result)).convert('RGB'),Image.fromarray(np.uint8(semantic_result)).convert('RGB')
+title = "kMaX-DeepLab"
+description = "Gradio demo for kMaX-DeepLab. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below."
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.01527' target='_blank'>kMaX-DeepLab</a> | <a href='https://github.com/google-research/deeplab2' target='_blank'>Github Repo</a></p>"
+examples = [['Vj17K5z.jpg']]
+gr.Interface(inference, inputs=gr.inputs.Image(type="filepath"), outputs=[gr.outputs.Image(label="Panoptic segmentation",type="pil"),gr.outputs.Image(label="instance segmentation",type="pil"),gr.outputs.Image(label="semantic segmentation",type="pil")], title=title,
+    description=description,
+    article=article,
+    examples=examples).launch(enable_queue=True,cache_examples=True)

configs/coco/panoptic-segmentation/kmax_convnext_base.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+_BASE_: kmax_r50.yaml
+MODEL:
+  # backbone part.
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  WEIGHTS: "./convnext_base_22k_1k_384.pkl"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 27, 3]
+    DIMS: [128, 256, 512, 1024]
+    # https://github.com/google-research/deeplab2/blob/main/configs/coco/kmax_deeplab/kmax_meta_convnext_base_os32.textproto#L28
+    DROP_PATH_RATE: 0.5
+    OUT_INDICES: [0, 1, 2, 3]

configs/coco/panoptic-segmentation/kmax_convnext_large.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+_BASE_: kmax_r50.yaml
+MODEL:
+  # backbone part.
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  WEIGHTS: "./convnext_large_22k_1k_384.pkl"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 27, 3]
+    DIMS: [192, 384, 768, 1536]
+    # https://github.com/google-research/deeplab2/blob/main/configs/coco/kmax_deeplab/kmax_meta_convnext_large_os32.textproto#L28
+    DROP_PATH_RATE: 0.6
+    OUT_INDICES: [0, 1, 2, 3]

configs/coco/panoptic-segmentation/kmax_convnext_small.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+_BASE_: kmax_r50.yaml
+MODEL:
+  # backbone part.
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  WEIGHTS: "./convnext_small_22k_1k_384.pkl"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 27, 3]
+    DIMS: [96, 192, 384, 768]
+    # https://github.com/google-research/deeplab2/blob/main/configs/coco/kmax_deeplab/kmax_meta_convnext_small_os32.textproto#L28
+    DROP_PATH_RATE: 0.4
+    OUT_INDICES: [0, 1, 2, 3]

configs/coco/panoptic-segmentation/kmax_convnext_tiny.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+_BASE_: kmax_r50.yaml
+MODEL:
+  # backbone part.
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  WEIGHTS: "./convnext_tiny_22k_1k_384.pkl"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 9, 3]
+    DIMS: [96, 192, 384, 768]
+    # https://github.com/google-research/deeplab2/blob/main/configs/coco/kmax_deeplab/kmax_meta_convnext_tiny_os32.textproto#L28
+    DROP_PATH_RATE: 0.3
+    OUT_INDICES: [0, 1, 2, 3]

configs/coco/panoptic-segmentation/kmax_r50.yaml ADDED Viewed

	@@ -0,0 +1,91 @@

+MODEL:
+  # backbone part.
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "custom_bn_build_resnet_backbone" # we customize the momentum and eps in syncbn, to align with tf implementation.
+  WEIGHTS: "../R-50.pkl"
+  PIXEL_MEAN: [127.5, 127.5, 127.5]
+  PIXEL_STD: [127.5, 127.5, 127.5]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    NORM: "SyncBN"
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+  # kmax part.
+  META_ARCHITECTURE: "kMaXDeepLab"
+  SEM_SEG_HEAD:
+    NAME: "kMaXDeepLabHead"
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 133
+    LOSS_WEIGHT: 1.0
+  KMAX_DEEPLAB:
+    SAVE_VIS_NUM: 0
+    SHARE_FINAL_MATCHING: True
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 1e-5
+    CLASS_WEIGHT: 3.0
+    DICE_WEIGHT: 3.0
+    MASK_WEIGHT: 0.3
+    INSDIS_WEIGHT: 1.0
+    AUX_SEMANTIC_WEIGHT: 1.0
+    PIXEL_DEC:
+      NAME: "kMaXPixelDecoder"
+      IN_FEATURES: ["res2", "res3", "res4", "res5"]
+      DEC_LAYERS: [1, 5, 1, 1]
+      LAYER_TYPES: ["axial", "axial", "bottleneck", "bottleneck"]
+      DEC_CHANNELS: [512, 256, 128, 64]
+    TRANS_DEC:
+      NAME: "kMaXTransformerDecoder"
+      DEC_LAYERS: [2, 2, 2]
+      NUM_OBJECT_QUERIES: 128
+      IN_CHANNELS: [2048, 1024, 512] # [512 * 4, 256 * 4, 128 * 4]
+      DROP_PATH_PROB: 0.2
+    TEST:
+      SEMANTIC_ON: False
+      INSTANCE_ON: False # Save some time :)
+      PANOPTIC_ON: True
+      OBJECT_MASK_THRESHOLD: 0.4
+      CLASS_THRESHOLD_THING: 0.7
+      CLASS_THRESHOLD_STUFF: 0.5
+      REORDER_CLASS_WEIGHT: 1.0
+      REORDER_MASK_WEIGHT: 1.0
+      OVERLAP_THRESHOLD: 0.8
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic",)
+  TEST: ("coco_2017_val_panoptic",)
+SOLVER:
+  IMS_PER_BATCH: 64
+  BASE_LR: 0.0005
+  LR_SCHEDULER_NAME: "TF2WarmupPolyLR"
+  MAX_ITER: 150000
+  WARMUP_ITERS: 5000
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: False
+  AMP:
+    ENABLED: True
+INPUT:
+  IMAGE_SIZE: [1281, 1281]
+  MIN_SCALE: 0.2
+  MAX_SCALE: 2.0
+  FORMAT: "RGB"
+  DATASET_MAPPER_NAME: "coco_panoptic_lsj"
+  MIN_SIZE_TEST: 1281
+  MAX_SIZE_TEST: 1281
+TEST:
+  EVAL_PERIOD: 5000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 4
+VERSION: 2

convert-pretrained-model-to-d2.py ADDED Viewed

	@@ -0,0 +1,36 @@

+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import pickle as pkl
+import sys
+import torch
+"""
+Usage:
+  # download pretrained swin model:
+  wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
+  # run the conversion
+  ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
+  # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
+MODEL:
+  WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
+INPUT:
+  FORMAT: "RGB"
+"""
+if __name__ == "__main__":
+    input = sys.argv[1]
+    obj = torch.load(input, map_location="cpu")["model"]
+    # Clean unused convnext weight
+    if "norm.weight" in obj:
+        del obj["norm.weight"]
+    if "norm.bias" in obj:
+        del obj["norm.bias"]
+    res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
+    with open(sys.argv[2], "wb") as f:
+        pkl.dump(res, f)

convert-tf-weights-to-d2.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import tensorflow as tf
+import pickle as pkl
+import sys
+import torch
+import numpy as np
+def load_tf_weights(ckpt_path):
+    # https://stackoverflow.com/questions/40118062/how-to-read-weights-saved-in-tensorflow-checkpoint-file
+    from tensorflow.python.training import py_checkpoint_reader
+    reader = py_checkpoint_reader.NewCheckpointReader(ckpt_path)
+    state_dict = {}
+    for k in reader.get_variable_to_shape_map():
+        if '.OPTIMIZER_SLOT' in k or 'optimizer' in k or '_CHECKPOINTABLE_OBJECT_GRAPH' in k or 'save_counter' in k or 'global_step' in k:
+            continue
+        v = reader.get_tensor(k)
+        state_dict[k.replace('/.ATTRIBUTES/VARIABLE_VALUE', '')] = v
+    for k in sorted(state_dict.keys()):
+        print(k, state_dict[k].shape)
+    return state_dict
+def map_bn(name1, name2):
+    res = {}
+    res[name1 + '/gamma'] = name2 + ".weight"
+    res[name1 + '/beta'] = name2 + ".bias"
+    res[name1 + '/moving_mean'] = name2 + ".running_mean"
+    res[name1 + '/moving_variance'] = name2 + ".running_var"
+    return res
+def map_conv(name1, name2, dw=False, bias=False):
+    res = {}
+    if dw:
+        res[name1 + '/depthwise_kernel'] = name2 + ".weight"
+    else:
+        res[name1 + '/kernel'] = name2 + ".weight"
+    if bias:
+        res[name1 + '/bias'] = name2 + ".bias"
+    return res
+def tf_2_torch_mapping_r50():
+    res = {}
+    res.update(map_conv('encoder/_stem/_conv', 'backbone.stem.conv1'))
+    res.update(map_bn('encoder/_stem/_batch_norm', 'backbone.stem.conv1.norm'))
+    block_num = {2: 3, 3: 4, 4: 6, 5: 3}
+    for stage_idx in range(2, 6):
+        for block_idx in range(1, block_num[stage_idx] + 1):
+            res.update(map_conv(f'encoder/_stage{stage_idx}/_block{block_idx}/_conv1_bn_act/_conv',
+             f'backbone.res{stage_idx}.{block_idx-1}.conv1'))
+            res.update(map_bn(f'encoder/_stage{stage_idx}/_block{block_idx}/_conv1_bn_act/_batch_norm',
+             f'backbone.res{stage_idx}.{block_idx-1}.conv1.norm'))
+            res.update(map_conv(f'encoder/_stage{stage_idx}/_block{block_idx}/_conv2_bn_act/_conv',
+             f'backbone.res{stage_idx}.{block_idx-1}.conv2'))
+            res.update(map_bn(f'encoder/_stage{stage_idx}/_block{block_idx}/_conv2_bn_act/_batch_norm',
+             f'backbone.res{stage_idx}.{block_idx-1}.conv2.norm'))
+            res.update(map_conv(f'encoder/_stage{stage_idx}/_block{block_idx}/_conv3_bn/_conv',
+             f'backbone.res{stage_idx}.{block_idx-1}.conv3'))
+            res.update(map_bn(f'encoder/_stage{stage_idx}/_block{block_idx}/_conv3_bn/_batch_norm',
+             f'backbone.res{stage_idx}.{block_idx-1}.conv3.norm'))
+            res.update(map_conv(f'encoder/_stage{stage_idx}/_block{block_idx}/_shortcut/_conv',
+             f'backbone.res{stage_idx}.{block_idx-1}.shortcut'))
+            res.update(map_bn(f'encoder/_stage{stage_idx}/_block{block_idx}/_shortcut/_batch_norm',
+             f'backbone.res{stage_idx}.{block_idx-1}.shortcut.norm'))
+    return res
+def tf_2_torch_mapping_convnext():
+    res = {}
+    for i in range(4):
+        if i == 0:
+            res.update(map_conv(f'encoder/downsample_layers/{i}/layer_with_weights-0',
+                f'backbone.downsample_layers.{i}.0', bias=True))
+            res.update(map_bn(f'encoder/downsample_layers/{i}/layer_with_weights-1',
+                f'backbone.downsample_layers.{i}.1'))
+        else:
+            res.update(map_conv(f'encoder/downsample_layers/{i}/layer_with_weights-1',
+                f'backbone.downsample_layers.{i}.1', bias=True))
+            res.update(map_bn(f'encoder/downsample_layers/{i}/layer_with_weights-0',
+                f'backbone.downsample_layers.{i}.0'))
+    block_num = {0: 3, 1: 3, 2: 27, 3: 3}
+    for stage_idx in range(4):
+        for block_idx in range(block_num[stage_idx]):
+            res.update(map_conv(f'encoder/stages/{stage_idx}/layer_with_weights-{block_idx}/depthwise_conv',
+                f'backbone.stages.{stage_idx}.{block_idx}.dwconv', bias=True))
+            res.update(map_bn(f'encoder/stages/{stage_idx}/layer_with_weights-{block_idx}/norm',
+                f'backbone.stages.{stage_idx}.{block_idx}.norm'))
+            res.update(map_conv(f'encoder/stages/{stage_idx}/layer_with_weights-{block_idx}/pointwise_conv1',
+                f'backbone.stages.{stage_idx}.{block_idx}.pwconv1', bias=True))
+            res.update(map_conv(f'encoder/stages/{stage_idx}/layer_with_weights-{block_idx}/pointwise_conv2',
+                f'backbone.stages.{stage_idx}.{block_idx}.pwconv2', bias=True))
+            res[f'encoder/stages/{stage_idx}/layer_with_weights-{block_idx}/layer_scale'] = f'backbone.stages.{stage_idx}.{block_idx}.gamma'
+    return res
+def tf_2_torch_mapping_pixel_dec():
+    res = {}
+    for i in range(4):
+        res.update(map_bn(f'pixel_decoder/_backbone_norms/{i}', f'sem_seg_head.pixel_decoder._in_norms.{i}'))
+        res.update(map_bn(f'pixel_decoder/_backbone_norms/{i}', f'sem_seg_head.pixel_decoder._in_norms.{i}'))
+        res.update(map_bn(f'pixel_decoder/_backbone_norms/{i}', f'sem_seg_head.pixel_decoder._in_norms.{i}'))
+        res.update(map_bn(f'pixel_decoder/_backbone_norms/{i}', f'sem_seg_head.pixel_decoder._in_norms.{i}'))
+    for i in range(3):
+        res.update(map_conv(f'pixel_decoder/_skip_connections/{i}/_resized_conv_bn1/_conv',
+         f'sem_seg_head.pixel_decoder._resized_fuses.{i}._conv_bn_low.conv'))
+        res.update(map_bn(f'pixel_decoder/_skip_connections/{i}/_resized_conv_bn1/_batch_norm',
+         f'sem_seg_head.pixel_decoder._resized_fuses.{i}._conv_bn_low.norm'))
+        res.update(map_conv(f'pixel_decoder/_skip_connections/{i}/_resized_conv_bn2/_conv',
+         f'sem_seg_head.pixel_decoder._resized_fuses.{i}._conv_bn_high.conv'))
+        res.update(map_bn(f'pixel_decoder/_skip_connections/{i}/_resized_conv_bn2/_batch_norm',
+         f'sem_seg_head.pixel_decoder._resized_fuses.{i}._conv_bn_high.norm'))
+    num_blocks = {0: 1, 1:5, 2:1, 3:1}
+    for stage_idx in range(4):
+        for block_idx in range(1, 1+num_blocks[stage_idx]):
+            res.update(map_conv(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_shortcut/_conv',
+                f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._shortcut.conv'))
+            res.update(map_bn(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_shortcut/_batch_norm',
+                f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._shortcut.norm'))
+            res.update(map_conv(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_conv1_bn_act/_conv',
+                f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._conv1_bn_act.conv'))
+            res.update(map_bn(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_conv1_bn_act/_batch_norm',
+                f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._conv1_bn_act.norm'))
+            res.update(map_conv(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_conv3_bn/_conv',
+                f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._conv3_bn.conv'))
+            res.update(map_bn(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_conv3_bn/_batch_norm',
+                f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._conv3_bn.norm'))
+            if stage_idx <= 1:
+                for attn in ['height', 'width']:
+                    res.update(map_bn(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_attention/_{attn}_axis/_batch_norm_qkv',
+                    f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._attention._{attn}_axis._batch_norm_qkv'))
+                    res.update(map_bn(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_attention/_{attn}_axis/_batch_norm_retrieved_output',
+                    f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._attention._{attn}_axis._batch_norm_retrieved_output'))
+                    res.update(map_bn(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_attention/_{attn}_axis/_batch_norm_similarity',
+                    f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._attention._{attn}_axis._batch_norm_similarity'))
+                    res[f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_attention/_{attn}_axis/_key_rpe/embeddings'] = (
+                        f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._attention._{attn}_axis._key_rpe._embeddings.weight')
+                    res[f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_attention/_{attn}_axis/_query_rpe/embeddings'] = (
+                        f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._attention._{attn}_axis._query_rpe._embeddings.weight')
+                    res[f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_attention/_{attn}_axis/_value_rpe/embeddings'] = (
+                        f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._attention._{attn}_axis._value_rpe._embeddings.weight')
+                    res[f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_attention/_{attn}_axis/qkv_kernel'] = (
+                        f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._attention._{attn}_axis.qkv_transform.conv.weight')
+            else:
+                res.update(map_conv(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_conv2_bn_act/_conv',
+                    f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._conv2_bn_act.conv'))
+                res.update(map_bn(f'pixel_decoder/_stages/{stage_idx}/_block{block_idx}/_conv2_bn_act/_batch_norm',
+                    f'sem_seg_head.pixel_decoder._stages.{stage_idx}._blocks.{block_idx-1}._conv2_bn_act.norm'))
+    return res
+def tf_2_torch_mapping_predcitor(prefix_tf, prefix_torch):
+    res = {}
+    res.update(map_bn(prefix_tf + 'pixel_space_feature_batch_norm',
+        prefix_torch + '_pixel_space_head_last_convbn.norm'))
+    res[prefix_tf + 'pixel_space_head/conv_block/_conv1_bn_act/_depthwise/_depthwise_conv/depthwise_kernel'] = (
+        prefix_torch + '_pixel_space_head_conv0bnact.conv.weight'
+    )
+    res.update(map_bn(prefix_tf + 'pixel_space_head/conv_block/_conv1_bn_act/_depthwise/_batch_norm',
+        prefix_torch + '_pixel_space_head_conv0bnact.norm'))
+    res.update(map_conv(prefix_tf + 'pixel_space_head/conv_block/_conv1_bn_act/_pointwise/_conv',
+        prefix_torch + '_pixel_space_head_conv1bnact.conv'))
+    res.update(map_bn(prefix_tf + 'pixel_space_head/conv_block/_conv1_bn_act/_pointwise/_batch_norm',
+        prefix_torch + '_pixel_space_head_conv1bnact.norm'))
+    res.update(map_conv(prefix_tf + 'pixel_space_head/final_conv',
+        prefix_torch + '_pixel_space_head_last_convbn.conv', bias=True))
+    res.update(map_bn(prefix_tf + 'pixel_space_mask_batch_norm',
+        prefix_torch + '_pixel_space_mask_batch_norm'))
+    res.update(map_conv(prefix_tf + 'transformer_class_head/_conv',
+        prefix_torch + '_transformer_class_head.conv', bias=True))
+    res.update(map_conv(prefix_tf + 'transformer_mask_head/_conv',
+        prefix_torch + '_transformer_mask_head.conv'))
+    res.update(map_bn(prefix_tf + 'transformer_mask_head/_batch_norm',
+        prefix_torch + '_transformer_mask_head.norm'))
+    return res
+def tf_2_torch_mapping_trans_dec():
+    res = {}
+    res.update(map_bn('transformer_decoder/_class_embedding_projection/_batch_norm',
+        'sem_seg_head.predictor._class_embedding_projection.norm'))
+    res.update(map_conv('transformer_decoder/_class_embedding_projection/_conv',
+        'sem_seg_head.predictor._class_embedding_projection.conv'))
+    res.update(map_bn('transformer_decoder/_mask_embedding_projection/_batch_norm',
+        'sem_seg_head.predictor._mask_embedding_projection.norm'))
+    res.update(map_conv('transformer_decoder/_mask_embedding_projection/_conv',
+        'sem_seg_head.predictor._mask_embedding_projection.conv'))
+    res['transformer_decoder/cluster_centers'] = 'sem_seg_head.predictor._cluster_centers.weight'
+    res.update(tf_2_torch_mapping_predcitor(
+            prefix_tf = '',
+            prefix_torch = 'sem_seg_head.predictor._predcitor.'
+        ))
+    for kmax_idx in range(6):
+        res.update(tf_2_torch_mapping_predcitor(
+            prefix_tf = f'transformer_decoder/_kmax_decoder/{kmax_idx}/_block1_transformer/_auxiliary_clustering_predictor/_',
+            prefix_torch = f'sem_seg_head.predictor._kmax_transformer_layers.{kmax_idx}._predcitor.'
+        ))
+        common_prefix_tf = f'transformer_decoder/_kmax_decoder/{kmax_idx}/_block1_transformer/'
+        common_prefix_torch = f'sem_seg_head.predictor._kmax_transformer_layers.{kmax_idx}.'
+        res.update(map_bn(common_prefix_tf + '_kmeans_memory_batch_norm_retrieved_value',
+            common_prefix_torch + '_kmeans_query_batch_norm_retrieved_value'))
+        res.update(map_bn(common_prefix_tf + '_kmeans_memory_conv3_bn/_batch_norm',
+            common_prefix_torch + '_kmeans_query_conv3_bn.norm'))
+        res.update(map_conv(common_prefix_tf + '_kmeans_memory_conv3_bn/_conv',
+            common_prefix_torch + '_kmeans_query_conv3_bn.conv'))
+        res.update(map_bn(common_prefix_tf + '_memory_attention/_batch_norm_retrieved_value',
+            common_prefix_torch + '_query_self_attention._batch_norm_retrieved_value'))
+        res.update(map_bn(common_prefix_tf + '_memory_attention/_batch_norm_similarity',
+            common_prefix_torch + '_query_self_attention._batch_norm_similarity'))
+        res.update(map_bn(common_prefix_tf + '_memory_conv1_bn_act/_batch_norm',
+            common_prefix_torch + '_query_conv1_bn_act.norm'))
+        res.update(map_conv(common_prefix_tf + '_memory_conv1_bn_act/_conv',
+            common_prefix_torch + '_query_conv1_bn_act.conv'))
+        res.update(map_bn(common_prefix_tf + '_memory_conv3_bn/_batch_norm',
+            common_prefix_torch + '_query_conv3_bn.norm'))
+        res.update(map_conv(common_prefix_tf + '_memory_conv3_bn/_conv',
+            common_prefix_torch + '_query_conv3_bn.conv'))
+        res.update(map_bn(common_prefix_tf + '_memory_ffn_conv1_bn_act/_batch_norm',
+            common_prefix_torch + '_query_ffn_conv1_bn_act.norm'))
+        res.update(map_conv(common_prefix_tf + '_memory_ffn_conv1_bn_act/_conv',
+            common_prefix_torch + '_query_ffn_conv1_bn_act.conv'))
+        res.update(map_bn(common_prefix_tf + '_memory_ffn_conv2_bn/_batch_norm',
+            common_prefix_torch + '_query_ffn_conv2_bn.norm'))
+        res.update(map_conv(common_prefix_tf + '_memory_ffn_conv2_bn/_conv',
+            common_prefix_torch + '_query_ffn_conv2_bn.conv'))
+        res.update(map_bn(common_prefix_tf + '_memory_qkv_conv_bn/_batch_norm',
+            common_prefix_torch + '_query_qkv_conv_bn.norm'))
+        res.update(map_conv(common_prefix_tf + '_memory_qkv_conv_bn/_conv',
+            common_prefix_torch + '_query_qkv_conv_bn.conv'))
+        res.update(map_bn(common_prefix_tf + '_pixel_conv1_bn_act/_batch_norm',
+            common_prefix_torch + '_pixel_conv1_bn_act.norm'))
+        res.update(map_conv(common_prefix_tf + '_pixel_conv1_bn_act/_conv',
+            common_prefix_torch + '_pixel_conv1_bn_act.conv'))
+        res.update(map_bn(common_prefix_tf + '_pixel_v_conv_bn/_batch_norm',
+            common_prefix_torch + '_pixel_v_conv_bn.norm'))
+        res.update(map_conv(common_prefix_tf + '_pixel_v_conv_bn/_conv',
+            common_prefix_torch + '_pixel_v_conv_bn.conv'))
+    return res
+def tf_2_torch_mapping_aux_semanic_dec():
+    res = {}
+    res.update(map_conv('semantic_decoder/_aspp/_conv_bn_act/_conv',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._aspp._aspp_conv0.conv'))
+    res.update(map_bn('semantic_decoder/_aspp/_conv_bn_act/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._aspp._aspp_conv0.norm'))
+    res.update(map_conv('semantic_decoder/_aspp/_aspp_pool/_conv_bn_act/_conv',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._aspp._aspp_pool.conv'))
+    res.update(map_bn('semantic_decoder/_aspp/_aspp_pool/_conv_bn_act/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._aspp._aspp_pool.norm'))
+    res.update(map_conv('semantic_decoder/_aspp/_proj_conv_bn_act/_conv',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._aspp._proj_conv_bn_act.conv'))
+    res.update(map_bn('semantic_decoder/_aspp/_proj_conv_bn_act/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._aspp._proj_conv_bn_act.norm'))
+    for i in range(1, 4):
+        res.update(map_conv(f'semantic_decoder/_aspp/_aspp_conv{i}/_conv_bn_act/_conv',
+             f'sem_seg_head.predictor._auxiliary_semantic_predictor._aspp._aspp_conv{i}.conv'))
+        res.update(map_bn(f'semantic_decoder/_aspp/_aspp_conv{i}/_conv_bn_act/_batch_norm',
+             f'sem_seg_head.predictor._auxiliary_semantic_predictor._aspp._aspp_conv{i}.norm'))
+    res.update({
+        'semantic_decoder/_fusion_conv1/_conv1_bn_act/_depthwise/_depthwise_conv/depthwise_kernel':
+         'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_fusion_os8_conv0_bn_act.conv.weight'})
+    res.update(map_bn('semantic_decoder/_fusion_conv1/_conv1_bn_act/_depthwise/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_fusion_os8_conv0_bn_act.norm'))
+    res.update({
+        'semantic_decoder/_fusion_conv1/_conv1_bn_act/_pointwise/_conv/kernel':
+         'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_fusion_os8_conv1_bn_act.conv.weight'})
+    res.update(map_bn('semantic_decoder/_fusion_conv1/_conv1_bn_act/_pointwise/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_fusion_os8_conv1_bn_act.norm'))
+    res.update({
+        'semantic_decoder/_fusion_conv2/_conv1_bn_act/_depthwise/_depthwise_conv/depthwise_kernel':
+         'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_fusion_os4_conv0_bn_act.conv.weight'})
+    res.update(map_bn('semantic_decoder/_fusion_conv2/_conv1_bn_act/_depthwise/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_fusion_os4_conv0_bn_act.norm'))
+    res.update({
+        'semantic_decoder/_fusion_conv2/_conv1_bn_act/_pointwise/_conv/kernel':
+         'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_fusion_os4_conv1_bn_act.conv.weight'})
+    res.update(map_bn('semantic_decoder/_fusion_conv2/_conv1_bn_act/_pointwise/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_fusion_os4_conv1_bn_act.norm'))
+    res.update({
+        'semantic_decoder/_low_level_conv1/_conv/kernel':
+         'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_projection_os8.conv.weight'})
+    res.update(map_bn('semantic_decoder/_low_level_conv1/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_projection_os8.norm'))
+    res.update({
+        'semantic_decoder/_low_level_conv2/_conv/kernel':
+         'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_projection_os4.conv.weight'})
+    res.update(map_bn('semantic_decoder/_low_level_conv2/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor._low_level_projection_os4.norm'))
+    res.update({
+        'semantic_head_without_last_layer/_conv1_bn_act/_depthwise/_depthwise_conv/depthwise_kernel':
+         'sem_seg_head.predictor._auxiliary_semantic_predictor.conv_block_0.conv.weight'})
+    res.update(map_bn('semantic_head_without_last_layer/_conv1_bn_act/_depthwise/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor.conv_block_0.norm'))
+    res.update({
+        'semantic_head_without_last_layer/_conv1_bn_act/_pointwise/_conv/kernel':
+         'sem_seg_head.predictor._auxiliary_semantic_predictor.conv_block_1.conv.weight'})
+    res.update(map_bn('semantic_head_without_last_layer/_conv1_bn_act/_pointwise/_batch_norm',
+             'sem_seg_head.predictor._auxiliary_semantic_predictor.conv_block_1.norm'))
+    res.update({
+        'semantic_last_layer/kernel':
+         'sem_seg_head.predictor._auxiliary_semantic_predictor.final_conv.conv.weight'})
+    res.update({
+        'semantic_last_layer/bias':
+         'sem_seg_head.predictor._auxiliary_semantic_predictor.final_conv.conv.bias'})
+    return res
+# python3 convert-tf-weights-to-d2.py kmax_resnet50_coco_train/ckpt-150000 tf_kmax_r50.pkl
+if __name__ == "__main__":
+    input = sys.argv[1]
+    state_dict = load_tf_weights(input)
+    #exit()
+    state_dict_torch = {}
+    mapping_key = {}
+    if 'resnet50' in input:
+        mapping_key.update(tf_2_torch_mapping_r50())
+    elif 'convnext' in input:
+        mapping_key.update(tf_2_torch_mapping_convnext())
+    mapping_key.update(tf_2_torch_mapping_pixel_dec())
+    mapping_key.update(tf_2_torch_mapping_trans_dec())
+    mapping_key.update(tf_2_torch_mapping_aux_semanic_dec())
+    for k in state_dict.keys():
+        value = state_dict[k]
+        k2 = mapping_key[k]
+        rank = len(value.shape)
+        if '_batch_norm_retrieved_output' in k2 or '_batch_norm_similarity' in k2 or '_batch_norm_retrieved_value' in k2:
+            value = np.reshape(value, [-1])
+        elif 'qkv_transform.conv.weight' in k2:
+            # (512, 1024) -> (1024, 512, 1)
+            value = np.transpose(value, (1, 0))[:, :, None]
+        elif '_cluster_centers.weight' in k2:
+            # (1, 128, 256) -> (256, 128)
+            value = np.transpose(value[0], (1, 0))
+        elif '_pixel_conv1_bn_act.conv.weight' in k2:
+            # (1, 512, 256) -> (256, 512, 1, 1)
+            value = np.transpose(value, (2, 1, 0))[:, :, :, None]
+        elif '_pixel_v_conv_bn.conv.weight' in k2:
+            # (1, 256, 256) -> (256, 256, 1, 1)
+            value = np.transpose(value, (2, 1, 0))[:, :, :, None]
+        elif '_pixel_space_head_conv0bnact.conv.weight' in k2:
+            # (5, 5, 256, 1) -> (256, 1, 5, 5)
+            value = np.transpose(value, (2, 3, 0, 1))
+        elif '/layer_scale' in k:
+            value = np.reshape(value, [-1])
+        elif 'pwconv1.weight' in k2 or 'pwconv2.weight' in k2:
+            # (128, 512) -> (512, 128)
+            value = np.transpose(value, (1, 0))
+        elif ('_low_level_fusion_os4_conv0_bn_act.conv.weight' in k2
+        or '_low_level_fusion_os8_conv0_bn_act.conv.weight' in k2
+        or 'sem_seg_head.predictor._auxiliary_semantic_predictor.conv_block_0.conv.weight' in k2):
+            value = np.transpose(value, (2, 3, 0, 1))
+        else:
+            if rank == 1: # bias, norm etc
+                pass
+            elif rank == 2: # _query_rpe
+                pass
+            elif rank == 3: # conv 1d kernel, etc
+                value = np.transpose(value, (2, 1, 0))
+            elif rank == 4: # conv 2d kernel, etc
+                value = np.transpose(value, (3, 2, 0, 1))
+        state_dict_torch[k2] = value
+    res = {"model": state_dict_torch, "__author__": "third_party", "matching_heuristics": True}
+    with open(sys.argv[2], "wb") as f:
+        pkl.dump(res, f)
+# r50: 52.85 -> 52.71 w/ eps 1e-3
+# convnext-base: 56.85 -> 56.97 w/ eps 1e-3

demo/demo.ipynb ADDED Viewed

	@@ -0,0 +1,213 @@

+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# kMaX-DeepLab Demo\n",
+    "This notebook is modified by Qihang Yu, with reference from [Mask2Former's script](https://colab.research.google.com/drive/1uIWE5KbGFSjrxey2aRd5pWkKNY1_SaNq)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Install detectron2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install detectron2\n",
+    "import torch\n",
+    "TORCH_VERSION = \".\".join(torch.__version__.split(\".\")[:2])\n",
+    "CUDA_VERSION = torch.__version__.split(\"+\")[-1]\n",
+    "print(\"torch: \", TORCH_VERSION, \"; cuda: \", CUDA_VERSION)\n",
+    "# Install detectron2 that matches the above pytorch version\n",
+    "# See https://detectron2.readthedocs.io/tutorials/install.html for instructions\n",
+    "!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/$CUDA_VERSION/torch$TORCH_VERSION/index.html"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Install kMaX-DeepLab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# clone and install kMaX-DeepLab\n",
+    "!git clone https://github.com/yucornetto/kmaxdeeplab_detectron2.git\n",
+    "%cd kmaxdeeplab_detectron2\n",
+    "!pip install -U opencv-python\n",
+    "!pip install git+https://github.com/cocodataset/panopticapi.git\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You may need to restart your runtime prior to this, to let your installation take effect\n",
+    "%cd /content/kmaxdeeplab_detectron2\n",
+    "# Some basic setup:\n",
+    "# Setup detectron2 logger\n",
+    "import detectron2\n",
+    "from detectron2.utils.logger import setup_logger\n",
+    "setup_logger()\n",
+    "setup_logger(name=\"kmax_deeplab\")\n",
+    "\n",
+    "# import some common libraries\n",
+    "import numpy as np\n",
+    "import cv2\n",
+    "import torch\n",
+    "from google.colab.patches import cv2_imshow\n",
+    "\n",
+    "# import some common detectron2 utilities\n",
+    "from detectron2 import model_zoo\n",
+    "from detectron2.engine import DefaultPredictor\n",
+    "from detectron2.config import get_cfg\n",
+    "from detectron2.utils.visualizer import Visualizer, ColorMode\n",
+    "from detectron2.data import MetadataCatalog\n",
+    "from detectron2.projects.deeplab import add_deeplab_config\n",
+    "coco_metadata = MetadataCatalog.get(\"coco_2017_val_panoptic\")\n",
+    "\n",
+    "# import Mask2Former project\n",
+    "from kmax_deeplab import add_kmax_deeplab_config"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run a pre-trained Mask2Former model\n",
+    "We first download an image from the COCO dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget http://images.cocodataset.org/val2017/000000005477.jpg -q -O input.jpg\n",
+    "im = cv2.imread(\"./input.jpg\")\n",
+    "cv2_imshow(im)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, we create a detectron2 config and a detectron2 `DefaultPredictor` to run inference on this image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cfg = get_cfg()\n",
+    "add_deeplab_config(cfg)\n",
+    "add_kmax_deeplab_config(cfg)\n",
+    "cfg.merge_from_file(\"configs/coco/panoptic-segmentation/kmax_convnext_large.yaml\")\n",
+    "cfg.MODEL.WEIGHTS = 'https://drive.google.com/uc?id=1b6rEnKw4PNTdqSdWpmb0P9dsvN0pkOiN&export=download'\n",
+    "cfg.MODEL.KMAX_DEEPLAB.TEST.SEMANTIC_ON = True\n",
+    "cfg.MODEL.KMAX_DEEPLAB.TEST.INSTANCE_ON = True\n",
+    "cfg.MODEL.KMAX_DEEPLAB.TEST.PANOPTIC_ON = True\n",
+    "predictor = DefaultPredictor(cfg)\n",
+    "outputs = predictor(im)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Show panoptic/instance/semantic predictions: \n",
+    "v = Visualizer(im[:, :, ::-1], coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)\n",
+    "panoptic_result = v.draw_panoptic_seg(outputs[\"panoptic_seg\"][0].to(\"cpu\"), outputs[\"panoptic_seg\"][1]).get_image()\n",
+    "v = Visualizer(im[:, :, ::-1], coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)\n",
+    "instance_result = v.draw_instance_predictions(outputs[\"instances\"].to(\"cpu\")).get_image()\n",
+    "v = Visualizer(im[:, :, ::-1], coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)\n",
+    "semantic_result = v.draw_sem_seg(outputs[\"sem_seg\"].argmax(0).to(\"cpu\")).get_image()\n",
+    "print(\"Panoptic segmentation (top), instance segmentation (middle), semantic segmentation (bottom)\")\n",
+    "cv2_imshow(np.concatenate((panoptic_result, instance_result, semantic_result), axis=0)[:, :, ::-1])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's try an image not from COCO as well:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download a sample image and display. Replace path here to try your own images!\n",
+    "!wget https://web.eecs.umich.edu/~fouhey/fun/desk/desk.jpg\n",
+    "im = cv2.imread(\"./desk.jpg\")\n",
+    "cv2_imshow(im)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outputs = predictor(im)\n",
+    "# Show panoptic/instance/semantic predictions: \n",
+    "v = Visualizer(im[:, :, ::-1], coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)\n",
+    "panoptic_result = v.draw_panoptic_seg(outputs[\"panoptic_seg\"][0].to(\"cpu\"), outputs[\"panoptic_seg\"][1]).get_image()\n",
+    "v = Visualizer(im[:, :, ::-1], coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)\n",
+    "instance_result = v.draw_instance_predictions(outputs[\"instances\"].to(\"cpu\")).get_image()\n",
+    "v = Visualizer(im[:, :, ::-1], coco_metadata, scale=1.2, instance_mode=ColorMode.IMAGE_BW)\n",
+    "semantic_result = v.draw_sem_seg(outputs[\"sem_seg\"].argmax(0).to(\"cpu\")).get_image()\n",
+    "print(\"Panoptic segmentation (top), instance segmentation (middle), semantic segmentation (bottom)\")\n",
+    "cv2_imshow(np.concatenate((panoptic_result, instance_result, semantic_result), axis=0)[:, :, ::-1])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.9.6 (default, Oct 18 2022, 12:41:40) \n[Clang 14.0.0 (clang-1400.0.29.202)]"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

demo/demo.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
+import argparse
+import glob
+import multiprocessing as mp
+import os
+# fmt: off
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+# fmt: on
+import tempfile
+import time
+import warnings
+import cv2
+import numpy as np
+import tqdm
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.projects.deeplab import add_deeplab_config
+from detectron2.utils.logger import setup_logger
+from kmax_deeplab import add_kmax_deeplab_config
+from predictor import VisualizationDemo
+# constants
+WINDOW_NAME = "kmaxdeeplab demo"
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_deeplab_config(cfg)
+    add_kmax_deeplab_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    return cfg
+def get_parser():
+    parser = argparse.ArgumentParser(description="kmaxdeeplab demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        default="configs/coco/panoptic-segmentation/kmax_convnext_large.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.5,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+def test_opencv_video_format(codec, file_ext):
+    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
+        filename = os.path.join(dir, "test_file" + file_ext)
+        writer = cv2.VideoWriter(
+            filename=filename,
+            fourcc=cv2.VideoWriter_fourcc(*codec),
+            fps=float(30),
+            frameSize=(10, 10),
+            isColor=True,
+        )
+        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
+        writer.release()
+        if os.path.isfile(filename):
+            return True
+        return False
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    demo = VisualizationDemo(cfg)
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            img = read_image(path, format="BGR")
+            start_time = time.time()
+            predictions, visualized_output = demo.run_on_image(img)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+            ## Below are raw outputs.
+            # panoptic_seg, segments_info = predictions["panoptic_seg"]
+            # print(panoptic_seg.shape, segments_info)
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output.save(out_filename)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    elif args.webcam:
+        assert args.input is None, "Cannot have both --input and --webcam!"
+        assert args.output is None, "output not yet supported with --webcam!"
+        cam = cv2.VideoCapture(0)
+        for vis in tqdm.tqdm(demo.run_on_video(cam)):
+            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+            cv2.imshow(WINDOW_NAME, vis)
+            if cv2.waitKey(1) == 27:
+                break  # esc to quit
+        cam.release()
+        cv2.destroyAllWindows()

demo/predictor.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
+import atexit
+import bisect
+import multiprocessing as mp
+from collections import deque
+import cv2
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+class VisualizationDemo(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
+        )
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+    def run_on_image(self, image):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        vis_output = None
+        predictions = self.predictor(image)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
+        if "panoptic_seg" in predictions:
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output = visualizer.draw_panoptic_seg_predictions(
+                panoptic_seg.to(self.cpu_device), segments_info
+            )
+        else:
+            if "sem_seg" in predictions:
+                vis_output = visualizer.draw_sem_seg(
+                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            if "instances" in predictions:
+                instances = predictions["instances"].to(self.cpu_device)
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+        return predictions, vis_output
+    def _frame_from_video(self, video):
+        while video.isOpened():
+            success, frame = video.read()
+            if success:
+                yield frame
+            else:
+                break
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput a little bit when rendering videos.
+    """
+    class _StopToken:
+        pass
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+    def __len__(self):
+        return self.put_idx - self.get_idx
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5

docs/clustering_view_of_mask_transformer.png ADDED Viewed

docs/kmax_decoder.png ADDED Viewed

kmax_deeplab/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from . import data  # register all new datasets
+from . import modeling
+# config
+from .config import add_kmax_deeplab_config
+# dataset loading
+from .data.dataset_mappers.coco_panoptic_kmaxdeeplab_dataset_mapper import COCOPanoptickMaXDeepLabDatasetMapper
+# models
+from .kmax_model import kMaXDeepLab
+# evaluation
+from .evaluation.instance_evaluation import InstanceSegEvaluator

kmax_deeplab/config.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# -*- coding: utf-8 -*-
+from detectron2.config import CfgNode as CN
+def add_kmax_deeplab_config(cfg):
+    """
+    Add config for KMAX_DEEPLAB.
+    """
+    # NOTE: configs from original maskformer
+    # data config
+    # select the dataset mapper
+    cfg.INPUT.DATASET_MAPPER_NAME = "coco_panoptic_kmaxdeeplab"
+    # Color augmentation
+    # Pad image and segmentation GT in dataset mapper.
+    cfg.INPUT.SIZE_DIVISIBILITY = -1
+    # solver config
+    # weight decay on embedding
+    cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.05
+    # optimizer
+    cfg.SOLVER.OPTIMIZER = "ADAMW"
+    cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
+    # kMaX-DeepLab model config
+    cfg.MODEL.KMAX_DEEPLAB = CN()
+    # whether to share matching results
+    cfg.MODEL.KMAX_DEEPLAB.SHARE_FINAL_MATCHING = True
+    # vis
+    cfg.MODEL.KMAX_DEEPLAB.SAVE_VIS_NUM = 0
+    # loss
+    cfg.MODEL.KMAX_DEEPLAB.DEEP_SUPERVISION = True
+    cfg.MODEL.KMAX_DEEPLAB.SKIP_CONN_INIT_VALUE = 0.0
+    cfg.MODEL.KMAX_DEEPLAB.NO_OBJECT_WEIGHT = 1e-5
+    cfg.MODEL.KMAX_DEEPLAB.CLASS_WEIGHT = 3.0
+    cfg.MODEL.KMAX_DEEPLAB.DICE_WEIGHT = 3.0
+    cfg.MODEL.KMAX_DEEPLAB.MASK_WEIGHT = 0.3
+    cfg.MODEL.KMAX_DEEPLAB.INSDIS_WEIGHT = 1.0
+    cfg.MODEL.KMAX_DEEPLAB.AUX_SEMANTIC_WEIGHT = 1.0
+    cfg.MODEL.KMAX_DEEPLAB.PIXEL_INSDIS_TEMPERATURE = 1.5
+    cfg.MODEL.KMAX_DEEPLAB.PIXEL_INSDIS_SAMPLE_K = 4096
+    cfg.MODEL.KMAX_DEEPLAB.AUX_SEMANTIC_TEMPERATURE = 2.0
+    cfg.MODEL.KMAX_DEEPLAB.UX_SEMANTIC_SAMPLE_K = 4096
+    # pixel decoder config
+    cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC = CN()
+    cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.NAME = "kMaXPixelDecoder"
+    cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.IN_FEATURES = ['res2', 'res3', 'res4', 'res5']
+    cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.DEC_LAYERS = [1, 5, 1, 1]
+    cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.LAYER_TYPES = ["axial", "axial", "bottleneck", "bottleneck"]
+    cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.DEC_CHANNELS = [512, 256, 128, 64]
+    cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.DROP_PATH_PROB = 0.0
+    # transformer decoder config
+    cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC = CN()
+    cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.NAME = "kMaXTransformerDecoder"
+    cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.DEC_LAYERS = [2, 2, 2]
+    cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.NUM_OBJECT_QUERIES = 128
+    cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.IN_CHANNELS = [2048, 1024, 512]
+    cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.DROP_PATH_PROB = 0.0
+    # kMaX-DeepLab inference config
+    cfg.MODEL.KMAX_DEEPLAB.TEST = CN()
+    cfg.MODEL.KMAX_DEEPLAB.TEST.SEMANTIC_ON = False
+    cfg.MODEL.KMAX_DEEPLAB.TEST.INSTANCE_ON = False
+    cfg.MODEL.KMAX_DEEPLAB.TEST.PANOPTIC_ON = True
+    cfg.MODEL.KMAX_DEEPLAB.TEST.OBJECT_MASK_THRESHOLD = 0.4
+    cfg.MODEL.KMAX_DEEPLAB.TEST.CLASS_THRESHOLD_THING = 0.7
+    cfg.MODEL.KMAX_DEEPLAB.TEST.CLASS_THRESHOLD_STUFF = 0.5
+    cfg.MODEL.KMAX_DEEPLAB.TEST.REORDER_CLASS_WEIGHT = 1.0
+    cfg.MODEL.KMAX_DEEPLAB.TEST.REORDER_MASK_WEIGHT = 1.0
+    cfg.MODEL.KMAX_DEEPLAB.TEST.OVERLAP_THRESHOLD = 0.8
+    cfg.MODEL.KMAX_DEEPLAB.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
+    # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
+    # you can use this config to override
+    cfg.MODEL.KMAX_DEEPLAB.SIZE_DIVISIBILITY = -1
+    # https://github.com/SHI-Labs/OneFormer/blob/main/oneformer/config.py#L197
+    cfg.MODEL.CONVNEXT = CN()
+    cfg.MODEL.CONVNEXT.IN_CHANNELS = 3
+    cfg.MODEL.CONVNEXT.DEPTHS = [3, 3, 27, 3]
+    cfg.MODEL.CONVNEXT.DIMS = [192, 384, 768, 1536]
+    cfg.MODEL.CONVNEXT.DROP_PATH_RATE = 0.6
+    cfg.MODEL.CONVNEXT.LSIT = 1e-6
+    cfg.MODEL.CONVNEXT.OUT_INDICES = [0, 1, 2, 3]
+    cfg.MODEL.CONVNEXT.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
+    cfg.INPUT.IMAGE_SIZE = [1281, 1281]
+    cfg.INPUT.MIN_SCALE = 0.2
+    cfg.INPUT.MAX_SCALE = 2.0

kmax_deeplab/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import datasets

kmax_deeplab/data/dataset_mappers/__init__.py ADDED Viewed

File without changes

kmax_deeplab/data/dataset_mappers/coco_panoptic_kmaxdeeplab_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
+# modified by Qihang Yu
+import copy
+import logging
+import numpy as np
+import torch
+import random
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.projects.point_rend import ColorAugSSDTransform
+from detectron2.structures import BitMasks, Boxes, Instances
+import os
+__all__ = ["COCOPanoptickMaXDeepLabDatasetMapper"]
+def build_transform_gen(cfg, is_train, scale_ratio=1.0):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+    Returns:
+        list[Augmentation]
+    """
+    image_size = cfg.INPUT.IMAGE_SIZE
+    assert is_train
+    min_scale = cfg.INPUT.MIN_SCALE * scale_ratio
+    max_scale = cfg.INPUT.MAX_SCALE * scale_ratio
+    augmentation = [
+        T.ResizeScale(
+            min_scale=min_scale, max_scale=max_scale, target_height=image_size[0], target_width=image_size[1]
+        ),
+        ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT),
+        T.RandomCrop(crop_type="absolute", crop_size=(image_size[0], image_size[1])),
+        T.RandomFlip(),
+    ]
+    return augmentation
+class COCOPanoptickMaXDeepLabDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by kMaX-DeepLab.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        tfm_gens,
+        tfm_gens_copy_paste,
+        image_format,
+        image_size,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            tfm_gens: data augmentation
+            tfm_gens_copy_paste: data augmentation
+            image_format: an image format supported by :func:`detection_utils.read_image`
+            image_size: expected image size
+        """
+        self.tfm_gens = tfm_gens
+        self.tfm_gens_copy_paste = tfm_gens_copy_paste
+        if is_train:
+            logging.getLogger(__name__).info(
+                "[COCOPanopticDeepLab2DatasetMapper] Full TransformGens used in training: {}, {}".format(
+                    str(self.tfm_gens), str(self.tfm_gens_copy_paste)
+                )
+            )
+        else:
+            logging.getLogger(__name__).info(
+                "[COCOPanopticDeepLab2DatasetMapper] Full TransformGens used in testing: {}".format(
+                    str(self.tfm_gens)
+                )
+            )
+        self.img_format = image_format
+        self.is_train = is_train
+        self.image_size = image_size
+        dataset_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+        image_dir = os.path.join(dataset_root, "coco/train2017")
+        gt_dir = os.path.join(dataset_root, "coco/panoptic_train2017")
+        semseg_dir = os.path.join(dataset_root, "coco/panoptic_semseg_train2017")
+        json_file = os.path.join(dataset_root, "coco/annotations/panoptic_train2017.json")
+        from ..datasets import register_coco_panoptic_annos_semseg
+        meta_data = register_coco_panoptic_annos_semseg.get_metadata()
+        self.dataset_dict_all = register_coco_panoptic_annos_semseg.load_coco_panoptic_json(
+            json_file, image_dir, gt_dir, semseg_dir, meta_data
+        )
+        self.filename2idx = {}
+        for idx, dataset_dict in enumerate(self.dataset_dict_all):
+            self.filename2idx[dataset_dict["file_name"].split('/')[-1].replace('.jpg', '')] = idx
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        tfm_gens = build_transform_gen(cfg, is_train)
+        tfm_gens_copy_paste = build_transform_gen(cfg, is_train, scale_ratio=0.5)
+        ret = {
+            "is_train": is_train,
+            "tfm_gens": tfm_gens,
+            "tfm_gens_copy_paste": tfm_gens_copy_paste,
+            "image_format": cfg.INPUT.FORMAT,
+            "image_size": cfg.INPUT.IMAGE_SIZE
+        }
+        return ret
+    def read_dataset_dict(self, dataset_dict, is_copy_paste=False):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        if not is_copy_paste:
+            image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        else:
+            image, transforms = T.apply_transform_gens(self.tfm_gens_copy_paste, image)
+        dataset_dict["image"] = np.ascontiguousarray(image.transpose(2, 0, 1))
+        if not self.is_train:
+            dataset_dict.pop("annotations", None)
+            return dataset_dict, None
+        # We pad the image manually, for copy-paste purpose.
+        padded_image = np.zeros((3, self.image_size[0], self.image_size[1]), dtype=dataset_dict["image"].dtype)
+        new_h, new_w = dataset_dict["image"].shape[1:]
+        offset_h, offset_w = 0, 0 # following the d2 panoptic deeplab implementaiton to only perform bottom/right padding.
+        padded_image[:, offset_h:offset_h+new_h, offset_w:offset_w+new_w] = dataset_dict["image"]
+        dataset_dict["image"] = padded_image
+        if "pan_seg_file_name" in dataset_dict:
+            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
+            # apply the same transformation to panoptic segmentation
+            pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
+            from panopticapi.utils import rgb2id
+            pan_seg_gt = rgb2id(pan_seg_gt) # int32 # H x W
+            # similarily, we manually pad the label, and we use label -1 to indicate those padded pixels.
+            # In this way, we can masking out the padded pixels values to 0 after normalization, which aligns the
+            # behavior between training and testing.
+            padded_pan_seg_gt = np.zeros((self.image_size[0], self.image_size[1]), dtype=pan_seg_gt.dtype)
+            is_real_pixels = np.zeros((self.image_size[0], self.image_size[1]), dtype=np.bool)
+            padded_pan_seg_gt[offset_h:offset_h+new_h, offset_w:offset_w+new_w] = pan_seg_gt
+            is_real_pixels[offset_h:offset_h+new_h, offset_w:offset_w+new_w] = True
+            dataset_dict["is_real_pixels"] = is_real_pixels
+            pan_seg_gt = padded_pan_seg_gt
+            return dataset_dict, pan_seg_gt
+        # This should never happen.
+        raise NotImplementedError
+    def call_copypaste(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        # Read main image.
+        dataset_dict, pan_seg_gt = self.read_dataset_dict(dataset_dict, is_copy_paste=False)
+        # Read copy-paste image.
+        # We use the last number as a bias to random number, in case same random numbers are generated across devices.
+        main_image_idx = self.filename2idx[dataset_dict["file_name"].split('/')[-1].replace('.jpg', '')]
+        random_image_idx = main_image_idx + random.randint(0, len(self.dataset_dict_all) - 1)
+        random_image_idx = random_image_idx % len(self.dataset_dict_all)
+        dataset_dict_copy_paste = copy.deepcopy(self.dataset_dict_all[random_image_idx])
+        dataset_dict_copy_paste, pan_seg_gt_copy_paste = self.read_dataset_dict(dataset_dict_copy_paste, is_copy_paste=True)
+        # Copy data_dict_copy_paste onto data_dict. 0 means keep original pixel, 1 means use copy-paste pixel.
+        copy_paste_masks = np.zeros((pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+        segments_info_copy_paste = dataset_dict_copy_paste["segments_info"]
+        all_ids = []
+        thing_ids = []
+        for segment_info_copy_paste in segments_info_copy_paste:
+            class_id = segment_info_copy_paste["category_id"]
+            if not segment_info_copy_paste["iscrowd"]:
+                # -1 is reserved for padded pixels.
+                if segment_info_copy_paste["id"] in [-1, 0]:
+                    print(segment_info_copy_paste)
+                    raise ValueError("id should not be -1, 0")
+                all_ids.append(segment_info_copy_paste["id"])
+                if segment_info_copy_paste["isthing"]: # All thing classes are copy-pasted.
+                    thing_ids.append(segment_info_copy_paste["id"])
+        # Shuffle and randomly select kept label ids.
+        random.shuffle(all_ids)
+        keep_number = random.randint(0, len(all_ids))
+        for index, label_id in enumerate(all_ids):
+            # randomly copy labels, but keep all thing classes.
+            if index < keep_number or label_id in thing_ids:
+                copy_paste_masks[pan_seg_gt_copy_paste == label_id] = 1
+        # We merge the image and copy-paste image based on the copy-paste mask.
+        dataset_dict["image"] = (dataset_dict["image"] * (1.0 - copy_paste_masks).astype(dataset_dict["image"].dtype) +
+                                 dataset_dict_copy_paste["image"] * copy_paste_masks.astype(dataset_dict["image"].dtype))
+        dataset_dict["image"] = torch.as_tensor(dataset_dict["image"])
+        dataset_dict["is_real_pixels"] = (dataset_dict["is_real_pixels"] * (1.0 - copy_paste_masks).astype(dataset_dict["is_real_pixels"].dtype) +
+                                 dataset_dict_copy_paste["is_real_pixels"] * copy_paste_masks.astype(dataset_dict["is_real_pixels"].dtype))
+        dataset_dict["is_real_pixels"] = torch.as_tensor(dataset_dict["is_real_pixels"])
+        # We set all ids in copy-paste image to be negative, so that there will be no overlap between original id and copy-paste id.
+        pan_seg_gt_copy_paste = -pan_seg_gt_copy_paste
+        pan_seg_gt = (pan_seg_gt * (1.0 - copy_paste_masks).astype(pan_seg_gt.dtype) +
+                       pan_seg_gt_copy_paste * copy_paste_masks.astype(pan_seg_gt.dtype))
+        # We use 4x downsampled gt for final supervision.
+        pan_seg_gt = pan_seg_gt[::4, ::4]
+        sem_seg_gt = -np.ones_like(pan_seg_gt) # H x W, init with -1
+        # We then process the obtained pan_seg_gt to training format.
+        image_shape = dataset_dict["image"].shape[1:]  # h, w
+        segments_info = dataset_dict["segments_info"]
+        instances = Instances(image_shape)
+        classes = []
+        masks = []
+        valid_pixel_num = 0
+        # As the two images may share same stuff classes, we use a dict to track existing stuff and merge them.
+        stuff_class_to_idx = {}
+        for segment_info in segments_info:
+            class_id = segment_info["category_id"]
+            if not segment_info["iscrowd"]:
+                # -1 is reserved to indicate padded pixels.
+                if segment_info["id"] in [-1, 0]:
+                    print(segment_info)
+                    raise ValueError("id should not be -1, 0")
+                binary_mask = (pan_seg_gt == segment_info["id"])
+                # As it is possible that some masks are removed during the copy-paste process, we need
+                # to double check if the maks exists.
+                valid_pixel_num_ = binary_mask.sum()
+                valid_pixel_num += valid_pixel_num_
+                if valid_pixel_num_ > 0:
+                    sem_seg_gt[binary_mask] = class_id
+                    if not segment_info["isthing"]:
+                        # For original image, stuff should only appear once.
+                        if class_id in stuff_class_to_idx:
+                            raise ValueError('class_id should not already be in stuff_class_to_idx!')
+                        else:
+                            stuff_class_to_idx[class_id] = len(masks)
+                    classes.append(class_id)
+                    masks.append(binary_mask)
+        for segment_info in segments_info_copy_paste:
+            class_id = segment_info["category_id"]
+            if not segment_info["iscrowd"]:
+                # -1 is reserved to indicate padded pixels.
+                if segment_info["id"] in [-1, 0]:
+                    print(segment_info)
+                    raise ValueError("id should not be -1, 0")
+                # Note that copy-paste id is negative.
+                binary_mask = (pan_seg_gt == -segment_info["id"])
+                valid_pixel_num_ = binary_mask.sum()
+                valid_pixel_num += valid_pixel_num_
+                if valid_pixel_num_ > 0:
+                    sem_seg_gt[binary_mask] = class_id
+                    if not segment_info["isthing"]:
+                        # The stuff in copy-paste image already appeared in original image.
+                        if class_id in stuff_class_to_idx:
+                            # Merge into original stuff masks.
+                            masks[stuff_class_to_idx[class_id]] = np.logical_or(masks[stuff_class_to_idx[class_id]], binary_mask)
+                            continue
+                        else:
+                            stuff_class_to_idx[class_id] = len(masks)
+                    classes.append(class_id)
+                    masks.append(binary_mask)
+        classes = np.array(classes)
+        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+        sem_seg_gt = torch.tensor(sem_seg_gt, dtype=torch.int64)
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+            instances.gt_boxes = Boxes(torch.zeros((0, 4)))
+        else:
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+            )
+            instances.gt_masks = masks.tensor
+            instances.gt_boxes = masks.get_bounding_boxes()
+        dataset_dict["instances"] = instances
+        dataset_dict["sem_seg_gt"] = sem_seg_gt
+        dataset_dict["valid_pixel_num"] = valid_pixel_num
+        return dataset_dict
+    def __call__(self, dataset_dict):
+        res = self.call_copypaste(dataset_dict)
+        while ("instances" in res and res["instances"].gt_masks.shape[0] == 0) or ("valid_pixel_num" in res and res["valid_pixel_num"] <= 4096):
+            # this gt is empty or contains too many void pixels, let's re-generate one.
+            main_image_idx = self.filename2idx[dataset_dict["file_name"].split('/')[-1].replace('.jpg', '')]
+            random_image_idx = main_image_idx + random.randint(0, len(self.dataset_dict_all) - 1)
+            random_image_idx = random_image_idx % len(self.dataset_dict_all)
+            dataset_dict = self.dataset_dict_all[random_image_idx]
+            res = self.call_copypaste(dataset_dict)
+        return res

kmax_deeplab/data/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import (
+    register_coco_panoptic_annos_semseg,
+)

kmax_deeplab/data/datasets/register_coco_panoptic_annos_semseg.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py
+import json
+import os
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+from detectron2.utils.file_io import PathManager
+_PREDEFINED_SPLITS_COCO_PANOPTIC = {
+    "coco_2017_train_panoptic": (
+        # This is the original panoptic annotation directory
+        "coco/panoptic_train2017",
+        "coco/annotations/panoptic_train2017.json",
+        # This directory contains semantic annotations that are
+        # converted from panoptic annotations.
+        # It is used by PanopticFPN.
+        # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
+        # to create these directories.
+        "coco/panoptic_semseg_train2017",
+    ),
+    "coco_2017_val_panoptic": (
+        "coco/panoptic_val2017",
+        "coco/annotations/panoptic_val2017.json",
+        "coco/panoptic_semseg_val2017",
+    ),
+}
+def get_metadata():
+    meta = {}
+    # The following metadata maps contiguous id from [0, #thing categories +
+    # #stuff categories) to their names and colors. We have to replica of the
+    # same name and color under "thing_*" and "stuff_*" because the current
+    # visualization function in D2 handles thing and class classes differently
+    # due to some heuristic used in Panoptic FPN. We keep the same naming to
+    # enable reusing existing visualization functions.
+    thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    stuff_classes = [k["name"] for k in COCO_CATEGORIES]
+    stuff_colors = [k["color"] for k in COCO_CATEGORIES]
+    meta["thing_classes"] = thing_classes
+    meta["thing_colors"] = thing_colors
+    meta["stuff_classes"] = stuff_classes
+    meta["stuff_colors"] = stuff_colors
+    # Convert category id for training:
+    #   category id: like semantic segmentation, it is the class id for each
+    #   pixel. Since there are some classes not used in evaluation, the category
+    #   id is not always contiguous and thus we have two set of category ids:
+    #       - original category id: category id in the original dataset, mainly
+    #           used for evaluation.
+    #       - contiguous category id: [0, #classes), in order to train the linear
+    #           softmax classifier.
+    thing_dataset_id_to_contiguous_id = {}
+    stuff_dataset_id_to_contiguous_id = {}
+    for i, cat in enumerate(COCO_CATEGORIES):
+        if cat["isthing"]:
+            thing_dataset_id_to_contiguous_id[cat["id"]] = i
+        # else:
+        #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+        # in order to use sem_seg evaluator
+        stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+    meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+    meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+    return meta
+def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
+        gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
+        json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+    def _convert_category_id(segment_info, meta):
+        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = True
+        else:
+            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = False
+        return segment_info
+    with PathManager.open(json_file) as f:
+        json_info = json.load(f)
+    ret = []
+    for ann in json_info["annotations"]:
+        image_id = int(ann["image_id"])
+        # TODO: currently we assume image and label has the same filename but
+        # different extension, and images have extension ".jpg" for COCO. Need
+        # to make image extension a user-provided argument if we extend this
+        # function to support other COCO-like datasets.
+        image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
+        label_file = os.path.join(gt_dir, ann["file_name"])
+        sem_label_file = os.path.join(semseg_dir, ann["file_name"])
+        segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
+        ret.append(
+            {
+                "file_name": image_file,
+                "image_id": image_id,
+                "pan_seg_file_name": label_file,
+                "sem_seg_file_name": sem_label_file,
+                "segments_info": segments_info,
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
+    assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
+    assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
+    return ret
+def register_coco_panoptic_annos_sem_seg(
+    name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
+):
+    panoptic_name = name
+    delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
+    delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
+    MetadataCatalog.get(panoptic_name).set(
+        thing_classes=metadata["thing_classes"],
+        thing_colors=metadata["thing_colors"],
+        # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
+    )
+    # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
+    semantic_name = name + "_with_sem_seg"
+    DatasetCatalog.register(
+        semantic_name,
+        lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, sem_seg_root, metadata),
+    )
+    MetadataCatalog.get(semantic_name).set(
+        sem_seg_root=sem_seg_root,
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        json_file=instances_json,
+        evaluator_type="coco_panoptic_seg",
+        ignore_label=255,
+        label_divisor=1000,
+        **metadata,
+    )
+def register_all_coco_panoptic_annos_sem_seg(root):
+    for (
+        prefix,
+        (panoptic_root, panoptic_json, semantic_root),
+    ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
+        prefix_instances = prefix[: -len("_panoptic")]
+        instances_meta = MetadataCatalog.get(prefix_instances)
+        image_root, instances_json = instances_meta.image_root, instances_meta.json_file
+        register_coco_panoptic_annos_sem_seg(
+            prefix,
+            get_metadata(),
+            image_root,
+            os.path.join(root, panoptic_root),
+            os.path.join(root, panoptic_json),
+            os.path.join(root, semantic_root),
+            instances_json,
+        )
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_coco_panoptic_annos_sem_seg(_root)

kmax_deeplab/evaluation/__init__.py ADDED Viewed

File without changes

kmax_deeplab/evaluation/instance_evaluation.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/evaluation/instance_evaluation.py
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import pickle
+from collections import OrderedDict
+import pycocotools.mask as mask_util
+import torch
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+import detectron2.utils.comm as comm
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_json
+from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
+from detectron2.evaluation.fast_eval_api import COCOeval_opt
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table
+# modified from COCOEvaluator for instance segmetnat
+class InstanceSegEvaluator(COCOEvaluator):
+    """
+    Evaluate AR for object proposals, AP for instance detection/segmentation, AP
+    for keypoint detection outputs using COCO's metrics.
+    See http://cocodataset.org/#detection-eval and
+    http://cocodataset.org/#keypoints-eval to understand its metrics.
+    The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
+    the metric cannot be computed (e.g. due to no predictions made).
+    In addition to COCO, this evaluator is able to support any bounding box detection,
+    instance segmentation, or keypoint detection dataset.
+    """
+    def _eval_predictions(self, predictions, img_ids=None):
+        """
+        Evaluate predictions. Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        tasks = self._tasks or self._tasks_from_predictions(coco_results)
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
+            # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
+            # num_classes = len(all_contiguous_ids)
+            # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
+            reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
+            for result in coco_results:
+                category_id = result["category_id"]
+                # assert category_id < num_classes, (
+                #     f"A prediction has class={category_id}, "
+                #     f"but the dataset only has {num_classes} classes and "
+                #     f"predicted class id should be in [0, {num_classes - 1}]."
+                # )
+                assert category_id in reverse_id_mapping, (
+                    f"A prediction has class={category_id}, "
+                    f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
+                )
+                result["category_id"] = reverse_id_mapping[category_id]
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info(
+            "Evaluating predictions with {} COCO API...".format(
+                "unofficial" if self._use_fast_impl else "official"
+            )
+        )
+        for task in sorted(tasks):
+            assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
+            coco_eval = (
+                _evaluate_predictions_on_coco(
+                    self._coco_api,
+                    coco_results,
+                    task,
+                    kpt_oks_sigmas=self._kpt_oks_sigmas,
+                    use_fast_impl=self._use_fast_impl,
+                    img_ids=img_ids,
+                    max_dets_per_image=self._max_dets_per_image,
+                )
+                if len(coco_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+            res = self._derive_coco_results(
+                coco_eval, task, class_names=self._metadata.get("thing_classes")
+            )
+            self._results[task] = res

kmax_deeplab/evaluation/panoptic_evaluation.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# Reference: https://github.com/cocodataset/panopticapi/blob/master/panopticapi/evaluation.py
+# Reference: https://github.com/open-mmlab/mmdetection/pull/7538
+#!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import os, sys
+import numpy as np
+import json
+import time
+from datetime import timedelta
+from collections import defaultdict
+import argparse
+import multiprocessing
+import PIL.Image as Image
+from panopticapi.utils import get_traceback, rgb2id
+OFFSET = 256 * 256 * 256
+VOID = 0
+class PQStatCat():
+        def __init__(self):
+            self.iou = 0.0
+            self.tp = 0
+            self.fp = 0
+            self.fn = 0
+        def __iadd__(self, pq_stat_cat):
+            self.iou += pq_stat_cat.iou
+            self.tp += pq_stat_cat.tp
+            self.fp += pq_stat_cat.fp
+            self.fn += pq_stat_cat.fn
+            return self
+class PQStat():
+    def __init__(self):
+        self.pq_per_cat = defaultdict(PQStatCat)
+    def __getitem__(self, i):
+        return self.pq_per_cat[i]
+    def __iadd__(self, pq_stat):
+        for label, pq_stat_cat in pq_stat.pq_per_cat.items():
+            self.pq_per_cat[label] += pq_stat_cat
+        return self
+    def pq_average(self, categories, isthing):
+        pq, sq, rq, n = 0, 0, 0, 0
+        per_class_results = {}
+        for label, label_info in categories.items():
+            if isthing is not None:
+                cat_isthing = label_info['isthing'] == 1
+                if isthing != cat_isthing:
+                    continue
+            iou = self.pq_per_cat[label].iou
+            tp = self.pq_per_cat[label].tp
+            fp = self.pq_per_cat[label].fp
+            fn = self.pq_per_cat[label].fn
+            if tp + fp + fn == 0:
+                per_class_results[label] = {'pq': 0.0, 'sq': 0.0, 'rq': 0.0}
+                continue
+            n += 1
+            pq_class = iou / (tp + 0.5 * fp + 0.5 * fn)
+            sq_class = iou / tp if tp != 0 else 0
+            rq_class = tp / (tp + 0.5 * fp + 0.5 * fn)
+            per_class_results[label] = {'pq': pq_class, 'sq': sq_class, 'rq': rq_class}
+            pq += pq_class
+            sq += sq_class
+            rq += rq_class
+        return {'pq': pq / n, 'sq': sq / n, 'rq': rq / n, 'n': n}, per_class_results
+@get_traceback
+def pq_compute_single_core(proc_id, annotation_set, gt_folder, pred_folder, categories):
+    pq_stat = PQStat()
+    idx = 0
+    for gt_ann, pred_ann in annotation_set:
+        if idx % 100 == 0:
+            print('Core: {}, {} from {} images processed'.format(proc_id, idx, len(annotation_set)))
+        idx += 1
+        pan_gt = np.array(Image.open(os.path.join(gt_folder, gt_ann['file_name'])), dtype=np.uint32)
+        pan_gt = rgb2id(pan_gt)
+        pan_pred = np.array(Image.open(os.path.join(pred_folder, pred_ann['file_name'])), dtype=np.uint32)
+        pan_pred = rgb2id(pan_pred)
+        gt_segms = {el['id']: el for el in gt_ann['segments_info']}
+        pred_segms = {el['id']: el for el in pred_ann['segments_info']}
+        # predicted segments area calculation + prediction sanity checks
+        pred_labels_set = set(el['id'] for el in pred_ann['segments_info'])
+        labels, labels_cnt = np.unique(pan_pred, return_counts=True)
+        for label, label_cnt in zip(labels, labels_cnt):
+            if label not in pred_segms:
+                if label == VOID:
+                    continue
+                raise KeyError('In the image with ID {} segment with ID {} is presented in PNG and not presented in JSON.'.format(gt_ann['image_id'], label))
+            pred_segms[label]['area'] = label_cnt
+            pred_labels_set.remove(label)
+            if pred_segms[label]['category_id'] not in categories:
+                raise KeyError('In the image with ID {} segment with ID {} has unknown category_id {}.'.format(gt_ann['image_id'], label, pred_segms[label]['category_id']))
+        if len(pred_labels_set) != 0:
+            raise KeyError('In the image with ID {} the following segment IDs {} are presented in JSON and not presented in PNG.'.format(gt_ann['image_id'], list(pred_labels_set)))
+        # confusion matrix calculation
+        pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype(np.uint64)
+        gt_pred_map = {}
+        labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True)
+        for label, intersection in zip(labels, labels_cnt):
+            gt_id = label // OFFSET
+            pred_id = label % OFFSET
+            gt_pred_map[(gt_id, pred_id)] = intersection
+        # count all matched pairs
+        gt_matched = set()
+        pred_matched = set()
+        for label_tuple, intersection in gt_pred_map.items():
+            gt_label, pred_label = label_tuple
+            if gt_label not in gt_segms:
+                continue
+            if pred_label not in pred_segms:
+                continue
+            if gt_segms[gt_label]['iscrowd'] == 1:
+                continue
+            if gt_segms[gt_label]['category_id'] != pred_segms[pred_label]['category_id']:
+                continue
+            union = pred_segms[pred_label]['area'] + gt_segms[gt_label]['area'] - intersection - gt_pred_map.get((VOID, pred_label), 0)
+            iou = intersection / union
+            if iou > 0.5:
+                pq_stat[gt_segms[gt_label]['category_id']].tp += 1
+                pq_stat[gt_segms[gt_label]['category_id']].iou += iou
+                gt_matched.add(gt_label)
+                pred_matched.add(pred_label)
+        # count false positives
+        crowd_labels_dict = {}
+        for gt_label, gt_info in gt_segms.items():
+            if gt_label in gt_matched:
+                continue
+            # crowd segments are ignored
+            if gt_info['iscrowd'] == 1:
+                crowd_labels_dict[gt_info['category_id']] = gt_label
+                continue
+            pq_stat[gt_info['category_id']].fn += 1
+        # count false positives
+        for pred_label, pred_info in pred_segms.items():
+            if pred_label in pred_matched:
+                continue
+            # intersection of the segment with VOID
+            intersection = gt_pred_map.get((VOID, pred_label), 0)
+            # plus intersection with corresponding CROWD region if it exists
+            if pred_info['category_id'] in crowd_labels_dict:
+                intersection += gt_pred_map.get((crowd_labels_dict[pred_info['category_id']], pred_label), 0)
+            # predicted segment is ignored if more than half of the segment correspond to VOID and CROWD regions
+            if intersection / pred_info['area'] > 0.5:
+                continue
+            pq_stat[pred_info['category_id']].fp += 1
+    print('Core: {}, all {} images processed'.format(proc_id, len(annotation_set)))
+    return pq_stat
+def pq_compute_multi_core(matched_annotations_list, gt_folder, pred_folder, categories):
+    cpu_num = multiprocessing.cpu_count()
+    annotations_split = np.array_split(matched_annotations_list, cpu_num)
+    print("Number of cores: {}, images per core: {}".format(cpu_num, len(annotations_split[0])))
+    workers = multiprocessing.Pool(processes=cpu_num)
+    processes = []
+    for proc_id, annotation_set in enumerate(annotations_split):
+        p = workers.apply_async(pq_compute_single_core,
+                                (proc_id, annotation_set, gt_folder, pred_folder, categories))
+        processes.append(p)
+    # https://github.com/open-mmlab/mmdetection/pull/7538
+    # Close the process pool, otherwise it will lead to memory
+    # leaking problems.
+    workers.close()
+    workers.join()
+    pq_stat = PQStat()
+    for p in processes:
+        pq_stat += p.get()
+    return pq_stat
+def pq_compute(gt_json_file, pred_json_file, gt_folder=None, pred_folder=None):
+    start_time = time.time()
+    with open(gt_json_file, 'r') as f:
+        gt_json = json.load(f)
+    with open(pred_json_file, 'r') as f:
+        pred_json = json.load(f)
+    if gt_folder is None:
+        gt_folder = gt_json_file.replace('.json', '')
+    if pred_folder is None:
+        pred_folder = pred_json_file.replace('.json', '')
+    categories = {el['id']: el for el in gt_json['categories']}
+    print("Evaluation panoptic segmentation metrics:")
+    print("Ground truth:")
+    print("\tSegmentation folder: {}".format(gt_folder))
+    print("\tJSON file: {}".format(gt_json_file))
+    print("Prediction:")
+    print("\tSegmentation folder: {}".format(pred_folder))
+    print("\tJSON file: {}".format(pred_json_file))
+    if not os.path.isdir(gt_folder):
+        raise Exception("Folder {} with ground truth segmentations doesn't exist".format(gt_folder))
+    if not os.path.isdir(pred_folder):
+        raise Exception("Folder {} with predicted segmentations doesn't exist".format(pred_folder))
+    pred_annotations = {el['image_id']: el for el in pred_json['annotations']}
+    matched_annotations_list = []
+    for gt_ann in gt_json['annotations']:
+        image_id = gt_ann['image_id']
+        if image_id not in pred_annotations:
+            raise Exception('no prediction for the image with id: {}'.format(image_id))
+        matched_annotations_list.append((gt_ann, pred_annotations[image_id]))
+    pq_stat = pq_compute_multi_core(matched_annotations_list, gt_folder, pred_folder, categories)
+    metrics = [("All", None), ("Things", True), ("Stuff", False)]
+    results = {}
+    for name, isthing in metrics:
+        results[name], per_class_results = pq_stat.pq_average(categories, isthing=isthing)
+        if name == 'All':
+            results['per_class'] = per_class_results
+    print("{:10s}| {:>5s}  {:>5s}  {:>5s} {:>5s}".format("", "PQ", "SQ", "RQ", "N"))
+    print("-" * (10 + 7 * 4))
+    for name, _isthing in metrics:
+        print("{:10s}| {:5.1f}  {:5.1f}  {:5.1f} {:5d}".format(
+            name,
+            100 * results[name]['pq'],
+            100 * results[name]['sq'],
+            100 * results[name]['rq'],
+            results[name]['n'])
+        )
+    t_delta = time.time() - start_time
+    print("Time elapsed: {:0.2f} seconds".format(t_delta))
+    return results
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gt_json_file', type=str,
+                        help="JSON file with ground truth data")
+    parser.add_argument('--pred_json_file', type=str,
+                        help="JSON file with predictions data")
+    parser.add_argument('--gt_folder', type=str, default=None,
+                        help="Folder with ground turth COCO format segmentations. \
+                              Default: X if the corresponding json file is X.json")
+    parser.add_argument('--pred_folder', type=str, default=None,
+                        help="Folder with prediction COCO format segmentations. \
+                              Default: X if the corresponding json file is X.json")
+    args = parser.parse_args()
+    pq_compute(args.gt_json_file, args.pred_json_file, args.gt_folder, args.pred_folder)

kmax_deeplab/kmax_model.py ADDED Viewed

	@@ -0,0 +1,446 @@

+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/maskformer_model.py
+# Reference: https://github.com/google-research/deeplab2/blob/main/model/kmax_deeplab.py
+# Reference: https://github.com/google-research/deeplab2/blob/main/model/post_processor/max_deeplab.py
+# Modified by Qihang Yu
+from typing import Tuple, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.data import MetadataCatalog
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.postprocessing import sem_seg_postprocess
+from detectron2.structures import Boxes, ImageList, Instances
+from detectron2.utils.memory import retry_if_cuda_oom
+from .modeling.criterion import SetCriterion
+from .modeling.matcher import HungarianMatcher
+from torch.cuda.amp import autocast
+@META_ARCH_REGISTRY.register()
+class kMaXDeepLab(nn.Module):
+    """
+    Main class for mask classification semantic segmentation architectures.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        num_queries: int,
+        object_mask_threshold: float,
+        class_threshold_thing: float,
+        class_threshold_stuff: float,
+        overlap_threshold: float,
+        reorder_class_weight: float,
+        reorder_mask_weight: float,
+        metadata,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        # inference
+        semantic_on: bool,
+        panoptic_on: bool,
+        instance_on: bool,
+        test_topk_per_image: int,
+        input_shape: List[int]
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            num_queries: int, number of queries
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+                For high-resolution dataset like Mapillary, resizing predictions before
+                inference will cause OOM error.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+            semantic_on: bool, whether to output semantic segmentation prediction
+            instance_on: bool, whether to output instance segmentation prediction
+            panoptic_on: bool, whether to output panoptic segmentation prediction
+            test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.criterion = criterion
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.object_mask_threshold = object_mask_threshold
+        self.class_threshold_thing = class_threshold_thing
+        self.class_threshold_stuff = class_threshold_stuff
+        self.reorder_class_weight = reorder_class_weight
+        self.reorder_mask_weight = reorder_mask_weight
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
+        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+        # additional args
+        self.semantic_on = semantic_on
+        self.instance_on = instance_on
+        self.panoptic_on = panoptic_on
+        self.test_topk_per_image = test_topk_per_image
+        if not self.semantic_on:
+            assert self.sem_seg_postprocess_before_inference
+        self.input_shape = input_shape
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
+        # Loss parameters:
+        deep_supervision = cfg.MODEL.KMAX_DEEPLAB.DEEP_SUPERVISION
+        no_object_weight = cfg.MODEL.KMAX_DEEPLAB.NO_OBJECT_WEIGHT
+        share_final_matching = cfg.MODEL.KMAX_DEEPLAB.SHARE_FINAL_MATCHING
+        # loss weights
+        class_weight = cfg.MODEL.KMAX_DEEPLAB.CLASS_WEIGHT
+        dice_weight = cfg.MODEL.KMAX_DEEPLAB.DICE_WEIGHT
+        mask_weight = cfg.MODEL.KMAX_DEEPLAB.MASK_WEIGHT
+        insdis_weight = cfg.MODEL.KMAX_DEEPLAB.INSDIS_WEIGHT
+        aux_semantic_weight = cfg.MODEL.KMAX_DEEPLAB.AUX_SEMANTIC_WEIGHT
+        # building criterion
+        matcher = HungarianMatcher()
+        weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight,
+        "loss_pixel_insdis": insdis_weight, "loss_aux_semantic": aux_semantic_weight}
+        if deep_supervision:
+            dec_layers = sum(cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.DEC_LAYERS)
+            aux_weight_dict = {}
+            for i in range(dec_layers):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        losses = ["labels", "masks"]
+        if insdis_weight > 0:
+            losses += ["pixels"]
+        if aux_semantic_weight > 0:
+            losses += ["aux_semantic"]
+        criterion = SetCriterion(
+            sem_seg_head.num_classes,
+            matcher=matcher,
+            weight_dict=weight_dict,
+            eos_coef=no_object_weight,
+            losses=losses,
+            share_final_matching=share_final_matching,
+            pixel_insdis_temperature=cfg.MODEL.KMAX_DEEPLAB.PIXEL_INSDIS_TEMPERATURE,
+            pixel_insdis_sample_k=cfg.MODEL.KMAX_DEEPLAB.PIXEL_INSDIS_SAMPLE_K,
+            aux_semantic_temperature=cfg.MODEL.KMAX_DEEPLAB.AUX_SEMANTIC_TEMPERATURE,
+            aux_semantic_sample_k=cfg.MODEL.KMAX_DEEPLAB.UX_SEMANTIC_SAMPLE_K
+        )
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "criterion": criterion,
+            "num_queries": cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.NUM_OBJECT_QUERIES,
+            "object_mask_threshold": cfg.MODEL.KMAX_DEEPLAB.TEST.OBJECT_MASK_THRESHOLD,
+            "class_threshold_thing": cfg.MODEL.KMAX_DEEPLAB.TEST.CLASS_THRESHOLD_THING,
+            "class_threshold_stuff": cfg.MODEL.KMAX_DEEPLAB.TEST.CLASS_THRESHOLD_STUFF,
+            "overlap_threshold": cfg.MODEL.KMAX_DEEPLAB.TEST.OVERLAP_THRESHOLD,
+            "reorder_class_weight": cfg.MODEL.KMAX_DEEPLAB.TEST.REORDER_CLASS_WEIGHT,
+            "reorder_mask_weight": cfg.MODEL.KMAX_DEEPLAB.TEST.REORDER_MASK_WEIGHT,
+            "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
+            "size_divisibility": cfg.MODEL.KMAX_DEEPLAB.SIZE_DIVISIBILITY,
+            "sem_seg_postprocess_before_inference": (
+                cfg.MODEL.KMAX_DEEPLAB.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE
+                or cfg.MODEL.KMAX_DEEPLAB.TEST.PANOPTIC_ON
+                or cfg.MODEL.KMAX_DEEPLAB.TEST.INSTANCE_ON
+            ),
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+            # inference
+            "semantic_on": cfg.MODEL.KMAX_DEEPLAB.TEST.SEMANTIC_ON,
+            "instance_on": cfg.MODEL.KMAX_DEEPLAB.TEST.INSTANCE_ON,
+            "panoptic_on": cfg.MODEL.KMAX_DEEPLAB.TEST.PANOPTIC_ON,
+            "test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
+            "input_shape": cfg.INPUT.IMAGE_SIZE
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        if "is_real_pixels" in batched_inputs[0]:
+            is_real_pixels = [x["is_real_pixels"] for x in batched_inputs]
+            # Set all padded pixel values to 0.
+            images = [x * y.to(x) for x, y in zip(images, is_real_pixels)]
+        # We perform zero padding to ensure input shape equal to self.input_shape.
+        # The padding is done on the right and bottom sides.
+        for idx in range(len(images)):
+            cur_height, cur_width = images[idx].shape[-2:]
+            padding = (0, max(0, self.input_shape[1] - cur_width), 0, max(0, self.input_shape[0] - cur_height), 0, 0)
+            images[idx] = F.pad(images[idx], padding, value=0)
+        images = ImageList.from_tensors(images, -1)
+        if self.training:
+            # mask classification target
+            if "instances" in batched_inputs[0]:
+                gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+                gt_semantic = [x["sem_seg_gt"].to(self.device) for x in batched_inputs]
+                targets = self.prepare_targets(gt_instances, gt_semantic, images)
+            else:
+                targets = None
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features)
+        if self.training:
+            with autocast(enabled=False):
+                # bipartite matching-based loss
+                for output_key in ["pixel_feature", "pred_masks", "pred_logits", "aux_semantic_pred"]:
+                    if output_key in outputs:
+                        outputs[output_key] = outputs[output_key].float()
+                for i in range(len(outputs["aux_outputs"])):
+                    for output_key in ["pixel_feature", "pred_masks", "pred_logits"]:
+                        outputs["aux_outputs"][i][output_key] = outputs["aux_outputs"][i][output_key].float()
+                losses = self.criterion(outputs, targets)
+                for k in list(losses.keys()):
+                    if k in self.criterion.weight_dict:
+                        losses[k] *= self.criterion.weight_dict[k]
+                    else:
+                        # remove this loss if not specified in `weight_dict`
+                        losses.pop(k)
+                return losses
+        else:
+            mask_cls_results = outputs["pred_logits"]
+            mask_pred_results = outputs["pred_masks"]
+            align_corners = (images.tensor.shape[-1] % 2 == 1)
+            # upsample masks
+            mask_pred_results = F.interpolate(
+                mask_pred_results,
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=align_corners,
+            )
+            del outputs
+            processed_results = []
+            for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
+                mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                cur_image = input_per_image["image"].to(self.device)
+                processed_results.append({})
+                scale_factor = max(images.tensor.shape[-2:]) / max(height, width)
+                ori_height, ori_width = round(height * scale_factor), round(width * scale_factor)
+                mask_pred_result = mask_pred_result[:, :ori_height, :ori_width].expand(1, -1, -1, -1)
+                cur_image = cur_image[:, :ori_height, :ori_width].expand(1, -1, -1, -1)
+                mask_pred_result = F.interpolate(
+                    mask_pred_result, size=(height, width), mode="bilinear", align_corners=align_corners
+                )[0]
+                cur_image = F.interpolate(
+                    cur_image.float(), size=(height, width), mode="bilinear", align_corners=align_corners
+                )[0].to(torch.uint8)
+                if self.sem_seg_postprocess_before_inference:
+                    mask_cls_result = mask_cls_result.to(mask_pred_result)
+                # semantic segmentation inference
+                if self.semantic_on:
+                    r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result)
+                    if not self.sem_seg_postprocess_before_inference:
+                        r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
+                    processed_results[-1]["sem_seg"] = r
+                # panoptic segmentation inference
+                if self.panoptic_on:
+                    panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result)
+                    processed_results[-1]["panoptic_seg"] = panoptic_r
+                    processed_results[-1]["original_image"] = cur_image
+                # instance segmentation inference
+                if self.instance_on:
+                    instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result)
+                    processed_results[-1]["instances"] = instance_r
+            return processed_results
+    def prepare_targets(self, targets, targets_semantic, images):
+        new_targets = []
+        for targets_per_image, semantic_gt_mask in zip(targets, targets_semantic):
+            gt_masks = targets_per_image.gt_masks
+            new_targets.append(
+                {
+                    "labels": targets_per_image.gt_classes,
+                    "masks": gt_masks,
+                    "semantic_masks": semantic_gt_mask
+                }
+            )
+        return new_targets
+    def semantic_inference(self, mask_cls, mask_pred):
+        # For cls prob, we exluced the void class following
+        # https://github.com/google-research/deeplab2/blob/main/model/post_processor/max_deeplab.py#L199
+        mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+        mask_pred = F.softmax(mask_pred, dim=0)
+        semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+        return semseg
+    def panoptic_inference(self, mask_cls, mask_pred):
+        # mask_cls: N x C
+        # mask_pred: N x H x W
+        # some hyper-params
+        num_mask_slots = mask_pred.shape[0]
+        cls_threshold_thing = self.class_threshold_thing
+        cls_threshold_stuff = self.class_threshold_stuff
+        object_mask_threshold = self.object_mask_threshold
+        overlap_threshold = self.overlap_threshold
+        reorder_class_weight = self.reorder_class_weight
+        reorder_mask_weight = self.reorder_mask_weight
+        # https://github.com/google-research/deeplab2/blob/main/model/post_processor/max_deeplab.py#L675
+        # https://github.com/google-research/deeplab2/blob/main/model/post_processor/max_deeplab.py#L199
+        cls_scores, cls_labels = F.softmax(mask_cls, dim=-1)[..., :-1].max(-1) # N
+        mask_scores = F.softmax(mask_pred, dim=0)
+        binary_masks = mask_scores > object_mask_threshold # N x H x W
+        mask_scores_flat = mask_scores.flatten(1) # N x HW
+        binary_masks_flat = binary_masks.flatten(1).float() # N x HW
+        pixel_number_flat = binary_masks_flat.sum(1) # N
+        mask_scores_flat = (mask_scores_flat * binary_masks_flat).sum(1) / torch.clamp(pixel_number_flat, min=1.0) # N
+        reorder_score = (cls_scores ** reorder_class_weight) * (mask_scores_flat ** reorder_mask_weight) # N
+        reorder_indices = torch.argsort(reorder_score, dim=-1, descending=True)
+        panoptic_seg = torch.zeros((mask_pred.shape[1], mask_pred.shape[2]),
+         dtype=torch.int32, device=mask_pred.device)
+        segments_info = []
+        current_segment_id = 0
+        stuff_memory_list = {}
+        for i in range(num_mask_slots):
+            cur_idx = reorder_indices[i].item() # 1
+            cur_binary_mask = binary_masks[cur_idx] # H x W
+            cur_cls_score = cls_scores[cur_idx].item() # 1
+            cur_cls_label = cls_labels[cur_idx].item() # 1
+            is_thing = cur_cls_label in self.metadata.thing_dataset_id_to_contiguous_id.values()
+            is_confident = (is_thing and cur_cls_score > cls_threshold_thing) or (
+                (not is_thing) and cur_cls_score > cls_threshold_stuff)
+            original_pixel_number = cur_binary_mask.float().sum()
+            new_binary_mask = torch.logical_and(cur_binary_mask, (panoptic_seg == 0))
+            new_pixel_number = new_binary_mask.float().sum()
+            is_not_overlap_too_much = new_pixel_number > (original_pixel_number * overlap_threshold)
+            if is_confident and is_not_overlap_too_much:
+                # merge stuff regions
+                if not is_thing:
+                    if int(cur_cls_label) in stuff_memory_list.keys():
+                        panoptic_seg[new_binary_mask] = stuff_memory_list[int(cur_cls_label)]
+                        continue
+                    else:
+                        stuff_memory_list[int(cur_cls_label)] = current_segment_id + 1
+                current_segment_id += 1
+                panoptic_seg[new_binary_mask] = current_segment_id
+                segments_info.append(
+                    {
+                        "id": current_segment_id,
+                        "isthing": bool(is_thing),
+                        "category_id": int(cur_cls_label),
+                    }
+                )
+        return panoptic_seg, segments_info
+    def instance_inference(self, mask_cls, mask_pred):
+        # mask_pred is already processed to have the same shape as original input
+        image_size = mask_pred.shape[-2:]
+        mask_pred = mask_pred.softmax(dim=0)
+        # [Q, K]
+        scores = F.softmax(mask_cls[:, :-1], dim=-1)
+        labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
+        scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False)
+        labels_per_image = labels[topk_indices]
+        topk_indices = topk_indices // self.sem_seg_head.num_classes
+        mask_pred = mask_pred[topk_indices]
+        # if this is panoptic segmentation, we only keep the "thing" classes
+        if self.panoptic_on:
+            keep = torch.zeros_like(scores_per_image).bool()
+            for i, lab in enumerate(labels_per_image):
+                keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values()
+            scores_per_image = scores_per_image[keep]
+            labels_per_image = labels_per_image[keep]
+            mask_pred = mask_pred[keep]
+        result = Instances(image_size)
+        result.pred_masks = (mask_pred > self.object_mask_threshold).float()
+        result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
+        # Uncomment the following to get boxes from masks (this is slow)
+        # result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
+        # calculate average mask prob
+        mask_scores_per_image = (mask_pred.flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
+        result.scores = scores_per_image * mask_scores_per_image
+        result.pred_classes = labels_per_image
+        return result

kmax_deeplab/modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .backbone.convnext import D2ConvNeXt
+from .backbone.resnet import custom_bn_build_resnet_backbone
+from .pixel_decoder.kmax_pixel_decoder import kMaXPixelDecoder
+from .meta_arch.kmax_deeplab_head import kMaXDeepLabHead

kmax_deeplab/modeling/backbone/__init__.py ADDED Viewed

File without changes

kmax_deeplab/modeling/backbone/convnext.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# reference: https://github.com/SHI-Labs/OneFormer/blob/main/oneformer/modeling/backbone/convnext.py
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import DropPath
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+from torch.cuda.amp import autocast
+class Block(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
+                                    requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+    def forward(self, x):
+        with autocast(enabled=False):
+            x = x.float()
+            if self.data_format == "channels_last":
+                return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+            elif self.data_format == "channels_first":
+                u = x.mean(1, keepdim=True)
+                s = (x - u).pow(2).mean(1, keepdim=True)
+                x = (x - u) / torch.sqrt(s + self.eps)
+                x = self.weight[:, None, None] * x + self.bias[:, None, None]
+                return x
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(self, in_chans=3, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768],
+                 drop_path_rate=0., layer_scale_init_value=1e-6, out_indices=[0, 1, 2, 3],
+                 ):
+        super().__init__()
+        self.num_features = dims
+        self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                    LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                    nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+        self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_path=dp_rates[cur + j],
+                layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.out_indices = out_indices
+    def forward_features(self, x):
+        outs = {}
+        for i in range(4):
+            # We add zero padding here for downstream tasks.
+            # ref: https://github.com/google-research/deeplab2/blob/main/model/pixel_encoder/convnext.py#L128
+            if i == 0:
+                x = F.pad(x, (1, 2, 1, 2, 0, 0, 0, 0), "constant", 0)
+            else:
+                x = F.pad(x, (0, 1, 0, 1, 0, 0, 0, 0), "constant", 0)
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+            if i in self.out_indices:
+                outs["res{}".format(i + 2)] = x
+        return outs
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+@BACKBONE_REGISTRY.register()
+class D2ConvNeXt(ConvNeXt, Backbone):
+    def __init__(self, cfg, input_shape):
+        in_chans = cfg.MODEL.CONVNEXT.IN_CHANNELS
+        depths = cfg.MODEL.CONVNEXT.DEPTHS
+        dims = cfg.MODEL.CONVNEXT.DIMS
+        drop_path_rate = cfg.MODEL.CONVNEXT.DROP_PATH_RATE
+        layer_scale_init_value = cfg.MODEL.CONVNEXT.LSIT
+        out_indices = cfg.MODEL.CONVNEXT.OUT_INDICES
+        super().__init__(
+            in_chans=in_chans,
+            depths=depths,
+            dims=dims,
+            drop_path_rate=drop_path_rate,
+            layer_scale_init_value=layer_scale_init_value,
+            out_indices=out_indices,
+        )
+        self._out_features = cfg.MODEL.CONVNEXT.OUT_FEATURES
+        self._out_feature_strides = {
+            "res2": 4,
+            "res3": 8,
+            "res4": 16,
+            "res5": 32,
+        }
+        self._out_feature_channels = {
+            "res2": self.num_features[0],
+            "res3": self.num_features[1],
+            "res4": self.num_features[2],
+            "res5": self.num_features[3],
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f"ConvNeXt takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    @property
+    def size_divisibility(self):
+        return -1

kmax_deeplab/modeling/backbone/resnet.py ADDED Viewed

	@@ -0,0 +1,697 @@

+# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/resnet.py
+# Modified by Qihang Yu
+import numpy as np
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn.functional as F
+from torch import nn
+from detectron2.layers import (
+    CNNBlockBase,
+    Conv2d,
+    DeformConv,
+    ModulatedDeformConv,
+    #ShapeSpec,
+    #get_norm,
+)
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+from ..pixel_decoder.kmax_pixel_decoder import get_norm
+__all__ = [
+    "ResNetBlockBase",
+    "BasicBlock",
+    "BottleneckBlock",
+    "DeformBottleneckBlock",
+    "BasicStem",
+    "ResNet",
+    "make_stage",
+    "custom_bn_build_resnet_backbone",
+]
+class BasicBlock(CNNBlockBase):
+    """
+    The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
+    with two 3x3 conv layers and a projection shortcut if needed.
+    """
+    def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            stride (int): Stride for the first conv.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__(in_channels, out_channels, stride)
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        self.conv2 = Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        for layer in [self.conv1, self.conv2, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+    ):
+        """
+        Args:
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            num_groups (int): number of groups for the 3x3 conv layer.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            stride_in_1x1 (bool): when stride>1, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+            dilation (int): the dilation rate of the 3x3 conv layer.
+        """
+        super().__init__(in_channels, out_channels, stride)
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+        out = F.relu_(out)
+        out = self.conv3(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out
+class DeformBottleneckBlock(CNNBlockBase):
+    """
+    Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv <deformconv>`
+    in the 3x3 convolution.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+        deform_modulated=False,
+        deform_num_groups=1,
+    ):
+        super().__init__(in_channels, out_channels, stride)
+        self.deform_modulated = deform_modulated
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        if deform_modulated:
+            deform_conv_op = ModulatedDeformConv
+            # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
+            offset_channels = 27
+        else:
+            deform_conv_op = DeformConv
+            offset_channels = 18
+        self.conv2_offset = Conv2d(
+            bottleneck_channels,
+            offset_channels * deform_num_groups,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            dilation=dilation,
+        )
+        self.conv2 = deform_conv_op(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            deformable_groups=deform_num_groups,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+        nn.init.constant_(self.conv2_offset.weight, 0)
+        nn.init.constant_(self.conv2_offset.bias, 0)
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        if self.deform_modulated:
+            offset_mask = self.conv2_offset(out)
+            offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
+            offset = torch.cat((offset_x, offset_y), dim=1)
+            mask = mask.sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = F.relu_(out)
+        out = self.conv3(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out
+class BasicStem(CNNBlockBase):
+    """
+    The standard ResNet stem (layers before the first residual block),
+    with a conv, relu and max_pool.
+    """
+    def __init__(self, in_channels=3, out_channels=64, norm="BN"):
+        """
+        Args:
+            norm (str or callable): norm after the first conv layer.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__(in_channels, out_channels, 4)
+        self.in_channels = in_channels
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        weight_init.c2_msra_fill(self.conv1)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu_(x)
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+        return x
+class ResNet(Backbone):
+    """
+    Implement :paper:`ResNet`.
+    """
+    def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
+        """
+        Args:
+            stem (nn.Module): a stem module
+            stages (list[list[CNNBlockBase]]): several (typically 4) stages,
+                each contains multiple :class:`CNNBlockBase`.
+            num_classes (None or int): if None, will not perform classification.
+                Otherwise, will create a linear layer.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
+                If None, will return the output of the last layer.
+            freeze_at (int): The number of stages at the beginning to freeze.
+                see :meth:`freeze` for detailed explanation.
+        """
+        super().__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+        self.stage_names, self.stages = [], []
+        if out_features is not None:
+            # Avoid keeping unused layers in this module. They consume extra memory
+            # and may cause allreduce to fail
+            num_stages = max(
+                [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features]
+            )
+            stages = stages[:num_stages]
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, CNNBlockBase), block
+            name = "res" + str(i + 2)
+            stage = nn.Sequential(*blocks)
+            self.add_module(name, stage)
+            self.stage_names.append(name)
+            self.stages.append(stage)
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
+        self.stage_names = tuple(self.stage_names)  # Make it static for scripting
+        if num_classes is not None:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.linear = nn.Linear(curr_channels, num_classes)
+            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+            # "The 1000-way fully-connected layer is initialized by
+            # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
+            nn.init.normal_(self.linear.weight, std=0.01)
+            name = "linear"
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(", ".join(children))
+        self.freeze(freeze_at)
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name, stage in zip(self.stage_names, self.stages):
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.linear(x)
+            if "linear" in self._out_features:
+                outputs["linear"] = x
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    def freeze(self, freeze_at=0):
+        """
+        Freeze the first several stages of the ResNet. Commonly used in
+        fine-tuning.
+        Layers that produce the same feature map spatial size are defined as one
+        "stage" by :paper:`FPN`.
+        Args:
+            freeze_at (int): number of stages to freeze.
+                `1` means freezing the stem. `2` means freezing the stem and
+                one residual stage, etc.
+        Returns:
+            nn.Module: this ResNet itself
+        """
+        if freeze_at >= 1:
+            self.stem.freeze()
+        for idx, stage in enumerate(self.stages, start=2):
+            if freeze_at >= idx:
+                for block in stage.children():
+                    block.freeze()
+        return self
+    @staticmethod
+    def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
+        """
+        Create a list of blocks of the same type that forms one ResNet stage.
+        Args:
+            block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
+                stage. A module of this type must not change spatial resolution of inputs unless its
+                stride != 1.
+            num_blocks (int): number of blocks in this stage
+            in_channels (int): input channels of the entire stage.
+            out_channels (int): output channels of **every block** in the stage.
+            kwargs: other arguments passed to the constructor of
+                `block_class`. If the argument name is "xx_per_block", the
+                argument is a list of values to be passed to each block in the
+                stage. Otherwise, the same argument is passed to every block
+                in the stage.
+        Returns:
+            list[CNNBlockBase]: a list of block module.
+        Examples:
+        ::
+            stage = ResNet.make_stage(
+                BottleneckBlock, 3, in_channels=16, out_channels=64,
+                bottleneck_channels=16, num_groups=1,
+                stride_per_block=[2, 1, 1],
+                dilations_per_block=[1, 1, 2]
+            )
+        Usually, layers that produce the same feature map spatial size are defined as one
+        "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
+        all be 1.
+        """
+        blocks = []
+        for i in range(num_blocks):
+            curr_kwargs = {}
+            for k, v in kwargs.items():
+                if k.endswith("_per_block"):
+                    assert len(v) == num_blocks, (
+                        f"Argument '{k}' of make_stage should have the "
+                        f"same length as num_blocks={num_blocks}."
+                    )
+                    newk = k[: -len("_per_block")]
+                    assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
+                    curr_kwargs[newk] = v[i]
+                else:
+                    curr_kwargs[k] = v
+            blocks.append(
+                block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs)
+            )
+            in_channels = out_channels
+        return blocks
+    @staticmethod
+    def make_default_stages(depth, block_class=None, **kwargs):
+        """
+        Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
+        If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
+        instead for fine-grained customization.
+        Args:
+            depth (int): depth of ResNet
+            block_class (type): the CNN block class. Has to accept
+                `bottleneck_channels` argument for depth > 50.
+                By default it is BasicBlock or BottleneckBlock, based on the
+                depth.
+            kwargs:
+                other arguments to pass to `make_stage`. Should not contain
+                stride and channels, as they are predefined for each depth.
+        Returns:
+            list[list[CNNBlockBase]]: modules in all stages; see arguments of
+                :class:`ResNet.__init__`.
+        """
+        num_blocks_per_stage = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }[depth]
+        if block_class is None:
+            block_class = BasicBlock if depth < 50 else BottleneckBlock
+        if depth < 50:
+            in_channels = [64, 64, 128, 256]
+            out_channels = [64, 128, 256, 512]
+        else:
+            in_channels = [64, 256, 512, 1024]
+            out_channels = [256, 512, 1024, 2048]
+        ret = []
+        for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels):
+            if depth >= 50:
+                kwargs["bottleneck_channels"] = o // 4
+            ret.append(
+                ResNet.make_stage(
+                    block_class=block_class,
+                    num_blocks=n,
+                    stride_per_block=[s] + [1] * (n - 1),
+                    in_channels=i,
+                    out_channels=o,
+                    **kwargs,
+                )
+            )
+        return ret
+ResNetBlockBase = CNNBlockBase
+"""
+Alias for backward compatibiltiy.
+"""
+def make_stage(*args, **kwargs):
+    """
+    Deprecated alias for backward compatibiltiy.
+    """
+    return ResNet.make_stage(*args, **kwargs)
+@BACKBONE_REGISTRY.register()
+def custom_bn_build_resnet_backbone(cfg, input_shape):
+    """
+    Create a ResNet instance from config.
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    # need registration of new blocks/stems?
+    norm = cfg.MODEL.RESNETS.NORM
+    stem = BasicStem(
+        in_channels=input_shape.channels,
+        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
+        norm=norm,
+    )
+    # fmt: off
+    freeze_at           = cfg.MODEL.BACKBONE.FREEZE_AT
+    out_features        = cfg.MODEL.RESNETS.OUT_FEATURES
+    depth               = cfg.MODEL.RESNETS.DEPTH
+    num_groups          = cfg.MODEL.RESNETS.NUM_GROUPS
+    width_per_group     = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+    bottleneck_channels = num_groups * width_per_group
+    in_channels         = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+    out_channels        = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    stride_in_1x1       = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+    res5_dilation       = cfg.MODEL.RESNETS.RES5_DILATION
+    deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
+    deform_modulated    = cfg.MODEL.RESNETS.DEFORM_MODULATED
+    deform_num_groups   = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
+    # fmt: on
+    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
+    num_blocks_per_stage = {
+        18: [2, 2, 2, 2],
+        34: [3, 4, 6, 3],
+        50: [3, 4, 6, 3],
+        101: [3, 4, 23, 3],
+        152: [3, 8, 36, 3],
+    }[depth]
+    if depth in [18, 34]:
+        assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
+        assert not any(
+            deform_on_per_stage
+        ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34"
+        assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
+        assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"
+    stages = []
+    for idx, stage_idx in enumerate(range(2, 6)):
+        # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper
+        dilation = res5_dilation if stage_idx == 5 else 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1),
+            "in_channels": in_channels,
+            "out_channels": out_channels,
+            "norm": norm,
+        }
+        # Use BasicBlock for R18 and R34.
+        if depth in [18, 34]:
+            stage_kargs["block_class"] = BasicBlock
+        else:
+            stage_kargs["bottleneck_channels"] = bottleneck_channels
+            stage_kargs["stride_in_1x1"] = stride_in_1x1
+            stage_kargs["dilation"] = dilation
+            stage_kargs["num_groups"] = num_groups
+            if deform_on_per_stage[idx]:
+                stage_kargs["block_class"] = DeformBottleneckBlock
+                stage_kargs["deform_modulated"] = deform_modulated
+                stage_kargs["deform_num_groups"] = deform_num_groups
+            else:
+                stage_kargs["block_class"] = BottleneckBlock
+        blocks = ResNet.make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+        stages.append(blocks)
+    return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at)

kmax_deeplab/modeling/criterion.py ADDED Viewed

	@@ -0,0 +1,432 @@

+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/criterion.py
+# Reference: https://github.com/google-research/deeplab2/blob/main/model/loss/max_deeplab_loss.py
+# Modified by Qihang Yu
+import torch
+import torch.nn.functional as F
+from torch import nn
+_SOFTMAX_MASKING_CONSTANT = -99999.0
+# https://www.tensorflow.org/api_docs/python/tf/math/divide_no_nan
+def divide_no_nan(x: torch.Tensor, y: torch.Tensor):
+    return torch.nan_to_num(x / y, nan=0.0, posinf=0.0, neginf=0.0)
+# https://github.com/google-research/deeplab2/blob/main/model/loss/base_loss.py#L393
+def focal_cross_entropy_loss(
+    pred: torch.Tensor,
+    gt: torch.Tensor,
+    weight: torch.Tensor, # This is for PQ-loss weighting
+    focal_loss_alpha: float = 0.75,
+    focal_loss_gamma: float = 0.0,
+    background_channel_index: int = -1):
+    """
+    pred: B x N x C
+    gt: B x N
+    weight: B x N
+    """
+    pred = pred.transpose(1, 2) # B x C x N
+    gt = F.one_hot(gt, num_classes=pred.shape[1]).transpose(1, 2).to(pred) # B x C x N
+    loss = F.cross_entropy(pred, gt, reduction="none") # B x N
+    if focal_loss_gamma == 0.0:
+        focal_loss = loss
+    else:
+        pred = F.softmax(pred, dim=1) # B x C x N
+        pt = (pred * gt).sum(1)  # B x N
+        focal_loss = torch.pow(1.0 - pt, focal_loss_gamma) * loss # B x N
+    if focal_loss_alpha >= 0:
+        alpha_weights = (
+          focal_loss_alpha * (1.0 - gt[:, background_channel_index])
+          + (1 - focal_loss_alpha) * gt[:, background_channel_index]) # B x N
+        focal_loss = alpha_weights * focal_loss # B x N
+    focal_loss = focal_loss * weight # B x N
+    focal_loss = focal_loss.flatten(1)
+    num_non_zero = (focal_loss != 0.0).to(focal_loss).sum(-1) # B
+    num_non_zero = torch.clamp(num_non_zero, min=1.0)
+    loss_sum_per_sample = focal_loss.sum(-1) # B
+    return divide_no_nan(loss_sum_per_sample, num_non_zero).mean() # 1
+# https://github.com/google-research/deeplab2/blob/main/model/loss/max_deeplab_loss.py#L50
+def _gumbel_topk_sample(logits: torch.Tensor, k: int):
+    """Samples k points from the softmax distribution with Gumbel-Top-k trick."""
+    # Note that torch.rand is [0, 1), we need to make it (0, 1) to ensure the log is valid.
+    gumbel_noise = torch.rand(size=logits.shape, dtype=logits.dtype, device=logits.device)
+    gumbel_noise = -torch.log(-torch.log(gumbel_noise))
+    _, indices = torch.topk(logits + gumbel_noise, k)
+    return indices
+# https://github.com/google-research/deeplab2/blob/main/model/loss/max_deeplab_loss.py#L576
+def pixelwise_insdis_loss(
+    pixel_feature: torch.Tensor,
+    gt_mask: torch.Tensor,
+    sample_temperature: float,
+    sample_k: int,
+    instance_discrimination_temperature: float,
+    pixel_gt_void_mask: torch.Tensor,
+    inverse_gt_mask_area: torch.Tensor
+    ):
+    # pixel_feature: B x C x H x W
+    # gt_mask: B x N x H x W
+    pixel_feature = pixel_feature.flatten(2) # B x C x HW
+    gt_mask = gt_mask.flatten(2) # B x N x HW
+    pixel_gt_void_mask = pixel_gt_void_mask.flatten(1) # B x HW
+    inverse_gt_mask_area = inverse_gt_mask_area.flatten(1) # B x HW
+    sample_logits = torch.log(inverse_gt_mask_area) * sample_temperature # B x HW
+    # sample_logits.masked_fill_(pixel_gt_void_mask, float('-inf'))
+    sample_logits += pixel_gt_void_mask.to(sample_logits) * _SOFTMAX_MASKING_CONSTANT
+    sample_indices = _gumbel_topk_sample(sample_logits, sample_k) # B x K
+    # Sample ground truth one-hot encodings and compute gt_similarity.
+    pixel_gt_sampled_feature = torch.gather(gt_mask, dim=2, index=sample_indices.unsqueeze(1).repeat(1, gt_mask.shape[1], 1)) # B x N x K
+    sampled_gt_similarity = torch.einsum('bnk,bnj->bkj', pixel_gt_sampled_feature, pixel_gt_sampled_feature) # B x K x K
+    # Normalize the ground truth similarity into a distribution (sum to 1).
+    pixel_normalizing_constant = sampled_gt_similarity.sum(dim=1, keepdim=True) # B x 1 x K
+    sampled_gt_similarity /= torch.clamp(pixel_normalizing_constant, min=1.0) # B x K x K
+    # Sample predicted features and compute pred_similarity.
+    pixel_pred_sampled_feature = torch.gather(pixel_feature, dim=2, index=sample_indices.unsqueeze(1).repeat(1, pixel_feature.shape[1], 1)) # B x C x K
+    sampled_pred_similarity = torch.einsum('bck,bcj->bkj', pixel_pred_sampled_feature, pixel_pred_sampled_feature) # B x K x K
+    sampled_pred_similarity /= instance_discrimination_temperature # B x K x K
+    loss = F.cross_entropy(sampled_pred_similarity, sampled_gt_similarity, reduction="none") # B x K
+    num_non_zero = (loss != 0.0).to(loss).sum(-1) # B
+    num_non_zero = torch.clamp(num_non_zero, min=1.0)
+    loss_sum_per_sample = loss.sum(-1) # B
+    return divide_no_nan(loss_sum_per_sample, num_non_zero).mean() # 1
+def aux_semantic_loss(
+    pred_semantic_logits: torch.Tensor,
+    ground_truth_semantic: torch.Tensor,
+    sample_temperature: float,
+    sample_k: int,
+    pixel_gt_void_mask: torch.Tensor,
+    inverse_gt_mask_area: torch.Tensor,
+    num_classes: int):
+    pred_semantic_logits = pred_semantic_logits.flatten(2) # B x C x HW
+    ground_truth_semantic = ground_truth_semantic.flatten(1) # B x HW
+    pixel_gt_void_mask = pixel_gt_void_mask.flatten(1) # B x HW
+    inverse_gt_mask_area = inverse_gt_mask_area.flatten(1) # B x HW
+    sample_logits = torch.log(inverse_gt_mask_area) * sample_temperature # B x HW
+    sample_logits += pixel_gt_void_mask.to(sample_logits) * _SOFTMAX_MASKING_CONSTANT
+    sample_indices = _gumbel_topk_sample(sample_logits, sample_k) # B x K
+    sampled_ground_truth_semantic = torch.gather(ground_truth_semantic, dim=1, index=sample_indices) # B x K
+    sampled_pred_semantic_logits = torch.gather(pred_semantic_logits, dim=2, index=sample_indices.unsqueeze(1).repeat(1, pred_semantic_logits.shape[1], 1)) # B x C x K
+    # ignore the class index num_classes.
+    keep_mask = (sampled_ground_truth_semantic != num_classes) # B x K
+    loss = F.cross_entropy(sampled_pred_semantic_logits, sampled_ground_truth_semantic, ignore_index=num_classes, reduction='none') # B x K
+    loss = loss * keep_mask.to(loss)
+    num_non_zero = (loss != 0.0).to(loss).sum(-1) # B
+    num_non_zero = torch.clamp(num_non_zero, min=1.0)
+    loss_sum_per_sample = loss.sum(-1) # B
+    return divide_no_nan(loss_sum_per_sample, num_non_zero).mean() # 1
+# https://github.com/google-research/deeplab2/blob/c4a533c14fac1a1071a6d24c5379c31a69a3e5e6/model/loss/base_loss.py#L56
+# https://github.com/google-research/deeplab2/blob/main/model/loss/base_loss.py#L510
+def dice_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        pixel_gt_void_mask: torch.Tensor,
+        matched_cls_prob: torch.Tensor
+    ):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.softmax(1) # B N HW
+    # https://github.com/google-research/deeplab2/blob/main/model/loss/base_loss.py#L111
+    inputs = inputs.masked_fill(pixel_gt_void_mask.unsqueeze(1), 0) # remove void pixels.
+    smooth = 1.0
+    intersection = 2 * (inputs * targets).sum(-1) + smooth # B x N
+    denominator = inputs.sum(-1) + targets.sum(-1) + smooth # B x N
+    loss = 1.0 - divide_no_nan(intersection, denominator)
+    loss *= matched_cls_prob
+    # Note: kMaX-DeepLab sum over num_masks and avg over batches. But here batch and num_mask are one
+    # https://github.com/google-research/deeplab2/blob/c4a533c14fac1a1071a6d24c5379c31a69a3e5e6/model/loss/base_loss.py#L559
+    # https://github.com/google-research/deeplab2/blob/c4a533c14fac1a1071a6d24c5379c31a69a3e5e6/model/loss/max_deeplab_loss.py#L402
+    # As the existing of modifer, it equals to multiplier by 0.75
+    return (loss.sum(1) * 0.75/128).mean() # sum over masks and mean over batches.
+def softmax_ce_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        pixel_gt_void_mask: torch.Tensor,
+    ):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    loss = F.cross_entropy(inputs, targets, reduction="none") # B x HW
+    loss = loss.masked_fill(pixel_gt_void_mask, 0) # remove void pixels.
+    num_non_zero = (loss != 0.0).to(loss).sum(-1) # B
+    num_non_zero = torch.clamp(num_non_zero, min=1.0)
+    loss_sum_per_sample = loss.sum(-1) # B
+    return divide_no_nan(loss_sum_per_sample, num_non_zero).mean() # 1
+class SetCriterion(nn.Module):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses, share_final_matching,
+                 pixel_insdis_temperature=1.5, pixel_insdis_sample_k=4096,
+                 aux_semantic_temperature=2.0, aux_semantic_sample_k=4096):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        self.share_final_matching = share_final_matching
+        self.pixel_insdis_temperature = pixel_insdis_temperature
+        self.pixel_insdis_sample_k = pixel_insdis_sample_k
+        self.aux_semantic_temperature = aux_semantic_temperature
+        self.aux_semantic_sample_k = aux_semantic_sample_k
+    def loss_labels(self, outputs, targets):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert "pred_logits" in outputs
+        src_logits = outputs["pred_logits"] # B x N x C
+        target_classes = targets["labels"] # B x N
+        pq_loss_class_weight = targets["pq_loss_class_weight"]
+        losses = {"loss_ce": focal_cross_entropy_loss(src_logits, target_classes, pq_loss_class_weight)}
+        return losses
+    def loss_masks(self, outputs, targets):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        src_masks = outputs["pred_masks"] # B x N x H x W
+        target_masks = targets["masks"]
+        pq_loss_mask_weight = targets["pq_loss_mask_weight"]
+        pixel_gt_void_mask = targets["pixel_gt_void_mask"]
+        src_masks = src_masks.flatten(2) # B x N x HW
+        target_masks = target_masks.flatten(2) # B x N x HW
+        pixel_gt_void_mask = pixel_gt_void_mask.flatten(1) # B x HW
+        losses = {
+            "loss_mask": softmax_ce_loss(src_masks, target_masks, pixel_gt_void_mask),
+            "loss_dice": dice_loss(src_masks, target_masks, pixel_gt_void_mask, pq_loss_mask_weight),
+        }
+        return losses
+    def loss_pixels(self, outputs, targets):
+        pixel_feature = outputs["pixel_feature"]
+        target_masks = targets["masks"]
+        pixel_gt_void_mask = targets["pixel_gt_void_mask"]
+        inverse_gt_mask_area = targets["inverse_gt_mask_area"]
+        losses = {"loss_pixel_insdis": pixelwise_insdis_loss(
+            pixel_feature=pixel_feature,
+            gt_mask=target_masks,
+            sample_temperature=self.pixel_insdis_temperature,
+            sample_k=self.pixel_insdis_sample_k,
+            instance_discrimination_temperature=0.3,
+            pixel_gt_void_mask=pixel_gt_void_mask,
+            inverse_gt_mask_area=inverse_gt_mask_area
+            )}
+        del target_masks
+        return losses
+    def loss_semantic(self, outputs, targets):
+        pred_semantic_logits = outputs["aux_semantic_pred"]
+        ground_truth_semantic = targets["ground_truth_semantic"]
+        pixel_gt_void_mask = targets["pixel_gt_void_mask"].flatten(1)
+        inverse_gt_mask_area = targets["inverse_gt_mask_area"].flatten(1)
+        losses = {"loss_aux_semantic": aux_semantic_loss(
+            pred_semantic_logits=pred_semantic_logits,
+            ground_truth_semantic=ground_truth_semantic,
+            sample_temperature=self.aux_semantic_temperature,
+            sample_k=self.aux_semantic_sample_k,
+            pixel_gt_void_mask=pixel_gt_void_mask,
+            inverse_gt_mask_area=inverse_gt_mask_area,
+            num_classes=self.num_classes
+        )}
+        return losses
+    @torch.no_grad()
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        # torch.full_like gives a tensor full of i in shape of src.shape
+        # at each iter, i is the index, src is the src ind in shape of (N)
+        # so batch_idx is concat of (0,0,...), (1,1,...), with shape (N0+N1+N2+...+Nb)
+        # so if we flatten gt/pred across bathces, this gives the batch_id of each sample
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        # src_idx is src_ind concated to shape (N0+N1+N2+...+Nb)
+        # it is a flattened concat of mask_id at each batch
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def get_loss(self, loss, outputs, targets):
+        loss_map = {
+            'labels': self.loss_labels,
+            'masks': self.loss_masks,
+            'pixels': self.loss_pixels,
+            'aux_semantic': self.loss_semantic,
+        }
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        return loss_map[loss](outputs, targets)
+    @torch.no_grad()
+    def process_gt(self, outputs, targets, indices, matched_dice, matched_cls_prob, process_semantic=False):
+        # Permute&Pad Pred&GT for loss compuation.
+        # By controling process_gt, we can share the matching results for all preds.
+        src_idx = self._get_src_permutation_idx(indices)
+        src_masks = outputs["pred_masks"].detach() # B x N x H x W
+        # Pad and permute the target_mask to B x N x H x W
+        target_masks = torch.zeros_like(src_masks)
+        target_masks_o = torch.cat([t["masks"][J] for t, (_, J) in zip(targets, indices)]).to(target_masks)
+        target_masks[src_idx] = target_masks_o
+        # Pad and permute the matched_cls_prob to B x N
+        matched_cls_prob_o = torch.cat([cls_prob for cls_prob in matched_cls_prob])
+        matched_cls_prob_o = torch.clamp(matched_cls_prob_o, min=self.eos_coef)
+        # https://github.com/google-research/deeplab2/blob/main/model/loss/max_deeplab_loss.py#L1034
+        # no penalty for unmatched masks.
+        matched_cls_prob = torch.full(
+            src_masks.shape[:2], 0, dtype=src_masks.dtype, device=src_masks.device
+        ) # B x N
+        matched_cls_prob[src_idx] = matched_cls_prob_o.to(matched_cls_prob)
+        # pixel_gt_void_mask is used to indicate those pixels without labels.
+        pixel_gt_void_mask = (target_masks.sum(1) < 1) # B x H x W
+        # inverse_gt_mask_area is used to sample pixels.
+        mask_gt_area = target_masks.sum(2).sum(2) # B x N
+        pixel_gt_area = torch.einsum('bnhw,bn->bhw', target_masks, mask_gt_area) # B x H x W
+        inverse_gt_mask_area = (pixel_gt_area.shape[1] * pixel_gt_area.shape[2]) / torch.clamp(pixel_gt_area, min=1.0) # B x H x W
+        src_logits = outputs["pred_logits"] # B x N x C
+        # Pad and permute the target_classes to B x N
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        # This serves as a padding.
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        # We put real GT to those corresponds to src_idx, and put void into other places.
+        target_classes[src_idx] = target_classes_o
+        src_masks_prob = src_masks.softmax(1)
+        void_mask = pixel_gt_void_mask.to(src_masks_prob) # B x H x W
+        # compute iou instead of dice for void overlapping.
+        def computer_iou_score(x, y):
+            # x : B x N x H x W
+            # y : B x H x W
+            x = x.flatten(2) # B x N x L
+            y = y.flatten(1) # B x L
+            intersection = torch.einsum('bnl,bl->bn', x, y) # B x N
+            denominator = x.sum(-1) # B x N
+            return intersection / (denominator + 1e-5) # B x N
+        # Pad and permute the matched_dice to B x N
+        matched_dice_o = torch.cat([dice for dice in matched_dice])
+        matched_dice = computer_iou_score(src_masks_prob, void_mask) # unmatched masks use their dice with void
+        matched_dice[src_idx] = matched_dice_o.to(matched_dice)
+        matched_dice = torch.clamp(matched_dice, min=self.eos_coef)
+        processed_gt = {"masks": target_masks, "labels": target_classes,
+            "pq_loss_mask_weight": matched_cls_prob,
+            "pq_loss_class_weight": matched_dice,
+            "pixel_gt_void_mask": pixel_gt_void_mask,
+            "inverse_gt_mask_area": inverse_gt_mask_area,}
+        if process_semantic:
+            # To obtain semantic gt
+            ground_truth_semantic = [t["semantic_masks"] for t in targets]
+            ground_truth_semantic = torch.stack(ground_truth_semantic, dim=0) # B x H x W
+            # self.num_classes is set to ignore label
+            ground_truth_semantic[ground_truth_semantic==-1] = self.num_classes
+            processed_gt.update({"ground_truth_semantic": ground_truth_semantic})
+        return processed_gt
+    def forward(self, outputs, targets):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+        indices, matched_dice, matched_cls_prob = self.matcher(outputs_without_aux, targets)
+        # Pad GT to the same number of prediction.
+        processed_targets = self.process_gt(outputs, targets, indices, matched_dice, matched_cls_prob, process_semantic=True)
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, processed_targets))
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                # We share matching results across predictions.
+                if not self.share_final_matching:
+                    indices, matched_dice, matched_cls_prob = self.matcher(aux_outputs, targets)
+                if not self.share_final_matching:
+                    processed_targets = self.process_gt(aux_outputs, targets, indices, matched_dice, matched_cls_prob)
+                for loss in self.losses:
+                    if loss in ['aux_semantic']:
+                        # Only for final output.
+                        continue
+                    l_dict = self.get_loss(loss, aux_outputs, processed_targets)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        return losses
+    def __repr__(self):
+        head = "Criterion " + self.__class__.__name__
+        body = [
+            "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)),
+            "losses: {}".format(self.losses),
+            "weight_dict: {}".format(self.weight_dict),
+            "num_classes: {}".format(self.num_classes),
+            "eos_coef: {}".format(self.eos_coef),
+        ]
+        _repr_indent = 4
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)

kmax_deeplab/modeling/matcher.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/matcher.py
+# Reference: https://github.com/google-research/deeplab2/blob/main/model/loss/max_deeplab_loss.py
+# Modified by Qihang Yu
+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import torch
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from torch.cuda.amp import autocast
+import numpy as np
+# https://github.com/google-research/deeplab2/blob/c4a533c14fac1a1071a6d24c5379c31a69a3e5e6/model/loss/max_deeplab_loss.py#L158
+@torch.no_grad()
+def compute_mask_similarity(inputs: torch.Tensor, targets: torch.Tensor):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    denominator_epsilon = 1e-5
+    inputs = F.softmax(inputs, dim=0)
+    inputs = inputs.flatten(1) # N x HW
+    pixel_gt_non_void_mask = (targets.sum(0, keepdim=True) > 0).to(inputs) # 1xHW
+    inputs = inputs * pixel_gt_non_void_mask
+    intersection = torch.einsum("nc,mc->nm", inputs, targets)
+    denominator = (inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]) / 2.0
+    return intersection / (denominator + denominator_epsilon)
+# https://github.com/google-research/deeplab2/blob/c4a533c14fac1a1071a6d24c5379c31a69a3e5e6/model/loss/max_deeplab_loss.py#L941
+@torch.no_grad()
+def compute_class_similarity(inputs: torch.Tensor, targets: torch.Tensor):
+    pred_class_prob = inputs.softmax(-1)[..., :-1] # exclude the void class
+    return pred_class_prob[:, targets]
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
+            cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
+        """
+        super().__init__()
+    @torch.no_grad()
+    def memory_efficient_forward(self, outputs, targets):
+        """More memory-friendly matching"""
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+        indices = []
+        matched_dice = []
+        matched_cls_prob = []
+        # Iterate through batch size
+        for b in range(bs):
+            with autocast(enabled=False):
+                class_similarity = compute_class_similarity(outputs["pred_logits"][b].float(), targets[b]["labels"])
+            out_mask = outputs["pred_masks"][b].flatten(1)  # [num_queries, H_pred, W_pred]
+            # gt masks are already padded when preparing target
+            tgt_mask = targets[b]["masks"].to(out_mask).flatten(1)
+            with autocast(enabled=False):
+                mask_similarity = compute_mask_similarity(out_mask.float(), tgt_mask.float())
+            # Final cost matrix
+            C = - mask_similarity * class_similarity
+            C = C.reshape(num_queries, -1).cpu() # N x M , N = num_queries, M = num_gt
+            # the assignment will be truncated to a square matrix.
+            row_ind, col_ind = linear_sum_assignment(C)
+            matched_dice.append(mask_similarity[row_ind, col_ind].detach())
+            matched_cls_prob.append(class_similarity[row_ind, col_ind].detach())
+            indices.append((row_ind, col_ind)) # row_ind and col_ind, row_ind = 0,1,2,3,...,N-1, col_ind = a,b,c,d,...
+        indices = [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
+        return indices, matched_dice, matched_cls_prob
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        return self.memory_efficient_forward(outputs, targets)
+    def __repr__(self, _repr_indent=4):
+        head = "Matcher " + self.__class__.__name__
+        body = []
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)

kmax_deeplab/modeling/meta_arch/__init__.py ADDED Viewed

File without changes

kmax_deeplab/modeling/meta_arch/kmax_deeplab_head.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/meta_arch/mask_former_head.py
+# Modified by Qihang Yu
+from typing import Dict
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+from ..transformer_decoder.kmax_transformer_decoder import build_transformer_decoder
+def build_pixel_decoder(cfg, input_shape):
+    """
+    Build a pixel decoder from `cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.NAME`.
+    """
+    name = cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.NAME
+    model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
+    forward_features = getattr(model, "forward_features", None)
+    if not callable(forward_features):
+        raise ValueError(
+            "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. "
+            f"Please implement forward_features for {name} to only return mask features."
+        )
+    return model
+@SEM_SEG_HEADS_REGISTRY.register()
+class kMaXDeepLabHead(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+        transformer_predictor: nn.Module,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+            transformer_predictor: the transformer decoder that makes prediction
+            transformer_in_feature: input feature name to the transformer_predictor
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]
+        self.ignore_value = ignore_value
+        self.common_stride = 4
+        self.loss_weight = loss_weight
+        self.pixel_decoder = pixel_decoder
+        self.predictor = transformer_predictor
+        self.num_classes = num_classes
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        return {
+            "input_shape": {
+                k: v for k, v in input_shape.items() if k in cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.IN_FEATURES
+            },
+            "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+            "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+            "pixel_decoder": build_pixel_decoder(cfg, input_shape),
+            "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
+            "transformer_predictor": build_transformer_decoder(cfg, input_shape),
+        }
+    def forward(self, features):
+        return self.layers(features)
+    def layers(self, features):
+        panoptic_features, semantic_features, multi_scale_features = self.pixel_decoder.forward_features(features)
+        predictions = self.predictor(multi_scale_features, panoptic_features, semantic_features)
+        return predictions

kmax_deeplab/modeling/pixel_decoder/__init__.py ADDED Viewed

File without changes

kmax_deeplab/modeling/pixel_decoder/kmax_pixel_decoder.py ADDED Viewed

	@@ -0,0 +1,370 @@

+# Reference: https://github.com/google-research/deeplab2/blob/main/model/pixel_decoder/kmax.py
+# Modified by Qihang Yu
+from typing import Dict, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+from timm.models.layers import DropPath
+from timm.models.layers import trunc_normal_tf_ as trunc_normal_
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+from torch.cuda.amp import autocast
+from ..backbone.convnext import LayerNorm
+import math
+def get_activation(name):
+    if name is None or name.lower() == 'none':
+        return nn.Identity()
+    if name == 'relu':
+        return nn.ReLU()
+    elif name == 'gelu':
+        return nn.GELU()
+def get_norm(name, channels):
+    if name is None or name.lower() == 'none':
+        return nn.Identity()
+    if name.lower() == 'syncbn':
+        return nn.SyncBatchNorm(channels, eps=1e-3, momentum=0.01)
+class ConvBN(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, norm=None, act=None,
+                 conv_type='2d', conv_init='he_normal', norm_init=1.0):
+        super().__init__()
+        if conv_type == '2d':
+            self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
+        elif conv_type == '1d':
+            self.conv = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
+        self.norm = get_norm(norm, out_channels)
+        self.act = get_activation(act)
+        if conv_init == 'normal':
+            nn.init.normal_(self.conv.weight, std=.02)
+        elif conv_init == 'trunc_normal':
+            trunc_normal_(self.conv.weight, std=.02)
+        elif conv_init == 'he_normal':
+            # https://www.tensorflow.org/api_docs/python/tf/keras/initializers/HeNormal
+            trunc_normal_(self.conv.weight, std=math.sqrt(2.0 / in_channels))
+        elif conv_init == 'xavier_uniform':
+            nn.init.xavier_uniform_(self.conv.weight)
+        if bias:
+            nn.init.zeros_(self.conv.bias)
+        if norm is not None:
+            nn.init.constant_(self.norm.weight, norm_init)
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+MAX_SPAN = 255
+def _compute_relative_distance_matrix(query_length, key_length):
+    if (key_length - query_length) % 2:
+        raise ValueError('Key_length should be query_length + 2 * memory_flange.')
+    key_index = torch.arange(key_length)
+    query_index = torch.arange(query_length) + (key_length - query_length) // 2
+    distance_matrix = key_index[None, :] - query_index[:, None]
+    # Shift the distance_matrix so that it is >= 0. Each entry of the
+    # distance_matrix distance will index a relative positional embedding.
+    distance_matrix = distance_matrix + MAX_SPAN - 1
+    return distance_matrix
+class RelativePositionalEncoding(nn.Module):
+    def __init__(self, query_length, key_length, depth):
+        super().__init__()
+        self._embeddings = nn.Embedding(MAX_SPAN * 2 - 1, depth)
+        trunc_normal_(self._embeddings.weight, std=1.0)
+        self._relative_distance_matrix = _compute_relative_distance_matrix(query_length, key_length)
+        self.query_length = query_length
+        self.key_length = key_length
+        self.depth = depth
+    def forward(self):
+        return self._embeddings.weight[self._relative_distance_matrix.reshape(-1)].reshape(self.query_length, self.key_length, self.depth)
+# https://github.com/google-research/deeplab2/blob/main/model/layers/axial_layers.py#L36
+class AxialAttention(nn.Module):
+    def __init__(self, in_planes, query_shape=56, total_key_depth=512, total_value_depth=1024, num_heads=8):
+        assert (total_key_depth % num_heads == 0) and (total_value_depth % num_heads == 0)
+        super().__init__()
+        self._in_planes = in_planes
+        self._query_shape = query_shape
+        self._total_key_depth = total_key_depth
+        self._total_value_depth = total_value_depth
+        self._num_heads = num_heads
+        self._key_depth_per_head = total_key_depth // num_heads
+        self.qkv_transform = ConvBN(in_planes, self._total_key_depth * 2 + self._total_value_depth, kernel_size=1, stride=1,
+                                       padding=0, bias=False, norm=None, act=None, conv_type='1d')
+        trunc_normal_(self.qkv_transform.conv.weight, std=in_planes ** -0.5)
+        self._query_rpe = RelativePositionalEncoding(query_shape, query_shape, self._key_depth_per_head)
+        self._key_rpe = RelativePositionalEncoding(query_shape, query_shape, self._key_depth_per_head)
+        self._value_rpe = RelativePositionalEncoding(query_shape, query_shape, total_value_depth // num_heads)
+        self._batch_norm_qkv = get_norm('syncbn', self._total_key_depth * 2 + self._total_value_depth)
+        self._batch_norm_similarity = get_norm('syncbn', num_heads * 3)
+        self._batch_norm_retrieved_output = get_norm('syncbn', self._total_value_depth * 2)
+    def forward(self, x):
+        N, C, L = x.shape
+        qkv = self._batch_norm_qkv(self.qkv_transform(x))
+        q, k, v = torch.split(qkv, [self._total_key_depth, self._total_key_depth, self._total_value_depth], dim=1)
+        q = q.reshape(N, self._num_heads, self._total_key_depth // self._num_heads, L)
+        k = k.reshape(N, self._num_heads, self._total_key_depth // self._num_heads, L)
+        v = v.reshape(N, self._num_heads, self._total_value_depth // self._num_heads, L)
+        similarity_logits = []
+        content_similarity = torch.einsum('bhdl,bhdm->bhlm', q, k)
+        query_rpe = self._query_rpe()
+        query_rpe_similarity = torch.einsum('bhdl,lmd->bhlm', q, query_rpe)
+        key_rpe = self._key_rpe()
+        key_rpe_similarity = torch.einsum('bhdm,lmd->bhlm', k, key_rpe)
+        similarity_logits = torch.cat([content_similarity, query_rpe_similarity, key_rpe_similarity], dim=1)
+        similarity_logits = self._batch_norm_similarity(similarity_logits).reshape(N, 3, self._num_heads, L, L).sum(dim=1)
+        with autocast(enabled=False):
+            weights = F.softmax(similarity_logits.float(), dim=-1)
+        retrieved_content = torch.einsum('bhlm,bhdm->bhdl', weights, v)
+        value_rpe = self._value_rpe()
+        retrieved_rpe = torch.einsum('bhlm,lmd->bhdl', weights, value_rpe)
+        retrieved_output = torch.cat([retrieved_content, retrieved_rpe], dim=1).reshape(N, 2*self._total_value_depth, L)
+        retrieved_output = self._batch_norm_retrieved_output(retrieved_output).reshape(N, 2, self._total_value_depth, L).sum(1)
+        return retrieved_output
+# https://github.com/google-research/deeplab2/blob/main/model/layers/axial_layers.py#L316
+class AxialAttention2D(nn.Module):
+    def __init__(self, in_planes, query_shape=[56, 56], filters=512, key_expansion=1, value_expansion=2, num_heads=8):
+        super().__init__()
+        total_key_depth = int(round(filters * key_expansion))
+        total_value_depth = int(round(filters * value_expansion))
+        self._total_key_depth = total_key_depth
+        self._total_value_depth = total_value_depth
+        self._height_axis = AxialAttention(
+            in_planes=in_planes,
+            query_shape=query_shape[0],
+            total_key_depth=total_key_depth,
+            total_value_depth=total_value_depth,
+            num_heads=num_heads)
+        self._width_axis = AxialAttention(
+            in_planes=total_value_depth,
+            query_shape=query_shape[1],
+            total_key_depth=total_key_depth,
+            total_value_depth=total_value_depth,
+            num_heads=num_heads)
+    def forward(self, x):
+        # N C H W -> N W C H
+        N, C, H, W = x.shape
+        x = x.permute(0, 3, 1, 2).contiguous()
+        x = x.reshape(N*W, C, H)
+        x = self._height_axis(x)
+        # N W C H -> N H C W
+        x = x.reshape(N, W, self._total_value_depth, H).permute(0, 3, 2, 1).contiguous()
+        x = x.reshape(N*H, self._total_value_depth, W)
+        x = self._width_axis(x)
+        x = x.reshape(N, H, self._total_value_depth, W).permute(0, 2, 1, 3).contiguous()
+        x = x.reshape(N, self._total_value_depth, H, W)
+        return x
+# https://github.com/google-research/deeplab2/blob/main/model/layers/axial_blocks.py#L36
+class SingleBlock(nn.Module):
+    def __init__(self, inplanes, filter_list, block_type, query_shape=[56, 56], key_expansion=1, value_expansion=2, num_heads=8, drop_path_prob=0.0):
+        super(SingleBlock, self).__init__()
+        self._block_type = block_type.lower()
+        self._filter_list = filter_list
+        self._conv1_bn_act = ConvBN(inplanes, self._filter_list[0], kernel_size=1, bias=False, norm='syncbn', act='gelu')
+        if self._block_type == 'axial':
+            self._attention = AxialAttention2D(in_planes=self._filter_list[0], query_shape=query_shape, filters=self._filter_list[1],
+                                                key_expansion=key_expansion, value_expansion=value_expansion, num_heads=num_heads)
+            output_channel = filter_list[1] * value_expansion
+        elif self._block_type == 'bottleneck':
+            self._conv2_bn_act = ConvBN(self._filter_list[0], self._filter_list[1], kernel_size=3, padding=1, bias=False, norm='syncbn', act='gelu')
+            output_channel = filter_list[1]
+        self._conv3_bn = ConvBN(output_channel, self._filter_list[2], kernel_size=1, bias=False, norm='syncbn', act=None, norm_init=0.0)
+        self._shortcut = None
+        if inplanes != self._filter_list[-1]:
+            self._shortcut = ConvBN(inplanes, self._filter_list[-1], kernel_size=1, bias=False, norm='syncbn', act=None)
+        self.drop_path = DropPath(drop_path_prob) if drop_path_prob > 0. else nn.Identity()
+    def forward(self, x):
+        x = F.gelu(x)
+        shortcut = x
+        if self._shortcut is not None:
+            shortcut = self._shortcut(shortcut)
+        x = self._conv1_bn_act(x)
+        if self._block_type == 'axial':
+            x = self._attention(x)
+            x = F.gelu(x)
+        elif self._block_type == 'bottleneck':
+            x = self._conv2_bn_act(x)
+        x = self._conv3_bn(x)
+        x = self.drop_path(x) + shortcut
+        return x
+# https://github.com/google-research/deeplab2/blob/main/model/layers/axial_block_groups.py#L42
+class BlockGroup(nn.Module):
+    def __init__(self, inplanes, base_filter, num_blocks, block_type, **kwargs):
+        super().__init__()
+        self._num_blocks = num_blocks
+        block_type = block_type.lower()
+        if block_type == 'axial':
+            # https://github.com/google-research/deeplab2/blob/main/model/layers/axial_block_groups.py#L247
+            filter_list = [base_filter * 2, base_filter, base_filter * 4]
+        elif block_type == 'bottleneck':
+            # https://github.com/google-research/deeplab2/blob/main/model/layers/axial_block_groups.py#L250
+            filter_list = [base_filter, base_filter, base_filter * 4]
+        self._blocks = nn.ModuleList()
+        for i in range(num_blocks):
+            self._blocks.append(SingleBlock(inplanes=inplanes, filter_list=filter_list, block_type=block_type, **kwargs))
+            inplanes = filter_list[-1]
+    def forward(self, x):
+        for i in range(self._num_blocks):
+            x = self._blocks[i](x)
+        return x
+# https://github.com/google-research/deeplab2/blob/7a01a7165e97b3325ad7ea9b6bcc02d67fecd07a/model/layers/resized_fuse.py#L31
+class ResizedFuse(nn.Module):
+    def __init__(self, low_in_channels, high_in_channels, out_channels):
+        super().__init__()
+        self.low_in_channels = low_in_channels
+        self.high_in_channels = high_in_channels
+        self.out_channels = out_channels
+        if low_in_channels != out_channels:
+            self._conv_bn_low = ConvBN(low_in_channels, out_channels, kernel_size=1, bias=False, norm='syncbn', act=None)
+        if high_in_channels != out_channels:
+            self._conv_bn_high = ConvBN(high_in_channels, out_channels, kernel_size=1, bias=False, norm='syncbn', act=None)
+    def forward(self, lowres_x, highres_x):
+        align_corners = (lowres_x.shape[-1] % 2 == 1)
+        if self.low_in_channels != self.out_channels:
+            lowres_x = F.gelu(lowres_x)
+            lowres_x = self._conv_bn_low(lowres_x)
+            lowres_x = F.interpolate(lowres_x, size=highres_x.shape[2:], mode='bilinear', align_corners=align_corners)
+        else:
+            lowres_x = F.interpolate(lowres_x, size=highres_x.shape[2:], mode='bilinear', align_corners=align_corners)
+        if self.high_in_channels != self.out_channels:
+            highres_x = F.gelu(highres_x)
+            highres_x = self._conv_bn_high(highres_x)
+        return lowres_x + highres_x
+@SEM_SEG_HEADS_REGISTRY.register()
+class kMaXPixelDecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        dec_layers: List[int],
+        dec_channels: List[int],
+        layer_types: List[str],
+        drop_path_prob: float,
+        spatial_shape: List[int],
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+        """
+        super().__init__()
+        self.num_stages = len(input_shape)
+        assert self.num_stages == len(dec_layers) and self.num_stages == len(dec_channels) and self.num_stages == len(layer_types)
+        # For now, we hard code all hyper-parameters.
+        block_types = ['axial', 'axial', 'bottleneck', 'bottleneck']
+        input_shape = sorted(input_shape.items(), key=lambda x: -x[1].stride)
+        self.in_features = [k for k, v in input_shape] # starting from "res5" to "res2"
+        in_channels = [v.channels for k, v in input_shape]
+        add_one = (spatial_shape[0] % 2, spatial_shape[1] % 2)
+        query_shape = [
+            (spatial_shape[0]//32+add_one[0], spatial_shape[1]//32+add_one[1]),
+            (spatial_shape[0]//16+add_one[0], spatial_shape[1]//16+add_one[1]),
+            (spatial_shape[0]//8+add_one[0], spatial_shape[1]//8+add_one[1]),
+            (spatial_shape[0]//4+add_one[0], spatial_shape[1]//4+add_one[1])]
+        self._in_norms = nn.ModuleList()
+        self._stages = nn.ModuleList()
+        self._resized_fuses = nn.ModuleList()
+        for i in range(self.num_stages):
+            self._in_norms.append(LayerNorm(in_channels[i], data_format="channels_first"))
+            inplanes = in_channels[i] if i == 0 else dec_channels[i]
+            self._stages.append(BlockGroup(inplanes=inplanes,
+                base_filter=dec_channels[i], num_blocks=dec_layers[i], block_type=block_types[i],
+                query_shape=query_shape[i], key_expansion=1, value_expansion=2, num_heads=8, drop_path_prob=0.0))
+            if i > 0:
+                self._resized_fuses.append(ResizedFuse(
+                    low_in_channels=dec_channels[i-1] * 4,
+                    high_in_channels=in_channels[i],
+                    out_channels=dec_channels[i]))
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = {}
+        ret["input_shape"] = {
+            k: v for k, v in input_shape.items() if k in cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.IN_FEATURES
+        }
+        ret["dec_layers"] = cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.DEC_LAYERS
+        ret["dec_channels"] = cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.DEC_CHANNELS
+        ret["layer_types"] = cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.LAYER_TYPES
+        ret["drop_path_prob"] = cfg.MODEL.KMAX_DEEPLAB.PIXEL_DEC.DROP_PATH_PROB
+        ret["spatial_shape"] = cfg.INPUT.IMAGE_SIZE # We expect the height == width
+        return ret
+    def forward_features(self, features):
+        out = []
+        multi_scale_features = []
+        x = self._in_norms[0](features[self.in_features[0]])
+        for idx in range(self.num_stages - 1):
+            x = self._stages[idx](x)
+            out.append(x)
+            x = self._resized_fuses[idx](
+                lowres_x=x,
+                highres_x=self._in_norms[idx+1](features[self.in_features[idx+1]]))
+        x = self._stages[-1](x)
+        out.append(x)
+        multi_scale_features = out[:3] # OS32, 16, 8, they are used for kmax_transformer_decoder.
+        panoptic_features = out[-1] # OS4, it is used for final mask prediction.
+        # OS 32, 8, 4
+        semantic_features = [features[self.in_features[0]], features[self.in_features[2]], features[self.in_features[3]]]
+        return panoptic_features, semantic_features, multi_scale_features

kmax_deeplab/modeling/transformer_decoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .kmax_transformer_decoder import kMaXTransformerDecoder

kmax_deeplab/modeling/transformer_decoder/kmax_transformer_decoder.py ADDED Viewed

	@@ -0,0 +1,453 @@

+# Reference: https://github.com/google-research/deeplab2/blob/main/model/transformer_decoder/kmax.py
+# Modified by Qihang Yu
+from typing import List
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.cuda.amp import autocast
+from timm.models.layers import DropPath
+from timm.models.layers import trunc_normal_tf_ as trunc_normal_
+from detectron2.config import configurable
+from detectron2.utils.registry import Registry
+from ..pixel_decoder.kmax_pixel_decoder import get_norm, ConvBN
+import math
+TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE")
+TRANSFORMER_DECODER_REGISTRY.__doc__ = """
+Registry for transformer module.
+"""
+def build_transformer_decoder(cfg, input_shape_from_backbone):
+    """
+    Build a instance embedding branch from `cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.NAME`.
+    """
+    name = cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.NAME
+    return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, input_shape_from_backbone)
+# https://github.com/google-research/deeplab2/blob/7a01a7165e97b3325ad7ea9b6bcc02d67fecd07a/model/decoder/max_deeplab.py#L60
+def add_bias_towards_void(query_class_logits, void_prior_prob=0.9):
+    class_logits_shape = query_class_logits.shape
+    init_bias = [0.0] * class_logits_shape[-1]
+    init_bias[-1] = math.log(
+      (class_logits_shape[-1] - 1) * void_prior_prob / (1 - void_prior_prob))
+    return query_class_logits + torch.tensor(init_bias, dtype=query_class_logits.dtype).to(query_class_logits)
+# https://github.com/google-research/deeplab2/blob/7a01a7165e97b3325ad7ea9b6bcc02d67fecd07a/model/layers/dual_path_transformer.py#L41
+class AttentionOperation(nn.Module):
+    def __init__(self, channels_v, num_heads):
+        super().__init__()
+        self._batch_norm_similarity = get_norm('syncbn', num_heads)
+        self._batch_norm_retrieved_value = get_norm('syncbn', channels_v)
+    def forward(self, query, key, value):
+        N, _, _, L = query.shape
+        _, num_heads, C, _ = value.shape
+        similarity_logits = torch.einsum('bhdl,bhdm->bhlm', query, key)
+        similarity_logits = self._batch_norm_similarity(similarity_logits)
+        with autocast(enabled=False):
+            attention_weights = F.softmax(similarity_logits.float(), dim=-1)
+        retrieved_value = torch.einsum(
+            'bhlm,bhdm->bhdl', attention_weights, value)
+        retrieved_value = retrieved_value.reshape(N, num_heads * C, L)
+        retrieved_value = self._batch_norm_retrieved_value(
+            retrieved_value)
+        retrieved_value = F.gelu(retrieved_value)
+        return retrieved_value
+# https://github.com/google-research/deeplab2/blob/main/model/kmax_deeplab.py#L32
+class kMaXPredictor(nn.Module):
+    def __init__(self, in_channel_pixel, in_channel_query, num_classes=133+1):
+        super().__init__()
+        self._pixel_space_head_conv0bnact = ConvBN(in_channel_pixel, in_channel_pixel, kernel_size=5, groups=in_channel_pixel, padding=2, bias=False,
+                                                   norm='syncbn', act='gelu', conv_init='xavier_uniform')
+        self._pixel_space_head_conv1bnact = ConvBN(in_channel_pixel, 256, kernel_size=1, bias=False, norm='syncbn', act='gelu')
+        self._pixel_space_head_last_convbn = ConvBN(256, 128, kernel_size=1, bias=True, norm='syncbn', act=None)
+        trunc_normal_(self._pixel_space_head_last_convbn.conv.weight, std=0.01)
+        self._transformer_mask_head = ConvBN(256, 128, kernel_size=1, bias=False, norm='syncbn', act=None, conv_type='1d')
+        self._transformer_class_head = ConvBN(256, num_classes, kernel_size=1, norm=None, act=None, conv_type='1d')
+        trunc_normal_(self._transformer_class_head.conv.weight, std=0.01)
+        self._pixel_space_mask_batch_norm = get_norm('syncbn', channels=1)
+        nn.init.constant_(self._pixel_space_mask_batch_norm.weight, 0.1)
+    def forward(self, mask_embeddings, class_embeddings, pixel_feature):
+        # mask_embeddings/class_embeddings: B x C x N
+        # pixel feature: B x C x H x W
+        pixel_space_feature = self._pixel_space_head_conv0bnact(pixel_feature)
+        pixel_space_feature = self._pixel_space_head_conv1bnact(pixel_space_feature)
+        pixel_space_feature = self._pixel_space_head_last_convbn(pixel_space_feature)
+        pixel_space_normalized_feature = F.normalize(pixel_space_feature, p=2, dim=1)
+        cluster_class_logits = self._transformer_class_head(class_embeddings).permute(0, 2, 1).contiguous()
+        cluster_class_logits = add_bias_towards_void(cluster_class_logits)
+        cluster_mask_kernel = self._transformer_mask_head(mask_embeddings)
+        mask_logits = torch.einsum('bchw,bcn->bnhw',
+          pixel_space_normalized_feature, cluster_mask_kernel)
+        mask_logits = self._pixel_space_mask_batch_norm(mask_logits.unsqueeze(dim=1)).squeeze(dim=1)
+        return {
+            'class_logits': cluster_class_logits,
+            'mask_logits': mask_logits,
+            'pixel_feature': pixel_space_normalized_feature}
+# https://github.com/google-research/deeplab2/blob/7a01a7165e97b3325ad7ea9b6bcc02d67fecd07a/model/layers/dual_path_transformer.py#L107
+class kMaXTransformerLayer(nn.Module):
+    def __init__(
+        self,
+        num_classes=133,
+        in_channel_pixel=2048,
+        in_channel_query=256,
+        base_filters=128,
+        num_heads=8,
+        bottleneck_expansion=2,
+        key_expansion=1,
+        value_expansion=2,
+        drop_path_prob=0.0,
+    ):
+        super().__init__()
+        self._num_classes = num_classes
+        self._num_heads = num_heads
+        self._bottleneck_channels = int(round(base_filters * bottleneck_expansion))
+        self._total_key_depth = int(round(base_filters * key_expansion))
+        self._total_value_depth = int(round(base_filters * value_expansion))
+        # Per tf2 implementation, the same drop path prob are applied to:
+        # 1. k-means update for object query
+        # 2. self/cross-attetion for object query
+        # 3. ffn for object query
+        self.drop_path_kmeans = DropPath(drop_path_prob) if drop_path_prob > 0. else nn.Identity()
+        self.drop_path_attn = DropPath(drop_path_prob) if drop_path_prob > 0. else nn.Identity()
+        self.drop_path_ffn = DropPath(drop_path_prob) if drop_path_prob > 0. else nn.Identity()
+        initialization_std = self._bottleneck_channels ** -0.5
+        self._query_conv1_bn_act = ConvBN(in_channel_query, self._bottleneck_channels, kernel_size=1, bias=False,
+                                          norm='syncbn', act='gelu', conv_type='1d')
+        self._pixel_conv1_bn_act = ConvBN(in_channel_pixel, self._bottleneck_channels, kernel_size=1, bias=False,
+                                          norm='syncbn', act='gelu')
+        self._query_qkv_conv_bn = ConvBN(self._bottleneck_channels, self._total_key_depth * 2 + self._total_value_depth, kernel_size=1, bias=False,
+                                          norm='syncbn', act=None, conv_type='1d')
+        trunc_normal_(self._query_qkv_conv_bn.conv.weight, std=initialization_std)
+        self._pixel_v_conv_bn = ConvBN(self._bottleneck_channels, self._total_value_depth, kernel_size=1, bias=False,
+                                          norm='syncbn', act=None)
+        trunc_normal_(self._pixel_v_conv_bn.conv.weight, std=initialization_std)
+        self._query_self_attention = AttentionOperation(channels_v=self._total_value_depth, num_heads=num_heads)
+        self._query_conv3_bn = ConvBN(self._total_value_depth, in_channel_query, kernel_size=1, bias=False,
+                                          norm='syncbn', act=None, conv_type='1d', norm_init=0.0)
+        self._query_ffn_conv1_bn_act = ConvBN(in_channel_query, 2048, kernel_size=1, bias=False,
+                                          norm='syncbn', act='gelu', conv_type='1d')
+        self._query_ffn_conv2_bn = ConvBN(2048, in_channel_query, kernel_size=1, bias=False,
+                                          norm='syncbn', act=None, conv_type='1d', norm_init=0.0)
+        self._predcitor = kMaXPredictor(in_channel_pixel=self._bottleneck_channels,
+            in_channel_query=self._bottleneck_channels, num_classes=num_classes)
+        self._kmeans_query_batch_norm_retrieved_value = get_norm('syncbn', self._total_value_depth)
+        self._kmeans_query_conv3_bn = ConvBN(self._total_value_depth, in_channel_query, kernel_size=1, bias=False,
+                                          norm='syncbn', act=None, conv_type='1d', norm_init=0.0)
+    def forward(self, pixel_feature, query_feature):
+        N, C, H, W = pixel_feature.shape
+        _, D, L = query_feature.shape
+        pixel_space = self._pixel_conv1_bn_act(F.gelu(pixel_feature)) # N C H W
+        query_space = self._query_conv1_bn_act(query_feature) # N x C x L
+        # k-means cross-attention.
+        pixel_value = self._pixel_v_conv_bn(pixel_space) # N C H W
+        pixel_value = pixel_value.reshape(N, self._total_value_depth, H*W)
+        # k-means assignment.
+        prediction_result = self._predcitor(
+            mask_embeddings=query_space, class_embeddings=query_space, pixel_feature=pixel_space)
+        clustering_result = prediction_result['mask_logits'].flatten(2).detach() # N L HW
+        with torch.no_grad():
+            clustering_result = prediction_result['mask_logits'].flatten(2).detach() # N L HW
+            index = clustering_result.max(1, keepdim=True)[1]
+            clustering_result = torch.zeros_like(clustering_result, memory_format=torch.legacy_contiguous_format).scatter_(1, index, 1.0)
+        with autocast(enabled=False):
+        # k-means update.
+            kmeans_update = torch.einsum('blm,bdm->bdl', clustering_result.float(), pixel_value.float()) # N x C x L
+        kmeans_update = self._kmeans_query_batch_norm_retrieved_value(kmeans_update)
+        kmeans_update = self._kmeans_query_conv3_bn(kmeans_update)
+        query_feature = query_feature + self.drop_path_kmeans(kmeans_update)
+        # query self-attention.
+        query_qkv = self._query_qkv_conv_bn(query_space)
+        query_q, query_k, query_v = torch.split(query_qkv,
+         [self._total_key_depth, self._total_key_depth, self._total_value_depth], dim=1)
+        query_q = query_q.reshape(N, self._num_heads, self._total_key_depth//self._num_heads, L)
+        query_k = query_k.reshape(N, self._num_heads, self._total_key_depth//self._num_heads, L)
+        query_v = query_v.reshape(N, self._num_heads, self._total_value_depth//self._num_heads, L)
+        self_attn_update = self._query_self_attention(query_q, query_k, query_v)
+        self_attn_update = self._query_conv3_bn(self_attn_update)
+        query_feature = query_feature + self.drop_path_attn(self_attn_update)
+        query_feature = F.gelu(query_feature)
+        # FFN.
+        ffn_update = self._query_ffn_conv1_bn_act(query_feature)
+        ffn_update = self._query_ffn_conv2_bn(ffn_update)
+        query_feature = query_feature + self.drop_path_ffn(ffn_update)
+        query_feature = F.gelu(query_feature)
+        return query_feature, prediction_result
+class ASPP(nn.Module):
+    def __init__(self, in_channels, output_channels, atrous_rates):
+        super().__init__()
+        self._aspp_conv0 = ConvBN(in_channels, output_channels, kernel_size=1, bias=False,
+                                  norm='syncbn', act='gelu')
+        rate1, rate2, rate3 = atrous_rates
+        self._aspp_conv1 = ConvBN(in_channels, output_channels, kernel_size=3, dilation=rate1, padding=rate1, bias=False,
+                                  norm='syncbn', act='gelu')
+        self._aspp_conv2 = ConvBN(in_channels, output_channels, kernel_size=3, dilation=rate2, padding=rate2, bias=False,
+                                  norm='syncbn', act='gelu')
+        self._aspp_conv3 = ConvBN(in_channels, output_channels, kernel_size=3, dilation=rate3, padding=rate3, bias=False,
+                                  norm='syncbn', act='gelu')
+        self._avg_pool = nn.AdaptiveAvgPool2d(1)
+        self._aspp_pool = ConvBN(in_channels, output_channels, kernel_size=1, bias=False,
+                                 norm='syncbn', act='gelu')
+        self._proj_conv_bn_act = ConvBN(output_channels * 5, output_channels, kernel_size=1, bias=False,
+                                 norm='syncbn', act='gelu')
+        # https://github.com/google-research/deeplab2/blob/main/model/decoder/aspp.py#L249
+        self._proj_drop = nn.Dropout(p=0.1)
+    def forward(self, x):
+        results = []
+        results.append(self._aspp_conv0(x))
+        results.append(self._aspp_conv1(x))
+        results.append(self._aspp_conv2(x))
+        results.append(self._aspp_conv3(x))
+        align_corners = (x.shape[-1] % 2 == 1)
+        results.append(F.interpolate(self._aspp_pool(self._avg_pool(x)), size=x.shape[-2:], mode='bilinear', align_corners=align_corners))
+        x = torch.cat(results, dim=1)
+        x = self._proj_conv_bn_act(x)
+        x = self._proj_drop(x)
+        return x
+class SemanticPredictor(nn.Module):
+    def __init__(self, in_channels, os8_channels, os4_channels, num_classes):
+        super().__init__()
+        # Below is PanopticDeepLabSingleDecoder
+        self._aspp = ASPP(
+            in_channels=in_channels,
+            # https://github.com/google-research/deeplab2/blob/main/configs/coco/kmax_deeplab/kmax_meta_r50_os32.textproto#L35
+            output_channels=256,
+            # https://github.com/google-research/deeplab2/blob/main/configs/coco/kmax_deeplab/kmax_meta_r50_os32.textproto#L36
+            atrous_rates=[6,12,18])
+        self._low_level_projection_os8 = ConvBN(os8_channels, 64, kernel_size=1, bias=False,
+                                                norm='syncbn', act='gelu')
+        self._low_level_fusion_os8_conv0_bn_act = ConvBN(256 + 64, 256 + 64, groups=256 + 64, kernel_size=5, padding=2, bias=False,
+                                                         norm='syncbn', act='gelu', conv_init='xavier_uniform')
+        self._low_level_fusion_os8_conv1_bn_act = ConvBN(256 + 64, 256, kernel_size=1,bias=False,
+                                                         norm='syncbn', act='gelu')
+        self._low_level_projection_os4 = ConvBN(os4_channels, 32, kernel_size=1, bias=False,
+                                                norm='syncbn', act='gelu')
+        self._low_level_fusion_os4_conv0_bn_act = ConvBN(256 + 32, 256 + 32, groups=256 + 32, kernel_size=5, padding=2, bias=False,
+                                                         norm='syncbn', act='gelu', conv_init='xavier_uniform')
+        self._low_level_fusion_os4_conv1_bn_act = ConvBN(256 + 32, 256, kernel_size=1,bias=False,
+                                                         norm='syncbn', act='gelu')
+        # Below is PanopticDeepLabSingleHead
+        self.conv_block_0 = ConvBN(256, 256, groups=256, kernel_size=5, padding=2, bias=False,
+                                   norm='syncbn', act='gelu', conv_init='xavier_uniform')
+        self.conv_block_1 = ConvBN(256, 256, kernel_size=1,bias=False,
+                                   norm='syncbn', act='gelu')
+        self.final_conv = ConvBN(256, num_classes, kernel_size=1, norm=None, act=None)
+        trunc_normal_(self.final_conv.conv.weight, std=0.01)
+    def forward(self, x, low_features_os8, low_features_os4):
+        x = self._aspp(x)
+        align_corners = (x.shape[-1] % 2 == 1)
+        low_features_os8 = self._low_level_projection_os8(low_features_os8)
+        x = F.interpolate(x, size=low_features_os8.shape[-2:], mode='bilinear', align_corners=align_corners)
+        x = torch.concat([x, low_features_os8], dim=1)
+        x = self._low_level_fusion_os8_conv0_bn_act(x)
+        x = self._low_level_fusion_os8_conv1_bn_act(x)
+        low_features_os4 = self._low_level_projection_os4(low_features_os4)
+        x = F.interpolate(x, size=low_features_os4.shape[-2:], mode='bilinear', align_corners=align_corners)
+        x = torch.concat([x, low_features_os4], dim=1)
+        x = self._low_level_fusion_os4_conv0_bn_act(x)
+        x = self._low_level_fusion_os4_conv1_bn_act(x)
+        x = self.conv_block_0(x)
+        x = self.conv_block_1(x)
+        x = self.final_conv(x)
+        return x
+@TRANSFORMER_DECODER_REGISTRY.register()
+class kMaXTransformerDecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        *,
+        dec_layers: List[int],
+        in_channels: List[int],
+        num_classes: int,
+        num_queries: int,
+        drop_path_prob: float,
+        add_aux_semantic_pred: bool,
+        input_shape_from_backbone,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+        """
+        super().__init__()
+        # define Transformer decoder here
+        self._kmax_transformer_layers = nn.ModuleList()
+        self._num_blocks = dec_layers
+        os2channels = {32: in_channels[0], 16: in_channels[1], 8: in_channels[2]}
+        for index, output_stride in enumerate([32, 16, 8]):
+            for _ in range(self._num_blocks[index]):
+                self._kmax_transformer_layers.append(
+                    kMaXTransformerLayer(num_classes=num_classes+1,
+                    in_channel_pixel=os2channels[output_stride],
+                    in_channel_query=256,
+                    base_filters=128,
+                    num_heads=8,
+                    bottleneck_expansion=2,
+                    key_expansion=1,
+                    value_expansion=2,
+                    drop_path_prob=drop_path_prob)
+                )
+        self._num_queries = num_queries
+        # learnable query features
+        self._cluster_centers = nn.Embedding(256, num_queries)
+        trunc_normal_(self._cluster_centers.weight, std=1.0)
+        self._class_embedding_projection = ConvBN(256, 256, kernel_size=1, bias=False, norm='syncbn', act='gelu',
+                                                  conv_type='1d')
+        self._mask_embedding_projection = ConvBN(256, 256, kernel_size=1, bias=False, norm='syncbn', act='gelu',
+                                                  conv_type='1d')
+        self._predcitor = kMaXPredictor(in_channel_pixel=256,
+            in_channel_query=256, num_classes=num_classes+1)
+        self._add_aux_semantic_pred = add_aux_semantic_pred
+        if add_aux_semantic_pred:
+            self._auxiliary_semantic_predictor = SemanticPredictor(
+                in_channels=input_shape_from_backbone['res5'].channels,
+                os8_channels=input_shape_from_backbone['res3'].channels,
+                os4_channels=input_shape_from_backbone['res2'].channels,
+                # +1 for void.
+                num_classes=num_classes+1)
+    @classmethod
+    def from_config(cls, cfg, input_shape_from_backbone):
+        ret = {}
+        ret["dec_layers"] = cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.DEC_LAYERS
+        ret["in_channels"] = cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.IN_CHANNELS
+        ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
+        ret["num_queries"] = cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.NUM_OBJECT_QUERIES
+        ret["drop_path_prob"] = cfg.MODEL.KMAX_DEEPLAB.TRANS_DEC.DROP_PATH_PROB
+        ret["add_aux_semantic_pred"] = (cfg.MODEL.KMAX_DEEPLAB.AUX_SEMANTIC_WEIGHT > 0)
+        ret["input_shape_from_backbone"] = input_shape_from_backbone
+        return ret
+    def forward(self, x, panoptic_features, semantic_features):
+        B = x[0].shape[0]
+        cluster_centers = self._cluster_centers.weight.unsqueeze(0).repeat(B, 1, 1) # B x C x L
+        current_transformer_idx = 0
+        predictions_class = []
+        predictions_mask = []
+        predictions_pixel_feature = []
+        for i, feat in enumerate(x):
+            for _ in range(self._num_blocks[i]):
+                cluster_centers, prediction_result = self._kmax_transformer_layers[current_transformer_idx](
+                    pixel_feature=feat, query_feature=cluster_centers
+                )
+                predictions_class.append(prediction_result['class_logits'])
+                predictions_mask.append(prediction_result['mask_logits'])
+                predictions_pixel_feature.append(prediction_result['pixel_feature'])
+                current_transformer_idx += 1
+        class_embeddings = self._class_embedding_projection(cluster_centers)
+        mask_embeddings = self._mask_embedding_projection(cluster_centers)
+        # Final predictions.
+        prediction_result = self._predcitor(
+            class_embeddings=class_embeddings,
+            mask_embeddings=mask_embeddings,
+            pixel_feature=panoptic_features,
+        )
+        predictions_class.append(prediction_result['class_logits'])
+        predictions_mask.append(prediction_result['mask_logits'])
+        predictions_pixel_feature.append(prediction_result['pixel_feature'])
+        out = {
+            'pred_logits': predictions_class[-1],
+            'pred_masks': predictions_mask[-1],
+            'pixel_feature': predictions_pixel_feature[-1],
+            'aux_outputs': self._set_aux_loss(
+                predictions_class, predictions_mask, predictions_pixel_feature
+            ),
+        }
+        if self._add_aux_semantic_pred and self.training:
+            semantic_features, low_features_os8, low_features_os4 = semantic_features
+            aux_semantic_prediction = self._auxiliary_semantic_predictor(
+                x=semantic_features, low_features_os8=low_features_os8, low_features_os4=low_features_os4)
+            out.update({'aux_semantic_pred': aux_semantic_prediction,})
+        return out
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_seg_masks, outputs_pixel_feature):
+        target_size = outputs_seg_masks[-1].shape[-2:]
+        align_corners = (target_size[0] % 2 == 1)
+        return [
+            {"pred_logits": a, "pred_masks": F.interpolate(b, size=target_size, mode="bilinear", align_corners=align_corners),
+            "pixel_feature": F.interpolate(c, size=target_size, mode="bilinear", align_corners=align_corners),}
+            for a, b, c in zip(outputs_class[:-1], outputs_seg_masks[:-1], outputs_pixel_feature[:-1])
+        ]

pakages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+libtinfo5
+libsm6
+libxext6
+python3-opencv

requirements.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+pyyaml==5.1
+torch==1.9.0
+torchvision==0.10.0
+docutils==0.16
+# https://github.com/sphinx-doc/sphinx/commit/7acd3ada3f38076af7b2b5c9f3b60bb9c2587a3d
+sphinx==3.2.0
+recommonmark==0.6.0
+sphinx_rtd_theme
+# Dependencies here are only those required by import
+termcolor
+numpy
+tqdm
+matplotlib
+termcolor
+yacs
+tabulate
+cloudpickle
+Pillow
+future
+fvcore
+omegaconf>=2.1.0.dev24
+hydra-core>=1.1.0.dev5
+opencv-python-headless
+cython
+scipy
+shapely
+timm
+h5py
+submitit
+scikit-image

train_net.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/train_net.py
+# Modified by Qihang Yu
+try:
+    # ignore ShapelyDeprecationWarning from fvcore
+    from shapely.errors import ShapelyDeprecationWarning
+    import warnings
+    warnings.filterwarnings('ignore', category=ShapelyDeprecationWarning)
+except:
+    pass
+import copy
+import itertools
+import os
+from typing import Any, Dict, List, Set
+import torch
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import MetadataCatalog, build_detection_train_loader, build_detection_test_loader
+from detectron2.engine import (
+    DefaultTrainer,
+    default_argument_parser,
+    default_setup,
+    launch,
+)
+from detectron2.evaluation import (
+    COCOEvaluator,
+    DatasetEvaluators,
+    SemSegEvaluator,
+    verify_results,
+)
+from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler
+from detectron2.solver.build import maybe_add_gradient_clipping
+from detectron2.utils.logger import setup_logger
+# MaskFormer
+from kmax_deeplab import (
+    COCOPanoptickMaXDeepLabDatasetMapper,
+    add_kmax_deeplab_config,
+)
+from detectron2.data import MetadataCatalog
+import train_net_utils
+class Trainer(DefaultTrainer):
+    """
+    Extension of the Trainer class adapted to MaskFormer.
+    """
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each
+        builtin dataset. For your own dataset, you can simply create an
+        evaluator manually in your script and do not have to worry about the
+        hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        # panoptic segmentation
+        if evaluator_type in [
+            "coco_panoptic_seg",
+        ]:
+            if cfg.MODEL.KMAX_DEEPLAB.TEST.PANOPTIC_ON:
+                evaluator_list.append(train_net_utils.COCOPanopticEvaluatorwithVis(dataset_name, output_folder, save_vis_num=cfg.MODEL.KMAX_DEEPLAB.SAVE_VIS_NUM))
+        # COCO
+        if evaluator_type == "coco_panoptic_seg" and cfg.MODEL.KMAX_DEEPLAB.TEST.INSTANCE_ON:
+            evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
+        if evaluator_type == "coco_panoptic_seg" and cfg.MODEL.KMAX_DEEPLAB.TEST.SEMANTIC_ON:
+            evaluator_list.append(SemSegEvaluator(dataset_name, distributed=True, output_dir=output_folder))
+        elif len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+    @classmethod
+    def build_train_loader(cls, cfg):
+        # Semantic segmentation dataset mapper
+        if cfg.INPUT.DATASET_MAPPER_NAME == "coco_panoptic_lsj":
+            mapper = COCOPanoptickMaXDeepLabDatasetMapper(cfg, True)
+            return build_detection_train_loader(cfg, mapper=mapper)
+        else:
+            mapper = None
+            return build_detection_train_loader(cfg, mapper=mapper)
+    @classmethod
+    def build_lr_scheduler(cls, cfg, optimizer):
+        """
+        It now calls :func:`detectron2.solver.build_lr_scheduler`.
+        Overwrite it if you'd like a different scheduler.
+        """
+        name = cfg.SOLVER.LR_SCHEDULER_NAME
+        if name == "TF2WarmupPolyLR":
+            return train_net_utils.TF2WarmupPolyLR(
+                optimizer,
+                cfg.SOLVER.MAX_ITER,
+                warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
+                warmup_iters=cfg.SOLVER.WARMUP_ITERS,
+                warmup_method=cfg.SOLVER.WARMUP_METHOD,
+                power=cfg.SOLVER.POLY_LR_POWER,
+                constant_ending=cfg.SOLVER.POLY_LR_CONSTANT_ENDING,
+            )
+        else:
+            return build_lr_scheduler(cfg, optimizer)
+    @classmethod
+    def build_optimizer(cls, cfg, model):
+        weight_decay_norm = cfg.SOLVER.WEIGHT_DECAY_NORM
+        weight_decay_embed = cfg.SOLVER.WEIGHT_DECAY_EMBED
+        defaults = {}
+        defaults["lr"] = cfg.SOLVER.BASE_LR
+        defaults["weight_decay"] = cfg.SOLVER.WEIGHT_DECAY
+        from kmax_deeplab.modeling.backbone.convnext import LayerNorm
+        norm_module_types = (
+            torch.nn.BatchNorm1d,
+            torch.nn.BatchNorm2d,
+            torch.nn.BatchNorm3d,
+            torch.nn.SyncBatchNorm,
+            # NaiveSyncBatchNorm inherits from BatchNorm2d
+            torch.nn.GroupNorm,
+            torch.nn.InstanceNorm1d,
+            torch.nn.InstanceNorm2d,
+            torch.nn.InstanceNorm3d,
+            torch.nn.LayerNorm,
+            torch.nn.LocalResponseNorm,
+            LayerNorm
+        )
+        params: List[Dict[str, Any]] = []
+        memo: Set[torch.nn.parameter.Parameter] = set()
+        for module_name, module in model.named_modules():
+            for module_param_name, value in module.named_parameters(recurse=False):
+                if not value.requires_grad:
+                    continue
+                # Avoid duplicating parameters
+                if value in memo:
+                    continue
+                memo.add(value)
+                hyperparams = copy.copy(defaults)
+                hyperparams["name"] = (module_name, module_param_name)
+                if "backbone" in module_name:
+                    hyperparams["lr"] = hyperparams["lr"] * cfg.SOLVER.BACKBONE_MULTIPLIER
+                if (
+                    "relative_position_bias_table" in module_param_name
+                    or "absolute_pos_embed" in module_param_name
+                ):
+                    print(module_param_name)
+                    hyperparams["weight_decay"] = 0.0
+                if isinstance(module, norm_module_types):
+                    hyperparams["weight_decay"] = weight_decay_norm
+                if isinstance(module, torch.nn.Embedding):
+                    hyperparams["weight_decay"] = weight_decay_embed
+                # Rule for kMaX.
+                if "_rpe" in module_name:
+                    # relative positional embedding in axial attention.
+                    hyperparams["weight_decay"] = 0.0
+                if "_cluster_centers" in module_name:
+                    # cluster center embeddings.
+                    hyperparams["weight_decay"] = 0.0
+                if "bias" in module_param_name:
+                    # any bias terms.
+                    hyperparams["weight_decay"] = 0.0
+                if "gamma" in module_param_name:
+                    # gamma term in convnext
+                    hyperparams["weight_decay"] = 0.0
+                params.append({"params": [value], **hyperparams})
+        for param_ in params:
+            print(param_["name"], param_["lr"], param_["weight_decay"])
+        def maybe_add_full_model_gradient_clipping(optim):
+            # detectron2 doesn't have full model gradient clipping now
+            clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
+            enable = (
+                cfg.SOLVER.CLIP_GRADIENTS.ENABLED
+                and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
+                and clip_norm_val > 0.0
+            )
+            class FullModelGradientClippingOptimizer(optim):
+                def step(self, closure=None):
+                    all_params = itertools.chain(*[x["params"] for x in self.param_groups])
+                    torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
+                    super().step(closure=closure)
+            return FullModelGradientClippingOptimizer if enable else optim
+        optimizer_type = cfg.SOLVER.OPTIMIZER
+        if optimizer_type == "SGD":
+            optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
+                params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
+            )
+        elif optimizer_type == "ADAMW":
+            optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
+                params, cfg.SOLVER.BASE_LR
+            )
+        elif optimizer_type == "ADAM":
+            optimizer = maybe_add_full_model_gradient_clipping(torch.optim.Adam)(
+                params, cfg.SOLVER.BASE_LR
+            )
+        else:
+            raise NotImplementedError(f"no optimizer type {optimizer_type}")
+        if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
+            optimizer = maybe_add_gradient_clipping(cfg, optimizer)
+        return optimizer
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    # for poly lr schedule
+    add_deeplab_config(cfg)
+    add_kmax_deeplab_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="kmax_deeplab")
+    return cfg
+def main(args):
+    cfg = setup(args)
+    torch.backends.cudnn.enabled = True
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )

train_net_utils.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import itertools
+import os
+from typing import List, Optional
+import torch
+import numpy as np
+import tempfile
+from collections import OrderedDict
+from PIL import Image
+from tabulate import tabulate
+import json
+import contextlib
+import detectron2.utils.comm as comm
+from detectron2.utils.file_io import PathManager
+from detectron2.data import MetadataCatalog
+from detectron2.evaluation import COCOPanopticEvaluator
+from detectron2.utils.visualizer import ColorMode, Visualizer
+from detectron2.data import MetadataCatalog
+import io
+import math
+from PIL import Image
+from detectron2.solver.lr_scheduler import _get_warmup_factor_at_iter
+import logging
+logger = logging.getLogger(__name__)
+class TF2WarmupPolyLR(torch.optim.lr_scheduler._LRScheduler):
+    """
+    Poly learning rate schedule used in TF DeepLab2.
+    Reference: https://github.com/google-research/deeplab2/blob/main/trainer/trainer_utils.py#L23
+    """
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        max_iters: int,
+        warmup_factor: float = 0.001,
+        warmup_iters: int = 1000,
+        warmup_method: str = "linear",
+        last_epoch: int = -1,
+        power: float = 0.9,
+        constant_ending: float = 0.0,
+    ):
+        self.max_iters = max_iters
+        self.warmup_factor = warmup_factor
+        self.warmup_iters = warmup_iters
+        self.warmup_method = warmup_method
+        self.power = power
+        self.constant_ending = constant_ending
+        super().__init__(optimizer, last_epoch)
+    def get_lr(self) -> List[float]:
+        warmup_factor = _get_warmup_factor_at_iter(
+            self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
+        )
+        if self.constant_ending > 0 and warmup_factor == 1.0:
+            # Constant ending lr.
+            if (
+                math.pow((1.0 - self.last_epoch / self.max_iters), self.power)
+                < self.constant_ending
+            ):
+                return [base_lr * self.constant_ending for base_lr in self.base_lrs]
+        if self.last_epoch < self.warmup_iters:
+            return [
+            base_lr * warmup_factor
+            for base_lr in self.base_lrs
+        ]
+        else:
+            return [
+                base_lr * math.pow((1.0 - self.last_epoch / self.max_iters), self.power)
+                for base_lr in self.base_lrs
+            ]
+    def _compute_values(self) -> List[float]:
+        # The new interface
+        return self.get_lr()
+class COCOPanopticEvaluatorwithVis(COCOPanopticEvaluator):
+    """
+    COCO Panoptic Evaluator that supports saving visualizations.
+    TODO(qihangyu): Note that original implementation will also write all predictions to a tmp folder
+        and then run official evaluation script, we may also check how to copy from the tmp folder for visualization.
+    """
+    def __init__(self, dataset_name: str, output_dir: Optional[str] = None, save_vis_num=0):
+        super().__init__(dataset_name=dataset_name, output_dir=output_dir)
+        self.metadata = MetadataCatalog.get("coco_2017_val_panoptic_with_sem_seg")
+        self.output_dir = output_dir
+        self.save_vis_num = save_vis_num
+    def process(self, inputs, outputs):
+        from panopticapi.utils import id2rgb
+        cur_save_num = 0
+        for input, output in zip(inputs, outputs):
+            panoptic_img, segments_info = output["panoptic_seg"]
+            panoptic_seg = panoptic_img.cpu()
+            panoptic_img = panoptic_seg.numpy()
+            file_name = os.path.basename(input["file_name"])
+            file_name_png = os.path.splitext(file_name)[0] + ".png"
+            if cur_save_num < self.save_vis_num:
+                image = output["original_image"]
+                image = image.permute(1, 2 ,0).cpu().numpy()#[:, :, ::-1]
+                visualizer = Visualizer(image, self.metadata, instance_mode=ColorMode.IMAGE)
+                vis_output = visualizer.draw_panoptic_seg_predictions(
+                    panoptic_seg, segments_info
+                )
+                if not os.path.exists(os.path.join(self.output_dir, 'vis')):
+                    os.makedirs(os.path.join(self.output_dir, 'vis'))
+                out_filename = os.path.join(self.output_dir, 'vis', file_name_png)
+                vis_output.save(out_filename)
+                cur_save_num += 1
+            if segments_info is None:
+                # If "segments_info" is None, we assume "panoptic_img" is a
+                # H*W int32 image storing the panoptic_id in the format of
+                # category_id * label_divisor + instance_id. We reserve -1 for
+                # VOID label, and add 1 to panoptic_img since the official
+                # evaluation script uses 0 for VOID label.
+                label_divisor = self._metadata.label_divisor
+                segments_info = []
+                for panoptic_label in np.unique(panoptic_img):
+                    if panoptic_label == -1:
+                        # VOID region.
+                        continue
+                    pred_class = panoptic_label // label_divisor
+                    isthing = (
+                        pred_class in self._metadata.thing_dataset_id_to_contiguous_id.values()
+                    )
+                    segments_info.append(
+                        {
+                            "id": int(panoptic_label) + 1,
+                            "category_id": int(pred_class),
+                            "isthing": bool(isthing),
+                        }
+                    )
+                # Official evaluation script uses 0 for VOID label.
+                panoptic_img += 1
+            with io.BytesIO() as out:
+                Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG")
+                segments_info = [self._convert_category_id(x) for x in segments_info]
+                self._predictions.append(
+                    {
+                        "image_id": input["image_id"],
+                        "file_name": file_name_png,
+                        "png_string": out.getvalue(),
+                        "segments_info": segments_info,
+                    }
+                )
+    def evaluate(self):
+        comm.synchronize()
+        self._predictions = comm.gather(self._predictions)
+        self._predictions = list(itertools.chain(*self._predictions))
+        if not comm.is_main_process():
+            return
+        # PanopticApi requires local files
+        gt_json = PathManager.get_local_path(self._metadata.panoptic_json)
+        gt_folder = PathManager.get_local_path(self._metadata.panoptic_root)
+        with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir:
+            logger.info("Writing all panoptic predictions to {} ...".format(pred_dir))
+            for p in self._predictions:
+                with open(os.path.join(pred_dir, p["file_name"]), "wb") as f:
+                    f.write(p.pop("png_string"))
+            with open(gt_json, "r") as f:
+                json_data = json.load(f)
+            json_data["annotations"] = self._predictions
+            output_dir = self._output_dir or pred_dir
+            predictions_json = os.path.join(output_dir, "predictions.json")
+            with PathManager.open(predictions_json, "w") as f:
+                f.write(json.dumps(json_data))
+            from kmax_deeplab.evaluation.panoptic_evaluation import pq_compute
+            with contextlib.redirect_stdout(io.StringIO()):
+                pq_res = pq_compute(
+                    gt_json,
+                    PathManager.get_local_path(predictions_json),
+                    gt_folder=gt_folder,
+                    pred_folder=pred_dir,
+                )
+        res = {}
+        res["PQ"] = 100 * pq_res["All"]["pq"]
+        res["SQ"] = 100 * pq_res["All"]["sq"]
+        res["RQ"] = 100 * pq_res["All"]["rq"]
+        res["PQ_th"] = 100 * pq_res["Things"]["pq"]
+        res["SQ_th"] = 100 * pq_res["Things"]["sq"]
+        res["RQ_th"] = 100 * pq_res["Things"]["rq"]
+        res["PQ_st"] = 100 * pq_res["Stuff"]["pq"]
+        res["SQ_st"] = 100 * pq_res["Stuff"]["sq"]
+        res["RQ_st"] = 100 * pq_res["Stuff"]["rq"]
+        results = OrderedDict({"panoptic_seg": res})
+        _print_panoptic_results(pq_res)
+        return results
+def _print_panoptic_results(pq_res):
+    headers = ["", "PQ", "SQ", "RQ", "#categories"]
+    data = []
+    for name in ["All", "Things", "Stuff"]:
+        row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]]
+        data.append(row)
+    table = tabulate(
+        data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center"
+    )
+    logger.info("Panoptic Evaluation Results:\n" + table)