# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddleseg.cvlibs import manager from paddleseg.models import layers from paddleseg.utils import utils @manager.MODELS.add_component class BiseNetV1(nn.Layer): """ The BiSeNetV1 implementation based on PaddlePaddle. The original article refers to Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation" (https://paperswithcode.com/paper/bisenet-bilateral-segmentation-network-for) Args: num_classes (int): The unique number of target classes. backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd. pretrained (str, optional): The path or url of pretrained model. Default: None. """ def __init__(self, num_classes, backbone, conv_channel=128, pretrained=None): super().__init__() self.backbone = backbone self.spatial_path = SpatialPath(3, 128) self.global_context = nn.Sequential( nn.AdaptiveAvgPool2D(1), layers.ConvBNReLU( 512, conv_channel, 1, bias_attr=False), ) self.arms = nn.LayerList([ AttentionRefinement(512, conv_channel), AttentionRefinement(256, conv_channel), ]) self.refines = nn.LayerList([ layers.ConvBNReLU( conv_channel, conv_channel, 3, stride=1, padding=1, bias_attr=False), layers.ConvBNReLU( conv_channel, conv_channel, 3, stride=1, padding=1, bias_attr=False), ]) self.heads = nn.LayerList([ BiSeNetHead(conv_channel, num_classes, 8, True), BiSeNetHead(conv_channel, num_classes, 8, True), BiSeNetHead(conv_channel * 2, num_classes, 8, False), ]) self.ffm = FeatureFusion(conv_channel * 2, conv_channel * 2, 1) self.pretrained = pretrained def init_weight(self): if self.pretrained is not None: utils.load_entire_model(self, self.pretrained) def forward(self, x): spatial_out = self.spatial_path(x) context_blocks = self.backbone(x) context_blocks.reverse() global_context = self.global_context(context_blocks[0]) global_context = F.interpolate( global_context, size=paddle.shape(context_blocks[0])[2:], mode='bilinear', align_corners=True) last_fm = global_context pred_out = [] for i, ( fm, arm, refine ) in enumerate(zip(context_blocks[:2], self.arms, self.refines)): fm = arm(fm) fm += last_fm last_fm = F.interpolate( fm, size=paddle.shape(context_blocks[i + 1])[2:], mode='bilinear', align_corners=True) last_fm = refine(last_fm) pred_out.append(last_fm) context_out = last_fm concate_fm = self.ffm(spatial_out, context_out) pred_out.append(concate_fm) output = [] if self.training: for i, head in enumerate(self.heads): out = head(pred_out[i]) output.append(out) else: out = self.heads[-1](pred_out[-1]) output.append(out) return output class SpatialPath(nn.Layer): """ SpatialPath module of BiseNetV1 model Args: in_channels (int): The number of input channels in spatial path module. out_channels (int): The number of output channels in spatial path module. """ def __init__(self, in_channels, out_channels, inner_channel=64): super().__init__() self.conv_7x7 = layers.ConvBNReLU( in_channels, inner_channel, 7, stride=2, padding=3, bias_attr=False) self.conv_3x3_1 = layers.ConvBNReLU( inner_channel, inner_channel, 3, stride=2, padding=1, bias_attr=False) self.conv_3x3_2 = layers.ConvBNReLU( inner_channel, inner_channel, 3, stride=2, padding=1, bias_attr=False) self.conv_1x1 = layers.ConvBNReLU( inner_channel, out_channels, 1, bias_attr=False) def forward(self, x): x = self.conv_7x7(x) x = self.conv_3x3_1(x) x = self.conv_3x3_2(x) x = self.conv_1x1(x) return x class BiSeNetHead(nn.Layer): """ BiSeNet head of BiseNetV1 model Args: in_channels (int): The number of input channels in spatial path module. out_channels (int): The number of output channels in spatial path module. scale (int, float): The scale factor of interpolation. """ def __init__(self, in_channels, out_channels, scale, is_aux=False): super().__init__() inner_channel = 128 if is_aux else 64 self.conv_3x3 = layers.ConvBNReLU( in_channels, inner_channel, 3, stride=1, padding=1, bias_attr=False) self.conv_1x1 = nn.Conv2D(inner_channel, out_channels, 1) self.scale = scale def forward(self, x): x = self.conv_3x3(x) x = self.conv_1x1(x) if self.scale > 1: x = F.interpolate( x, scale_factor=self.scale, mode='bilinear', align_corners=True) return x class AttentionRefinement(nn.Layer): """ AttentionRefinement module of BiseNetV1 model Args: in_channels (int): The number of input channels in spatial path module. out_channels (int): The number of output channels in spatial path module. """ def __init__(self, in_channels, out_channels): super().__init__() self.conv_3x3 = layers.ConvBNReLU( in_channels, out_channels, 3, stride=1, padding=1, bias_attr=False) self.channel_attention = nn.Sequential( nn.AdaptiveAvgPool2D(1), layers.ConvBNReLU( out_channels, out_channels, 1, bias_attr=False), nn.Sigmoid(), ) def forward(self, x): x = self.conv_3x3(x) se = self.channel_attention(x) x = x * se return x class FeatureFusion(nn.Layer): """ AttentionRefinement module of BiseNetV1 model Args: in_channels (int): The number of input channels in spatial path module. out_channels (int): The number of output channels in spatial path module. reduction (int): A factor shrinks convolutional channels. Default: 1. """ def __init__(self, in_channels, out_channels, reduction=1): super().__init__() self.conv_1x1 = layers.ConvBNReLU( in_channels, out_channels, 1, bias_attr=False) self.channel_attention = nn.Sequential( nn.AdaptiveAvgPool2D(1), layers.ConvBNReLU( out_channels, out_channels // reduction, 1, bias_attr=False), layers.ConvBNReLU( out_channels // reduction, out_channels, 1, bias_attr=False), nn.Sigmoid(), ) def forward(self, x1, x2): fm = paddle.concat([x1, x2], axis=1) fm = self.conv_1x1(fm) fm_se = self.channel_attention(fm) output = fm + fm * fm_se return output