|
@@ -0,0 +1,832 @@
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+import numpy as np
|
|
|
+
|
|
|
+import paddle
|
|
|
+import paddle.nn as nn
|
|
|
+import paddle.nn.functional as F
|
|
|
+
|
|
|
+from paddlers.models.ppseg.cvlibs import manager
|
|
|
+from paddlers.models.ppseg.models import layers
|
|
|
+from paddlers.models.ppseg.utils import utils
|
|
|
+
|
|
|
+
|
|
|
+@manager.MODELS.add_component
|
|
|
+class PointRend(nn.Layer):
|
|
|
+ """
|
|
|
+ The SemanticFPN-PointRend implementation based on PaddlePaddle.
|
|
|
+
|
|
|
+ The original article refers to
|
|
|
+ Kirillov A, Wu Y, He K, et al. "PointRend: Image Segmentation As Rendering."
|
|
|
+ (https://arxiv.org/abs/1912.08193).
|
|
|
+
|
|
|
+ Args:
|
|
|
+ num_classes (int): The unique number of target classes.
|
|
|
+ backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
|
|
|
+ backbone_indices (tuple, optional): Four values in the tuple indicate the indices of output of backbone.
|
|
|
+ fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction in FPN. Default: [256, 512, 1024, 2048].
|
|
|
+ fpn_outplanes (int, optional): The output channels in FPN. Default: 256.
|
|
|
+ point_num_fcs (int, optional): Number of fc layers in the head in PointHead. Default: 3.
|
|
|
+ point_in_channels (list, optional): input channels of fc block in PointHead. Default: [256].
|
|
|
+ point_out_channels (int, optional): Fc block's output channels in PointHead. Default: 256.
|
|
|
+ point_in_index (list, optional): The indexs of input features to use in PointHead. Default: [0].
|
|
|
+ point_num_points (int, optional): The number of point in training mode in PointHead. Default: 2048.
|
|
|
+ point_oversample_ratio (int, optional): The sample ratio of points when in training mode in PointHead.
|
|
|
+ sampled_point = num_points * oversample_ratio. Default: 3.
|
|
|
+ point_importance_sample_ratio (float, optional): The importance sample ratio for compute num_uncertain_points in PointHead. Default: 0.75.
|
|
|
+ point_scale_factor(int, optinal): The scale factor of F.interpolate in refine seg logits stage when in inference in PointHead. Default: 2.
|
|
|
+ point_subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference in PointHead. Default: 2.
|
|
|
+ point_subdivision_num_points(int, optional): The points number for refine seg logits when in inference in PointHead. Default: 8196.
|
|
|
+ point_dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in PointHead. Default: 0.1.
|
|
|
+ point_coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with
|
|
|
+ the output of each fc layer in PointHead. Default: True.
|
|
|
+ point_conv_cfg(str): The config of Conv in PointHead. Default: 'Conv1D'.
|
|
|
+ point_input_transform(str): The features transform method of inputs in PointHead.
|
|
|
+ it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
|
|
|
+ PFN_feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2 in FPNHead. The first
|
|
|
+ one is of largest resolution. Default: [4, 8, 16, 32].
|
|
|
+ PFN_in_channels(list): The input feature's channels list in FPNHead. Default: [256, 256, 256, 256].
|
|
|
+ PFN_channels(int,optional): The output channels of scale_head's Conv before Upsample block in FPNHead. Default: 128.
|
|
|
+ PFN_in_index(list): The indexs of input features to use. it's shape should keep with in_channels in FPNHead. Default: [0, 1, 2, 3].
|
|
|
+ PFN_dropout_ratio(float,optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in FPNHead. Default: 0.1.
|
|
|
+ PFN_conv_cfg(str): The config of Conv. Default: 'Conv2D'.
|
|
|
+ PFN_input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs' in FPNHead. Defalut: 'multiple_select'.
|
|
|
+ align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
|
|
|
+ e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
|
|
|
+ pretrained (str, optional): The path or url of pretrained model. Default: None.
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ num_classes,
|
|
|
+ backbone,
|
|
|
+ backbone_indices,
|
|
|
+ fpn_inplanes=[256, 512, 1024, 2048],
|
|
|
+ fpn_outplanes=256,
|
|
|
+ point_in_channels=[256],
|
|
|
+ point_out_channels=256,
|
|
|
+ point_in_index=[0],
|
|
|
+ point_num_fcs=3,
|
|
|
+ point_num_points=2048,
|
|
|
+ point_oversample_ratio=3,
|
|
|
+ point_importance_sample_ratio=0.75,
|
|
|
+ point_scale_factor=2,
|
|
|
+ point_subdivision_steps=2,
|
|
|
+ point_subdivision_num_points=8196,
|
|
|
+ point_dropout_ratio=0,
|
|
|
+ point_coarse_pred_each_layer=True,
|
|
|
+ point_input_transform='multiple_select',
|
|
|
+ point_conv_cfg='Conv1D',
|
|
|
+ PFN_feature_strides=[4, 8, 16, 32],
|
|
|
+ PFN_in_channels=[256, 256, 256, 256],
|
|
|
+ PFN_channels=128,
|
|
|
+ PFN_in_index=[0, 1, 2, 3],
|
|
|
+ PFN_dropout_ratio=0,
|
|
|
+ PFN_conv_cfg='Conv2D',
|
|
|
+ PFN_input_transform='multiple_select',
|
|
|
+ align_corners=False,
|
|
|
+ pretrained=None):
|
|
|
+ super(PointRend, self).__init__()
|
|
|
+ self.backbone = backbone
|
|
|
+ self.backbone_indices = backbone_indices
|
|
|
+ self.in_channels = [
|
|
|
+ self.backbone.feat_channels[i] for i in backbone_indices
|
|
|
+ ]
|
|
|
+
|
|
|
+ self.neck = FPNNeck(
|
|
|
+ fpn_inplanes=fpn_inplanes, fpn_outplanes=fpn_outplanes)
|
|
|
+ self.pointhead = PointHead(
|
|
|
+ in_channels=point_in_channels,
|
|
|
+ out_channels=point_out_channels,
|
|
|
+ num_classes=num_classes,
|
|
|
+ in_index=point_in_index,
|
|
|
+ num_fcs=point_num_fcs,
|
|
|
+ num_points=point_num_points,
|
|
|
+ oversample_ratio=point_oversample_ratio,
|
|
|
+ importance_sample_ratio=point_importance_sample_ratio,
|
|
|
+ scale_factor=point_scale_factor,
|
|
|
+ subdivision_steps=point_subdivision_steps,
|
|
|
+ subdivision_num_points=point_subdivision_num_points,
|
|
|
+ dropout_ratio=point_dropout_ratio,
|
|
|
+ align_corners=align_corners,
|
|
|
+ coarse_pred_each_layer=point_coarse_pred_each_layer,
|
|
|
+ input_transform=point_input_transform,
|
|
|
+ conv_cfg=point_conv_cfg)
|
|
|
+ self.fpnhead = FPNHead(
|
|
|
+ feature_strides=PFN_feature_strides,
|
|
|
+ in_channels=PFN_in_channels,
|
|
|
+ channels=PFN_channels,
|
|
|
+ num_class=num_classes,
|
|
|
+ in_index=PFN_in_index,
|
|
|
+ dropout_ratio=PFN_dropout_ratio,
|
|
|
+ conv_cfg=PFN_conv_cfg,
|
|
|
+ input_transform=PFN_input_transform,
|
|
|
+ align_corners=align_corners)
|
|
|
+
|
|
|
+ self.align_corners = align_corners
|
|
|
+ self.pretrained = pretrained
|
|
|
+ self.init_weight()
|
|
|
+
|
|
|
+ def forward(self, x):
|
|
|
+ feats = self.backbone(x)
|
|
|
+ feats = [feats[i] for i in self.backbone_indices]
|
|
|
+ fpn_feats = self.neck(feats)
|
|
|
+ pfn_logits = self.fpnhead(
|
|
|
+ fpn_feats
|
|
|
+ )
|
|
|
+ point_logits = self.pointhead(
|
|
|
+ fpn_feats, pfn_logits)
|
|
|
+
|
|
|
+ if self.training:
|
|
|
+ logit_list = [
|
|
|
+ F.interpolate(
|
|
|
+ logit,
|
|
|
+ paddle.shape(x)[2:],
|
|
|
+ mode='bilinear',
|
|
|
+ align_corners=self.align_corners) for logit in pfn_logits
|
|
|
+ ]
|
|
|
+ logit_list.append(point_logits)
|
|
|
+ else:
|
|
|
+ logit_list = [
|
|
|
+ F.interpolate(
|
|
|
+ logit,
|
|
|
+ paddle.shape(x)[2:],
|
|
|
+ mode='bilinear',
|
|
|
+ align_corners=self.align_corners) for logit in point_logits
|
|
|
+ ]
|
|
|
+ return logit_list
|
|
|
+
|
|
|
+ def init_weight(self):
|
|
|
+ if self.pretrained is not None:
|
|
|
+ utils.load_entire_model(self, self.pretrained)
|
|
|
+
|
|
|
+
|
|
|
+class PointHead(nn.Layer):
|
|
|
+ """
|
|
|
+ The PointHead implementation based on PaddlePaddle.
|
|
|
+
|
|
|
+ PointHead use shared multi-layer perceptron (equivalent to
|
|
|
+ nn.Conv1D) to predict the logit of input points. The fine-grained feature
|
|
|
+ and coarse feature will be concatenate together for predication.
|
|
|
+
|
|
|
+ The original article refers to:
|
|
|
+ Kirillov A , Wu Y , He K , et al "PointRend: Image Segmentation As Rendering."
|
|
|
+ (https://arxiv.org/abs/1912.08193)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ num_classes (int): Number of classes for logits. Default: 19.
|
|
|
+ num_fcs (int, optional): Number of fc layers in the head. Default: 3.
|
|
|
+ in_channels (list): input channels of fc block. Default: [256].
|
|
|
+ out_channels (int, optional): Fc block's output channels. Default: 256.
|
|
|
+ in_index (list): The indexs of input features to use. Default: [0].
|
|
|
+ num_points (int, optional): The number of point in training mode. Default: 2048.
|
|
|
+ oversample_ratio (int, optional): The sample ratio of points when in training mode.
|
|
|
+ sampled_point = num_points * oversample_ratio. Default: 3.
|
|
|
+ importance_sample_ratio(float, optional): The importance sample ratio for compute num_uncertain_points. Default: 0.75.
|
|
|
+ scale_factor(int, optional): The scale factor of F.interpolate in refine seg logits stage when in inference. Default: 2.
|
|
|
+ subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference. Default: 2.
|
|
|
+ subdivision_num_points(int, optional): The points number for refine seg logits when in inference. Default: 8196.
|
|
|
+ dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1.
|
|
|
+ coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with
|
|
|
+ the output of each fc layer. Default: True.
|
|
|
+ conv_cfg(str): The config of Conv. Default: 'Conv1D'.
|
|
|
+ input_transform(str): The features transform method of inputs.
|
|
|
+ it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
|
|
|
+ align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
|
|
|
+ e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ num_classes=19,
|
|
|
+ num_fcs=3,
|
|
|
+ in_channels=[256],
|
|
|
+ out_channels=256,
|
|
|
+ in_index=[0],
|
|
|
+ num_points=2048,
|
|
|
+ oversample_ratio=3,
|
|
|
+ importance_sample_ratio=0.75,
|
|
|
+ scale_factor=2,
|
|
|
+ subdivision_steps=2,
|
|
|
+ subdivision_num_points=8196,
|
|
|
+ dropout_ratio=0.1,
|
|
|
+ coarse_pred_each_layer=True,
|
|
|
+ conv_cfg='Conv1D',
|
|
|
+ input_transform='multiple_select',
|
|
|
+ align_corners=False):
|
|
|
+ super(PointHead, self).__init__()
|
|
|
+
|
|
|
+ self.in_channels = in_channels
|
|
|
+ self.channels = out_channels
|
|
|
+ self.in_index = in_index
|
|
|
+ self.num_classes = num_classes
|
|
|
+ self.num_fcs = num_fcs
|
|
|
+ self.num_points = num_points
|
|
|
+ self.oversample_ratio = oversample_ratio
|
|
|
+ self.importance_sample_ratio = importance_sample_ratio
|
|
|
+ self.scale_factor = scale_factor
|
|
|
+ self.subdivision_steps = subdivision_steps
|
|
|
+ self.subdivision_num_points = paddle.to_tensor(subdivision_num_points, dtype="int32")
|
|
|
+ self.dropout_ratio = dropout_ratio
|
|
|
+ self.coarse_pred_each_layer = coarse_pred_each_layer
|
|
|
+ self.align_corners = align_corners
|
|
|
+ self.input_transform = input_transform
|
|
|
+
|
|
|
+ fc_in_channels = sum(self.in_channels) + self.num_classes
|
|
|
+ fc_channels = self.channels
|
|
|
+ self.fcs = nn.LayerList()
|
|
|
+ for k in range(num_fcs):
|
|
|
+ fc = ConvModule(
|
|
|
+ fc_in_channels,
|
|
|
+ fc_channels,
|
|
|
+ kernel_size=1,
|
|
|
+ stride=1,
|
|
|
+ padding=0,
|
|
|
+ conv_cfg=conv_cfg,
|
|
|
+ )
|
|
|
+ self.fcs.append(fc)
|
|
|
+ fc_in_channels = fc_channels
|
|
|
+ fc_in_channels += self.num_classes if self.coarse_pred_each_layer else 0
|
|
|
+ self.fc_seg = nn.Conv1D(
|
|
|
+ fc_in_channels,
|
|
|
+ self.num_classes,
|
|
|
+ kernel_size=1,
|
|
|
+ stride=1,
|
|
|
+ padding=0)
|
|
|
+
|
|
|
+ if self.dropout_ratio > 0:
|
|
|
+ self.dropout = nn.Dropout(self.dropout_ratio)
|
|
|
+ else:
|
|
|
+ self.dropout = None
|
|
|
+
|
|
|
+ def cls_seg(self, feat):
|
|
|
+ """Classify each pixel with fc."""
|
|
|
+ if self.dropout is not None:
|
|
|
+ feat = self.dropout(feat)
|
|
|
+ output = self.fc_seg(feat)
|
|
|
+ return output
|
|
|
+
|
|
|
+ def _get_fine_grained_point_feats(self, x, points):
|
|
|
+ """
|
|
|
+ Sample from fine grained features.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ x (list[Tensor]): Feature pyramid from by neck or backbone.
|
|
|
+ points (Tensor): Point coordinates, shape (batch_size,
|
|
|
+ num_points, 2).
|
|
|
+ Returns:
|
|
|
+ fine_grained_feats (Tensor): Sampled fine grained feature,
|
|
|
+ shape (batch_size, sum(channels of x), num_points).
|
|
|
+ """
|
|
|
+
|
|
|
+ fine_grained_feats_list = [
|
|
|
+ point_sample(_, points, align_corners=self.align_corners) for _ in x
|
|
|
+ ]
|
|
|
+ if len(fine_grained_feats_list) > 1:
|
|
|
+ fine_grained_feats = paddle.concat(fine_grained_feats_list, axis=1)
|
|
|
+ else:
|
|
|
+ fine_grained_feats = fine_grained_feats_list[0]
|
|
|
+ return fine_grained_feats
|
|
|
+
|
|
|
+ def _get_coarse_point_feats(self, prev_output, points):
|
|
|
+ """
|
|
|
+ Sample from fine grained features.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ prev_output (list[Tensor]): Prediction of previous decode head.
|
|
|
+ points (Tensor): Point coordinates, shape (batch_size,
|
|
|
+ num_points, 2).
|
|
|
+ Returns:
|
|
|
+ coarse_feats (Tensor): Sampled coarse feature, shape (batch_size,
|
|
|
+ num_classes, num_points).
|
|
|
+ """
|
|
|
+
|
|
|
+ coarse_feats = point_sample(
|
|
|
+ prev_output, points, align_corners=self.align_corners)
|
|
|
+ return coarse_feats
|
|
|
+
|
|
|
+ def _transform_inputs(self, inputs):
|
|
|
+ """
|
|
|
+ Transform inputs for decoder.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ inputs (list[Tensor]): List of multi-level img features.
|
|
|
+ Returns:
|
|
|
+ Tensor: The transformed inputs
|
|
|
+ """
|
|
|
+
|
|
|
+ if self.input_transform == 'resize_concat':
|
|
|
+ inputs = [inputs[i] for i in self.in_index]
|
|
|
+ upsampled_inputs = [
|
|
|
+ F.interpolate(
|
|
|
+ x,
|
|
|
+ size=paddle.shape(inputs[0])[2:],
|
|
|
+ mode='bilinear',
|
|
|
+ align_corners=self.align_corners) for x in inputs
|
|
|
+ ]
|
|
|
+ inputs = paddle.concat(upsampled_inputs, axis=1)
|
|
|
+ elif self.input_transform == 'multiple_select':
|
|
|
+ inputs = [inputs[i] for i in self.in_index]
|
|
|
+ else:
|
|
|
+ inputs = inputs[self.in_index[0]]
|
|
|
+ return inputs
|
|
|
+
|
|
|
+ def get_points_train(self, seg_logits, uncertainty_func):
|
|
|
+ """
|
|
|
+ Sample points for training.
|
|
|
+ Sample points in [0, 1] x [0, 1] coordinate space based on their
|
|
|
+ uncertainty. The uncertainties are calculated for each point using
|
|
|
+ 'uncertainty_func' function that takes point's logit prediction as
|
|
|
+ input.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ seg_logits (Tensor): Semantic segmentation logits, shape (
|
|
|
+ batch_size, num_classes, height, width).
|
|
|
+ uncertainty_func (func): uncertainty calculation function.
|
|
|
+ cfg (dict): Training config of point head.
|
|
|
+ Returns:
|
|
|
+ point_coords (Tensor): A tensor of shape (batch_size, num_points,
|
|
|
+ 2) that contains the coordinates of ``num_points`` sampled
|
|
|
+ points.
|
|
|
+ """
|
|
|
+
|
|
|
+ num_points = self.num_points
|
|
|
+ oversample_ratio = self.oversample_ratio
|
|
|
+ importance_sample_ratio = self.importance_sample_ratio
|
|
|
+ assert oversample_ratio >= 1
|
|
|
+ assert 0 <= importance_sample_ratio <= 1
|
|
|
+ batch_size = paddle.shape(seg_logits)[0]
|
|
|
+ num_sampled = int(num_points * oversample_ratio)
|
|
|
+ point_coords = paddle.rand([batch_size, num_sampled, 2])
|
|
|
+ point_logits = point_sample(seg_logits, point_coords)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ point_uncertainties = uncertainty_func(point_logits)
|
|
|
+ num_uncertain_points = int(importance_sample_ratio * num_points)
|
|
|
+ num_random_points = num_points - num_uncertain_points
|
|
|
+ idx = paddle.topk(
|
|
|
+ point_uncertainties[:, 0, :], k=num_uncertain_points, axis=1)[1]
|
|
|
+ shift = num_sampled * paddle.arange(batch_size, dtype='int64')
|
|
|
+ idx += shift.unsqueeze([-1])
|
|
|
+ idx = idx.reshape([-1])
|
|
|
+ point_coords = paddle.index_select(
|
|
|
+ point_coords.reshape([-1, 2]), idx, axis=0)
|
|
|
+ point_coords = point_coords.reshape(
|
|
|
+ [batch_size, num_uncertain_points, 2])
|
|
|
+ if num_random_points > 0:
|
|
|
+ rand_point_coords = paddle.rand([batch_size, num_random_points, 2])
|
|
|
+ point_coords = paddle.concat((point_coords, rand_point_coords),
|
|
|
+ axis=1)
|
|
|
+ return point_coords
|
|
|
+
|
|
|
+ def get_points_test(self, seg_logits, uncertainty_func):
|
|
|
+ """
|
|
|
+ Sample points for testing.
|
|
|
+ Find ``num_points`` most uncertain points from ``uncertainty_map``.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ seg_logits (Tensor): A tensor of shape (batch_size, num_classes,
|
|
|
+ height, width) for class-specific or class-agnostic prediction.
|
|
|
+ uncertainty_func (func): uncertainty calculation function.
|
|
|
+ cfg (dict): Testing config of point head.
|
|
|
+ Returns:
|
|
|
+ point_indices (Tensor): A tensor of shape (batch_size, num_points)
|
|
|
+ that contains indices from [0, height x width) of the most
|
|
|
+ uncertain points.
|
|
|
+ point_coords (Tensor): A tensor of shape (batch_size, num_points,
|
|
|
+ 2) that contains [0, 1] x [0, 1] normalized coordinates of the
|
|
|
+ most uncertain points from the ``height x width`` grid .
|
|
|
+ """
|
|
|
+
|
|
|
+ num_points = self.subdivision_num_points
|
|
|
+ uncertainty_map = uncertainty_func(seg_logits)
|
|
|
+ batch_size = paddle.shape(uncertainty_map)[0]
|
|
|
+ height = paddle.shape(uncertainty_map)[2]
|
|
|
+ width = paddle.shape(uncertainty_map)[3]
|
|
|
+ h_step = 1.0 / height
|
|
|
+ w_step = 1.0 / width
|
|
|
+
|
|
|
+ uncertainty_map = uncertainty_map.reshape([batch_size, height * width])
|
|
|
+ num_points = paddle.min(paddle.concat([height * width, num_points]))
|
|
|
+ point_indices = paddle.topk(uncertainty_map, num_points, axis=1)[1]
|
|
|
+ point_coords = paddle.zeros([batch_size, num_points, 2],
|
|
|
+ dtype='float32')
|
|
|
+ point_coords[:, :, 0] = w_step / 2.0 + (
|
|
|
+ point_indices % width).astype('float32') * w_step
|
|
|
+ point_coords[:, :, 1] = h_step / 2.0 + (
|
|
|
+ point_indices // width).astype('float32') * h_step
|
|
|
+ return point_indices, point_coords
|
|
|
+
|
|
|
+ def scatter_paddle(self, refined_seg_logits, point_indices, point_logits):
|
|
|
+ """
|
|
|
+ paddle version scatter : equal to pytorch version scatter(-1,point_indices,point_logits).
|
|
|
+
|
|
|
+ Args:
|
|
|
+ refined_seg_logits(Tensor): shape=[batch_size, channels, height * width]
|
|
|
+ point_indices(Tensor): shape=[batch_size, channels, height * width]
|
|
|
+ point_logits(Tensor): shape[batch_size, channels, height * width]
|
|
|
+ Returns:
|
|
|
+ scattered refined_seg_logits(Tensor).
|
|
|
+ """
|
|
|
+
|
|
|
+ original_shape = paddle.shape(refined_seg_logits)
|
|
|
+ new_refined_seg_logits = refined_seg_logits.flatten(0, 1)
|
|
|
+ offsets = (paddle.arange(paddle.shape(new_refined_seg_logits)[0]) *
|
|
|
+ paddle.shape(new_refined_seg_logits)[1]).unsqueeze(-1)
|
|
|
+ point_indices = point_indices.flatten(0, 1)
|
|
|
+ new_point_indices = (point_indices + offsets).flatten()
|
|
|
+ point_logits = point_logits.flatten()
|
|
|
+ refined_seg_logits = paddle.scatter(
|
|
|
+ refined_seg_logits.flatten(),
|
|
|
+ new_point_indices,
|
|
|
+ point_logits,
|
|
|
+ overwrite=True)
|
|
|
+ return refined_seg_logits.reshape(shape=original_shape)
|
|
|
+
|
|
|
+ def forward_train(self, x, prev_output):
|
|
|
+ with paddle.no_grad():
|
|
|
+ points = self.get_points_train(prev_output, calculate_uncertainty)
|
|
|
+
|
|
|
+ fine_grained_point_feats = self._get_fine_grained_point_feats(
|
|
|
+ x, points)
|
|
|
+ coarse_point_feats = self._get_coarse_point_feats(
|
|
|
+ prev_output, points)
|
|
|
+
|
|
|
+ fusion_point_feats = paddle.concat(
|
|
|
+ [fine_grained_point_feats, coarse_point_feats], axis=1)
|
|
|
+ for fc in self.fcs:
|
|
|
+ fusion_point_feats = fc(fusion_point_feats)
|
|
|
+ if self.coarse_pred_each_layer:
|
|
|
+ fusion_point_feats = paddle.concat(
|
|
|
+ (fusion_point_feats, coarse_point_feats), axis=1)
|
|
|
+ point_logits = self.cls_seg(fusion_point_feats)
|
|
|
+ return [point_logits, points]
|
|
|
+
|
|
|
+ def forward(self, inputs, prev_output):
|
|
|
+ """
|
|
|
+ Forward function.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ inputs (list[Tensor]): List of multi-level img features.
|
|
|
+ prev_output (Tensor): The output of previous decode head.
|
|
|
+ Returns:
|
|
|
+ [point_logits,points]: For points loss when in training.
|
|
|
+ [refined_seg_logits]: Output refined seg logits when in inference.
|
|
|
+ """
|
|
|
+
|
|
|
+ prev_output = prev_output[0]
|
|
|
+ x = self._transform_inputs(inputs)
|
|
|
+ if self.training:
|
|
|
+ return self.forward_train(x, prev_output)
|
|
|
+ else:
|
|
|
+ refined_seg_logits = prev_output.clone()
|
|
|
+ for _ in range(self.subdivision_steps):
|
|
|
+ refined_seg_logits = F.interpolate(
|
|
|
+ refined_seg_logits,
|
|
|
+ scale_factor=self.scale_factor,
|
|
|
+ mode='bilinear',
|
|
|
+ align_corners=self.align_corners)
|
|
|
+
|
|
|
+ save_shape = paddle.shape(refined_seg_logits)
|
|
|
+ point_indices, points = self.get_points_test(
|
|
|
+ refined_seg_logits, calculate_uncertainty)
|
|
|
+ fine_grained_point_feats = self._get_fine_grained_point_feats(
|
|
|
+ x, points)
|
|
|
+ coarse_point_feats = self._get_coarse_point_feats(
|
|
|
+ prev_output, points)
|
|
|
+
|
|
|
+ fusion_point_feats = paddle.concat(
|
|
|
+ [fine_grained_point_feats, coarse_point_feats], axis=1)
|
|
|
+ for fc in self.fcs:
|
|
|
+ fusion_point_feats = fc(fusion_point_feats)
|
|
|
+ if self.coarse_pred_each_layer:
|
|
|
+ fusion_point_feats = paddle.concat(
|
|
|
+ (fusion_point_feats, coarse_point_feats), axis=1)
|
|
|
+ point_logits = self.cls_seg(fusion_point_feats)
|
|
|
+ point_indices = paddle.unsqueeze(point_indices, axis=1)
|
|
|
+ point_indices = paddle.expand(point_indices, [-1, save_shape[1], -1])
|
|
|
+
|
|
|
+ refined_seg_logits = paddle.flatten(refined_seg_logits, 2)
|
|
|
+ refined_seg_logits = self.scatter_paddle(
|
|
|
+ refined_seg_logits, point_indices,
|
|
|
+ point_logits)
|
|
|
+ refined_seg_logits = refined_seg_logits.reshape(save_shape)
|
|
|
+ return [refined_seg_logits]
|
|
|
+
|
|
|
+
|
|
|
+class FPNHead(nn.Layer):
|
|
|
+ """
|
|
|
+ This head is the implementation of Semantic FPN in paddle.
|
|
|
+
|
|
|
+ The original article refers to:
|
|
|
+ Kirillov, A. , et al. "Panoptic Feature Pyramid Networks."
|
|
|
+ (https://arxiv.org/abs/1901.02446)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ num_classes(int): The unique number of target classes. Default: 19.
|
|
|
+ feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2. The first
|
|
|
+ one is of largest resolution. Default: [4, 8, 16, 32].
|
|
|
+ in_channels(list): The input feature's channels list. Default: [256, 256, 256, 256].
|
|
|
+ channels(int, optional): The output channels of scale_head's Conv before Upsample block. Default: 128.
|
|
|
+ in_index(list): The indexs of input features to use. it's shape should keep with in_channels. Default: [0, 1, 2, 3].
|
|
|
+ dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1.
|
|
|
+ conv_cfg(str): The config of Conv. Default: 'Conv2D'.
|
|
|
+ input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
|
|
|
+ align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
|
|
|
+ e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ num_class=19,
|
|
|
+ feature_strides=[4, 8, 16, 32],
|
|
|
+ in_channels=[256, 256, 256, 256],
|
|
|
+ channels=128,
|
|
|
+ in_index=[0, 1, 2, 3],
|
|
|
+ dropout_ratio=0.1,
|
|
|
+ conv_cfg='Conv2D',
|
|
|
+ input_transform='multiple_select',
|
|
|
+ align_corners=False,
|
|
|
+ ):
|
|
|
+ super(FPNHead, self).__init__()
|
|
|
+ assert len(feature_strides) == len(in_channels)
|
|
|
+ assert min(feature_strides) == feature_strides[0]
|
|
|
+ self.feature_strides = feature_strides
|
|
|
+ self.in_channels = in_channels
|
|
|
+ self.channels = channels
|
|
|
+ self.in_index = in_index
|
|
|
+ self.num_class = num_class
|
|
|
+ self.conv_cfg = conv_cfg
|
|
|
+ self.dropout_ratio = dropout_ratio
|
|
|
+ self.input_transform = input_transform
|
|
|
+ self.align_corners = align_corners
|
|
|
+ self.scale_heads = nn.LayerList()
|
|
|
+
|
|
|
+ for i in range(len(feature_strides)):
|
|
|
+ head_length = max(
|
|
|
+ 1,
|
|
|
+ int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
|
|
|
+ scale_head = []
|
|
|
+ for k in range(head_length):
|
|
|
+ scale_head.append(
|
|
|
+ ConvModule(
|
|
|
+ self.in_channels[i] if k == 0 else self.channels,
|
|
|
+ self.channels,
|
|
|
+ 3,
|
|
|
+ padding=1,
|
|
|
+ conv_cfg=self.conv_cfg))
|
|
|
+ if feature_strides[i] != feature_strides[0]:
|
|
|
+ scale_head.append(
|
|
|
+ Upsample(
|
|
|
+ scale_factor=2,
|
|
|
+ mode='bilinear',
|
|
|
+ align_corners=self.align_corners))
|
|
|
+ self.scale_heads.append(nn.Sequential(*scale_head))
|
|
|
+
|
|
|
+ self.conv_seg = nn.Conv2D(self.channels, self.num_class, kernel_size=1)
|
|
|
+
|
|
|
+ if self.dropout_ratio is not None:
|
|
|
+ self.dropout = nn.Dropout2D(self.dropout_ratio)
|
|
|
+ else:
|
|
|
+ self.dropout = None
|
|
|
+
|
|
|
+ def cls_seg(self, feat):
|
|
|
+ if self.dropout is not None:
|
|
|
+ feat = self.dropout(feat)
|
|
|
+ output = self.conv_seg(feat)
|
|
|
+ return output
|
|
|
+
|
|
|
+ def _transform_inputs(self, inputs):
|
|
|
+ """
|
|
|
+ Transform inputs for decoder.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ inputs (list[Tensor]): List of multi-level img features.
|
|
|
+ Returns:
|
|
|
+ Tensor: The transformed inputs
|
|
|
+ """
|
|
|
+
|
|
|
+ if self.input_transform == 'resize_concat':
|
|
|
+ inputs = [inputs[i] for i in self.in_index]
|
|
|
+ upsampled_inputs = [
|
|
|
+ F.interpolate(
|
|
|
+ x,
|
|
|
+ size=paddle.shape(inputs[0])[2:],
|
|
|
+ mode='bilinear',
|
|
|
+ align_corners=self.align_corners) for x in inputs
|
|
|
+ ]
|
|
|
+ inputs = paddle.concat(upsampled_inputs, axis=1)
|
|
|
+ elif self.input_transform == 'multiple_select':
|
|
|
+ inputs = [inputs[i] for i in self.in_index]
|
|
|
+ else:
|
|
|
+ inputs = inputs[self.in_index[0]]
|
|
|
+
|
|
|
+ return inputs
|
|
|
+
|
|
|
+ def forward(self, inputs):
|
|
|
+ x = self._transform_inputs(inputs)
|
|
|
+ output = self.scale_heads[0](x[0])
|
|
|
+ for i in range(1, len(self.feature_strides)):
|
|
|
+ output = output + F.interpolate(
|
|
|
+ self.scale_heads[i](x[i]),
|
|
|
+ size=paddle.shape(output)[2:],
|
|
|
+ mode='bilinear',
|
|
|
+ align_corners=self.align_corners)
|
|
|
+ output = self.cls_seg(output)
|
|
|
+ return [output]
|
|
|
+
|
|
|
+
|
|
|
+class FPNNeck(nn.Layer):
|
|
|
+ """
|
|
|
+ The FPN Neck implementation in paddle.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction. Default: [256, 512, 1024, 2048].
|
|
|
+ fpn_outplanes (int, optional): The output channels. Default: 256.
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ fpn_inplanes=[256, 512, 1024, 2048],
|
|
|
+ fpn_outplanes=256,
|
|
|
+ ):
|
|
|
+ super(FPNNeck, self).__init__()
|
|
|
+ self.lateral_convs = []
|
|
|
+ self.fpn_out = []
|
|
|
+
|
|
|
+
|
|
|
+ for fpn_inplane in fpn_inplanes:
|
|
|
+ self.lateral_convs.append(
|
|
|
+ nn.Sequential(
|
|
|
+ nn.Conv2D(fpn_inplane, fpn_outplanes, 1),
|
|
|
+ layers.SyncBatchNorm(fpn_outplanes), nn.ReLU()))
|
|
|
+ self.fpn_out.append(
|
|
|
+ nn.Sequential(
|
|
|
+ layers.ConvBNReLU(
|
|
|
+ fpn_outplanes, fpn_outplanes, 3, bias_attr=False)))
|
|
|
+
|
|
|
+ self.lateral_convs = nn.LayerList(self.lateral_convs)
|
|
|
+ self.fpn_out = nn.LayerList(self.fpn_out)
|
|
|
+
|
|
|
+ def forward(self, conv_out):
|
|
|
+ last_out = self.lateral_convs[-1](conv_out[-1])
|
|
|
+ f = last_out
|
|
|
+ fpn_feature_list = [last_out]
|
|
|
+ for i in reversed(range(len(conv_out) - 1)):
|
|
|
+ conv_x = conv_out[i]
|
|
|
+ conv_x = self.lateral_convs[i](conv_x)
|
|
|
+ prev_shape = paddle.shape(conv_x)[2:]
|
|
|
+ f = conv_x + F.interpolate(
|
|
|
+ f, prev_shape, mode='bilinear', align_corners=True)
|
|
|
+ fpn_feature_list.append(self.fpn_out[i](f))
|
|
|
+ return fpn_feature_list
|
|
|
+
|
|
|
+
|
|
|
+class ConvModule(nn.Layer):
|
|
|
+ """
|
|
|
+ ConvModule includes Conv1/Conv2D.
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self,
|
|
|
+ in_channels,
|
|
|
+ out_channels,
|
|
|
+ kernel_size,
|
|
|
+ padding=0,
|
|
|
+ stride=1,
|
|
|
+ conv_cfg='Conv1D',
|
|
|
+ norm_cfg='None',
|
|
|
+ **kwargs):
|
|
|
+ super().__init__()
|
|
|
+ if (conv_cfg == 'Conv1D'):
|
|
|
+ self._conv = nn.Conv1D(
|
|
|
+ in_channels,
|
|
|
+ out_channels,
|
|
|
+ kernel_size,
|
|
|
+ stride=stride,
|
|
|
+ padding=padding,
|
|
|
+ **kwargs)
|
|
|
+ if (conv_cfg == 'Conv2D'):
|
|
|
+ self._conv = nn.Conv2D(
|
|
|
+ in_channels,
|
|
|
+ out_channels,
|
|
|
+ kernel_size,
|
|
|
+ stride=stride,
|
|
|
+ padding=padding,
|
|
|
+ **kwargs)
|
|
|
+ if 'data_format' in kwargs:
|
|
|
+ data_format = kwargs['data_format']
|
|
|
+ else:
|
|
|
+ data_format = 'NCHW'
|
|
|
+ if (norm_cfg != 'None'):
|
|
|
+ self._batch_norm = layers.SyncBatchNorm(
|
|
|
+ out_channels, data_format=data_format)
|
|
|
+ else:
|
|
|
+ self._batch_norm = None
|
|
|
+
|
|
|
+ def forward(self, x):
|
|
|
+ x = self._conv(x)
|
|
|
+ if (self._batch_norm != None):
|
|
|
+ x = self._batch_norm(x)
|
|
|
+ x = F.relu(x)
|
|
|
+ return x
|
|
|
+
|
|
|
+
|
|
|
+class Upsample(nn.Layer):
|
|
|
+ """
|
|
|
+ Upsample Module.
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self,
|
|
|
+ size=None,
|
|
|
+ scale_factor=None,
|
|
|
+ mode='nearest',
|
|
|
+ align_corners=None):
|
|
|
+ super(Upsample, self).__init__()
|
|
|
+ self.size = size
|
|
|
+ if isinstance(scale_factor, tuple):
|
|
|
+ self.scale_factor = tuple(float(factor) for factor in scale_factor)
|
|
|
+ else:
|
|
|
+ self.scale_factor = float(scale_factor) if scale_factor else None
|
|
|
+ self.mode = mode
|
|
|
+ self.align_corners = align_corners
|
|
|
+
|
|
|
+ def forward(self, x):
|
|
|
+ if not self.size:
|
|
|
+ return F.interpolate(x, None, self.scale_factor, self.mode, self.align_corners)
|
|
|
+ else:
|
|
|
+ return F.interpolate(x, self.size, None, self.mode, self.align_corners)
|
|
|
+
|
|
|
+
|
|
|
+def point_sample(input, points, align_corners=False, **kwargs):
|
|
|
+ """
|
|
|
+ A wrapper around :func:`grid_sample` to support 3D point_coords tensors
|
|
|
+ Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
|
|
|
+ lie inside ``[0, 1] x [0, 1]`` square.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ input (Tensor): Feature map, shape (N, C, H, W).
|
|
|
+ points (Tensor): Image based absolute point coordinates (normalized),
|
|
|
+ range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
|
|
|
+ align_corners (bool): Whether align_corners. Default: False
|
|
|
+ Returns:
|
|
|
+ Tensor: Features of `point` on `input`, shape (N, C, P) or
|
|
|
+ (N, C, Hgrid, Wgrid).
|
|
|
+ """
|
|
|
+
|
|
|
+ def denormalize(grid):
|
|
|
+ """Denormalize input grid from range [0, 1] to [-1, 1]
|
|
|
+ Args:
|
|
|
+ grid (Tensor): The grid to be denormalize, range [0, 1].
|
|
|
+ Returns:
|
|
|
+ Tensor: Denormalized grid, range [-1, 1].
|
|
|
+ """
|
|
|
+ return grid * 2.0 - 1.0
|
|
|
+
|
|
|
+ add_dim = False
|
|
|
+ if points.dim() == 3:
|
|
|
+ add_dim = True
|
|
|
+ points = paddle.unsqueeze(points, axis=2)
|
|
|
+ output = F.grid_sample(
|
|
|
+ input, denormalize(points), align_corners=align_corners, **kwargs)
|
|
|
+ if add_dim:
|
|
|
+ output = paddle.squeeze(output, axis=3)
|
|
|
+ return output
|
|
|
+
|
|
|
+
|
|
|
+def calculate_uncertainty(seg_logits):
|
|
|
+ """
|
|
|
+ Estimate uncertainty based on seg logits.
|
|
|
+ For each location of the prediction ``seg_logits`` we estimate
|
|
|
+ uncertainty as the difference between top first and top second
|
|
|
+ predicted logits.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ seg_logits (Tensor): Semantic segmentation logits,
|
|
|
+ shape (batch_size, num_classes, height, width).
|
|
|
+ Returns:
|
|
|
+ scores (Tensor): T uncertainty scores with the most uncertain
|
|
|
+ locations having the highest uncertainty score, shape (
|
|
|
+ batch_size, 1, height, width)
|
|
|
+ """
|
|
|
+
|
|
|
+ top2_scores = paddle.topk(seg_logits, k=2, axis=1)[0]
|
|
|
+ return paddle.unsqueeze(top2_scores[:, 1] - top2_scores[:, 0], axis=1)
|