Преглед изворни кода

[Feature] Add modules for remote sensing detection

Liu Yi пре 4 година
родитељ
комит
16c85bf3c2
100 измењених фајлова са 29530 додато и 1 уклоњено
  1. 5 1
      paddlers/__init__.py
  2. 1 0
      paddlers/datasets/__init__.py
  3. 445 0
      paddlers/datasets/voc.py
  4. 15 0
      paddlers/models/__init__.py
  5. 0 0
      paddlers/models/ppcd/__init__.py
  6. 0 0
      paddlers/models/ppcls/__init__.py
  7. 16 0
      paddlers/models/ppdet/__init__.py
  8. 15 0
      paddlers/models/ppdet/core/__init__.py
  9. 13 0
      paddlers/models/ppdet/core/config/__init__.py
  10. 248 0
      paddlers/models/ppdet/core/config/schema.py
  11. 118 0
      paddlers/models/ppdet/core/config/yaml_helpers.py
  12. 278 0
      paddlers/models/ppdet/core/workspace.py
  13. 21 0
      paddlers/models/ppdet/data/__init__.py
  14. 13 0
      paddlers/models/ppdet/data/crop_utils/__init__.py
  15. 585 0
      paddlers/models/ppdet/data/crop_utils/annotation_cropper.py
  16. 170 0
      paddlers/models/ppdet/data/crop_utils/chip_box_utils.py
  17. 302 0
      paddlers/models/ppdet/data/reader.py
  18. 67 0
      paddlers/models/ppdet/data/shm_utils.py
  19. 29 0
      paddlers/models/ppdet/data/source/__init__.py
  20. 904 0
      paddlers/models/ppdet/data/source/category.py
  21. 251 0
      paddlers/models/ppdet/data/source/coco.py
  22. 197 0
      paddlers/models/ppdet/data/source/dataset.py
  23. 669 0
      paddlers/models/ppdet/data/source/keypoint_coco.py
  24. 636 0
      paddlers/models/ppdet/data/source/mot.py
  25. 191 0
      paddlers/models/ppdet/data/source/sniper_coco.py
  26. 231 0
      paddlers/models/ppdet/data/source/voc.py
  27. 180 0
      paddlers/models/ppdet/data/source/widerface.py
  28. 28 0
      paddlers/models/ppdet/data/transform/__init__.py
  29. 270 0
      paddlers/models/ppdet/data/transform/atss_assigner.py
  30. 1591 0
      paddlers/models/ppdet/data/transform/autoaugment_utils.py
  31. 1080 0
      paddlers/models/ppdet/data/transform/batch_operators.py
  32. 86 0
      paddlers/models/ppdet/data/transform/gridmask_utils.py
  33. 868 0
      paddlers/models/ppdet/data/transform/keypoint_operators.py
  34. 628 0
      paddlers/models/ppdet/data/transform/mot_operators.py
  35. 498 0
      paddlers/models/ppdet/data/transform/op_helper.py
  36. 3025 0
      paddlers/models/ppdet/data/transform/operators.py
  37. 30 0
      paddlers/models/ppdet/engine/__init__.py
  38. 340 0
      paddlers/models/ppdet/engine/callbacks.py
  39. 50 0
      paddlers/models/ppdet/engine/env.py
  40. 177 0
      paddlers/models/ppdet/engine/export_utils.py
  41. 538 0
      paddlers/models/ppdet/engine/tracker.py
  42. 742 0
      paddlers/models/ppdet/engine/trainer.py
  43. 29 0
      paddlers/models/ppdet/metrics/__init__.py
  44. 184 0
      paddlers/models/ppdet/metrics/coco_utils.py
  45. 149 0
      paddlers/models/ppdet/metrics/json_results.py
  46. 401 0
      paddlers/models/ppdet/metrics/keypoint_metrics.py
  47. 444 0
      paddlers/models/ppdet/metrics/map_utils.py
  48. 470 0
      paddlers/models/ppdet/metrics/mcmot_metrics.py
  49. 434 0
      paddlers/models/ppdet/metrics/metrics.py
  50. 1236 0
      paddlers/models/ppdet/metrics/mot_metrics.py
  51. 428 0
      paddlers/models/ppdet/metrics/munkres.py
  52. 393 0
      paddlers/models/ppdet/metrics/widerface_utils.py
  53. 18 0
      paddlers/models/ppdet/model_zoo/__init__.py
  54. 84 0
      paddlers/models/ppdet/model_zoo/model_zoo.py
  55. 45 0
      paddlers/models/ppdet/modeling/__init__.py
  56. 51 0
      paddlers/models/ppdet/modeling/architectures/__init__.py
  57. 91 0
      paddlers/models/ppdet/modeling/architectures/blazeface.py
  58. 144 0
      paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py
  59. 108 0
      paddlers/models/ppdet/modeling/architectures/centernet.py
  60. 69 0
      paddlers/models/ppdet/modeling/architectures/deepsort.py
  61. 93 0
      paddlers/models/ppdet/modeling/architectures/detr.py
  62. 100 0
      paddlers/models/ppdet/modeling/architectures/fairmot.py
  63. 106 0
      paddlers/models/ppdet/modeling/architectures/faster_rcnn.py
  64. 105 0
      paddlers/models/ppdet/modeling/architectures/fcos.py
  65. 87 0
      paddlers/models/ppdet/modeling/architectures/gfl.py
  66. 111 0
      paddlers/models/ppdet/modeling/architectures/jde.py
  67. 287 0
      paddlers/models/ppdet/modeling/architectures/keypoint_hrhrnet.py
  68. 267 0
      paddlers/models/ppdet/modeling/architectures/keypoint_hrnet.py
  69. 135 0
      paddlers/models/ppdet/modeling/architectures/mask_rcnn.py
  70. 141 0
      paddlers/models/ppdet/modeling/architectures/meta_arch.py
  71. 91 0
      paddlers/models/ppdet/modeling/architectures/picodet.py
  72. 102 0
      paddlers/models/ppdet/modeling/architectures/s2anet.py
  73. 110 0
      paddlers/models/ppdet/modeling/architectures/solov2.py
  74. 99 0
      paddlers/models/ppdet/modeling/architectures/sparse_rcnn.py
  75. 93 0
      paddlers/models/ppdet/modeling/architectures/ssd.py
  76. 78 0
      paddlers/models/ppdet/modeling/architectures/tood.py
  77. 98 0
      paddlers/models/ppdet/modeling/architectures/ttfnet.py
  78. 124 0
      paddlers/models/ppdet/modeling/architectures/yolo.py
  79. 23 0
      paddlers/models/ppdet/modeling/assigners/__init__.py
  80. 211 0
      paddlers/models/ppdet/modeling/assigners/atss_assigner.py
  81. 262 0
      paddlers/models/ppdet/modeling/assigners/simota_assigner.py
  82. 158 0
      paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py
  83. 195 0
      paddlers/models/ppdet/modeling/assigners/utils.py
  84. 49 0
      paddlers/models/ppdet/modeling/backbones/__init__.py
  85. 320 0
      paddlers/models/ppdet/modeling/backbones/blazenet.py
  86. 340 0
      paddlers/models/ppdet/modeling/backbones/darknet.py
  87. 244 0
      paddlers/models/ppdet/modeling/backbones/dla.py
  88. 290 0
      paddlers/models/ppdet/modeling/backbones/esnet.py
  89. 470 0
      paddlers/models/ppdet/modeling/backbones/ghostnet.py
  90. 224 0
      paddlers/models/ppdet/modeling/backbones/hardnet.py
  91. 727 0
      paddlers/models/ppdet/modeling/backbones/hrnet.py
  92. 259 0
      paddlers/models/ppdet/modeling/backbones/lcnet.py
  93. 886 0
      paddlers/models/ppdet/modeling/backbones/lite_hrnet.py
  94. 411 0
      paddlers/models/ppdet/modeling/backbones/mobilenet_v1.py
  95. 479 0
      paddlers/models/ppdet/modeling/backbones/mobilenet_v3.py
  96. 69 0
      paddlers/models/ppdet/modeling/backbones/name_adapter.py
  97. 358 0
      paddlers/models/ppdet/modeling/backbones/res2net.py
  98. 609 0
      paddlers/models/ppdet/modeling/backbones/resnet.py
  99. 139 0
      paddlers/models/ppdet/modeling/backbones/senet.py
  100. 251 0
      paddlers/models/ppdet/modeling/backbones/shufflenet_v2.py

+ 5 - 1
paddlers/__init__.py

@@ -1 +1,5 @@
-from . import datasets, transforms, utils, tools
+from . import tasks, datasets, transforms, utils, tools, models
+
+# TODO, add these info in installation
+env_info = {'place': 'gpu', 'num': 1}
+__version__ = 0.1

+ 1 - 0
paddlers/datasets/__init__.py

@@ -0,0 +1 @@
+from .voc import VOCDetection

+ 445 - 0
paddlers/datasets/voc.py

@@ -0,0 +1,445 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+import copy
+import os
+import os.path as osp
+import random
+import re
+import numpy as np
+from collections import OrderedDict
+import xml.etree.ElementTree as ET
+from paddle.io import Dataset
+from paddlers.utils import logging, get_num_workers, get_encoding, path_normalization, is_pic
+from paddlers.transforms import Decode, MixupImage
+from paddlers.tools import YOLOAnchorCluster
+
+
+class VOCDetection(Dataset):
+    """读取PascalVOC格式的检测数据集,并对样本进行相应的处理。
+
+    Args:
+        data_dir (str): 数据集所在的目录路径。
+        file_list (str): 描述数据集图片文件和对应标注文件的文件路径(文本内每行路径为相对data_dir的相对路)。
+        label_list (str): 描述数据集包含的类别信息文件路径。
+        transforms (paddlers.det.transforms): 数据集中每个样本的预处理/增强算子。
+        num_workers (int|str): 数据集中样本在预处理过程中的线程或进程数。默认为'auto'。当设为'auto'时,根据
+            系统的实际CPU核数设置`num_workers`: 如果CPU核数的一半大于8,则`num_workers`为8,否则为CPU核数的
+            一半。
+        shuffle (bool): 是否需要对数据集中样本打乱顺序。默认为False。
+        allow_empty (bool): 是否加载负样本。默认为False。
+        empty_ratio (float): 用于指定负样本占总样本数的比例。如果小于0或大于等于1,则保留全部的负样本。默认为1。
+    """
+
+    def __init__(self,
+                 data_dir,
+                 file_list,
+                 label_list,
+                 transforms=None,
+                 num_workers='auto',
+                 shuffle=False,
+                 allow_empty=False,
+                 empty_ratio=1.):
+        # matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
+        # or matplotlib.backends is imported for the first time
+        # pycocotools import matplotlib
+        import matplotlib
+        matplotlib.use('Agg')
+        from pycocotools.coco import COCO
+        super(VOCDetection, self).__init__()
+        self.data_dir = data_dir
+        self.data_fields = None
+        self.transforms = copy.deepcopy(transforms)
+        self.num_max_boxes = 50
+
+        self.use_mix = False
+        if self.transforms is not None:
+            for op in self.transforms.transforms:
+                if isinstance(op, MixupImage):
+                    self.mixup_op = copy.deepcopy(op)
+                    self.use_mix = True
+                    self.num_max_boxes *= 2
+                    break
+
+        self.batch_transforms = None
+        self.num_workers = get_num_workers(num_workers)
+        self.shuffle = shuffle
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+        self.file_list = list()
+        neg_file_list = list()
+        self.labels = list()
+
+        annotations = dict()
+        annotations['images'] = list()
+        annotations['categories'] = list()
+        annotations['annotations'] = list()
+
+        cname2cid = OrderedDict()
+        label_id = 0
+        with open(label_list, 'r', encoding=get_encoding(label_list)) as f:
+            for line in f.readlines():
+                cname2cid[line.strip()] = label_id
+                label_id += 1
+                self.labels.append(line.strip())
+        logging.info("Starting to read file list from dataset...")
+        for k, v in cname2cid.items():
+            annotations['categories'].append({
+                'supercategory': 'component',
+                'id': v + 1,
+                'name': k
+            })
+        ct = 0
+        ann_ct = 0
+        with open(file_list, 'r', encoding=get_encoding(file_list)) as f:
+            while True:
+                line = f.readline()
+                if not line:
+                    break
+                if len(line.strip().split()) > 2:
+                    raise Exception("A space is defined as the separator, "
+                                    "but it exists in image or label name {}."
+                                    .format(line))
+                img_file, xml_file = [
+                    osp.join(data_dir, x) for x in line.strip().split()[:2]
+                ]
+                img_file = path_normalization(img_file)
+                xml_file = path_normalization(xml_file)
+                if not is_pic(img_file):
+                    continue
+                if not osp.isfile(xml_file):
+                    continue
+                if not osp.exists(img_file):
+                    logging.warning('The image file {} does not exist!'.format(
+                        img_file))
+                    continue
+                if not osp.exists(xml_file):
+                    logging.warning('The annotation file {} does not exist!'.
+                                    format(xml_file))
+                    continue
+                tree = ET.parse(xml_file)
+                if tree.find('id') is None:
+                    im_id = np.asarray([ct])
+                else:
+                    ct = int(tree.find('id').text)
+                    im_id = np.asarray([int(tree.find('id').text)])
+                pattern = re.compile('<size>', re.IGNORECASE)
+                size_tag = pattern.findall(
+                    str(ET.tostringlist(tree.getroot())))
+                if len(size_tag) > 0:
+                    size_tag = size_tag[0][1:-1]
+                    size_element = tree.find(size_tag)
+                    pattern = re.compile('<width>', re.IGNORECASE)
+                    width_tag = pattern.findall(
+                        str(ET.tostringlist(size_element)))[0][1:-1]
+                    im_w = float(size_element.find(width_tag).text)
+                    pattern = re.compile('<height>', re.IGNORECASE)
+                    height_tag = pattern.findall(
+                        str(ET.tostringlist(size_element)))[0][1:-1]
+                    im_h = float(size_element.find(height_tag).text)
+                else:
+                    im_w = 0
+                    im_h = 0
+
+                pattern = re.compile('<object>', re.IGNORECASE)
+                obj_match = pattern.findall(
+                    str(ET.tostringlist(tree.getroot())))
+                if len(obj_match) > 0:
+                    obj_tag = obj_match[0][1:-1]
+                    objs = tree.findall(obj_tag)
+                else:
+                    objs = list()
+
+                num_bbox, i = len(objs), 0
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_score = np.zeros((num_bbox, 1), dtype=np.float32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                difficult = np.zeros((num_bbox, 1), dtype=np.int32)
+                for obj in objs:
+                    pattern = re.compile('<name>', re.IGNORECASE)
+                    name_tag = pattern.findall(str(ET.tostringlist(obj)))[0][
+                        1:-1]
+                    cname = obj.find(name_tag).text.strip()
+                    pattern = re.compile('<difficult>', re.IGNORECASE)
+                    diff_tag = pattern.findall(str(ET.tostringlist(obj)))
+                    if len(diff_tag) == 0:
+                        _difficult = 0
+                    else:
+                        diff_tag = diff_tag[0][1:-1]
+                        try:
+                            _difficult = int(obj.find(diff_tag).text)
+                        except Exception:
+                            _difficult = 0
+                    pattern = re.compile('<bndbox>', re.IGNORECASE)
+                    box_tag = pattern.findall(str(ET.tostringlist(obj)))
+                    if len(box_tag) == 0:
+                        logging.warning(
+                            "There's no field '<bndbox>' in one of object, "
+                            "so this object will be ignored. xml file: {}".
+                            format(xml_file))
+                        continue
+                    box_tag = box_tag[0][1:-1]
+                    box_element = obj.find(box_tag)
+                    pattern = re.compile('<xmin>', re.IGNORECASE)
+                    xmin_tag = pattern.findall(
+                        str(ET.tostringlist(box_element)))[0][1:-1]
+                    x1 = float(box_element.find(xmin_tag).text)
+                    pattern = re.compile('<ymin>', re.IGNORECASE)
+                    ymin_tag = pattern.findall(
+                        str(ET.tostringlist(box_element)))[0][1:-1]
+                    y1 = float(box_element.find(ymin_tag).text)
+                    pattern = re.compile('<xmax>', re.IGNORECASE)
+                    xmax_tag = pattern.findall(
+                        str(ET.tostringlist(box_element)))[0][1:-1]
+                    x2 = float(box_element.find(xmax_tag).text)
+                    pattern = re.compile('<ymax>', re.IGNORECASE)
+                    ymax_tag = pattern.findall(
+                        str(ET.tostringlist(box_element)))[0][1:-1]
+                    y2 = float(box_element.find(ymax_tag).text)
+                    x1 = max(0, x1)
+                    y1 = max(0, y1)
+                    if im_w > 0.5 and im_h > 0.5:
+                        x2 = min(im_w - 1, x2)
+                        y2 = min(im_h - 1, y2)
+
+                    if not (x2 >= x1 and y2 >= y1):
+                        logging.warning(
+                            "Bounding box for object {} does not satisfy xmin {} <= xmax {} and ymin {} <= ymax {}, "
+                            "so this object is skipped. xml file: {}".format(i, x1, x2, y1, y2, xml_file))
+                        continue
+
+                    gt_bbox[i, :] = [x1, y1, x2, y2]
+                    gt_class[i, 0] = cname2cid[cname]
+                    gt_score[i, 0] = 1.
+                    is_crowd[i, 0] = 0
+                    difficult[i, 0] = _difficult
+                    i += 1
+                    annotations['annotations'].append({
+                        'iscrowd': 0,
+                        'image_id': int(im_id[0]),
+                        'bbox': [x1, y1, x2 - x1, y2 - y1],
+                        'area': float((x2 - x1) * (y2 - y1)),
+                        'category_id': cname2cid[cname] + 1,
+                        'id': ann_ct,
+                        'difficult': _difficult
+                    })
+                    ann_ct += 1
+
+                gt_bbox = gt_bbox[:i, :]
+                gt_class = gt_class[:i, :]
+                gt_score = gt_score[:i, :]
+                is_crowd = is_crowd[:i, :]
+                difficult = difficult[:i, :]
+
+                im_info = {
+                    'im_id': im_id,
+                    'image_shape': np.array(
+                        [im_h, im_w], dtype=np.int32)
+                }
+                label_info = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_score': gt_score,
+                    'difficult': difficult
+                }
+
+                if gt_bbox.size > 0:
+                    self.file_list.append({
+                        'image': img_file,
+                        **
+                        im_info,
+                        **
+                        label_info
+                    })
+                    annotations['images'].append({
+                        'height': im_h,
+                        'width': im_w,
+                        'id': int(im_id[0]),
+                        'file_name': osp.split(img_file)[1]
+                    })
+                else:
+                    neg_file_list.append({
+                        'image': img_file,
+                        **
+                        im_info,
+                        **
+                        label_info
+                    })
+                ct += 1
+
+                if self.use_mix:
+                    self.num_max_boxes = max(self.num_max_boxes, 2 * len(objs))
+                else:
+                    self.num_max_boxes = max(self.num_max_boxes, len(objs))
+
+        if not ct:
+            logging.error(
+                "No voc record found in %s' % (file_list)", exit=True)
+        self.pos_num = len(self.file_list)
+        if self.allow_empty and neg_file_list:
+            self.file_list += self._sample_empty(neg_file_list)
+        logging.info(
+            "{} samples in file {}, including {} positive samples and {} negative samples.".
+            format(
+                len(self.file_list), file_list, self.pos_num,
+                len(self.file_list) - self.pos_num))
+        self.num_samples = len(self.file_list)
+        self.coco_gt = COCO()
+        self.coco_gt.dataset = annotations
+        self.coco_gt.createIndex()
+
+        self._epoch = 0
+
+    def __getitem__(self, idx):
+        sample = copy.deepcopy(self.file_list[idx])
+        if self.data_fields is not None:
+            sample = {k: sample[k] for k in self.data_fields}
+        if self.use_mix and (self.mixup_op.mixup_epoch == -1 or
+                             self._epoch < self.mixup_op.mixup_epoch):
+            if self.num_samples > 1:
+                mix_idx = random.randint(1, self.num_samples - 1)
+                mix_pos = (mix_idx + idx) % self.num_samples
+            else:
+                mix_pos = 0
+            sample_mix = copy.deepcopy(self.file_list[mix_pos])
+            if self.data_fields is not None:
+                sample_mix = {k: sample_mix[k] for k in self.data_fields}
+            sample = self.mixup_op(sample=[
+                Decode(to_rgb=False)(sample), Decode(to_rgb=False)(sample_mix)
+            ])
+        sample = self.transforms(sample)
+        return sample
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch_id):
+        self._epoch = epoch_id
+
+    def cluster_yolo_anchor(self,
+                            num_anchors,
+                            image_size,
+                            cache=True,
+                            cache_path=None,
+                            iters=300,
+                            gen_iters=1000,
+                            thresh=.25):
+        """
+        Cluster YOLO anchors.
+
+        Reference:
+            https://github.com/ultralytics/yolov5/blob/master/utils/autoanchor.py
+
+        Args:
+            num_anchors (int): number of clusters
+            image_size (list or int): [h, w], being an int means image height and image width are the same.
+            cache (bool): whether using cache
+            cache_path (str or None, optional): cache directory path. If None, use `data_dir` of dataset.
+            iters (int, optional): iters of kmeans algorithm
+            gen_iters (int, optional): iters of genetic algorithm
+            threshold (float, optional): anchor scale threshold
+            verbose (bool, optional): whether print results
+        """
+        if cache_path is None:
+            cache_path = self.data_dir
+        cluster = YOLOAnchorCluster(
+            num_anchors=num_anchors,
+            dataset=self,
+            image_size=image_size,
+            cache=cache,
+            cache_path=cache_path,
+            iters=iters,
+            gen_iters=gen_iters,
+            thresh=thresh)
+        anchors = cluster()
+        return anchors
+
+    def add_negative_samples(self, image_dir, empty_ratio=1):
+        """将背景图片加入训练
+
+        Args:
+            image_dir (str):背景图片所在的文件夹目录。
+            empty_ratio (float or None): 用于指定负样本占总样本数的比例。如果为None,保留数据集初始化是设置的`empty_ratio`值,
+                否则更新原有`empty_ratio`值。如果小于0或大于等于1,则保留全部的负样本。默认为1。
+
+        """
+        import cv2
+        if not osp.isdir(image_dir):
+            raise Exception("{} is not a valid image directory.".format(
+                image_dir))
+        if empty_ratio is not None:
+            self.empty_ratio = empty_ratio
+        image_list = os.listdir(image_dir)
+        max_img_id = max(
+            len(self.file_list) - 1, max(self.coco_gt.getImgIds()))
+        neg_file_list = list()
+        for image in image_list:
+            if not is_pic(image):
+                continue
+            gt_bbox = np.zeros((0, 4), dtype=np.float32)
+            gt_class = np.zeros((0, 1), dtype=np.int32)
+            gt_score = np.zeros((0, 1), dtype=np.float32)
+            is_crowd = np.zeros((0, 1), dtype=np.int32)
+            difficult = np.zeros((0, 1), dtype=np.int32)
+
+            max_img_id += 1
+            im_fname = osp.join(image_dir, image)
+            img_data = cv2.imread(im_fname, cv2.IMREAD_UNCHANGED)
+            im_h, im_w, im_c = img_data.shape
+
+            im_info = {
+                'im_id': np.asarray([max_img_id]),
+                'image_shape': np.array(
+                    [im_h, im_w], dtype=np.int32)
+            }
+            label_info = {
+                'is_crowd': is_crowd,
+                'gt_class': gt_class,
+                'gt_bbox': gt_bbox,
+                'gt_score': gt_score,
+                'difficult': difficult
+            }
+            if 'gt_poly' in self.file_list[0]:
+                label_info['gt_poly'] = []
+
+            neg_file_list.append({
+                'image': im_fname,
+                **
+                im_info,
+                **
+                label_info
+            })
+        if neg_file_list:
+            self.allow_empty = True
+            self.file_list += self._sample_empty(neg_file_list)
+        logging.info(
+            "{} negative samples added. Dataset contains {} positive samples and {} negative samples.".
+            format(
+                len(self.file_list) - self.num_samples, self.pos_num,
+                len(self.file_list) - self.pos_num))
+        self.num_samples = len(self.file_list)
+
+    def _sample_empty(self, neg_file_list):
+        if 0. <= self.empty_ratio < 1.:
+            import random
+            total_num = len(self.file_list)
+            neg_num = total_num - self.pos_num
+            sample_num = min((total_num * self.empty_ratio - neg_num) //
+                             (1 - self.empty_ratio), len(neg_file_list))
+            return random.sample(neg_file_list, sample_num)
+        else:
+            return neg_file_list

+ 15 - 0
paddlers/models/__init__.py

@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import ppcd, ppcls, ppdet, ppseg

+ 0 - 0
paddlers/pipeline/__init__.py → paddlers/models/ppcd/__init__.py


+ 0 - 0
paddlers/third_party/ppcd/__init__.py → paddlers/models/ppcls/__init__.py


+ 16 - 0
paddlers/models/ppdet/__init__.py

@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import (core, data, engine, modeling, model_zoo, optimizer, metrics,
+               utils, slim)

+ 15 - 0
paddlers/models/ppdet/core/__init__.py

@@ -0,0 +1,15 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import config

+ 13 - 0
paddlers/models/ppdet/core/config/__init__.py

@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 248 - 0
paddlers/models/ppdet/core/config/schema.py

@@ -0,0 +1,248 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import inspect
+import importlib
+import re
+
+try:
+    from docstring_parser import parse as doc_parse
+except Exception:
+
+    def doc_parse(*args):
+        pass
+
+
+try:
+    from typeguard import check_type
+except Exception:
+
+    def check_type(*args):
+        pass
+
+
+__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema']
+
+
+class SchemaValue(object):
+    def __init__(self, name, doc='', type=None):
+        super(SchemaValue, self).__init__()
+        self.name = name
+        self.doc = doc
+        self.type = type
+
+    def set_default(self, value):
+        self.default = value
+
+    def has_default(self):
+        return hasattr(self, 'default')
+
+
+class SchemaDict(dict):
+    def __init__(self, **kwargs):
+        super(SchemaDict, self).__init__()
+        self.schema = {}
+        self.strict = False
+        self.doc = ""
+        self.update(kwargs)
+
+    def __setitem__(self, key, value):
+        # XXX also update regular dict to SchemaDict??
+        if isinstance(value, dict) and key in self and isinstance(self[key],
+                                                                  SchemaDict):
+            self[key].update(value)
+        else:
+            super(SchemaDict, self).__setitem__(key, value)
+
+    def __missing__(self, key):
+        if self.has_default(key):
+            return self.schema[key].default
+        elif key in self.schema:
+            return self.schema[key]
+        else:
+            raise KeyError(key)
+
+    def copy(self):
+        newone = SchemaDict()
+        newone.__dict__.update(self.__dict__)
+        newone.update(self)
+        return newone
+
+    def set_schema(self, key, value):
+        assert isinstance(value, SchemaValue)
+        self.schema[key] = value
+
+    def set_strict(self, strict):
+        self.strict = strict
+
+    def has_default(self, key):
+        return key in self.schema and self.schema[key].has_default()
+
+    def is_default(self, key):
+        if not self.has_default(key):
+            return False
+        if hasattr(self[key], '__dict__'):
+            return True
+        else:
+            return key not in self or self[key] == self.schema[key].default
+
+    def find_default_keys(self):
+        return [
+            k for k in list(self.keys()) + list(self.schema.keys())
+            if self.is_default(k)
+        ]
+
+    def mandatory(self):
+        return any([k for k in self.schema.keys() if not self.has_default(k)])
+
+    def find_missing_keys(self):
+        missing = [
+            k for k in self.schema.keys()
+            if k not in self and not self.has_default(k)
+        ]
+        placeholders = [k for k in self if self[k] in ('<missing>', '<value>')]
+        return missing + placeholders
+
+    def find_extra_keys(self):
+        return list(set(self.keys()) - set(self.schema.keys()))
+
+    def find_mismatch_keys(self):
+        mismatch_keys = []
+        for arg in self.schema.values():
+            if arg.type is not None:
+                try:
+                    check_type("{}.{}".format(self.name, arg.name),
+                               self[arg.name], arg.type)
+                except Exception:
+                    mismatch_keys.append(arg.name)
+        return mismatch_keys
+
+    def validate(self):
+        missing_keys = self.find_missing_keys()
+        if missing_keys:
+            raise ValueError("Missing param for class<{}>: {}".format(
+                self.name, ", ".join(missing_keys)))
+        extra_keys = self.find_extra_keys()
+        if extra_keys and self.strict:
+            raise ValueError("Extraneous param for class<{}>: {}".format(
+                self.name, ", ".join(extra_keys)))
+        mismatch_keys = self.find_mismatch_keys()
+        if mismatch_keys:
+            raise TypeError("Wrong param type for class<{}>: {}".format(
+                self.name, ", ".join(mismatch_keys)))
+
+
+class SharedConfig(object):
+    """
+    Representation class for `__shared__` annotations, which work as follows:
+
+    - if `key` is set for the module in config file, its value will take
+      precedence
+    - if `key` is not set for the module but present in the config file, its
+      value will be used
+    - otherwise, use the provided `default_value` as fallback
+
+    Args:
+        key: config[key] will be injected
+        default_value: fallback value
+    """
+
+    def __init__(self, key, default_value=None):
+        super(SharedConfig, self).__init__()
+        self.key = key
+        self.default_value = default_value
+
+
+def extract_schema(cls):
+    """
+    Extract schema from a given class
+
+    Args:
+        cls (type): Class from which to extract.
+
+    Returns:
+        schema (SchemaDict): Extracted schema.
+    """
+    ctor = cls.__init__
+    # python 2 compatibility
+    if hasattr(inspect, 'getfullargspec'):
+        argspec = inspect.getfullargspec(ctor)
+        annotations = argspec.annotations
+        has_kwargs = argspec.varkw is not None
+    else:
+        argspec = inspect.getfullargspec(ctor)
+        # python 2 type hinting workaround, see pep-3107
+        # however, since `typeguard` does not support python 2, type checking
+        # is still python 3 only for now
+        annotations = getattr(ctor, '__annotations__', {})
+        has_kwargs = argspec.varkw is not None
+
+    names = [arg for arg in argspec.args if arg != 'self']
+    defaults = argspec.defaults
+    num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0
+    num_required = len(names) - num_defaults
+
+    docs = cls.__doc__
+    if docs is None and getattr(cls, '__category__', None) == 'op':
+        docs = cls.__call__.__doc__
+    try:
+        docstring = doc_parse(docs)
+    except Exception:
+        docstring = None
+
+    if docstring is None:
+        comments = {}
+    else:
+        comments = {}
+        for p in docstring.params:
+            match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name)
+            if match_obj is not None:
+                comments[match_obj.group(1)] = p.description
+
+    schema = SchemaDict()
+    schema.name = cls.__name__
+    schema.doc = ""
+    if docs is not None:
+        start_pos = docs[0] == '\n' and 1 or 0
+        schema.doc = docs[start_pos:].split("\n")[0].strip()
+    # XXX handle paddle's weird doc convention
+    if '**' == schema.doc[:2] and '**' == schema.doc[-2:]:
+        schema.doc = schema.doc[2:-2].strip()
+    schema.category = hasattr(cls, '__category__') and getattr(
+        cls, '__category__') or 'module'
+    schema.strict = not has_kwargs
+    schema.pymodule = importlib.import_module(cls.__module__)
+    schema.inject = getattr(cls, '__inject__', [])
+    schema.shared = getattr(cls, '__shared__', [])
+    for idx, name in enumerate(names):
+        comment = name in comments and comments[name] or name
+        if name in schema.inject:
+            type_ = None
+        else:
+            type_ = name in annotations and annotations[name] or None
+        value_schema = SchemaValue(name, comment, type_)
+        if name in schema.shared:
+            assert idx >= num_required, "shared config must have default value"
+            default = defaults[idx - num_required]
+            value_schema.set_default(SharedConfig(name, default))
+        elif idx >= num_required:
+            default = defaults[idx - num_required]
+            value_schema.set_default(default)
+        schema.set_schema(name, value_schema)
+
+    return schema

+ 118 - 0
paddlers/models/ppdet/core/config/yaml_helpers.py

@@ -0,0 +1,118 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+
+import yaml
+from .schema import SharedConfig
+
+__all__ = ['serializable', 'Callable']
+
+
+def represent_dictionary_order(self, dict_data):
+    return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items())
+
+
+def setup_orderdict():
+    from collections import OrderedDict
+    yaml.add_representer(OrderedDict, represent_dictionary_order)
+
+
+def _make_python_constructor(cls):
+    def python_constructor(loader, node):
+        if isinstance(node, yaml.SequenceNode):
+            args = loader.construct_sequence(node, deep=True)
+            return cls(*args)
+        else:
+            kwargs = loader.construct_mapping(node, deep=True)
+            try:
+                return cls(**kwargs)
+            except Exception as ex:
+                print("Error when construct {} instance from yaml config".
+                      format(cls.__name__))
+                raise ex
+
+    return python_constructor
+
+
+def _make_python_representer(cls):
+    # python 2 compatibility
+    if hasattr(inspect, 'getfullargspec'):
+        argspec = inspect.getfullargspec(cls)
+    else:
+        argspec = inspect.getfullargspec(cls.__init__)
+    argnames = [arg for arg in argspec.args if arg != 'self']
+
+    def python_representer(dumper, obj):
+        if argnames:
+            data = {name: getattr(obj, name) for name in argnames}
+        else:
+            data = obj.__dict__
+        if '_id' in data:
+            del data['_id']
+        return dumper.represent_mapping(u'!{}'.format(cls.__name__), data)
+
+    return python_representer
+
+
+def serializable(cls):
+    """
+    Add loader and dumper for given class, which must be
+    "trivially serializable"
+
+    Args:
+        cls: class to be serialized
+
+    Returns: cls
+    """
+    yaml.add_constructor(u'!{}'.format(cls.__name__),
+                         _make_python_constructor(cls))
+    yaml.add_representer(cls, _make_python_representer(cls))
+    return cls
+
+
+yaml.add_representer(SharedConfig,
+                     lambda d, o: d.represent_data(o.default_value))
+
+
+@serializable
+class Callable(object):
+    """
+    Helper to be used in Yaml for creating arbitrary class objects
+
+    Args:
+        full_type (str): the full module path to target function
+    """
+
+    def __init__(self, full_type, args=[], kwargs={}):
+        super(Callable, self).__init__()
+        self.full_type = full_type
+        self.args = args
+        self.kwargs = kwargs
+
+    def __call__(self):
+        if '.' in self.full_type:
+            idx = self.full_type.rfind('.')
+            module = importlib.import_module(self.full_type[:idx])
+            func_name = self.full_type[idx + 1:]
+        else:
+            try:
+                module = importlib.import_module('builtins')
+            except Exception:
+                module = importlib.import_module('__builtin__')
+            func_name = self.full_type
+
+        func = getattr(module, func_name)
+        return func(*self.args, **self.kwargs)

+ 278 - 0
paddlers/models/ppdet/core/workspace.py

@@ -0,0 +1,278 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import importlib
+import os
+import sys
+
+import yaml
+import collections
+
+try:
+    collectionsAbc = collections.abc
+except AttributeError:
+    collectionsAbc = collections
+
+from .config.schema import SchemaDict, SharedConfig, extract_schema
+from .config.yaml_helpers import serializable
+
+__all__ = [
+    'global_config',
+    'load_config',
+    'merge_config',
+    'get_registered_modules',
+    'create',
+    'register',
+    'serializable',
+    'dump_value',
+]
+
+
+def dump_value(value):
+    # XXX this is hackish, but collections.abc is not available in python 2
+    if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)):
+        value = yaml.dump(value, default_flow_style=True)
+        value = value.replace('\n', '')
+        value = value.replace('...', '')
+        return "'{}'".format(value)
+    else:
+        # primitive types
+        return str(value)
+
+
+class AttrDict(dict):
+    """Single level attribute dict, NOT recursive"""
+
+    def __init__(self, **kwargs):
+        super(AttrDict, self).__init__()
+        super(AttrDict, self).update(kwargs)
+
+    def __getattr__(self, key):
+        if key in self:
+            return self[key]
+        raise AttributeError("object has no attribute '{}'".format(key))
+
+
+global_config = AttrDict()
+
+BASE_KEY = '_BASE_'
+
+
+# parse and load _BASE_ recursively
+def _load_config_with_base(file_path):
+    with open(file_path) as f:
+        file_cfg = yaml.load(f, Loader=yaml.Loader)
+
+    # NOTE: cfgs outside have higher priority than cfgs in _BASE_
+    if BASE_KEY in file_cfg:
+        all_base_cfg = AttrDict()
+        base_ymls = list(file_cfg[BASE_KEY])
+        for base_yml in base_ymls:
+            if base_yml.startswith("~"):
+                base_yml = os.path.expanduser(base_yml)
+            if not base_yml.startswith('/'):
+                base_yml = os.path.join(os.path.dirname(file_path), base_yml)
+
+            with open(base_yml) as f:
+                base_cfg = _load_config_with_base(base_yml)
+                all_base_cfg = merge_config(base_cfg, all_base_cfg)
+
+        del file_cfg[BASE_KEY]
+        return merge_config(file_cfg, all_base_cfg)
+
+    return file_cfg
+
+
+def load_config(file_path):
+    """
+    Load config from file.
+
+    Args:
+        file_path (str): Path of the config file to be loaded.
+
+    Returns: global config
+    """
+    _, ext = os.path.splitext(file_path)
+    assert ext in ['.yml', '.yaml'], "only support yaml files for now"
+
+    # load config from file and merge into global config
+    cfg = _load_config_with_base(file_path)
+    cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0]
+    merge_config(cfg)
+
+    return global_config
+
+
+def dict_merge(dct, merge_dct):
+    """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
+    updating only top-level keys, dict_merge recurses down into dicts nested
+    to an arbitrary depth, updating keys. The ``merge_dct`` is merged into
+    ``dct``.
+
+    Args:
+        dct: dict onto which the merge is executed
+        merge_dct: dct merged into dct
+
+    Returns: dct
+    """
+    for k, v in merge_dct.items():
+        if (k in dct and isinstance(dct[k], dict) and
+                isinstance(merge_dct[k], collectionsAbc.Mapping)):
+            dict_merge(dct[k], merge_dct[k])
+        else:
+            dct[k] = merge_dct[k]
+    return dct
+
+
+def merge_config(config, another_cfg=None):
+    """
+    Merge config into global config or another_cfg.
+
+    Args:
+        config (dict): Config to be merged.
+
+    Returns: global config
+    """
+    global global_config
+    dct = another_cfg or global_config
+    return dict_merge(dct, config)
+
+
+def get_registered_modules():
+    return {
+        k: v
+        for k, v in global_config.items() if isinstance(v, SchemaDict)
+    }
+
+
+def make_partial(cls):
+    op_module = importlib.import_module(cls.__op__.__module__)
+    op = getattr(op_module, cls.__op__.__name__)
+    cls.__category__ = getattr(cls, '__category__', None) or 'op'
+
+    def partial_apply(self, *args, **kwargs):
+        kwargs_ = self.__dict__.copy()
+        kwargs_.update(kwargs)
+        return op(*args, **kwargs_)
+
+    if getattr(cls, '__append_doc__', True):  # XXX should default to True?
+        if sys.version_info[0] > 2:
+            cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__)
+            cls.__init__.__doc__ = op.__doc__
+            cls.__call__ = partial_apply
+            cls.__call__.__doc__ = op.__doc__
+        else:
+            # XXX work around for python 2
+            partial_apply.__doc__ = op.__doc__
+            cls.__call__ = partial_apply
+    return cls
+
+
+def register(cls):
+    """
+    Register a given module class.
+
+    Args:
+        cls (type): Module class to be registered.
+
+    Returns: cls
+    """
+    if cls.__name__ in global_config:
+        raise ValueError("Module class already registered: {}".format(
+            cls.__name__))
+    if hasattr(cls, '__op__'):
+        cls = make_partial(cls)
+    global_config[cls.__name__] = extract_schema(cls)
+    return cls
+
+
+def create(cls_or_name, **kwargs):
+    """
+    Create an instance of given module class.
+
+    Args:
+        cls_or_name (type or str): Class of which to create instance.
+
+    Returns: instance of type `cls_or_name`
+    """
+    assert type(cls_or_name) in [type, str
+                                 ], "should be a class or name of a class"
+    name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__
+    assert name in global_config and \
+        isinstance(global_config[name], SchemaDict), \
+        "the module {} is not registered".format(name)
+    config = global_config[name]
+    cls = getattr(config.pymodule, name)
+    cls_kwargs = {}
+    cls_kwargs.update(global_config[name])
+
+    # parse `shared` annoation of registered modules
+    if getattr(config, 'shared', None):
+        for k in config.shared:
+            target_key = config[k]
+            shared_conf = config.schema[k].default
+            assert isinstance(shared_conf, SharedConfig)
+            if target_key is not None and not isinstance(target_key,
+                                                         SharedConfig):
+                continue  # value is given for the module
+            elif shared_conf.key in global_config:
+                # `key` is present in config
+                cls_kwargs[k] = global_config[shared_conf.key]
+            else:
+                cls_kwargs[k] = shared_conf.default_value
+
+    # parse `inject` annoation of registered modules
+    if getattr(cls, 'from_config', None):
+        cls_kwargs.update(cls.from_config(config, **kwargs))
+
+    if getattr(config, 'inject', None):
+        for k in config.inject:
+            target_key = config[k]
+            # optional dependency
+            if target_key is None:
+                continue
+
+            if isinstance(target_key, dict) or hasattr(target_key, '__dict__'):
+                if 'name' not in target_key.keys():
+                    continue
+                inject_name = str(target_key['name'])
+                if inject_name not in global_config:
+                    raise ValueError(
+                        "Missing injection name {} and check it's name in cfg file".
+                        format(k))
+                target = global_config[inject_name]
+                for i, v in target_key.items():
+                    if i == 'name':
+                        continue
+                    target[i] = v
+                if isinstance(target, SchemaDict):
+                    cls_kwargs[k] = create(inject_name)
+            elif isinstance(target_key, str):
+                if target_key not in global_config:
+                    raise ValueError("Missing injection config:", target_key)
+                target = global_config[target_key]
+                if isinstance(target, SchemaDict):
+                    cls_kwargs[k] = create(target_key)
+                elif hasattr(target, '__dict__'):  # serialized object
+                    cls_kwargs[k] = target
+            else:
+                raise ValueError("Unsupported injection type:", target_key)
+    # prevent modification of global config values of reference types
+    # (e.g., list, dict) from within the created module instances
+    #kwargs = copy.deepcopy(kwargs)
+    return cls(**cls_kwargs)

+ 21 - 0
paddlers/models/ppdet/data/__init__.py

@@ -0,0 +1,21 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import source
+from . import transform
+from . import reader
+
+from .source import *
+from .transform import *
+from .reader import *

+ 13 - 0
paddlers/models/ppdet/data/crop_utils/__init__.py

@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 585 - 0
paddlers/models/ppdet/data/crop_utils/annotation_cropper.py

@@ -0,0 +1,585 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import math
+import random
+import numpy as np
+from copy import deepcopy
+from typing import List, Tuple
+from collections import defaultdict
+
+from .chip_box_utils import nms, transform_chip_boxes2image_boxes
+from .chip_box_utils import find_chips_to_cover_overlaped_boxes
+from .chip_box_utils import transform_chip_box
+from .chip_box_utils import intersection_over_box
+
+
+class AnnoCropper(object):
+    def __init__(self,
+                 image_target_sizes: List[int],
+                 valid_box_ratio_ranges: List[List[float]],
+                 chip_target_size: int,
+                 chip_target_stride: int,
+                 use_neg_chip: bool=False,
+                 max_neg_num_per_im: int=8,
+                 max_per_img: int=-1,
+                 nms_thresh: int=0.5):
+        """
+        Generate chips by chip_target_size and chip_target_stride.
+        These two parameters just like kernel_size and stride in cnn.
+
+        Each image has its raw size. After resizing, then get its target size.
+        The resizing scale = target_size / raw_size.
+        So are chips of the image.
+        box_ratio = box_raw_size / image_raw_size = box_target_size / image_target_size
+        The 'size' above mentioned is the size of long-side of image, box or chip.
+
+        :param image_target_sizes: [2000, 1000]
+        :param valid_box_ratio_ranges:  [[-1, 0.1],[0.08, -1]]
+        :param chip_target_size: 500
+        :param chip_target_stride: 200
+        """
+        self.target_sizes = image_target_sizes
+        self.valid_box_ratio_ranges = valid_box_ratio_ranges
+        assert len(self.target_sizes) == len(self.valid_box_ratio_ranges)
+        self.scale_num = len(self.target_sizes)
+        self.chip_target_size = chip_target_size  # is target size
+        self.chip_target_stride = chip_target_stride  # is target stride
+        self.use_neg_chip = use_neg_chip
+        self.max_neg_num_per_im = max_neg_num_per_im
+        self.max_per_img = max_per_img
+        self.nms_thresh = nms_thresh
+
+    def crop_anno_records(self, records: List[dict]):
+        """
+        The main logic:
+        # foreach record(image):
+        #   foreach scale:
+        #     1 generate chips by chip size and stride for each scale
+        #     2 get pos chips
+        #     - validate boxes: current scale; h,w >= 1
+        #     - find pos chips greedily by valid gt boxes in each scale
+        #     - for every valid gt box, find its corresponding pos chips in each scale
+        #     3 get neg chips
+        #     - If given proposals, find neg boxes in them which are not in pos chips
+        #     - If got neg boxes in last step, we find neg chips and assign neg boxes to neg chips such as 2.
+        # 4 sample neg chips if too much each image
+        #   transform this image-scale annotations to chips(pos chips&neg chips) annotations
+
+        :param records, standard coco_record but with extra key `proposals`(Px4), which are predicted by stage1
+                        model and maybe have neg boxes in them.
+        :return: new_records, list of dict like
+        {
+            'im_file': 'fake_image1.jpg',
+            'im_id': np.array([1]),  # new _global_chip_id as im_id
+            'h': h,  # chip height
+            'w': w,  # chip width
+            'is_crowd': is_crowd,  # Nx1 -> Mx1
+            'gt_class': gt_class,  # Nx1 -> Mx1
+            'gt_bbox': gt_bbox,  # Nx4 -> Mx4, 4 represents [x1,y1,x2,y2]
+            'gt_poly': gt_poly,  # [None]xN -> [None]xM
+            'chip': [x1, y1, x2, y2]  # added
+        }
+
+        Attention:
+        ------------------------------>x
+        |
+        |    (x1,y1)------
+        |       |        |
+        |       |        |
+        |       |        |
+        |       |        |
+        |       |        |
+        |       ----------
+        |                 (x2,y2)
+        |
+        ↓
+        y
+
+        If we use [x1, y1, x2, y2] to represent boxes or chips,
+        (x1,y1) is the left-top point which is in the box,
+        but (x2,y2) is the right-bottom point which is not in the box.
+        So x1 in [0, w-1], x2 in [1, w], y1 in [0, h-1], y2 in [1,h].
+        And you can use x2-x1 to get width, and you can use image[y1:y2, x1:x2] to get the box area.
+        """
+
+        self.chip_records = []
+        self._global_chip_id = 1
+        for r in records:
+            self._cur_im_pos_chips = [
+            ]  # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]
+            self._cur_im_neg_chips = []  # element: (chip, neg_box_num)
+            for scale_i in range(self.scale_num):
+                self._get_current_scale_parameters(scale_i, r)
+
+                # Cx4
+                chips = self._create_chips(r['h'], r['w'], self._cur_scale)
+
+                # # dict: chipid->[box_id, ...]
+                pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(
+                    r['gt_bbox'], chips)
+
+                # dict: chipid->neg_box_num
+                neg_chip2box_num = self._get_neg_boxes_and_chips(
+                    chips,
+                    list(pos_chip2boxes_idx.keys()), r.get('proposals', None))
+
+                self._add_to_cur_im_chips(chips, pos_chip2boxes_idx,
+                                          neg_chip2box_num)
+
+            cur_image_records = self._trans_all_chips2annotations(r)
+            self.chip_records.extend(cur_image_records)
+        return self.chip_records
+
+    def _add_to_cur_im_chips(self, chips, pos_chip2boxes_idx,
+                             neg_chip2box_num):
+        for pos_chipid, boxes_idx in pos_chip2boxes_idx.items():
+            chip = np.array(chips[pos_chipid])  # copy chips slice
+            self._cur_im_pos_chips.append((chip, boxes_idx))
+
+        if neg_chip2box_num is None:
+            return
+
+        for neg_chipid, neg_box_num in neg_chip2box_num.items():
+            chip = np.array(chips[neg_chipid])
+            self._cur_im_neg_chips.append((chip, neg_box_num))
+
+    def _trans_all_chips2annotations(self, r):
+        gt_bbox = r['gt_bbox']
+        im_file = r['im_file']
+        is_crowd = r['is_crowd']
+        gt_class = r['gt_class']
+        # gt_poly = r['gt_poly']   # [None]xN
+        # remaining keys: im_id, h, w
+        chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox,
+                                                         is_crowd, gt_class)
+
+        if not self.use_neg_chip:
+            return chip_records
+
+        sampled_neg_chips = self._sample_neg_chips()
+        neg_chip_records = self._trans_neg_chips2annotations(im_file,
+                                                             sampled_neg_chips)
+        chip_records.extend(neg_chip_records)
+        return chip_records
+
+    def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd,
+                                     gt_class):
+        chip_records = []
+        for chip, boxes_idx in self._cur_im_pos_chips:
+            chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx,
+                                                            chip)
+            x1, y1, x2, y2 = chip
+            chip_h = y2 - y1
+            chip_w = x2 - x1
+            rec = {
+                'im_file': im_file,
+                'im_id': np.array([self._global_chip_id]),
+                'h': chip_h,
+                'w': chip_w,
+                'gt_bbox': chip_bbox,
+                'is_crowd': is_crowd[final_boxes_idx].copy(),
+                'gt_class': gt_class[final_boxes_idx].copy(),
+                # 'gt_poly': [None] * len(final_boxes_idx),
+                'chip': chip
+            }
+            self._global_chip_id += 1
+            chip_records.append(rec)
+        return chip_records
+
+    def _sample_neg_chips(self):
+        pos_num = len(self._cur_im_pos_chips)
+        neg_num = len(self._cur_im_neg_chips)
+        sample_num = min(pos_num + 2, self.max_neg_num_per_im)
+        assert sample_num >= 1
+        if neg_num <= sample_num:
+            return self._cur_im_neg_chips
+
+        candidate_num = int(sample_num * 1.5)
+        candidate_neg_chips = sorted(
+            self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]
+        random.shuffle(candidate_neg_chips)
+        sampled_neg_chips = candidate_neg_chips[:sample_num]
+        return sampled_neg_chips
+
+    def _trans_neg_chips2annotations(self,
+                                     im_file: str,
+                                     sampled_neg_chips: List[Tuple]):
+        chip_records = []
+        for chip, neg_box_num in sampled_neg_chips:
+            x1, y1, x2, y2 = chip
+            chip_h = y2 - y1
+            chip_w = x2 - x1
+            rec = {
+                'im_file': im_file,
+                'im_id': np.array([self._global_chip_id]),
+                'h': chip_h,
+                'w': chip_w,
+                'gt_bbox': np.zeros(
+                    (0, 4), dtype=np.float32),
+                'is_crowd': np.zeros(
+                    (0, 1), dtype=np.int32),
+                'gt_class': np.zeros(
+                    (0, 1), dtype=np.int32),
+                # 'gt_poly': [],
+                'chip': chip
+            }
+            self._global_chip_id += 1
+            chip_records.append(rec)
+        return chip_records
+
+    def _get_current_scale_parameters(self, scale_i, r):
+        im_size = max(r['h'], r['w'])
+        im_target_size = self.target_sizes[scale_i]
+        self._cur_im_size, self._cur_im_target_size = im_size, im_target_size
+        self._cur_scale = self._get_current_scale(im_target_size, im_size)
+        self._cur_valid_ratio_range = self.valid_box_ratio_ranges[scale_i]
+
+    def _get_current_scale(self, im_target_size, im_size):
+        return im_target_size / im_size
+
+    def _create_chips(self, h: int, w: int, scale: float):
+        """
+        Generate chips by chip_target_size and chip_target_stride.
+        These two parameters just like kernel_size and stride in cnn.
+        :return: chips, Cx4, xy in raw size dimension
+        """
+        chip_size = self.chip_target_size  # omit target for simplicity
+        stride = self.chip_target_stride
+        width = int(scale * w)
+        height = int(scale * h)
+        min_chip_location_diff = 20  # in target size
+
+        assert chip_size >= stride
+        chip_overlap = chip_size - stride
+        if (width - chip_overlap
+            ) % stride > min_chip_location_diff:  # 不能被stride整除的部分比较大,则保留
+            w_steps = max(1, int(math.ceil((width - chip_overlap) / stride)))
+        else:  # 不能被stride整除的部分比较小,则丢弃
+            w_steps = max(1, int(math.floor((width - chip_overlap) / stride)))
+        if (height - chip_overlap) % stride > min_chip_location_diff:
+            h_steps = max(1, int(math.ceil((height - chip_overlap) / stride)))
+        else:
+            h_steps = max(1, int(math.floor((height - chip_overlap) / stride)))
+
+        chips = list()
+        for j in range(h_steps):
+            for i in range(w_steps):
+                x1 = i * stride
+                y1 = j * stride
+                x2 = min(x1 + chip_size, width)
+                y2 = min(y1 + chip_size, height)
+                chips.append([x1, y1, x2, y2])
+
+        # check  chip size
+        for item in chips:
+            if item[2] - item[0] > chip_size * 1.1 or item[3] - item[
+                    1] > chip_size * 1.1:
+                raise ValueError(item)
+        chips = np.array(chips, dtype=np.float)
+
+        raw_size_chips = chips / scale
+        return raw_size_chips
+
+    def _get_valid_boxes_and_pos_chips(self, gt_bbox, chips):
+        valid_ratio_range = self._cur_valid_ratio_range
+        im_size = self._cur_im_size
+        scale = self._cur_scale
+        #   Nx4            N
+        valid_boxes, valid_boxes_idx = self._validate_boxes(
+            valid_ratio_range, im_size, gt_bbox, scale)
+        # dict: chipid->[box_id, ...]
+        pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes,
+                                                  valid_boxes_idx)
+        return pos_chip2boxes_idx
+
+    def _validate_boxes(self,
+                        valid_ratio_range: List[float],
+                        im_size: int,
+                        gt_boxes: 'np.array of Nx4',
+                        scale: float):
+        """
+        :return: valid_boxes: Nx4, valid_boxes_idx: N
+        """
+        ws = (gt_boxes[:, 2] - gt_boxes[:, 0]).astype(np.int32)
+        hs = (gt_boxes[:, 3] - gt_boxes[:, 1]).astype(np.int32)
+        maxs = np.maximum(ws, hs)
+        box_ratio = maxs / im_size
+        mins = np.minimum(ws, hs)
+        target_mins = mins * scale
+
+        low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0
+        high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(
+            np.float).max
+
+        valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) &
+                                     (target_mins >= 2))[0]
+        valid_boxes = gt_boxes[valid_boxes_idx]
+        return valid_boxes, valid_boxes_idx
+
+    def _find_pos_chips(self,
+                        chips: 'Cx4',
+                        valid_boxes: 'Bx4',
+                        valid_boxes_idx: 'B'):
+        """
+        :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...]
+        """
+        iob = intersection_over_box(chips, valid_boxes)  # overlap, CxB
+
+        iob_threshold_to_find_chips = 1.
+        pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(
+            iob, iob_threshold_to_find_chips)
+        pos_chip_ids = set(pos_chip_ids)
+
+        iob_threshold_to_assign_box = 0.5
+        pos_chip2boxes_idx = self._assign_boxes_to_pos_chips(
+            iob, iob_threshold_to_assign_box, pos_chip_ids, valid_boxes_idx)
+        return pos_chip2boxes_idx
+
+    def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold):
+        return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold)
+
+    def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids,
+                                   valid_boxes_idx):
+        chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
+        pos_chip2boxes_idx = defaultdict(list)
+        for chip_id, box_id in zip(chip_ids, box_ids):
+            if chip_id not in pos_chip_ids:
+                continue
+            raw_gt_box_idx = valid_boxes_idx[box_id]
+            pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx)
+        return pos_chip2boxes_idx
+
+    def _get_neg_boxes_and_chips(self,
+                                 chips: 'Cx4',
+                                 pos_chip_ids: 'D',
+                                 proposals: 'Px4'):
+        """
+        :param chips:
+        :param pos_chip_ids:
+        :param proposals:
+        :return: neg_chip2box_num, None or dict: chipid->neg_box_num
+        """
+        if not self.use_neg_chip:
+            return None
+
+        # train proposals maybe None
+        if proposals is None or len(proposals) < 1:
+            return None
+
+        valid_ratio_range = self._cur_valid_ratio_range
+        im_size = self._cur_im_size
+        scale = self._cur_scale
+
+        valid_props, _ = self._validate_boxes(valid_ratio_range, im_size,
+                                              proposals, scale)
+        neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props)
+        neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes)
+        return neg_chip2box_num
+
+    def _find_neg_boxes(self,
+                        chips: 'Cx4',
+                        pos_chip_ids: 'D',
+                        valid_props: 'Px4'):
+        """
+        :return: neg_boxes: Nx4
+        """
+        if len(pos_chip_ids) == 0:
+            return valid_props
+
+        pos_chips = chips[pos_chip_ids]
+        iob = intersection_over_box(pos_chips, valid_props)
+        overlap_per_prop = np.max(iob, axis=0)
+        non_overlap_props_idx = overlap_per_prop < 0.5
+        neg_boxes = valid_props[non_overlap_props_idx]
+        return neg_boxes
+
+    def _find_neg_chips(self,
+                        chips: 'Cx4',
+                        pos_chip_ids: 'D',
+                        neg_boxes: 'Nx4'):
+        """
+        :return: neg_chip2box_num, dict: chipid->neg_box_num
+        """
+        neg_chip_ids = np.setdiff1d(np.arange(len(chips)), pos_chip_ids)
+        neg_chips = chips[neg_chip_ids]
+
+        iob = intersection_over_box(neg_chips, neg_boxes)
+        iob_threshold_to_find_chips = 0.7
+        chosen_neg_chip_ids, chip_id2overlap_box_num = \
+            self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips)
+
+        neg_chipid2box_num = {}
+        for cid in chosen_neg_chip_ids:
+            box_num = chip_id2overlap_box_num[cid]
+            raw_chip_id = neg_chip_ids[cid]
+            neg_chipid2box_num[raw_chip_id] = box_num
+        return neg_chipid2box_num
+
+    def crop_infer_anno_records(self, records: List[dict]):
+        """
+        transform image record to chips record
+        :param records:
+        :return: new_records, list of dict like
+        {
+            'im_file': 'fake_image1.jpg',
+            'im_id': np.array([1]),  # new _global_chip_id as im_id
+            'h': h,  # chip height
+            'w': w,  # chip width
+            'chip': [x1, y1, x2, y2]  # added
+            'ori_im_h': ori_im_h  # added, origin image height
+            'ori_im_w': ori_im_w  # added, origin image width
+            'scale_i': 0  # added,
+        }
+        """
+        self.chip_records = []
+        self._global_chip_id = 1  # im_id start from 1
+        self._global_chip_id2img_id = {}
+
+        for r in records:
+            for scale_i in range(self.scale_num):
+                self._get_current_scale_parameters(scale_i, r)
+                # Cx4
+                chips = self._create_chips(r['h'], r['w'], self._cur_scale)
+                cur_img_chip_record = self._get_chips_records(r, chips,
+                                                              scale_i)
+                self.chip_records.extend(cur_img_chip_record)
+
+        return self.chip_records
+
+    def _get_chips_records(self, rec, chips, scale_i):
+        cur_img_chip_records = []
+        ori_im_h = rec["h"]
+        ori_im_w = rec["w"]
+        im_file = rec["im_file"]
+        ori_im_id = rec["im_id"]
+        for id, chip in enumerate(chips):
+            chip_rec = {}
+            x1, y1, x2, y2 = chip
+            chip_h = y2 - y1
+            chip_w = x2 - x1
+            chip_rec["im_file"] = im_file
+            chip_rec["im_id"] = self._global_chip_id
+            chip_rec["h"] = chip_h
+            chip_rec["w"] = chip_w
+            chip_rec["chip"] = chip
+            chip_rec["ori_im_h"] = ori_im_h
+            chip_rec["ori_im_w"] = ori_im_w
+            chip_rec["scale_i"] = scale_i
+
+            self._global_chip_id2img_id[self._global_chip_id] = int(ori_im_id)
+            self._global_chip_id += 1
+            cur_img_chip_records.append(chip_rec)
+
+        return cur_img_chip_records
+
+    def aggregate_chips_detections(self, results, records=None):
+        """
+        # 1. transform chip dets to image dets
+        # 2. nms boxes per image;
+        # 3. format output results
+        :param results:
+        :param roidb:
+        :return:
+        """
+        results = deepcopy(results)
+        records = records if records else self.chip_records
+        img_id2bbox = self._transform_chip2image_bboxes(results, records)
+        nms_img_id2bbox = self._nms_dets(img_id2bbox)
+        aggregate_results = self._reformat_results(nms_img_id2bbox)
+        return aggregate_results
+
+    def _transform_chip2image_bboxes(self, results, records):
+        # 1. Transform chip dets to image dets;
+        # 2. Filter valid range;
+        # 3. Reformat and Aggregate chip dets to Get scale_cls_dets
+        img_id2bbox = defaultdict(list)
+        for result in results:
+            bbox_locs = result['bbox']
+            bbox_nums = result['bbox_num']
+            if len(bbox_locs) == 1 and bbox_locs[0][
+                    0] == -1:  # current batch has no detections
+                # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]]
+                # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1.
+                continue
+            im_ids = result['im_id']  # replace with range(len(bbox_nums))
+
+            last_bbox_num = 0
+            for idx, im_id in enumerate(im_ids):
+
+                cur_bbox_len = bbox_nums[idx]
+                bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len]
+                last_bbox_num += cur_bbox_len
+                # box: [num_id, score, xmin, ymin, xmax, ymax]
+                if len(bboxes) == 0:  # current image has no detections
+                    continue
+
+                chip_rec = records[int(im_id) -
+                                   1]  # im_id starts from 1, type is np.int64
+                image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"])
+
+                bboxes = transform_chip_boxes2image_boxes(
+                    bboxes, chip_rec["chip"], chip_rec["ori_im_h"],
+                    chip_rec["ori_im_w"])
+
+                scale_i = chip_rec["scale_i"]
+                cur_scale = self._get_current_scale(self.target_sizes[scale_i],
+                                                    image_size)
+                _, valid_boxes_idx = self._validate_boxes(
+                    self.valid_box_ratio_ranges[scale_i], image_size,
+                    bboxes[:, 2:], cur_scale)
+                ori_img_id = self._global_chip_id2img_id[int(im_id)]
+
+                img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx])
+
+        return img_id2bbox
+
+    def _nms_dets(self, img_id2bbox):
+        # 1. NMS on each image-class
+        # 2. Limit number of detections to MAX_PER_IMAGE if requested
+        max_per_img = self.max_per_img
+        nms_thresh = self.nms_thresh
+
+        for img_id in img_id2bbox:
+            box = img_id2bbox[
+                img_id]  # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
+            box = np.concatenate(box, axis=0)
+            nms_dets = nms(box, nms_thresh)
+            if max_per_img > 0:
+                if len(nms_dets) > max_per_img:
+                    keep = np.argsort(-nms_dets[:, 1])[:max_per_img]
+                    nms_dets = nms_dets[keep]
+
+            img_id2bbox[img_id] = nms_dets
+
+        return img_id2bbox
+
+    def _reformat_results(self, img_id2bbox):
+        """reformat results"""
+        im_ids = img_id2bbox.keys()
+        results = []
+        for img_id in im_ids:  # output by original im_id order
+            if len(img_id2bbox[img_id]) == 0:
+                bbox = np.array(
+                    [[-1., 0., 0., 0., 0., 0.]])  # edge case: no detections
+                bbox_num = np.array([0])
+            else:
+                # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
+                bbox = img_id2bbox[img_id]
+                bbox_num = np.array([len(bbox)])
+            res = dict(
+                im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num)
+            results.append(res)
+        return results

+ 170 - 0
paddlers/models/ppdet/data/crop_utils/chip_box_utils.py

@@ -0,0 +1,170 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def bbox_area(boxes):
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def intersection_over_box(chips, boxes):
+    """
+    intersection area over box area
+    :param chips:  C
+    :param boxes:  B
+    :return: iob, CxB
+    """
+    M = chips.shape[0]
+    N = boxes.shape[0]
+    if M * N == 0:
+        return np.zeros([M, N], dtype='float32')
+
+    box_area = bbox_area(boxes)  # B
+
+    inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:],
+                            boxes[:, 2:])  # CxBX2
+    inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2],
+                            boxes[:, :2])  # CxBx2
+    inter_wh = inter_x2y2 - inter_x1y1
+    inter_wh = np.clip(inter_wh, a_min=0, a_max=None)
+    inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # CxB
+
+    iob = inter_area / np.expand_dims(box_area, 0)
+    return iob
+
+
+def clip_boxes(boxes, im_shape):
+    """
+    Clip boxes to image boundaries.
+    :param boxes: [N, 4]
+    :param im_shape: tuple of 2, [h, w]
+    :return: [N, 4]
+    """
+    # x1 >= 0
+    boxes[:, 0] = np.clip(boxes[:, 0], 0, im_shape[1] - 1)
+    # y1 >= 0
+    boxes[:, 1] = np.clip(boxes[:, 1], 0, im_shape[0] - 1)
+    # x2 < im_shape[1]
+    boxes[:, 2] = np.clip(boxes[:, 2], 1, im_shape[1])
+    # y2 < im_shape[0]
+    boxes[:, 3] = np.clip(boxes[:, 3], 1, im_shape[0])
+    return boxes
+
+
+def transform_chip_box(gt_bbox: 'Gx4', boxes_idx: 'B', chip: '4'):
+    boxes_idx = np.array(boxes_idx)
+    cur_gt_bbox = gt_bbox[boxes_idx].copy()  # Bx4
+    x1, y1, x2, y2 = chip
+    cur_gt_bbox[:, 0] -= x1
+    cur_gt_bbox[:, 1] -= y1
+    cur_gt_bbox[:, 2] -= x1
+    cur_gt_bbox[:, 3] -= y1
+    h = y2 - y1
+    w = x2 - x1
+    cur_gt_bbox = clip_boxes(cur_gt_bbox, (h, w))
+    ws = (cur_gt_bbox[:, 2] - cur_gt_bbox[:, 0]).astype(np.int32)
+    hs = (cur_gt_bbox[:, 3] - cur_gt_bbox[:, 1]).astype(np.int32)
+    valid_idx = (ws >= 2) & (hs >= 2)
+    return cur_gt_bbox[valid_idx], boxes_idx[valid_idx]
+
+
+def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):
+    chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
+    chip_id2overlap_box_num = np.bincount(chip_ids)  # 1d array
+    chip_id2overlap_box_num = np.pad(
+        chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)),
+        constant_values=0)
+
+    chosen_chip_ids = []
+    while len(box_ids) > 0:
+        value_counts = np.bincount(chip_ids)  # 1d array
+        max_count_chip_id = np.argmax(value_counts)
+        assert max_count_chip_id not in chosen_chip_ids
+        chosen_chip_ids.append(max_count_chip_id)
+
+        box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id]
+        ids_not_in_cur_boxes_mask = np.logical_not(
+            np.isin(box_ids, box_ids_in_cur_chip))
+        chip_ids = chip_ids[ids_not_in_cur_boxes_mask]
+        box_ids = box_ids[ids_not_in_cur_boxes_mask]
+    return chosen_chip_ids, chip_id2overlap_box_num
+
+
+def transform_chip_boxes2image_boxes(chip_boxes, chip, img_h, img_w):
+    chip_boxes = np.array(sorted(chip_boxes, key=lambda item: -item[1]))
+    xmin, ymin, _, _ = chip
+    # Transform to origin image loc
+    chip_boxes[:, 2] += xmin
+    chip_boxes[:, 4] += xmin
+    chip_boxes[:, 3] += ymin
+    chip_boxes[:, 5] += ymin
+    chip_boxes = clip_boxes(chip_boxes, (img_h, img_w))
+    return chip_boxes
+
+
+def nms(dets, thresh):
+    """Apply classic DPM-style greedy NMS."""
+    if dets.shape[0] == 0:
+        return dets[[], :]
+    scores = dets[:, 1]
+    x1 = dets[:, 2]
+    y1 = dets[:, 3]
+    x2 = dets[:, 4]
+    y2 = dets[:, 5]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=np.int)
+
+    # nominal indices
+    # _i, _j
+    # sorted indices
+    # i, j
+    # temp variables for box i's (the box currently under consideration)
+    # ix1, iy1, ix2, iy2, iarea
+
+    # variables for computing overlap with box j (lower scoring box)
+    # xx1, yy1, xx2, yy2
+    # w, h
+    # inter, ovr
+
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+    keep = np.where(suppressed == 0)[0]
+    dets = dets[keep, :]
+    return dets

+ 302 - 0
paddlers/models/ppdet/data/reader.py

@@ -0,0 +1,302 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import traceback
+import six
+import sys
+if sys.version_info >= (3, 0):
+    pass
+else:
+    pass
+import numpy as np
+
+from paddle.io import DataLoader, DistributedBatchSampler
+from paddle.fluid.dataloader.collate import default_collate_fn
+
+from paddlers.models.ppdet.core.workspace import register
+from . import transform
+from .shm_utils import _get_shared_memory_size_in_M
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger('reader')
+
+MAIN_PID = os.getpid()
+
+
+class Compose(object):
+    def __init__(self, transforms, num_classes=80):
+        self.transforms = transforms
+        self.transforms_cls = []
+        for t in self.transforms:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+
+                self.transforms_cls.append(f)
+
+    def __call__(self, data):
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map sample transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        return data
+
+
+class BatchCompose(Compose):
+    def __init__(self, transforms, num_classes=80, collate_batch=True):
+        super(BatchCompose, self).__init__(transforms, num_classes)
+        self.collate_batch = collate_batch
+
+    def __call__(self, data):
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map batch transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        # remove keys which is not needed by model
+        extra_key = ['h', 'w', 'flipped']
+        for k in extra_key:
+            for sample in data:
+                if k in sample:
+                    sample.pop(k)
+
+        # batch data, if user-define batch function needed
+        # use user-defined here
+        if self.collate_batch:
+            batch_data = default_collate_fn(data)
+        else:
+            batch_data = {}
+            for k in data[0].keys():
+                tmp_data = []
+                for i in range(len(data)):
+                    tmp_data.append(data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                batch_data[k] = tmp_data
+        return batch_data
+
+
+class BaseDataLoader(object):
+    """
+    Base DataLoader implementation for detection models
+
+    Args:
+        sample_transforms (list): a list of transforms to perform
+                                  on each sample
+        batch_transforms (list): a list of transforms to perform
+                                 on batch
+        batch_size (int): batch size for batch collating, default 1.
+        shuffle (bool): whether to shuffle samples
+        drop_last (bool): whether to drop the last incomplete,
+                          default False
+        num_classes (int): class number of dataset, default 80
+        collate_batch (bool): whether to collate batch in dataloader.
+            If set to True, the samples will collate into batch according
+            to the batch size. Otherwise, the ground-truth will not collate,
+            which is used when the number of ground-truch is different in
+            samples.
+        use_shared_memory (bool): whether to use shared memory to
+                accelerate data loading, enable this only if you
+                are sure that the shared memory size of your OS
+                is larger than memory cost of input datas of model.
+                Note that shared memory will be automatically
+                disabled if the shared memory of OS is less than
+                1G, which is not enough for detection models.
+                Default False.
+    """
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 collate_batch=True,
+                 use_shared_memory=False,
+                 **kwargs):
+        # sample transform
+        self._sample_transforms = Compose(
+            sample_transforms, num_classes=num_classes)
+
+        # batch transfrom
+        self._batch_transforms = BatchCompose(batch_transforms, num_classes,
+                                              collate_batch)
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.use_shared_memory = use_shared_memory
+        self.kwargs = kwargs
+
+    def __call__(self,
+                 dataset,
+                 worker_num,
+                 batch_sampler=None,
+                 return_list=False):
+        self.dataset = dataset
+        self.dataset.check_or_download_dataset()
+        self.dataset.parse_dataset()
+        # get data
+        self.dataset.set_transform(self._sample_transforms)
+        # set kwargs
+        self.dataset.set_kwargs(**self.kwargs)
+        # batch sampler
+        if batch_sampler is None:
+            self._batch_sampler = DistributedBatchSampler(
+                self.dataset,
+                batch_size=self.batch_size,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler = batch_sampler
+
+        # DataLoader do not start sub-process in Windows and Mac
+        # system, do not need to use shared memory
+        use_shared_memory = self.use_shared_memory and \
+                            sys.platform not in ['win32', 'darwin']
+        # check whether shared memory size is bigger than 1G(1024M)
+        if use_shared_memory:
+            shm_size = _get_shared_memory_size_in_M()
+            if shm_size is not None and shm_size < 1024.:
+                logger.warning("Shared memory size is less than 1G, "
+                               "disable shared_memory in DataLoader")
+                use_shared_memory = False
+
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_sampler=self._batch_sampler,
+            collate_fn=self._batch_transforms,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+        self.loader = iter(self.dataloader)
+
+        return self
+
+    def __len__(self):
+        return len(self._batch_sampler)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        try:
+            return next(self.loader)
+        except StopIteration:
+            self.loader = iter(self.dataloader)
+            six.reraise(*sys.exc_info())
+
+    def next(self):
+        # python2 compatibility
+        return self.__next__()
+
+
+@register
+class TrainReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 **kwargs):
+        super(TrainReader, self).__init__(sample_transforms, batch_transforms,
+                                          batch_size, shuffle, drop_last,
+                                          num_classes, collate_batch, **kwargs)
+
+
+@register
+class EvalReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=True,
+                 num_classes=80,
+                 **kwargs):
+        super(EvalReader, self).__init__(sample_transforms, batch_transforms,
+                                         batch_size, shuffle, drop_last,
+                                         num_classes, **kwargs)
+
+
+@register
+class TestReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 **kwargs):
+        super(TestReader, self).__init__(sample_transforms, batch_transforms,
+                                         batch_size, shuffle, drop_last,
+                                         num_classes, **kwargs)
+
+
+@register
+class EvalMOTReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=1,
+                 **kwargs):
+        super(EvalMOTReader, self).__init__(
+            sample_transforms, batch_transforms, batch_size, shuffle,
+            drop_last, num_classes, **kwargs)
+
+
+@register
+class TestMOTReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=1,
+                 **kwargs):
+        super(TestMOTReader, self).__init__(
+            sample_transforms, batch_transforms, batch_size, shuffle,
+            drop_last, num_classes, **kwargs)

+ 67 - 0
paddlers/models/ppdet/data/shm_utils.py

@@ -0,0 +1,67 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+SIZE_UNIT = ['K', 'M', 'G', 'T']
+SHM_QUERY_CMD = 'df -h'
+SHM_KEY = 'shm'
+SHM_DEFAULT_MOUNT = '/dev/shm'
+
+# [ shared memory size check ]
+# In detection models, image/target data occupies a lot of memory, and
+# will occupy lots of shared memory in multi-process DataLoader, we use
+# following code to get shared memory size and perform a size check to
+# disable shared memory use if shared memory size is not enough.
+# Shared memory getting process as follows:
+# 1. use `df -h` get all mount info
+# 2. pick up spaces whose mount info contains 'shm'
+# 3. if 'shm' space number is only 1, return its size
+# 4. if there are multiple 'shm' space, try to find the default mount
+#    directory '/dev/shm' is Linux-like system, otherwise return the
+#    biggest space size.
+
+
+def _parse_size_in_M(size_str):
+    num, unit = size_str[:-1], size_str[-1]
+    assert unit in SIZE_UNIT, \
+            "unknown shm size unit {}".format(unit)
+    return float(num) * \
+            (1024 ** (SIZE_UNIT.index(unit) - 1))
+
+
+def _get_shared_memory_size_in_M():
+    try:
+        df_infos = os.popen(SHM_QUERY_CMD).readlines()
+    except:
+        return None
+    else:
+        shm_infos = []
+        for df_info in df_infos:
+            info = df_info.strip()
+            if info.find(SHM_KEY) >= 0:
+                shm_infos.append(info.split())
+
+        if len(shm_infos) == 0:
+            return None
+        elif len(shm_infos) == 1:
+            return _parse_size_in_M(shm_infos[0][3])
+        else:
+            default_mount_infos = [
+                si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT
+            ]
+            if default_mount_infos:
+                return _parse_size_in_M(default_mount_infos[0][3])
+            else:
+                return max([_parse_size_in_M(si[3]) for si in shm_infos])

+ 29 - 0
paddlers/models/ppdet/data/source/__init__.py

@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import coco
+from . import voc
+from . import widerface
+from . import category
+from . import keypoint_coco
+from . import mot
+from . import sniper_coco
+
+from .coco import *
+from .voc import *
+from .widerface import *
+from .category import *
+from .keypoint_coco import *
+from .mot import *
+from .sniper_coco import SniperCOCODataSet

+ 904 - 0
paddlers/models/ppdet/data/source/category.py

@@ -0,0 +1,904 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from paddlers.models.ppdet.data.source.voc import pascalvoc_label
+from paddlers.models.ppdet.data.source.widerface import widerface_label
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['get_categories']
+
+
+def get_categories(metric_type, anno_file=None, arch=None):
+    """
+    Get class id to category id map and category id
+    to category name map from annotation file.
+
+    Args:
+        metric_type (str): metric type, currently support 'coco', 'voc', 'oid'
+            and 'widerface'.
+        anno_file (str): annotation file path
+    """
+    if arch == 'keypoint_arch':
+        return (None, {'id': 'keypoint'})
+
+    if metric_type.lower() == 'coco' or metric_type.lower(
+    ) == 'rbox' or metric_type.lower() == 'snipercoco':
+        if anno_file and os.path.isfile(anno_file):
+            # lazy import pycocotools here
+            from pycocotools.coco import COCO
+
+            coco = COCO(anno_file)
+            cats = coco.loadCats(coco.getCatIds())
+
+            clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
+            catid2name = {cat['id']: cat['name'] for cat in cats}
+            return clsid2catid, catid2name
+
+        # anno file not exist, load default categories of COCO17
+        else:
+            if metric_type.lower() == 'rbox':
+                return _dota_category()
+
+            return _coco17_category()
+
+    elif metric_type.lower() == 'voc':
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+
+            if cats[0] == 'background':
+                cats = cats[1:]
+
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+
+            return clsid2catid, catid2name
+
+        # anno file not exist, load default categories of
+        # VOC all 20 categories
+        else:
+            return _vocall_category()
+
+    elif metric_type.lower() == 'oid':
+        if anno_file and os.path.isfile(anno_file):
+            logger.warning("only default categories support for OID19")
+        return _oid19_category()
+
+    elif metric_type.lower() == 'widerface':
+        return _widerface_category()
+
+    elif metric_type.lower() == 'keypointtopdowncocoeval' or metric_type.lower(
+    ) == 'keypointtopdownmpiieval':
+        return (None, {'id': 'keypoint'})
+
+    elif metric_type.lower() in ['mot', 'motdet', 'reid']:
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+            if cats[0] == 'background':
+                cats = cats[1:]
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+            return clsid2catid, catid2name
+        # anno file not exist, load default category 'pedestrian'.
+        else:
+            return _mot_category(category='pedestrian')
+
+    elif metric_type.lower() in ['kitti', 'bdd100kmot']:
+        return _mot_category(category='vehicle')
+
+    elif metric_type.lower() in ['mcmot']:
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+            if cats[0] == 'background':
+                cats = cats[1:]
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+            return clsid2catid, catid2name
+        # anno file not exist, load default categories of visdrone all 10 categories
+        else:
+            return _visdrone_category()
+
+    else:
+        raise ValueError("unknown metric type {}".format(metric_type))
+
+
+def _mot_category(category='pedestrian'):
+    """
+    Get class id to category id map and category id
+    to category name map of mot dataset
+    """
+    label_map = {category: 0}
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def _coco17_category():
+    """
+    Get class id to category id map and category id
+    to category name map of COCO2017 dataset
+
+    """
+    clsid2catid = {
+        1: 1,
+        2: 2,
+        3: 3,
+        4: 4,
+        5: 5,
+        6: 6,
+        7: 7,
+        8: 8,
+        9: 9,
+        10: 10,
+        11: 11,
+        12: 13,
+        13: 14,
+        14: 15,
+        15: 16,
+        16: 17,
+        17: 18,
+        18: 19,
+        19: 20,
+        20: 21,
+        21: 22,
+        22: 23,
+        23: 24,
+        24: 25,
+        25: 27,
+        26: 28,
+        27: 31,
+        28: 32,
+        29: 33,
+        30: 34,
+        31: 35,
+        32: 36,
+        33: 37,
+        34: 38,
+        35: 39,
+        36: 40,
+        37: 41,
+        38: 42,
+        39: 43,
+        40: 44,
+        41: 46,
+        42: 47,
+        43: 48,
+        44: 49,
+        45: 50,
+        46: 51,
+        47: 52,
+        48: 53,
+        49: 54,
+        50: 55,
+        51: 56,
+        52: 57,
+        53: 58,
+        54: 59,
+        55: 60,
+        56: 61,
+        57: 62,
+        58: 63,
+        59: 64,
+        60: 65,
+        61: 67,
+        62: 70,
+        63: 72,
+        64: 73,
+        65: 74,
+        66: 75,
+        67: 76,
+        68: 77,
+        69: 78,
+        70: 79,
+        71: 80,
+        72: 81,
+        73: 82,
+        74: 84,
+        75: 85,
+        76: 86,
+        77: 87,
+        78: 88,
+        79: 89,
+        80: 90
+    }
+
+    catid2name = {
+        0: 'background',
+        1: 'person',
+        2: 'bicycle',
+        3: 'car',
+        4: 'motorcycle',
+        5: 'airplane',
+        6: 'bus',
+        7: 'train',
+        8: 'truck',
+        9: 'boat',
+        10: 'traffic light',
+        11: 'fire hydrant',
+        13: 'stop sign',
+        14: 'parking meter',
+        15: 'bench',
+        16: 'bird',
+        17: 'cat',
+        18: 'dog',
+        19: 'horse',
+        20: 'sheep',
+        21: 'cow',
+        22: 'elephant',
+        23: 'bear',
+        24: 'zebra',
+        25: 'giraffe',
+        27: 'backpack',
+        28: 'umbrella',
+        31: 'handbag',
+        32: 'tie',
+        33: 'suitcase',
+        34: 'frisbee',
+        35: 'skis',
+        36: 'snowboard',
+        37: 'sports ball',
+        38: 'kite',
+        39: 'baseball bat',
+        40: 'baseball glove',
+        41: 'skateboard',
+        42: 'surfboard',
+        43: 'tennis racket',
+        44: 'bottle',
+        46: 'wine glass',
+        47: 'cup',
+        48: 'fork',
+        49: 'knife',
+        50: 'spoon',
+        51: 'bowl',
+        52: 'banana',
+        53: 'apple',
+        54: 'sandwich',
+        55: 'orange',
+        56: 'broccoli',
+        57: 'carrot',
+        58: 'hot dog',
+        59: 'pizza',
+        60: 'donut',
+        61: 'cake',
+        62: 'chair',
+        63: 'couch',
+        64: 'potted plant',
+        65: 'bed',
+        67: 'dining table',
+        70: 'toilet',
+        72: 'tv',
+        73: 'laptop',
+        74: 'mouse',
+        75: 'remote',
+        76: 'keyboard',
+        77: 'cell phone',
+        78: 'microwave',
+        79: 'oven',
+        80: 'toaster',
+        81: 'sink',
+        82: 'refrigerator',
+        84: 'book',
+        85: 'clock',
+        86: 'vase',
+        87: 'scissors',
+        88: 'teddy bear',
+        89: 'hair drier',
+        90: 'toothbrush'
+    }
+
+    clsid2catid = {k - 1: v for k, v in clsid2catid.items()}
+    catid2name.pop(0)
+
+    return clsid2catid, catid2name
+
+
+def _dota_category():
+    """
+    Get class id to category id map and category id
+    to category name map of dota dataset
+    """
+    catid2name = {
+        0: 'background',
+        1: 'plane',
+        2: 'baseball-diamond',
+        3: 'bridge',
+        4: 'ground-track-field',
+        5: 'small-vehicle',
+        6: 'large-vehicle',
+        7: 'ship',
+        8: 'tennis-court',
+        9: 'basketball-court',
+        10: 'storage-tank',
+        11: 'soccer-ball-field',
+        12: 'roundabout',
+        13: 'harbor',
+        14: 'swimming-pool',
+        15: 'helicopter'
+    }
+    catid2name.pop(0)
+    clsid2catid = {i: i + 1 for i in range(len(catid2name))}
+    return clsid2catid, catid2name
+
+
+def _vocall_category():
+    """
+    Get class id to category id map and category id
+    to category name map of mixup voc dataset
+
+    """
+    label_map = pascalvoc_label()
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def _widerface_category():
+    label_map = widerface_label()
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def _oid19_category():
+    clsid2catid = {k: k + 1 for k in range(500)}
+
+    catid2name = {
+        0: "background",
+        1: "Infant bed",
+        2: "Rose",
+        3: "Flag",
+        4: "Flashlight",
+        5: "Sea turtle",
+        6: "Camera",
+        7: "Animal",
+        8: "Glove",
+        9: "Crocodile",
+        10: "Cattle",
+        11: "House",
+        12: "Guacamole",
+        13: "Penguin",
+        14: "Vehicle registration plate",
+        15: "Bench",
+        16: "Ladybug",
+        17: "Human nose",
+        18: "Watermelon",
+        19: "Flute",
+        20: "Butterfly",
+        21: "Washing machine",
+        22: "Raccoon",
+        23: "Segway",
+        24: "Taco",
+        25: "Jellyfish",
+        26: "Cake",
+        27: "Pen",
+        28: "Cannon",
+        29: "Bread",
+        30: "Tree",
+        31: "Shellfish",
+        32: "Bed",
+        33: "Hamster",
+        34: "Hat",
+        35: "Toaster",
+        36: "Sombrero",
+        37: "Tiara",
+        38: "Bowl",
+        39: "Dragonfly",
+        40: "Moths and butterflies",
+        41: "Antelope",
+        42: "Vegetable",
+        43: "Torch",
+        44: "Building",
+        45: "Power plugs and sockets",
+        46: "Blender",
+        47: "Billiard table",
+        48: "Cutting board",
+        49: "Bronze sculpture",
+        50: "Turtle",
+        51: "Broccoli",
+        52: "Tiger",
+        53: "Mirror",
+        54: "Bear",
+        55: "Zucchini",
+        56: "Dress",
+        57: "Volleyball",
+        58: "Guitar",
+        59: "Reptile",
+        60: "Golf cart",
+        61: "Tart",
+        62: "Fedora",
+        63: "Carnivore",
+        64: "Car",
+        65: "Lighthouse",
+        66: "Coffeemaker",
+        67: "Food processor",
+        68: "Truck",
+        69: "Bookcase",
+        70: "Surfboard",
+        71: "Footwear",
+        72: "Bench",
+        73: "Necklace",
+        74: "Flower",
+        75: "Radish",
+        76: "Marine mammal",
+        77: "Frying pan",
+        78: "Tap",
+        79: "Peach",
+        80: "Knife",
+        81: "Handbag",
+        82: "Laptop",
+        83: "Tent",
+        84: "Ambulance",
+        85: "Christmas tree",
+        86: "Eagle",
+        87: "Limousine",
+        88: "Kitchen & dining room table",
+        89: "Polar bear",
+        90: "Tower",
+        91: "Football",
+        92: "Willow",
+        93: "Human head",
+        94: "Stop sign",
+        95: "Banana",
+        96: "Mixer",
+        97: "Binoculars",
+        98: "Dessert",
+        99: "Bee",
+        100: "Chair",
+        101: "Wood-burning stove",
+        102: "Flowerpot",
+        103: "Beaker",
+        104: "Oyster",
+        105: "Woodpecker",
+        106: "Harp",
+        107: "Bathtub",
+        108: "Wall clock",
+        109: "Sports uniform",
+        110: "Rhinoceros",
+        111: "Beehive",
+        112: "Cupboard",
+        113: "Chicken",
+        114: "Man",
+        115: "Blue jay",
+        116: "Cucumber",
+        117: "Balloon",
+        118: "Kite",
+        119: "Fireplace",
+        120: "Lantern",
+        121: "Missile",
+        122: "Book",
+        123: "Spoon",
+        124: "Grapefruit",
+        125: "Squirrel",
+        126: "Orange",
+        127: "Coat",
+        128: "Punching bag",
+        129: "Zebra",
+        130: "Billboard",
+        131: "Bicycle",
+        132: "Door handle",
+        133: "Mechanical fan",
+        134: "Ring binder",
+        135: "Table",
+        136: "Parrot",
+        137: "Sock",
+        138: "Vase",
+        139: "Weapon",
+        140: "Shotgun",
+        141: "Glasses",
+        142: "Seahorse",
+        143: "Belt",
+        144: "Watercraft",
+        145: "Window",
+        146: "Giraffe",
+        147: "Lion",
+        148: "Tire",
+        149: "Vehicle",
+        150: "Canoe",
+        151: "Tie",
+        152: "Shelf",
+        153: "Picture frame",
+        154: "Printer",
+        155: "Human leg",
+        156: "Boat",
+        157: "Slow cooker",
+        158: "Croissant",
+        159: "Candle",
+        160: "Pancake",
+        161: "Pillow",
+        162: "Coin",
+        163: "Stretcher",
+        164: "Sandal",
+        165: "Woman",
+        166: "Stairs",
+        167: "Harpsichord",
+        168: "Stool",
+        169: "Bus",
+        170: "Suitcase",
+        171: "Human mouth",
+        172: "Juice",
+        173: "Skull",
+        174: "Door",
+        175: "Violin",
+        176: "Chopsticks",
+        177: "Digital clock",
+        178: "Sunflower",
+        179: "Leopard",
+        180: "Bell pepper",
+        181: "Harbor seal",
+        182: "Snake",
+        183: "Sewing machine",
+        184: "Goose",
+        185: "Helicopter",
+        186: "Seat belt",
+        187: "Coffee cup",
+        188: "Microwave oven",
+        189: "Hot dog",
+        190: "Countertop",
+        191: "Serving tray",
+        192: "Dog bed",
+        193: "Beer",
+        194: "Sunglasses",
+        195: "Golf ball",
+        196: "Waffle",
+        197: "Palm tree",
+        198: "Trumpet",
+        199: "Ruler",
+        200: "Helmet",
+        201: "Ladder",
+        202: "Office building",
+        203: "Tablet computer",
+        204: "Toilet paper",
+        205: "Pomegranate",
+        206: "Skirt",
+        207: "Gas stove",
+        208: "Cookie",
+        209: "Cart",
+        210: "Raven",
+        211: "Egg",
+        212: "Burrito",
+        213: "Goat",
+        214: "Kitchen knife",
+        215: "Skateboard",
+        216: "Salt and pepper shakers",
+        217: "Lynx",
+        218: "Boot",
+        219: "Platter",
+        220: "Ski",
+        221: "Swimwear",
+        222: "Swimming pool",
+        223: "Drinking straw",
+        224: "Wrench",
+        225: "Drum",
+        226: "Ant",
+        227: "Human ear",
+        228: "Headphones",
+        229: "Fountain",
+        230: "Bird",
+        231: "Jeans",
+        232: "Television",
+        233: "Crab",
+        234: "Microphone",
+        235: "Home appliance",
+        236: "Snowplow",
+        237: "Beetle",
+        238: "Artichoke",
+        239: "Jet ski",
+        240: "Stationary bicycle",
+        241: "Human hair",
+        242: "Brown bear",
+        243: "Starfish",
+        244: "Fork",
+        245: "Lobster",
+        246: "Corded phone",
+        247: "Drink",
+        248: "Saucer",
+        249: "Carrot",
+        250: "Insect",
+        251: "Clock",
+        252: "Castle",
+        253: "Tennis racket",
+        254: "Ceiling fan",
+        255: "Asparagus",
+        256: "Jaguar",
+        257: "Musical instrument",
+        258: "Train",
+        259: "Cat",
+        260: "Rifle",
+        261: "Dumbbell",
+        262: "Mobile phone",
+        263: "Taxi",
+        264: "Shower",
+        265: "Pitcher",
+        266: "Lemon",
+        267: "Invertebrate",
+        268: "Turkey",
+        269: "High heels",
+        270: "Bust",
+        271: "Elephant",
+        272: "Scarf",
+        273: "Barrel",
+        274: "Trombone",
+        275: "Pumpkin",
+        276: "Box",
+        277: "Tomato",
+        278: "Frog",
+        279: "Bidet",
+        280: "Human face",
+        281: "Houseplant",
+        282: "Van",
+        283: "Shark",
+        284: "Ice cream",
+        285: "Swim cap",
+        286: "Falcon",
+        287: "Ostrich",
+        288: "Handgun",
+        289: "Whiteboard",
+        290: "Lizard",
+        291: "Pasta",
+        292: "Snowmobile",
+        293: "Light bulb",
+        294: "Window blind",
+        295: "Muffin",
+        296: "Pretzel",
+        297: "Computer monitor",
+        298: "Horn",
+        299: "Furniture",
+        300: "Sandwich",
+        301: "Fox",
+        302: "Convenience store",
+        303: "Fish",
+        304: "Fruit",
+        305: "Earrings",
+        306: "Curtain",
+        307: "Grape",
+        308: "Sofa bed",
+        309: "Horse",
+        310: "Luggage and bags",
+        311: "Desk",
+        312: "Crutch",
+        313: "Bicycle helmet",
+        314: "Tick",
+        315: "Airplane",
+        316: "Canary",
+        317: "Spatula",
+        318: "Watch",
+        319: "Lily",
+        320: "Kitchen appliance",
+        321: "Filing cabinet",
+        322: "Aircraft",
+        323: "Cake stand",
+        324: "Candy",
+        325: "Sink",
+        326: "Mouse",
+        327: "Wine",
+        328: "Wheelchair",
+        329: "Goldfish",
+        330: "Refrigerator",
+        331: "French fries",
+        332: "Drawer",
+        333: "Treadmill",
+        334: "Picnic basket",
+        335: "Dice",
+        336: "Cabbage",
+        337: "Football helmet",
+        338: "Pig",
+        339: "Person",
+        340: "Shorts",
+        341: "Gondola",
+        342: "Honeycomb",
+        343: "Doughnut",
+        344: "Chest of drawers",
+        345: "Land vehicle",
+        346: "Bat",
+        347: "Monkey",
+        348: "Dagger",
+        349: "Tableware",
+        350: "Human foot",
+        351: "Mug",
+        352: "Alarm clock",
+        353: "Pressure cooker",
+        354: "Human hand",
+        355: "Tortoise",
+        356: "Baseball glove",
+        357: "Sword",
+        358: "Pear",
+        359: "Miniskirt",
+        360: "Traffic sign",
+        361: "Girl",
+        362: "Roller skates",
+        363: "Dinosaur",
+        364: "Porch",
+        365: "Human beard",
+        366: "Submarine sandwich",
+        367: "Screwdriver",
+        368: "Strawberry",
+        369: "Wine glass",
+        370: "Seafood",
+        371: "Racket",
+        372: "Wheel",
+        373: "Sea lion",
+        374: "Toy",
+        375: "Tea",
+        376: "Tennis ball",
+        377: "Waste container",
+        378: "Mule",
+        379: "Cricket ball",
+        380: "Pineapple",
+        381: "Coconut",
+        382: "Doll",
+        383: "Coffee table",
+        384: "Snowman",
+        385: "Lavender",
+        386: "Shrimp",
+        387: "Maple",
+        388: "Cowboy hat",
+        389: "Goggles",
+        390: "Rugby ball",
+        391: "Caterpillar",
+        392: "Poster",
+        393: "Rocket",
+        394: "Organ",
+        395: "Saxophone",
+        396: "Traffic light",
+        397: "Cocktail",
+        398: "Plastic bag",
+        399: "Squash",
+        400: "Mushroom",
+        401: "Hamburger",
+        402: "Light switch",
+        403: "Parachute",
+        404: "Teddy bear",
+        405: "Winter melon",
+        406: "Deer",
+        407: "Musical keyboard",
+        408: "Plumbing fixture",
+        409: "Scoreboard",
+        410: "Baseball bat",
+        411: "Envelope",
+        412: "Adhesive tape",
+        413: "Briefcase",
+        414: "Paddle",
+        415: "Bow and arrow",
+        416: "Telephone",
+        417: "Sheep",
+        418: "Jacket",
+        419: "Boy",
+        420: "Pizza",
+        421: "Otter",
+        422: "Office supplies",
+        423: "Couch",
+        424: "Cello",
+        425: "Bull",
+        426: "Camel",
+        427: "Ball",
+        428: "Duck",
+        429: "Whale",
+        430: "Shirt",
+        431: "Tank",
+        432: "Motorcycle",
+        433: "Accordion",
+        434: "Owl",
+        435: "Porcupine",
+        436: "Sun hat",
+        437: "Nail",
+        438: "Scissors",
+        439: "Swan",
+        440: "Lamp",
+        441: "Crown",
+        442: "Piano",
+        443: "Sculpture",
+        444: "Cheetah",
+        445: "Oboe",
+        446: "Tin can",
+        447: "Mango",
+        448: "Tripod",
+        449: "Oven",
+        450: "Mouse",
+        451: "Barge",
+        452: "Coffee",
+        453: "Snowboard",
+        454: "Common fig",
+        455: "Salad",
+        456: "Marine invertebrates",
+        457: "Umbrella",
+        458: "Kangaroo",
+        459: "Human arm",
+        460: "Measuring cup",
+        461: "Snail",
+        462: "Loveseat",
+        463: "Suit",
+        464: "Teapot",
+        465: "Bottle",
+        466: "Alpaca",
+        467: "Kettle",
+        468: "Trousers",
+        469: "Popcorn",
+        470: "Centipede",
+        471: "Spider",
+        472: "Sparrow",
+        473: "Plate",
+        474: "Bagel",
+        475: "Personal care",
+        476: "Apple",
+        477: "Brassiere",
+        478: "Bathroom cabinet",
+        479: "studio couch",
+        480: "Computer keyboard",
+        481: "Table tennis racket",
+        482: "Sushi",
+        483: "Cabinetry",
+        484: "Street light",
+        485: "Towel",
+        486: "Nightstand",
+        487: "Rabbit",
+        488: "Dolphin",
+        489: "Dog",
+        490: "Jug",
+        491: "Wok",
+        492: "Fire hydrant",
+        493: "Human eye",
+        494: "Skyscraper",
+        495: "Backpack",
+        496: "Potato",
+        497: "Paper towel",
+        498: "Lifejacket",
+        499: "Bicycle wheel",
+        500: "Toilet",
+    }
+
+    return clsid2catid, catid2name
+
+
+def _visdrone_category():
+    clsid2catid = {i: i for i in range(10)}
+
+    catid2name = {
+        0: 'pedestrian',
+        1: 'people',
+        2: 'bicycle',
+        3: 'car',
+        4: 'van',
+        5: 'truck',
+        6: 'tricycle',
+        7: 'awning-tricycle',
+        8: 'bus',
+        9: 'motor'
+    }
+    return clsid2catid, catid2name

+ 251 - 0
paddlers/models/ppdet/data/source/coco.py

@@ -0,0 +1,251 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+from paddlers.models.ppdet.core.workspace import register, serializable
+from .dataset import DetDataset
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class COCODataSet(DetDataset):
+    """
+    Load dataset with COCO format.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): coco annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        load_crowd (bool): whether to load crowded ground-truth.
+            False as default
+        allow_empty (bool): whether to load empty entry. False as default
+        empty_ratio (float): the ratio of empty record number to total
+            record's, if empty_ratio is out of [0. ,1.), do not sample the
+            records and use all the empty entries. 1. as default
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.):
+        super(COCODataSet, self).__init__(dataset_dir, image_dir, anno_path,
+                                          data_fields, sample_num)
+        self.load_image_only = False
+        self.load_semantic = False
+        self.load_crowd = load_crowd
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+
+    def _sample_empty(self, records, num):
+        # if empty_ratio is out of [0. ,1.), do not sample the records
+        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
+            return records
+        import random
+        sample_num = min(
+            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
+        records = random.sample(records, sample_num)
+        return records
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset:
+            self.load_image_only = True
+            logger.warning(
+                'Annotation file: {} does not contains ground truth '
+                'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning(
+                    'Illegal width: {} or height: {} in annotation, '
+                    'and im_id: {} will be ignored'.format(im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id],
+                    iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    # read rbox anno or not
+                    is_rbox_anno = True if len(inst['bbox']) == 5 else False
+                    if is_rbox_anno:
+                        xc, yc, box_w, box_h, angle = inst['bbox']
+                        x1 = xc - box_w / 2.0
+                        y1 = yc - box_h / 2.0
+                        x2 = x1 + box_w
+                        y2 = y1 + box_h
+                    else:
+                        x1, y1, box_w, box_h = inst['bbox']
+                        x2 = x1 + box_w
+                        y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        if is_rbox_anno:
+                            inst['clean_rbox'] = [xc, yc, box_w, box_h, angle]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                if is_rbox_anno:
+                    gt_rbox = np.zeros((num_bbox, 5), dtype=np.float32)
+                gt_theta = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+
+                has_segmentation = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    # xc, yc, w, h, theta
+                    if is_rbox_anno:
+                        gt_rbox[i, :] = box['clean_rbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(box['segmentation']
+                                        ).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                if is_rbox_anno:
+                    gt_rec = {
+                        'is_crowd': is_crowd,
+                        'gt_class': gt_class,
+                        'gt_bbox': gt_bbox,
+                        'gt_rbox': gt_rbox,
+                        'gt_poly': gt_poly,
+                    }
+                else:
+                    gt_rec = {
+                        'is_crowd': is_crowd,
+                        'gt_class': gt_class,
+                        'gt_bbox': gt_bbox,
+                        'gt_poly': gt_poly,
+                    }
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.debug('{} samples in file {}'.format(ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records

+ 197 - 0
paddlers/models/ppdet/data/source/dataset.py

@@ -0,0 +1,197 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from paddle.io import Dataset
+from paddlers.models.ppdet.core.workspace import register, serializable
+from paddlers.models.ppdet.utils.download import get_dataset_path
+import copy
+
+
+@serializable
+class DetDataset(Dataset):
+    """
+    Load detection dataset.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        use_default_label (bool): whether to load default label list.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 use_default_label=None,
+                 **kwargs):
+        super(DetDataset, self).__init__()
+        self.dataset_dir = dataset_dir if dataset_dir is not None else ''
+        self.anno_path = anno_path
+        self.image_dir = image_dir if image_dir is not None else ''
+        self.data_fields = data_fields
+        self.sample_num = sample_num
+        self.use_default_label = use_default_label
+        self._epoch = 0
+        self._curr_iter = 0
+
+    def __len__(self, ):
+        return len(self.roidbs)
+
+    def __getitem__(self, idx):
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            n = len(self.roidbs)
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            n = len(self.roidbs)
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            n = len(self.roidbs)
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(3)
+            ]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)
+
+    def check_or_download_dataset(self):
+        self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path,
+                                            self.image_dir)
+
+    def set_kwargs(self, **kwargs):
+        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
+        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
+        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
+
+    def set_transform(self, transform):
+        self.transform = transform
+
+    def set_epoch(self, epoch_id):
+        self._epoch = epoch_id
+
+    def parse_dataset(self, ):
+        raise NotImplementedError(
+            "Need to implement parse_dataset method of Dataset")
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        return os.path.join(self.dataset_dir, self.anno_path)
+
+
+def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')):
+    return f.lower().endswith(extensions)
+
+
+def _make_dataset(dir):
+    dir = os.path.expanduser(dir)
+    if not os.path.isdir(dir):
+        raise ('{} should be a dir'.format(dir))
+    images = []
+    for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
+        for fname in sorted(fnames):
+            path = os.path.join(root, fname)
+            if _is_valid_file(path):
+                images.append(path)
+    return images
+
+
+@register
+@serializable
+class ImageFolder(DetDataset):
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 sample_num=-1,
+                 use_default_label=None,
+                 **kwargs):
+        super(ImageFolder, self).__init__(
+            dataset_dir,
+            image_dir,
+            anno_path,
+            sample_num=sample_num,
+            use_default_label=use_default_label)
+        self._imid2path = {}
+        self.roidbs = None
+        self.sample_num = sample_num
+
+    def check_or_download_dataset(self):
+        if self.dataset_dir:
+            # NOTE: ImageFolder is only used for prediction, in
+            #       infer mode, image_dir is set by set_images
+            #       so we only check anno_path here
+            self.dataset_dir = get_dataset_path(self.dataset_dir,
+                                                self.anno_path, None)
+
+    def parse_dataset(self, ):
+        if not self.roidbs:
+            self.roidbs = self._load_images()
+
+    def _parse(self):
+        image_dir = self.image_dir
+        if not isinstance(image_dir, Sequence):
+            image_dir = [image_dir]
+        images = []
+        for im_dir in image_dir:
+            if os.path.isdir(im_dir):
+                im_dir = os.path.join(self.dataset_dir, im_dir)
+                images.extend(_make_dataset(im_dir))
+            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
+                images.append(im_dir)
+        return images
+
+    def _load_images(self):
+        images = self._parse()
+        ct = 0
+        records = []
+        for image in images:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {'im_id': np.array([ct]), 'im_file': image}
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def get_imid2path(self):
+        return self._imid2path
+
+    def set_images(self, images):
+        self.image_dir = images
+        self.roidbs = self._load_images()

+ 669 - 0
paddlers/models/ppdet/data/source/keypoint_coco.py

@@ -0,0 +1,669 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+this code is base on https://github.com/open-mmlab/mmpose
+"""
+import os
+import cv2
+import numpy as np
+import json
+import copy
+import pycocotools
+from pycocotools.coco import COCO
+from .dataset import DetDataset
+from paddlers.models.ppdet.core.workspace import register, serializable
+
+
+@serializable
+class KeypointBottomUpBaseDataset(DetDataset):
+    """Base class for bottom-up datasets.
+
+    All datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_imganno`
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        anno_path (str): Relative path to the annotation file.
+        image_dir (str): Path to a directory where images are held.
+            Default: None.
+        num_joints (int): keypoint numbers
+        transform (composed(operators)): A sequence of data transforms.
+        shard (list): [rank, worldsize], the distributed env params
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[],
+                 shard=[0, 1],
+                 test_mode=False):
+        super().__init__(dataset_dir, image_dir, anno_path)
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.img_prefix = os.path.join(dataset_dir, image_dir)
+        self.transform = transform
+        self.test_mode = test_mode
+
+        self.ann_info['num_joints'] = num_joints
+        self.img_ids = []
+
+    def parse_dataset(self):
+        pass
+
+    def __len__(self):
+        """Get dataset length."""
+        return len(self.img_ids)
+
+    def _get_imganno(self, idx):
+        """Get anno for a single image."""
+        raise NotImplementedError
+
+    def __getitem__(self, idx):
+        """Prepare image for training given the index."""
+        records = copy.deepcopy(self._get_imganno(idx))
+        records['image'] = cv2.imread(records['image_file'])
+        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
+        records['mask'] = (records['mask'] + 0).astype('uint8')
+        records = self.transform(records)
+        return records
+
+    def parse_dataset(self):
+        return
+
+
+@register
+@serializable
+class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
+    """COCO dataset for bottom-up pose estimation.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO keypoint indexes::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        anno_path (str): Relative path to the annotation file.
+        image_dir (str): Path to a directory where images are held.
+            Default: None.
+        num_joints (int): keypoint numbers
+        transform (composed(operators)): A sequence of data transforms.
+        shard (list): [rank, worldsize], the distributed env params
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[],
+                 shard=[0, 1],
+                 test_mode=False):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform, shard, test_mode)
+
+        self.ann_file = os.path.join(dataset_dir, anno_path)
+        self.shard = shard
+        self.test_mode = test_mode
+
+    def parse_dataset(self):
+        self.coco = COCO(self.ann_file)
+
+        self.img_ids = self.coco.getImgIds()
+        if not self.test_mode:
+            self.img_ids = [
+                img_id for img_id in self.img_ids
+                if len(self.coco.getAnnIds(
+                    imgIds=img_id, iscrowd=None)) > 0
+            ]
+        blocknum = int(len(self.img_ids) / self.shard[1])
+        self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
+            self.shard[0] + 1))]
+        self.num_images = len(self.img_ids)
+        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
+        self.dataset_name = 'coco'
+
+        cat_ids = self.coco.getCatIds()
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        print('=> num_images: {}'.format(self.num_images))
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _get_imganno(self, idx):
+        """Get anno for a single image.
+
+        Args:
+            idx (int): image idx
+
+        Returns:
+            dict: info for model training
+        """
+        coco = self.coco
+        img_id = self.img_ids[idx]
+        ann_ids = coco.getAnnIds(imgIds=img_id)
+        anno = coco.loadAnns(ann_ids)
+
+        mask = self._get_mask(anno, idx)
+        anno = [
+            obj for obj in anno
+            if obj['iscrowd'] == 0 or obj['num_keypoints'] > 0
+        ]
+
+        joints, orgsize = self._get_joints(anno, idx)
+
+        db_rec = {}
+        db_rec['im_id'] = img_id
+        db_rec['image_file'] = os.path.join(self.img_prefix,
+                                            self.id2name[img_id])
+        db_rec['mask'] = mask
+        db_rec['joints'] = joints
+        db_rec['im_shape'] = orgsize
+
+        return db_rec
+
+    def _get_joints(self, anno, idx):
+        """Get joints for all people in an image."""
+        num_people = len(anno)
+
+        joints = np.zeros(
+            (num_people, self.ann_info['num_joints'], 3), dtype=np.float32)
+
+        for i, obj in enumerate(anno):
+            joints[i, :self.ann_info['num_joints'], :3] = \
+                np.array(obj['keypoints']).reshape([-1, 3])
+
+        img_info = self.coco.loadImgs(self.img_ids[idx])[0]
+        joints[..., 0] /= img_info['width']
+        joints[..., 1] /= img_info['height']
+        orgsize = np.array([img_info['height'], img_info['width']])
+
+        return joints, orgsize
+
+    def _get_mask(self, anno, idx):
+        """Get ignore masks to mask out losses."""
+        coco = self.coco
+        img_info = coco.loadImgs(self.img_ids[idx])[0]
+
+        m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32)
+
+        for obj in anno:
+            if 'segmentation' in obj:
+                if obj['iscrowd']:
+                    rle = pycocotools.mask.frPyObjects(obj['segmentation'],
+                                                       img_info['height'],
+                                                       img_info['width'])
+                    m += pycocotools.mask.decode(rle)
+                elif obj['num_keypoints'] == 0:
+                    rles = pycocotools.mask.frPyObjects(obj['segmentation'],
+                                                        img_info['height'],
+                                                        img_info['width'])
+                    for rle in rles:
+                        m += pycocotools.mask.decode(rle)
+
+        return m < 0.5
+
+
+@register
+@serializable
+class KeypointBottomUpCrowdPoseDataset(KeypointBottomUpCocoDataset):
+    """CrowdPose dataset for bottom-up pose estimation.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    CrowdPose keypoint indexes::
+
+        0: 'left_shoulder',
+        1: 'right_shoulder',
+        2: 'left_elbow',
+        3: 'right_elbow',
+        4: 'left_wrist',
+        5: 'right_wrist',
+        6: 'left_hip',
+        7: 'right_hip',
+        8: 'left_knee',
+        9: 'right_knee',
+        10: 'left_ankle',
+        11: 'right_ankle',
+        12: 'top_head',
+        13: 'neck'
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        anno_path (str): Relative path to the annotation file.
+        image_dir (str): Path to a directory where images are held.
+            Default: None.
+        num_joints (int): keypoint numbers
+        transform (composed(operators)): A sequence of data transforms.
+        shard (list): [rank, worldsize], the distributed env params
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[],
+                 shard=[0, 1],
+                 test_mode=False):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform, shard, test_mode)
+
+        self.ann_file = os.path.join(dataset_dir, anno_path)
+        self.shard = shard
+        self.test_mode = test_mode
+
+    def parse_dataset(self):
+        self.coco = COCO(self.ann_file)
+
+        self.img_ids = self.coco.getImgIds()
+        if not self.test_mode:
+            self.img_ids = [
+                img_id for img_id in self.img_ids
+                if len(self.coco.getAnnIds(
+                    imgIds=img_id, iscrowd=None)) > 0
+            ]
+        blocknum = int(len(self.img_ids) / self.shard[1])
+        self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
+            self.shard[0] + 1))]
+        self.num_images = len(self.img_ids)
+        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
+
+        self.dataset_name = 'crowdpose'
+        print('=> num_images: {}'.format(self.num_images))
+
+
+@serializable
+class KeypointTopDownBaseDataset(DetDataset):
+    """Base class for top_down datasets.
+
+    All datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_db`
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+        anno_path (str): Relative path to the annotation file.
+        num_joints (int): keypoint numbers
+        transform (composed(operators)): A sequence of data transforms.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[]):
+        super().__init__(dataset_dir, image_dir, anno_path)
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.img_prefix = os.path.join(dataset_dir, image_dir)
+        self.transform = transform
+
+        self.ann_info['num_joints'] = num_joints
+        self.db = []
+
+    def __len__(self):
+        """Get dataset length."""
+        return len(self.db)
+
+    def _get_db(self):
+        """Get a sample"""
+        raise NotImplementedError
+
+    def __getitem__(self, idx):
+        """Prepare sample for training given the index."""
+        records = copy.deepcopy(self.db[idx])
+        records['image'] = cv2.imread(records['image_file'], cv2.IMREAD_COLOR |
+                                      cv2.IMREAD_IGNORE_ORIENTATION)
+        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
+        records['score'] = records['score'] if 'score' in records else 1
+        records = self.transform(records)
+        # print('records', records)
+        return records
+
+
+@register
+@serializable
+class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
+    """COCO dataset for top-down pose estimation.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO keypoint indexes:
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+        anno_path (str): Relative path to the annotation file.
+        num_joints (int): Keypoint numbers
+        trainsize (list):[w, h] Image target size
+        transform (composed(operators)): A sequence of data transforms.
+        bbox_file (str): Path to a detection bbox file
+            Default: None.
+        use_gt_bbox (bool): Whether to use ground truth bbox
+            Default: True.
+        pixel_std (int): The pixel std of the scale
+            Default: 200.
+        image_thre (float): The threshold to filter the detection box
+            Default: 0.0.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 trainsize,
+                 transform=[],
+                 bbox_file=None,
+                 use_gt_bbox=True,
+                 pixel_std=200,
+                 image_thre=0.0):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform)
+
+        self.bbox_file = bbox_file
+        self.use_gt_bbox = use_gt_bbox
+        self.trainsize = trainsize
+        self.pixel_std = pixel_std
+        self.image_thre = image_thre
+        self.dataset_name = 'coco'
+
+    def parse_dataset(self):
+        if self.use_gt_bbox:
+            self.db = self._load_coco_keypoint_annotations()
+        else:
+            self.db = self._load_coco_person_detection_results()
+
+    def _load_coco_keypoint_annotations(self):
+        coco = COCO(self.get_anno())
+        img_ids = coco.getImgIds()
+        gt_db = []
+        for index in img_ids:
+            im_ann = coco.loadImgs(index)[0]
+            width = im_ann['width']
+            height = im_ann['height']
+            file_name = im_ann['file_name']
+            im_id = int(im_ann["id"])
+
+            annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
+            objs = coco.loadAnns(annIds)
+
+            valid_objs = []
+            for obj in objs:
+                x, y, w, h = obj['bbox']
+                x1 = np.max((0, x))
+                y1 = np.max((0, y))
+                x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
+                y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
+                if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
+                    obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                    valid_objs.append(obj)
+            objs = valid_objs
+
+            rec = []
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+
+                joints = np.zeros(
+                    (self.ann_info['num_joints'], 3), dtype=np.float)
+                joints_vis = np.zeros(
+                    (self.ann_info['num_joints'], 3), dtype=np.float)
+                for ipt in range(self.ann_info['num_joints']):
+                    joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
+                    joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
+                    joints[ipt, 2] = 0
+                    t_vis = obj['keypoints'][ipt * 3 + 2]
+                    if t_vis > 1:
+                        t_vis = 1
+                    joints_vis[ipt, 0] = t_vis
+                    joints_vis[ipt, 1] = t_vis
+                    joints_vis[ipt, 2] = 0
+
+                center, scale = self._box2cs(obj['clean_bbox'][:4])
+                rec.append({
+                    'image_file': os.path.join(self.img_prefix, file_name),
+                    'center': center,
+                    'scale': scale,
+                    'joints': joints,
+                    'joints_vis': joints_vis,
+                    'im_id': im_id,
+                })
+            gt_db.extend(rec)
+
+        return gt_db
+
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        if center[0] != -1:
+            scale = scale * 1.25
+
+        return center, scale
+
+    def _load_coco_person_detection_results(self):
+        all_boxes = None
+        bbox_file_path = os.path.join(self.dataset_dir, self.bbox_file)
+        with open(bbox_file_path, 'r') as f:
+            all_boxes = json.load(f)
+
+        if not all_boxes:
+            print('=> Load %s fail!' % bbox_file_path)
+            return None
+
+        kpt_db = []
+        for n_img in range(0, len(all_boxes)):
+            det_res = all_boxes[n_img]
+            if det_res['category_id'] != 1:
+                continue
+            file_name = det_res[
+                'filename'] if 'filename' in det_res else '%012d.jpg' % det_res[
+                    'image_id']
+            img_name = os.path.join(self.img_prefix, file_name)
+            box = det_res['bbox']
+            score = det_res['score']
+            im_id = int(det_res['image_id'])
+
+            if score < self.image_thre:
+                continue
+
+            center, scale = self._box2cs(box)
+            joints = np.zeros((self.ann_info['num_joints'], 3), dtype=np.float)
+            joints_vis = np.ones(
+                (self.ann_info['num_joints'], 3), dtype=np.float)
+            kpt_db.append({
+                'image_file': img_name,
+                'im_id': im_id,
+                'center': center,
+                'scale': scale,
+                'score': score,
+                'joints': joints,
+                'joints_vis': joints_vis,
+            })
+
+        return kpt_db
+
+
+@register
+@serializable
+class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
+    """MPII dataset for topdown pose estimation.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MPII keypoint indexes::
+
+        0: 'right_ankle',
+        1: 'right_knee',
+        2: 'right_hip',
+        3: 'left_hip',
+        4: 'left_knee',
+        5: 'left_ankle',
+        6: 'pelvis',
+        7: 'thorax',
+        8: 'upper_neck',
+        9: 'head_top',
+        10: 'right_wrist',
+        11: 'right_elbow',
+        12: 'right_shoulder',
+        13: 'left_shoulder',
+        14: 'left_elbow',
+        15: 'left_wrist',
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+        anno_path (str): Relative path to the annotation file.
+        num_joints (int): Keypoint numbers
+        trainsize (list):[w, h] Image target size
+        transform (composed(operators)): A sequence of data transforms.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[]):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform)
+
+        self.dataset_name = 'mpii'
+
+    def parse_dataset(self):
+        with open(self.get_anno()) as anno_file:
+            anno = json.load(anno_file)
+
+        gt_db = []
+        for a in anno:
+            image_name = a['image']
+            im_id = a['image_id'] if 'image_id' in a else int(
+                os.path.splitext(image_name)[0])
+
+            c = np.array(a['center'], dtype=np.float)
+            s = np.array([a['scale'], a['scale']], dtype=np.float)
+
+            # Adjust center/scale slightly to avoid cropping limbs
+            if c[0] != -1:
+                c[1] = c[1] + 15 * s[1]
+                s = s * 1.25
+            c = c - 1
+
+            joints = np.zeros((self.ann_info['num_joints'], 3), dtype=np.float)
+            joints_vis = np.zeros(
+                (self.ann_info['num_joints'], 3), dtype=np.float)
+            if 'joints' in a:
+                joints_ = np.array(a['joints'])
+                joints_[:, 0:2] = joints_[:, 0:2] - 1
+                joints_vis_ = np.array(a['joints_vis'])
+                assert len(joints_) == self.ann_info[
+                    'num_joints'], 'joint num diff: {} vs {}'.format(
+                        len(joints_), self.ann_info['num_joints'])
+
+                joints[:, 0:2] = joints_[:, 0:2]
+                joints_vis[:, 0] = joints_vis_[:]
+                joints_vis[:, 1] = joints_vis_[:]
+
+            gt_db.append({
+                'image_file': os.path.join(self.img_prefix, image_name),
+                'im_id': im_id,
+                'center': c,
+                'scale': s,
+                'joints': joints,
+                'joints_vis': joints_vis
+            })
+        print("number length: {}".format(len(gt_db)))
+        self.db = gt_db

+ 636 - 0
paddlers/models/ppdet/data/source/mot.py

@@ -0,0 +1,636 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import cv2
+import glob
+import numpy as np
+from collections import OrderedDict, defaultdict
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from .dataset import DetDataset, _make_dataset, _is_valid_file
+from paddlers.models.ppdet.core.workspace import register, serializable
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class MOTDataSet(DetDataset):
+    """
+    Load dataset with MOT format, only support single class MOT.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_lists (str|list): mot data image lists, muiti-source mot dataset.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+
+    Notes:
+        MOT datasets root directory following this:
+            dataset/mot
+            |——————image_lists
+            |        |——————caltech.train
+            |        |——————caltech.val
+            |        |——————mot16.train
+            |        |——————mot17.train
+            |        ......
+            |——————Caltech
+            |——————MOT17
+            |——————......
+
+        All the MOT datasets have the following structure:
+            Caltech
+            |——————images
+            |        └——————00001.jpg
+            |        |—————— ...
+            |        └——————0000N.jpg
+            └——————labels_with_ids
+                        └——————00001.txt
+                        |—————— ...
+                        └——————0000N.txt
+            or
+
+            MOT17
+            |——————images
+            |        └——————train
+            |        └——————test
+            └——————labels_with_ids
+                        └——————train
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_lists=[],
+                 data_fields=['image'],
+                 sample_num=-1):
+        super(MOTDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            data_fields=data_fields,
+            sample_num=sample_num)
+        self.dataset_dir = dataset_dir
+        self.image_lists = image_lists
+        if isinstance(self.image_lists, str):
+            self.image_lists = [self.image_lists]
+        self.roidbs = None
+        self.cname2cid = None
+
+    def get_anno(self):
+        if self.image_lists == []:
+            return
+        # only used to get categories and metric
+        # only check first data, but the label_list of all data should be same.
+        first_mot_data = self.image_lists[0].split('.')[0]
+        anno_file = os.path.join(self.dataset_dir, first_mot_data,
+                                 'label_list.txt')
+        return anno_file
+
+    def parse_dataset(self):
+        self.img_files = OrderedDict()
+        self.img_start_index = OrderedDict()
+        self.label_files = OrderedDict()
+        self.tid_num = OrderedDict()
+        self.tid_start_index = OrderedDict()
+
+        img_index = 0
+        for data_name in self.image_lists:
+            # check every data image list
+            image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')
+            assert os.path.isdir(image_lists_dir), \
+                "The {} is not a directory.".format(image_lists_dir)
+
+            list_path = os.path.join(image_lists_dir, data_name)
+            assert os.path.exists(list_path), \
+                "The list path {} does not exist.".format(list_path)
+
+            # record img_files, filter out empty ones
+            with open(list_path, 'r') as file:
+                self.img_files[data_name] = file.readlines()
+                self.img_files[data_name] = [
+                    os.path.join(self.dataset_dir, x.strip())
+                    for x in self.img_files[data_name]
+                ]
+                self.img_files[data_name] = list(
+                    filter(lambda x: len(x) > 0, self.img_files[data_name]))
+
+                self.img_start_index[data_name] = img_index
+                img_index += len(self.img_files[data_name])
+
+            # record label_files
+            self.label_files[data_name] = [
+                x.replace('images', 'labels_with_ids').replace(
+                    '.png', '.txt').replace('.jpg', '.txt')
+                for x in self.img_files[data_name]
+            ]
+
+        for data_name, label_paths in self.label_files.items():
+            max_index = -1
+            for lp in label_paths:
+                lb = np.loadtxt(lp)
+                if len(lb) < 1:
+                    continue
+                if len(lb.shape) < 2:
+                    img_max = lb[1]
+                else:
+                    img_max = np.max(lb[:, 1])
+                if img_max > max_index:
+                    max_index = img_max
+            self.tid_num[data_name] = int(max_index + 1)
+
+        last_index = 0
+        for i, (k, v) in enumerate(self.tid_num.items()):
+            self.tid_start_index[k] = last_index
+            last_index += v
+
+        self.num_identities_dict = defaultdict(int)
+        self.num_identities_dict[0] = int(last_index + 1)  # single class
+        self.num_imgs_each_data = [len(x) for x in self.img_files.values()]
+        self.total_imgs = sum(self.num_imgs_each_data)
+
+        logger.info('MOT dataset summary: ')
+        logger.info(self.tid_num)
+        logger.info('Total images: {}'.format(self.total_imgs))
+        logger.info('Image start index: {}'.format(self.img_start_index))
+        logger.info('Total identities: {}'.format(self.num_identities_dict[0]))
+        logger.info('Identity start index: {}'.format(self.tid_start_index))
+
+        records = []
+        cname2cid = mot_label()
+
+        for img_index in range(self.total_imgs):
+            for i, (k, v) in enumerate(self.img_start_index.items()):
+                if img_index >= v:
+                    data_name = list(self.label_files.keys())[i]
+                    start_index = v
+            img_file = self.img_files[data_name][img_index - start_index]
+            lbl_file = self.label_files[data_name][img_index - start_index]
+
+            if not os.path.exists(img_file):
+                logger.warning(
+                    'Illegal image file: {}, and it will be ignored'.format(
+                        img_file))
+                continue
+            if not os.path.isfile(lbl_file):
+                logger.warning(
+                    'Illegal label file: {}, and it will be ignored'.format(
+                        lbl_file))
+                continue
+
+            labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)
+            # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]
+
+            cx, cy = labels[:, 2], labels[:, 3]
+            w, h = labels[:, 4], labels[:, 5]
+            gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')
+            gt_class = labels[:, 0:1].astype('int32')
+            gt_score = np.ones((len(labels), 1)).astype('float32')
+            gt_ide = labels[:, 1:2].astype('int32')
+            for i, _ in enumerate(gt_ide):
+                if gt_ide[i] > -1:
+                    gt_ide[i] += self.tid_start_index[data_name]
+
+            mot_rec = {
+                'im_file': img_file,
+                'im_id': img_index,
+            } if 'image' in self.data_fields else {}
+
+            gt_rec = {
+                'gt_class': gt_class,
+                'gt_score': gt_score,
+                'gt_bbox': gt_bbox,
+                'gt_ide': gt_ide,
+            }
+
+            for k, v in gt_rec.items():
+                if k in self.data_fields:
+                    mot_rec[k] = v
+
+            records.append(mot_rec)
+            if self.sample_num > 0 and img_index >= self.sample_num:
+                break
+        assert len(records) > 0, 'not found any mot record in %s' % (
+            self.image_lists)
+        self.roidbs, self.cname2cid = records, cname2cid
+
+
+@register
+@serializable
+class MCMOTDataSet(DetDataset):
+    """
+    Load dataset with MOT format, support multi-class MOT.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_lists (list(str)): mcmot data image lists, muiti-source mcmot dataset.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        label_list (str): if use_default_label is False, will load
+            mapping between category and class index.
+        sample_num (int): number of samples to load, -1 means all.
+
+    Notes:
+        MCMOT datasets root directory following this:
+            dataset/mot
+            |——————image_lists
+            |        |——————visdrone_mcmot.train
+            |        |——————visdrone_mcmot.val
+            visdrone_mcmot
+            |——————images
+            |        └——————train
+            |        └——————val
+            └——————labels_with_ids
+                        └——————train
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_lists=[],
+                 data_fields=['image'],
+                 label_list=None,
+                 sample_num=-1):
+        super(MCMOTDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            data_fields=data_fields,
+            sample_num=sample_num)
+        self.dataset_dir = dataset_dir
+        self.image_lists = image_lists
+        if isinstance(self.image_lists, str):
+            self.image_lists = [self.image_lists]
+        self.label_list = label_list
+        self.roidbs = None
+        self.cname2cid = None
+
+    def get_anno(self):
+        if self.image_lists == []:
+            return
+        # only used to get categories and metric
+        # only check first data, but the label_list of all data should be same.
+        first_mot_data = self.image_lists[0].split('.')[0]
+        anno_file = os.path.join(self.dataset_dir, first_mot_data,
+                                 'label_list.txt')
+        return anno_file
+
+    def parse_dataset(self):
+        self.img_files = OrderedDict()
+        self.img_start_index = OrderedDict()
+        self.label_files = OrderedDict()
+        self.tid_num = OrderedDict()
+        self.tid_start_idx_of_cls_ids = defaultdict(dict)  # for MCMOT
+
+        img_index = 0
+        for data_name in self.image_lists:
+            # check every data image list
+            image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')
+            assert os.path.isdir(image_lists_dir), \
+                "The {} is not a directory.".format(image_lists_dir)
+
+            list_path = os.path.join(image_lists_dir, data_name)
+            assert os.path.exists(list_path), \
+                "The list path {} does not exist.".format(list_path)
+
+            # record img_files, filter out empty ones
+            with open(list_path, 'r') as file:
+                self.img_files[data_name] = file.readlines()
+                self.img_files[data_name] = [
+                    os.path.join(self.dataset_dir, x.strip())
+                    for x in self.img_files[data_name]
+                ]
+                self.img_files[data_name] = list(
+                    filter(lambda x: len(x) > 0, self.img_files[data_name]))
+
+                self.img_start_index[data_name] = img_index
+                img_index += len(self.img_files[data_name])
+
+            # record label_files
+            self.label_files[data_name] = [
+                x.replace('images', 'labels_with_ids').replace(
+                    '.png', '.txt').replace('.jpg', '.txt')
+                for x in self.img_files[data_name]
+            ]
+
+        for data_name, label_paths in self.label_files.items():
+            # using max_ids_dict rather than max_index
+            max_ids_dict = defaultdict(int)
+            for lp in label_paths:
+                lb = np.loadtxt(lp)
+                if len(lb) < 1:
+                    continue
+                lb = lb.reshape(-1, 6)
+                for item in lb:
+                    if item[1] > max_ids_dict[int(item[0])]:
+                        # item[0]: cls_id
+                        # item[1]: track id
+                        max_ids_dict[int(item[0])] = int(item[1])
+            # track id number
+            self.tid_num[data_name] = max_ids_dict
+
+        last_idx_dict = defaultdict(int)
+        for i, (k, v) in enumerate(self.tid_num.items()):  # each sub dataset
+            for cls_id, id_num in v.items():  # v is a max_ids_dict
+                self.tid_start_idx_of_cls_ids[k][cls_id] = last_idx_dict[
+                    cls_id]
+                last_idx_dict[cls_id] += id_num
+
+        self.num_identities_dict = defaultdict(int)
+        for k, v in last_idx_dict.items():
+            self.num_identities_dict[k] = int(v)  # total ids of each category
+
+        self.num_imgs_each_data = [len(x) for x in self.img_files.values()]
+        self.total_imgs = sum(self.num_imgs_each_data)
+
+        # cname2cid and cid2cname
+        cname2cid = {}
+        if self.label_list is not None:
+            # if use label_list for multi source mix dataset,
+            # please make sure label_list in the first sub_dataset at least.
+            sub_dataset = self.image_lists[0].split('.')[0]
+            label_path = os.path.join(self.dataset_dir, sub_dataset,
+                                      self.label_list)
+            if not os.path.exists(label_path):
+                logger.info(
+                    "Note: label_list {} does not exists, use VisDrone 10 classes labels as default.".
+                    format(label_path))
+                cname2cid = visdrone_mcmot_label()
+            else:
+                with open(label_path, 'r') as fr:
+                    label_id = 0
+                    for line in fr.readlines():
+                        cname2cid[line.strip()] = label_id
+                        label_id += 1
+        else:
+            cname2cid = visdrone_mcmot_label()
+
+        cid2cname = dict([(v, k) for (k, v) in cname2cid.items()])
+
+        logger.info('MCMOT dataset summary: ')
+        logger.info(self.tid_num)
+        logger.info('Total images: {}'.format(self.total_imgs))
+        logger.info('Image start index: {}'.format(self.img_start_index))
+
+        logger.info('Total identities of each category: ')
+        num_identities_dict = sorted(
+            self.num_identities_dict.items(), key=lambda x: x[0])
+        total_IDs_all_cats = 0
+        for (k, v) in num_identities_dict:
+            logger.info('Category {} [{}] has {} IDs.'.format(k, cid2cname[k],
+                                                              v))
+            total_IDs_all_cats += v
+        logger.info('Total identities of all categories: {}'.format(
+            total_IDs_all_cats))
+
+        logger.info('Identity start index of each category: ')
+        for k, v in self.tid_start_idx_of_cls_ids.items():
+            sorted_v = sorted(v.items(), key=lambda x: x[0])
+            for (cls_id, start_idx) in sorted_v:
+                logger.info('Start index of dataset {} category {:d} is {:d}'
+                            .format(k, cls_id, start_idx))
+
+        records = []
+        for img_index in range(self.total_imgs):
+            for i, (k, v) in enumerate(self.img_start_index.items()):
+                if img_index >= v:
+                    data_name = list(self.label_files.keys())[i]
+                    start_index = v
+            img_file = self.img_files[data_name][img_index - start_index]
+            lbl_file = self.label_files[data_name][img_index - start_index]
+
+            if not os.path.exists(img_file):
+                logger.warning(
+                    'Illegal image file: {}, and it will be ignored'.format(
+                        img_file))
+                continue
+            if not os.path.isfile(lbl_file):
+                logger.warning(
+                    'Illegal label file: {}, and it will be ignored'.format(
+                        lbl_file))
+                continue
+
+            labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)
+            # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]
+
+            cx, cy = labels[:, 2], labels[:, 3]
+            w, h = labels[:, 4], labels[:, 5]
+            gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')
+            gt_class = labels[:, 0:1].astype('int32')
+            gt_score = np.ones((len(labels), 1)).astype('float32')
+            gt_ide = labels[:, 1:2].astype('int32')
+            for i, _ in enumerate(gt_ide):
+                if gt_ide[i] > -1:
+                    cls_id = int(gt_class[i])
+                    start_idx = self.tid_start_idx_of_cls_ids[data_name][
+                        cls_id]
+                    gt_ide[i] += start_idx
+
+            mot_rec = {
+                'im_file': img_file,
+                'im_id': img_index,
+            } if 'image' in self.data_fields else {}
+
+            gt_rec = {
+                'gt_class': gt_class,
+                'gt_score': gt_score,
+                'gt_bbox': gt_bbox,
+                'gt_ide': gt_ide,
+            }
+
+            for k, v in gt_rec.items():
+                if k in self.data_fields:
+                    mot_rec[k] = v
+
+            records.append(mot_rec)
+            if self.sample_num > 0 and img_index >= self.sample_num:
+                break
+        assert len(records) > 0, 'not found any mot record in %s' % (
+            self.image_lists)
+        self.roidbs, self.cname2cid = records, cname2cid
+
+
+@register
+@serializable
+class MOTImageFolder(DetDataset):
+    """
+    Load MOT dataset with MOT format from image folder or video .
+    Args:
+        video_file (str): path of the video file, default ''.
+        frame_rate (int): frame rate of the video, use cv2 VideoCapture if not set.
+        dataset_dir (str): root directory for dataset.
+        keep_ori_im (bool): whether to keep original image, default False.
+            Set True when used during MOT model inference while saving
+            images or video, or used in DeepSORT.
+    """
+
+    def __init__(self,
+                 video_file=None,
+                 frame_rate=-1,
+                 dataset_dir=None,
+                 data_root=None,
+                 image_dir=None,
+                 sample_num=-1,
+                 keep_ori_im=False,
+                 **kwargs):
+        super(MOTImageFolder, self).__init__(
+            dataset_dir, image_dir, sample_num=sample_num)
+        self.video_file = video_file
+        self.data_root = data_root
+        self.keep_ori_im = keep_ori_im
+        self._imid2path = {}
+        self.roidbs = None
+        self.frame_rate = frame_rate
+
+    def check_or_download_dataset(self):
+        return
+
+    def parse_dataset(self, ):
+        if not self.roidbs:
+            if self.video_file is None:
+                self.frame_rate = 30  # set as default if infer image folder
+                self.roidbs = self._load_images()
+            else:
+                self.roidbs = self._load_video_images()
+
+    def _load_video_images(self):
+        if self.frame_rate == -1:
+            # if frame_rate is not set for video, use cv2.VideoCapture
+            cap = cv2.VideoCapture(self.video_file)
+            self.frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
+
+        extension = self.video_file.split('.')[-1]
+        output_path = self.video_file.replace('.{}'.format(extension), '')
+        frames_path = video2frames(self.video_file, output_path,
+                                   self.frame_rate)
+        self.video_frames = sorted(
+            glob.glob(os.path.join(frames_path, '*.png')))
+
+        self.video_length = len(self.video_frames)
+        logger.info('Length of the video: {:d} frames.'.format(
+            self.video_length))
+        ct = 0
+        records = []
+        for image in self.video_frames:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {'im_id': np.array([ct]), 'im_file': image}
+            if self.keep_ori_im:
+                rec.update({'keep_ori_im': 1})
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def _find_images(self):
+        image_dir = self.image_dir
+        if not isinstance(image_dir, Sequence):
+            image_dir = [image_dir]
+        images = []
+        for im_dir in image_dir:
+            if os.path.isdir(im_dir):
+                im_dir = os.path.join(self.dataset_dir, im_dir)
+                images.extend(_make_dataset(im_dir))
+            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
+                images.append(im_dir)
+        return images
+
+    def _load_images(self):
+        images = self._find_images()
+        ct = 0
+        records = []
+        for image in images:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {'im_id': np.array([ct]), 'im_file': image}
+            if self.keep_ori_im:
+                rec.update({'keep_ori_im': 1})
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def get_imid2path(self):
+        return self._imid2path
+
+    def set_images(self, images):
+        self.image_dir = images
+        self.roidbs = self._load_images()
+
+    def set_video(self, video_file, frame_rate):
+        # update video_file and frame_rate by command line of tools/infer_mot.py
+        self.video_file = video_file
+        self.frame_rate = frame_rate
+        assert os.path.isfile(self.video_file) and _is_valid_video(self.video_file), \
+                "wrong or unsupported file format: {}".format(self.video_file)
+        self.roidbs = self._load_video_images()
+
+
+def _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')):
+    return f.lower().endswith(extensions)
+
+
+def video2frames(video_path, outpath, frame_rate, **kargs):
+    def _dict2str(kargs):
+        cmd_str = ''
+        for k, v in kargs.items():
+            cmd_str += (' ' + str(k) + ' ' + str(v))
+        return cmd_str
+
+    ffmpeg = ['ffmpeg ', ' -y -loglevel ', ' error ']
+    vid_name = os.path.basename(video_path).split('.')[0]
+    out_full_path = os.path.join(outpath, vid_name)
+
+    if not os.path.exists(out_full_path):
+        os.makedirs(out_full_path)
+
+    # video file name
+    outformat = os.path.join(out_full_path, '%08d.png')
+
+    cmd = ffmpeg
+    cmd = ffmpeg + [
+        ' -i ', video_path, ' -r ', str(frame_rate), ' -f image2 ', outformat
+    ]
+    cmd = ''.join(cmd) + _dict2str(kargs)
+
+    if os.system(cmd) != 0:
+        raise RuntimeError('ffmpeg process video: {} error'.format(video_path))
+        sys.exit(-1)
+
+    sys.stdout.flush()
+    return out_full_path
+
+
+def mot_label():
+    labels_map = {'person': 0}
+    return labels_map
+
+
+def visdrone_mcmot_label():
+    labels_map = {
+        'pedestrian': 0,
+        'people': 1,
+        'bicycle': 2,
+        'car': 3,
+        'van': 4,
+        'truck': 5,
+        'tricycle': 6,
+        'awning-tricycle': 7,
+        'bus': 8,
+        'motor': 9,
+    }
+    return labels_map

+ 191 - 0
paddlers/models/ppdet/data/source/sniper_coco.py

@@ -0,0 +1,191 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import json
+import copy
+import numpy as np
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+from paddlers.models.ppdet.core.workspace import register, serializable
+from paddlers.models.ppdet.data.crop_utils.annotation_cropper import AnnoCropper
+from .coco import COCODataSet
+from .dataset import _make_dataset, _is_valid_file
+from paddlers.models.ppdet.utils.logger import setup_logger
+
+logger = setup_logger('sniper_coco_dataset')
+
+
+@register
+@serializable
+class SniperCOCODataSet(COCODataSet):
+    """SniperCOCODataSet"""
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 proposals_file=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=True,
+                 empty_ratio=1.,
+                 is_trainset=True,
+                 image_target_sizes=[2000, 1000],
+                 valid_box_ratio_ranges=[[-1, 0.1], [0.08, -1]],
+                 chip_target_size=500,
+                 chip_target_stride=200,
+                 use_neg_chip=False,
+                 max_neg_num_per_im=8,
+                 max_per_img=-1,
+                 nms_thresh=0.5):
+        super(SniperCOCODataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            load_crowd=load_crowd,
+            allow_empty=allow_empty,
+            empty_ratio=empty_ratio)
+        self.proposals_file = proposals_file
+        self.proposals = None
+        self.anno_cropper = None
+        self.is_trainset = is_trainset
+        self.image_target_sizes = image_target_sizes
+        self.valid_box_ratio_ranges = valid_box_ratio_ranges
+        self.chip_target_size = chip_target_size
+        self.chip_target_stride = chip_target_stride
+        self.use_neg_chip = use_neg_chip
+        self.max_neg_num_per_im = max_neg_num_per_im
+        self.max_per_img = max_per_img
+        self.nms_thresh = nms_thresh
+
+    def parse_dataset(self):
+        if not hasattr(self, "roidbs"):
+            super(SniperCOCODataSet, self).parse_dataset()
+        if self.is_trainset:
+            self._parse_proposals()
+            self._merge_anno_proposals()
+        self.ori_roidbs = copy.deepcopy(self.roidbs)
+        self.init_anno_cropper()
+        self.roidbs = self.generate_chips_roidbs(self.roidbs, self.is_trainset)
+
+    def set_proposals_file(self, file_path):
+        self.proposals_file = file_path
+
+    def init_anno_cropper(self):
+        logger.info("Init AnnoCropper...")
+        self.anno_cropper = AnnoCropper(
+            image_target_sizes=self.image_target_sizes,
+            valid_box_ratio_ranges=self.valid_box_ratio_ranges,
+            chip_target_size=self.chip_target_size,
+            chip_target_stride=self.chip_target_stride,
+            use_neg_chip=self.use_neg_chip,
+            max_neg_num_per_im=self.max_neg_num_per_im,
+            max_per_img=self.max_per_img,
+            nms_thresh=self.nms_thresh)
+
+    def generate_chips_roidbs(self, roidbs, is_trainset):
+        if is_trainset:
+            roidbs = self.anno_cropper.crop_anno_records(roidbs)
+        else:
+            roidbs = self.anno_cropper.crop_infer_anno_records(roidbs)
+        return roidbs
+
+    def _parse_proposals(self):
+        if self.proposals_file:
+            self.proposals = {}
+            logger.info("Parse proposals file:{}".format(self.proposals_file))
+            with open(self.proposals_file, 'r') as f:
+                proposals = json.load(f)
+            for prop in proposals:
+                image_id = prop["image_id"]
+                if image_id not in self.proposals:
+                    self.proposals[image_id] = []
+                x, y, w, h = prop["bbox"]
+                self.proposals[image_id].append([x, y, x + w, y + h])
+
+    def _merge_anno_proposals(self):
+        assert self.roidbs
+        if self.proposals and len(self.proposals.keys()) > 0:
+            logger.info("merge proposals to annos")
+            for id, record in enumerate(self.roidbs):
+                image_id = int(record["im_id"])
+                if image_id not in self.proposals.keys():
+                    logger.info("image id :{} no proposals".format(image_id))
+                record["proposals"] = np.array(
+                    self.proposals.get(image_id, []), dtype=np.float32)
+                self.roidbs[id] = record
+
+    def get_ori_roidbs(self):
+        if not hasattr(self, "ori_roidbs"):
+            return None
+        return self.ori_roidbs
+
+    def get_roidbs(self):
+        if not hasattr(self, "roidbs"):
+            self.parse_dataset()
+        return self.roidbs
+
+    def set_roidbs(self, roidbs):
+        self.roidbs = roidbs
+
+    def check_or_download_dataset(self):
+        return
+
+    def _parse(self):
+        image_dir = self.image_dir
+        if not isinstance(image_dir, Sequence):
+            image_dir = [image_dir]
+        images = []
+        for im_dir in image_dir:
+            if os.path.isdir(im_dir):
+                im_dir = os.path.join(self.dataset_dir, im_dir)
+                images.extend(_make_dataset(im_dir))
+            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
+                images.append(im_dir)
+        return images
+
+    def _load_images(self):
+        images = self._parse()
+        ct = 0
+        records = []
+        for image in images:
+            assert image != '' and os.path.isfile(image), \
+                "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            im = cv2.imread(image)
+            h, w, c = im.shape
+            rec = {'im_id': np.array([ct]), 'im_file': image, "h": h, "w": w}
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def get_imid2path(self):
+        return self._imid2path
+
+    def set_images(self, images):
+        self._imid2path = {}
+        self.image_dir = images
+        self.roidbs = self._load_images()

+ 231 - 0
paddlers/models/ppdet/data/source/voc.py

@@ -0,0 +1,231 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import xml.etree.ElementTree as ET
+
+from paddlers.models.ppdet.core.workspace import register, serializable
+
+from .dataset import DetDataset
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class VOCDataSet(DetDataset):
+    """
+    Load dataset with PascalVOC format.
+
+    Notes:
+    `anno_path` must contains xml file and image file path for annotations.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): voc annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        label_list (str): if use_default_label is False, will load
+            mapping between category and class index.
+        allow_empty (bool): whether to load empty entry. False as default
+        empty_ratio (float): the ratio of empty record number to total
+            record's, if empty_ratio is out of [0. ,1.), do not sample the
+            records and use all the empty entries. 1. as default
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 label_list=None,
+                 allow_empty=False,
+                 empty_ratio=1.):
+        super(VOCDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num)
+        self.label_list = label_list
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+
+    def _sample_empty(self, records, num):
+        # if empty_ratio is out of [0. ,1.), do not sample the records
+        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
+            return records
+        import random
+        sample_num = min(
+            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
+        records = random.sample(records, sample_num)
+        return records
+
+    def parse_dataset(self, ):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        # mapping category name to class id
+        # first_class:0, second_class:1, ...
+        records = []
+        empty_records = []
+        ct = 0
+        cname2cid = {}
+        if self.label_list:
+            label_path = os.path.join(self.dataset_dir, self.label_list)
+            if not os.path.exists(label_path):
+                raise ValueError("label_list {} does not exists".format(
+                    label_path))
+            with open(label_path, 'r') as fr:
+                label_id = 0
+                for line in fr.readlines():
+                    cname2cid[line.strip()] = label_id
+                    label_id += 1
+        else:
+            cname2cid = pascalvoc_label()
+
+        with open(anno_path, 'r') as fr:
+            while True:
+                line = fr.readline()
+                if not line:
+                    break
+                img_file, xml_file = [os.path.join(image_dir, x) \
+                        for x in line.strip().split()[:2]]
+                if not os.path.exists(img_file):
+                    logger.warning(
+                        'Illegal image file: {}, and it will be ignored'.
+                        format(img_file))
+                    continue
+                if not os.path.isfile(xml_file):
+                    logger.warning(
+                        'Illegal xml file: {}, and it will be ignored'.format(
+                            xml_file))
+                    continue
+                tree = ET.parse(xml_file)
+                if tree.find('id') is None:
+                    im_id = np.array([ct])
+                else:
+                    im_id = np.array([int(tree.find('id').text)])
+
+                objs = tree.findall('object')
+                im_w = float(tree.find('size').find('width').text)
+                im_h = float(tree.find('size').find('height').text)
+                if im_w < 0 or im_h < 0:
+                    logger.warning(
+                        'Illegal width: {} or height: {} in annotation, '
+                        'and {} will be ignored'.format(im_w, im_h, xml_file))
+                    continue
+
+                num_bbox, i = len(objs), 0
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_score = np.zeros((num_bbox, 1), dtype=np.float32)
+                difficult = np.zeros((num_bbox, 1), dtype=np.int32)
+                for obj in objs:
+                    cname = obj.find('name').text
+
+                    # user dataset may not contain difficult field
+                    _difficult = obj.find('difficult')
+                    _difficult = int(
+                        _difficult.text) if _difficult is not None else 0
+
+                    x1 = float(obj.find('bndbox').find('xmin').text)
+                    y1 = float(obj.find('bndbox').find('ymin').text)
+                    x2 = float(obj.find('bndbox').find('xmax').text)
+                    y2 = float(obj.find('bndbox').find('ymax').text)
+                    x1 = max(0, x1)
+                    y1 = max(0, y1)
+                    x2 = min(im_w - 1, x2)
+                    y2 = min(im_h - 1, y2)
+                    if x2 > x1 and y2 > y1:
+                        gt_bbox[i, :] = [x1, y1, x2, y2]
+                        gt_class[i, 0] = cname2cid[cname]
+                        gt_score[i, 0] = 1.
+                        difficult[i, 0] = _difficult
+                        i += 1
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: xml_file: {}'
+                            ', x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                xml_file, x1, y1, x2, y2))
+                gt_bbox = gt_bbox[:i, :]
+                gt_class = gt_class[:i, :]
+                gt_score = gt_score[:i, :]
+                difficult = difficult[:i, :]
+
+                voc_rec = {
+                    'im_file': img_file,
+                    'im_id': im_id,
+                    'h': im_h,
+                    'w': im_w
+                } if 'image' in self.data_fields else {}
+
+                gt_rec = {
+                    'gt_class': gt_class,
+                    'gt_score': gt_score,
+                    'gt_bbox': gt_bbox,
+                    'difficult': difficult
+                }
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        voc_rec[k] = v
+
+                if len(objs) == 0:
+                    empty_records.append(voc_rec)
+                else:
+                    records.append(voc_rec)
+
+                ct += 1
+                if self.sample_num > 0 and ct >= self.sample_num:
+                    break
+        assert ct > 0, 'not found any voc record in %s' % (self.anno_path)
+        logger.debug('{} samples in file {}'.format(ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs, self.cname2cid = records, cname2cid
+
+    def get_label_list(self):
+        return os.path.join(self.dataset_dir, self.label_list)
+
+
+def pascalvoc_label():
+    labels_map = {
+        'aeroplane': 0,
+        'bicycle': 1,
+        'bird': 2,
+        'boat': 3,
+        'bottle': 4,
+        'bus': 5,
+        'car': 6,
+        'cat': 7,
+        'chair': 8,
+        'cow': 9,
+        'diningtable': 10,
+        'dog': 11,
+        'horse': 12,
+        'motorbike': 13,
+        'person': 14,
+        'pottedplant': 15,
+        'sheep': 16,
+        'sofa': 17,
+        'train': 18,
+        'tvmonitor': 19
+    }
+    return labels_map

+ 180 - 0
paddlers/models/ppdet/data/source/widerface.py

@@ -0,0 +1,180 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+from paddlers.models.ppdet.core.workspace import register, serializable
+from .dataset import DetDataset
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class WIDERFaceDataSet(DetDataset):
+    """
+    Load WiderFace records with 'anno_path'
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): WiderFace annotation data.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        with_lmk (bool): whether to load face landmark keypoint labels.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 with_lmk=False):
+        super(WIDERFaceDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            with_lmk=with_lmk)
+        self.anno_path = anno_path
+        self.sample_num = sample_num
+        self.roidbs = None
+        self.cname2cid = None
+        self.with_lmk = with_lmk
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        txt_file = anno_path
+
+        records = []
+        ct = 0
+        file_lists = self._load_file_list(txt_file)
+        cname2cid = widerface_label()
+
+        for item in file_lists:
+            im_fname = item[0]
+            im_id = np.array([ct])
+            gt_bbox = np.zeros((len(item) - 1, 4), dtype=np.float32)
+            gt_class = np.zeros((len(item) - 1, 1), dtype=np.int32)
+            gt_lmk_labels = np.zeros((len(item) - 1, 10), dtype=np.float32)
+            lmk_ignore_flag = np.zeros((len(item) - 1, 1), dtype=np.int32)
+            for index_box in range(len(item)):
+                if index_box < 1:
+                    continue
+                gt_bbox[index_box - 1] = item[index_box][0]
+                if self.with_lmk:
+                    gt_lmk_labels[index_box - 1] = item[index_box][1]
+                    lmk_ignore_flag[index_box - 1] = item[index_box][2]
+            im_fname = os.path.join(image_dir,
+                                    im_fname) if image_dir else im_fname
+            widerface_rec = {
+                'im_file': im_fname,
+                'im_id': im_id,
+            } if 'image' in self.data_fields else {}
+            gt_rec = {
+                'gt_bbox': gt_bbox,
+                'gt_class': gt_class,
+            }
+            for k, v in gt_rec.items():
+                if k in self.data_fields:
+                    widerface_rec[k] = v
+            if self.with_lmk:
+                widerface_rec['gt_keypoint'] = gt_lmk_labels
+                widerface_rec['keypoint_ignore'] = lmk_ignore_flag
+
+            if len(item) != 0:
+                records.append(widerface_rec)
+
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert len(records) > 0, 'not found any widerface in %s' % (anno_path)
+        logger.debug('{} samples in file {}'.format(ct, anno_path))
+        self.roidbs, self.cname2cid = records, cname2cid
+
+    def _load_file_list(self, input_txt):
+        with open(input_txt, 'r') as f_dir:
+            lines_input_txt = f_dir.readlines()
+
+        file_dict = {}
+        num_class = 0
+        exts = ['jpg', 'jpeg', 'png', 'bmp']
+        exts += [ext.upper() for ext in exts]
+        for i in range(len(lines_input_txt)):
+            line_txt = lines_input_txt[i].strip('\n\t\r')
+            split_str = line_txt.split(' ')
+            if len(split_str) == 1:
+                img_file_name = os.path.split(split_str[0])[1]
+                split_txt = img_file_name.split('.')
+                if len(split_txt) < 2:
+                    continue
+                elif split_txt[-1] in exts:
+                    if i != 0:
+                        num_class += 1
+                    file_dict[num_class] = [line_txt]
+            else:
+                if len(line_txt) <= 6:
+                    continue
+                result_boxs = []
+                xmin = float(split_str[0])
+                ymin = float(split_str[1])
+                w = float(split_str[2])
+                h = float(split_str[3])
+                # Filter out wrong labels
+                if w < 0 or h < 0:
+                    logger.warning('Illegal box with w: {}, h: {} in '
+                                   'img: {}, and it will be ignored'.format(
+                                       w, h, file_dict[num_class][0]))
+                    continue
+                xmin = max(0, xmin)
+                ymin = max(0, ymin)
+                xmax = xmin + w
+                ymax = ymin + h
+                gt_bbox = [xmin, ymin, xmax, ymax]
+                result_boxs.append(gt_bbox)
+                if self.with_lmk:
+                    assert len(split_str) > 18, 'When `with_lmk=True`, the number' \
+                            'of characters per line in the annotation file should' \
+                            'exceed 18.'
+                    lmk0_x = float(split_str[5])
+                    lmk0_y = float(split_str[6])
+                    lmk1_x = float(split_str[8])
+                    lmk1_y = float(split_str[9])
+                    lmk2_x = float(split_str[11])
+                    lmk2_y = float(split_str[12])
+                    lmk3_x = float(split_str[14])
+                    lmk3_y = float(split_str[15])
+                    lmk4_x = float(split_str[17])
+                    lmk4_y = float(split_str[18])
+                    lmk_ignore_flag = 0 if lmk0_x == -1 else 1
+                    gt_lmk_label = [
+                        lmk0_x, lmk0_y, lmk1_x, lmk1_y, lmk2_x, lmk2_y, lmk3_x,
+                        lmk3_y, lmk4_x, lmk4_y
+                    ]
+                    result_boxs.append(gt_lmk_label)
+                    result_boxs.append(lmk_ignore_flag)
+                file_dict[num_class].append(result_boxs)
+
+        return list(file_dict.values())
+
+
+def widerface_label():
+    labels_map = {'face': 0}
+    return labels_map

+ 28 - 0
paddlers/models/ppdet/data/transform/__init__.py

@@ -0,0 +1,28 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import operators
+from . import batch_operators
+from . import keypoint_operators
+from . import mot_operators
+
+from .operators import *
+from .batch_operators import *
+from .keypoint_operators import *
+from .mot_operators import *
+
+__all__ = []
+__all__ += registered_ops
+__all__ += keypoint_operators.__all__
+__all__ += mot_operators.__all__

+ 270 - 0
paddlers/models/ppdet/data/transform/atss_assigner.py

@@ -0,0 +1,270 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+    If ``is_aligned `` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned `` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or "iof" (intersection over
+            foreground).
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+    """
+    assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
+    # Either the boxes are empty or the length of boxes's last dimenstion is 4
+    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
+    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
+    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return np.random.random(batch_shape + (rows, ))
+        else:
+            return np.random.random(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
+        rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
+
+        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
+    else:
+        lt = np.maximum(bboxes1[..., :, None, :2],
+                        bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
+        rb = np.minimum(bboxes1[..., :, None, 2:],
+                        bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
+
+        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
+                                     bboxes2[..., None, :, :2])
+            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
+                                     bboxes2[..., None, :, 2:])
+
+    eps = np.array([eps])
+    union = np.maximum(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+    enclose_area = np.maximum(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
+
+
+def topk_(input, k, axis=1, largest=True):
+    x = -input if largest else input
+    if axis == 0:
+        row_index = np.arange(input.shape[1 - axis])
+        topk_index = np.argpartition(x, k, axis=axis)[0:k, :]
+        topk_data = x[topk_index, row_index]
+
+        topk_index_sort = np.argsort(topk_data, axis=axis)
+        topk_data_sort = topk_data[topk_index_sort, row_index]
+        topk_index_sort = topk_index[0:k, :][topk_index_sort, row_index]
+    else:
+        column_index = np.arange(x.shape[1 - axis])[:, None]
+        topk_index = np.argpartition(x, k, axis=axis)[:, 0:k]
+        topk_data = x[column_index, topk_index]
+        topk_data = -topk_data if largest else topk_data
+        topk_index_sort = np.argsort(topk_data, axis=axis)
+        topk_data_sort = topk_data[column_index, topk_index_sort]
+        topk_index_sort = topk_index[:, 0:k][column_index, topk_index_sort]
+
+    return topk_data_sort, topk_index_sort
+
+
+class ATSSAssigner(object):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        topk (float): number of bbox selected in each level
+    """
+
+    def __init__(self, topk=9):
+        self.topk = topk
+
+    def __call__(self,
+                 bboxes,
+                 num_level_bboxes,
+                 gt_bboxes,
+                 gt_bboxes_ignore=None,
+                 gt_labels=None):
+        """Assign gt to bboxes.
+        The assignment is done in following steps
+        1. compute iou between all bbox (bbox of all pyramid levels) and gt
+        2. compute center distance between all bbox and gt
+        3. on each pyramid level, for each gt, select k bbox whose center
+           are closest to the gt center, so we total select k*l bbox as
+           candidates for each gt
+        4. get corresponding iou for the these candidates, and compute the
+           mean and std, set mean + std as the iou threshold
+        5. select these candidates whose iou are greater than or equal to
+           the threshold as postive
+        6. limit the positive sample's center in gt
+        Args:
+            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
+            num_level_bboxes (List): num of bboxes in each level
+            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
+        """
+        bboxes = bboxes[:, :4]
+        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]
+
+        # assign 0 by default
+        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = np.zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if not np.any(gt_labels):
+                assigned_labels = None
+            else:
+                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
+            return assigned_gt_inds, max_overlaps
+
+        # compute iou between all bbox and gt
+        overlaps = bbox_overlaps(bboxes, gt_bboxes)
+        # compute center distance between all bbox and gt
+        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        gt_points = np.stack((gt_cx, gt_cy), axis=1)
+
+        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)
+
+        distances = np.sqrt(
+            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
+            .sum(-1))
+
+        # Selecting candidates based on the center distance
+        candidate_idxs = []
+        start_idx = 0
+        for bboxes_per_level in num_level_bboxes:
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_idx = start_idx + bboxes_per_level
+            distances_per_level = distances[start_idx:end_idx, :]
+            selectable_k = min(self.topk, bboxes_per_level)
+            _, topk_idxs_per_level = topk_(
+                distances_per_level, selectable_k, axis=0, largest=False)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+            start_idx = end_idx
+        candidate_idxs = np.concatenate(candidate_idxs, axis=0)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        candidate_overlaps = overlaps[candidate_idxs, np.arange(num_gt)]
+        overlaps_mean_per_gt = candidate_overlaps.mean(0)
+        overlaps_std_per_gt = candidate_overlaps.std(0)
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]
+
+        # limit the positive sample's center in gt
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+        ep_bboxes_cx = np.broadcast_to(
+            bboxes_cx.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)
+        ep_bboxes_cy = np.broadcast_to(
+            bboxes_cy.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)
+        candidate_idxs = candidate_idxs.reshape(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # bbox center and gt side
+        l_ = ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt)
+        is_in_gts = np.stack([l_, t_, r_, b_], axis=1).min(axis=1) > 0.01
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
+        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]
+        overlaps_inf[index] = overlaps.T.reshape(-1)[index]
+        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
+
+        max_overlaps = overlaps_inf.max(axis=1)
+        argmax_overlaps = overlaps_inf.argmax(axis=1)
+        assigned_gt_inds[max_overlaps !=
+                         -np.inf] = argmax_overlaps[max_overlaps !=
+                                                    -np.inf] + 1
+
+        return assigned_gt_inds, max_overlaps

+ 1591 - 0
paddlers/models/ppdet/data/transform/autoaugment_utils.py

@@ -0,0 +1,1591 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference:
+#   https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/autoaugment_utils.py
+"""AutoAugment util file."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+import math
+from PIL import Image, ImageEnhance
+import numpy as np
+import cv2
+from copy import deepcopy
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+# Represents an invalid bounding box that is used for checking for padding
+# lists of bounding box coordinates for a few augmentation operations
+_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]]
+
+
+def policy_v0():
+    """Autoaugment policy that was used in AutoAugment Detection Paper."""
+    # Each tuple is an augmentation operation of the form
+    # (operation, probability, magnitude). Each element in policy is a
+    # sub-policy that will be applied sequentially on the image.
+    policy = [
+        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
+        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
+        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
+        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
+        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
+    ]
+    return policy
+
+
+def policy_v1():
+    """Autoaugment policy that was used in AutoAugment Detection Paper."""
+    # Each tuple is an augmentation operation of the form
+    # (operation, probability, magnitude). Each element in policy is a
+    # sub-policy that will be applied sequentially on the image.
+    policy = [
+        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
+        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
+        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
+        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
+        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
+        [('Color', 0.0, 0), ('ShearX_Only_BBoxes', 0.8, 4)],
+        [('ShearY_Only_BBoxes', 0.8, 2), ('Flip_Only_BBoxes', 0.0, 10)],
+        [('Equalize', 0.6, 10), ('TranslateX_BBox', 0.2, 2)],
+        [('Color', 1.0, 10), ('TranslateY_Only_BBoxes', 0.4, 6)],
+        [('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)],  # ,
+        [('Cutout', 0.2, 2), ('Brightness', 0.8, 10)],
+        [('Color', 1.0, 6), ('Equalize', 1.0, 2)],
+        [('Cutout_Only_BBoxes', 0.4, 6), ('TranslateY_Only_BBoxes', 0.8, 2)],
+        [('Color', 0.2, 8), ('Rotate_BBox', 0.8, 10)],
+        [('Sharpness', 0.4, 4), ('TranslateY_Only_BBoxes', 0.0, 4)],
+        [('Sharpness', 1.0, 4), ('SolarizeAdd', 0.4, 4)],
+        [('Rotate_BBox', 1.0, 8), ('Sharpness', 0.2, 8)],
+        [('ShearY_BBox', 0.6, 10), ('Equalize_Only_BBoxes', 0.6, 8)],
+        [('ShearX_BBox', 0.2, 6), ('TranslateY_Only_BBoxes', 0.2, 10)],
+        [('SolarizeAdd', 0.6, 8), ('Brightness', 0.8, 10)],
+    ]
+    return policy
+
+
+def policy_vtest():
+    """Autoaugment test policy for debugging."""
+    # Each tuple is an augmentation operation of the form
+    # (operation, probability, magnitude). Each element in policy is a
+    # sub-policy that will be applied sequentially on the image.
+    policy = [[('TranslateX_BBox', 1.0, 4), ('Equalize', 1.0, 10)], ]
+    return policy
+
+
+def policy_v2():
+    """Additional policy that performs well on object detection."""
+    # Each tuple is an augmentation operation of the form
+    # (operation, probability, magnitude). Each element in policy is a
+    # sub-policy that will be applied sequentially on the image.
+    policy = [
+        [('Color', 0.0, 6), ('Cutout', 0.6, 8), ('Sharpness', 0.4, 8)],
+        [('Rotate_BBox', 0.4, 8), ('Sharpness', 0.4, 2),
+         ('Rotate_BBox', 0.8, 10)],
+        [('TranslateY_BBox', 1.0, 8), ('AutoContrast', 0.8, 2)],
+        [('AutoContrast', 0.4, 6), ('ShearX_BBox', 0.8, 8),
+         ('Brightness', 0.0, 10)],
+        [('SolarizeAdd', 0.2, 6), ('Contrast', 0.0, 10),
+         ('AutoContrast', 0.6, 0)],
+        [('Cutout', 0.2, 0), ('Solarize', 0.8, 8), ('Color', 1.0, 4)],
+        [('TranslateY_BBox', 0.0, 4), ('Equalize', 0.6, 8),
+         ('Solarize', 0.0, 10)],
+        [('TranslateY_BBox', 0.2, 2), ('ShearY_BBox', 0.8, 8),
+         ('Rotate_BBox', 0.8, 8)],
+        [('Cutout', 0.8, 8), ('Brightness', 0.8, 8), ('Cutout', 0.2, 2)],
+        [('Color', 0.8, 4), ('TranslateY_BBox', 1.0, 6),
+         ('Rotate_BBox', 0.6, 6)],
+        [('Rotate_BBox', 0.6, 10), ('BBox_Cutout', 1.0, 4),
+         ('Cutout', 0.2, 8)],
+        [('Rotate_BBox', 0.0, 0), ('Equalize', 0.6, 6),
+         ('ShearY_BBox', 0.6, 8)],
+        [('Brightness', 0.8, 8), ('AutoContrast', 0.4, 2),
+         ('Brightness', 0.2, 2)],
+        [('TranslateY_BBox', 0.4, 8), ('Solarize', 0.4, 6),
+         ('SolarizeAdd', 0.2, 10)],
+        [('Contrast', 1.0, 10), ('SolarizeAdd', 0.2, 8), ('Equalize', 0.2, 4)],
+    ]
+    return policy
+
+
+def policy_v3():
+    """"Additional policy that performs well on object detection."""
+    # Each tuple is an augmentation operation of the form
+    # (operation, probability, magnitude). Each element in policy is a
+    # sub-policy that will be applied sequentially on the image.
+    policy = [
+        [('Posterize', 0.8, 2), ('TranslateX_BBox', 1.0, 8)],
+        [('BBox_Cutout', 0.2, 10), ('Sharpness', 1.0, 8)],
+        [('Rotate_BBox', 0.6, 8), ('Rotate_BBox', 0.8, 10)],
+        [('Equalize', 0.8, 10), ('AutoContrast', 0.2, 10)],
+        [('SolarizeAdd', 0.2, 2), ('TranslateY_BBox', 0.2, 8)],
+        [('Sharpness', 0.0, 2), ('Color', 0.4, 8)],
+        [('Equalize', 1.0, 8), ('TranslateY_BBox', 1.0, 8)],
+        [('Posterize', 0.6, 2), ('Rotate_BBox', 0.0, 10)],
+        [('AutoContrast', 0.6, 0), ('Rotate_BBox', 1.0, 6)],
+        [('Equalize', 0.0, 4), ('Cutout', 0.8, 10)],
+        [('Brightness', 1.0, 2), ('TranslateY_BBox', 1.0, 6)],
+        [('Contrast', 0.0, 2), ('ShearY_BBox', 0.8, 0)],
+        [('AutoContrast', 0.8, 10), ('Contrast', 0.2, 10)],
+        [('Rotate_BBox', 1.0, 10), ('Cutout', 1.0, 10)],
+        [('SolarizeAdd', 0.8, 6), ('Equalize', 0.8, 8)],
+    ]
+    return policy
+
+
+def _equal(val1, val2, eps=1e-8):
+    return abs(val1 - val2) <= eps
+
+
+def blend(image1, image2, factor):
+    """Blend image1 and image2 using 'factor'.
+
+    Factor can be above 0.0.    A value of 0.0 means only image1 is used.
+    A value of 1.0 means only image2 is used.    A value between 0.0 and
+    1.0 means we linearly interpolate the pixel values between the two
+    images.    A value greater than 1.0 "extrapolates" the difference
+    between the two pixel values, and we clip the results to values
+    between 0 and 255.
+
+    Args:
+        image1: An image Tensor of type uint8.
+        image2: An image Tensor of type uint8.
+        factor: A floating point value above 0.0.
+
+    Returns:
+        A blended image Tensor of type uint8.
+    """
+    if factor == 0.0:
+        return image1
+    if factor == 1.0:
+        return image2
+
+    image1 = image1.astype(np.float32)
+    image2 = image2.astype(np.float32)
+
+    difference = image2 - image1
+    scaled = factor * difference
+
+    # Do addition in float.
+    temp = image1 + scaled
+
+    # Interpolate
+    if factor > 0.0 and factor < 1.0:
+        # Interpolation means we always stay within 0 and 255.
+        return temp.astype(np.uint8)
+
+    # Extrapolate:
+    #
+    # We need to clip and then cast.
+    return np.clip(temp, a_min=0, a_max=255).astype(np.uint8)
+
+
+def cutout(image, pad_size, replace=0):
+    """Apply cutout (https://arxiv.org/abs/1708.04552) to image.
+
+    This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
+    a random location within `img`. The pixel values filled in will be of the
+    value `replace`. The located where the mask will be applied is randomly
+    chosen uniformly over the whole image.
+
+    Args:
+        image: An image Tensor of type uint8.
+        pad_size: Specifies how big the zero mask that will be generated is that
+            is applied to the image. The mask will be of size
+            (2*pad_size x 2*pad_size).
+        replace: What pixel value to fill in the image in the area that has
+            the cutout mask applied to it.
+
+    Returns:
+        An image Tensor that is of type uint8.
+    Example:
+        img = cv2.imread( "/home/vis/gry/train/img_data/test.jpg", cv2.COLOR_BGR2RGB )
+        new_img = cutout(img, pad_size=50, replace=0)
+    """
+    image_height, image_width = image.shape[0], image.shape[1]
+
+    cutout_center_height = np.random.randint(low=0, high=image_height)
+    cutout_center_width = np.random.randint(low=0, high=image_width)
+
+    lower_pad = np.maximum(0, cutout_center_height - pad_size)
+    upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size)
+    left_pad = np.maximum(0, cutout_center_width - pad_size)
+    right_pad = np.maximum(0, image_width - cutout_center_width - pad_size)
+
+    cutout_shape = [
+        image_height - (lower_pad + upper_pad),
+        image_width - (left_pad + right_pad)
+    ]
+    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+    mask = np.pad(np.zeros(
+        cutout_shape, dtype=image.dtype),
+                  padding_dims,
+                  'constant',
+                  constant_values=1)
+    mask = np.expand_dims(mask, -1)
+    mask = np.tile(mask, [1, 1, 3])
+    image = np.where(
+        np.equal(mask, 0),
+        np.ones_like(
+            image, dtype=image.dtype) * replace,
+        image)
+    return image.astype(np.uint8)
+
+
+def solarize(image, threshold=128):
+    # For each pixel in the image, select the pixel
+    # if the value is less than the threshold.
+    # Otherwise, subtract 255 from the pixel.
+    return np.where(image < threshold, image, 255 - image)
+
+
+def solarize_add(image, addition=0, threshold=128):
+    # For each pixel in the image less than threshold
+    # we add 'addition' amount to it and then clip the
+    # pixel value to be between 0 and 255. The value
+    # of 'addition' is between -128 and 128.
+    added_image = image.astype(np.int64) + addition
+    added_image = np.clip(added_image, a_min=0, a_max=255).astype(np.uint8)
+    return np.where(image < threshold, added_image, image)
+
+
+def color(image, factor):
+    """use cv2 to deal"""
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    degenerate = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
+    return blend(degenerate, image, factor)
+
+
+# refer to https://github.com/4uiiurz1/pytorch-auto-augment/blob/024b2eac4140c38df8342f09998e307234cafc80/auto_augment.py#L197
+def contrast(img, factor):
+    img = ImageEnhance.Contrast(Image.fromarray(img)).enhance(factor)
+    return np.array(img)
+
+
+def brightness(image, factor):
+    """Equivalent of PIL Brightness."""
+    degenerate = np.zeros_like(image)
+    return blend(degenerate, image, factor)
+
+
+def posterize(image, bits):
+    """Equivalent of PIL Posterize."""
+    shift = 8 - bits
+    return np.left_shift(np.right_shift(image, shift), shift)
+
+
+def rotate(image, degrees, replace):
+    """Rotates the image by degrees either clockwise or counterclockwise.
+
+    Args:
+        image: An image Tensor of type uint8.
+        degrees: Float, a scalar angle in degrees to rotate all images by. If
+            degrees is positive the image will be rotated clockwise otherwise it will
+            be rotated counterclockwise.
+        replace: A one or three value 1D tensor to fill empty pixels caused by
+            the rotate operation.
+
+    Returns:
+        The rotated version of image.
+    """
+    image = wrap(image)
+    image = Image.fromarray(image)
+    image = image.rotate(degrees)
+    image = np.array(image, dtype=np.uint8)
+    return unwrap(image, replace)
+
+
+def random_shift_bbox(image,
+                      bbox,
+                      pixel_scaling,
+                      replace,
+                      new_min_bbox_coords=None):
+    """Move the bbox and the image content to a slightly new random location.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+            The potential values for the new min corner of the bbox will be between
+            [old_min - pixel_scaling * bbox_height/2,
+             old_min - pixel_scaling * bbox_height/2].
+        pixel_scaling: A float between 0 and 1 that specifies the pixel range
+            that the new bbox location will be sampled from.
+        replace: A one or three value 1D tensor to fill empty pixels.
+        new_min_bbox_coords: If not None, then this is a tuple that specifies the
+            (min_y, min_x) coordinates of the new bbox. Normally this is randomly
+            specified, but this allows it to be manually set. The coordinates are
+            the absolute coordinates between 0 and image height/width and are int32.
+
+    Returns:
+        The new image that will have the shifted bbox location in it along with
+        the new bbox that contains the new coordinates.
+    """
+    # Obtains image height and width and create helper clip functions.
+    image_height, image_width = image.shape[0], image.shape[1]
+    image_height = float(image_height)
+    image_width = float(image_width)
+
+    def clip_y(val):
+        return np.clip(val, a_min=0, a_max=image_height - 1).astype(np.int32)
+
+    def clip_x(val):
+        return np.clip(val, a_min=0, a_max=image_width - 1).astype(np.int32)
+
+    # Convert bbox to pixel coordinates.
+    min_y = int(image_height * bbox[0])
+    min_x = int(image_width * bbox[1])
+    max_y = clip_y(image_height * bbox[2])
+    max_x = clip_x(image_width * bbox[3])
+
+    bbox_height, bbox_width = (max_y - min_y + 1, max_x - min_x + 1)
+    image_height = int(image_height)
+    image_width = int(image_width)
+
+    # Select the new min/max bbox ranges that are used for sampling the
+    # new min x/y coordinates of the shifted bbox.
+    minval_y = clip_y(min_y - np.int32(pixel_scaling * float(bbox_height) /
+                                       2.0))
+    maxval_y = clip_y(min_y + np.int32(pixel_scaling * float(bbox_height) /
+                                       2.0))
+    minval_x = clip_x(min_x - np.int32(pixel_scaling * float(bbox_width) /
+                                       2.0))
+    maxval_x = clip_x(min_x + np.int32(pixel_scaling * float(bbox_width) /
+                                       2.0))
+
+    # Sample and calculate the new unclipped min/max coordinates of the new bbox.
+    if new_min_bbox_coords is None:
+        unclipped_new_min_y = np.random.randint(
+            low=minval_y, high=maxval_y, dtype=np.int32)
+        unclipped_new_min_x = np.random.randint(
+            low=minval_x, high=maxval_x, dtype=np.int32)
+    else:
+        unclipped_new_min_y, unclipped_new_min_x = (
+            clip_y(new_min_bbox_coords[0]), clip_x(new_min_bbox_coords[1]))
+    unclipped_new_max_y = unclipped_new_min_y + bbox_height - 1
+    unclipped_new_max_x = unclipped_new_min_x + bbox_width - 1
+
+    # Determine if any of the new bbox was shifted outside the current image.
+    # This is used for determining if any of the original bbox content should be
+    # discarded.
+    new_min_y, new_min_x, new_max_y, new_max_x = (
+        clip_y(unclipped_new_min_y), clip_x(unclipped_new_min_x),
+        clip_y(unclipped_new_max_y), clip_x(unclipped_new_max_x))
+    shifted_min_y = (new_min_y - unclipped_new_min_y) + min_y
+    shifted_max_y = max_y - (unclipped_new_max_y - new_max_y)
+    shifted_min_x = (new_min_x - unclipped_new_min_x) + min_x
+    shifted_max_x = max_x - (unclipped_new_max_x - new_max_x)
+
+    # Create the new bbox tensor by converting pixel integer values to floats.
+    new_bbox = np.stack([
+        float(new_min_y) / float(image_height), float(new_min_x) /
+        float(image_width), float(new_max_y) / float(image_height),
+        float(new_max_x) / float(image_width)
+    ])
+
+    # Copy the contents in the bbox and fill the old bbox location
+    # with gray (128).
+    bbox_content = image[shifted_min_y:shifted_max_y + 1, shifted_min_x:
+                         shifted_max_x + 1, :]
+
+    def mask_and_add_image(min_y_, min_x_, max_y_, max_x_, mask,
+                           content_tensor, image_):
+        """Applies mask to bbox region in image then adds content_tensor to it."""
+        mask = np.pad(mask, [[min_y_, (image_height - 1) - max_y_],
+                             [min_x_, (image_width - 1) - max_x_], [0, 0]],
+                      'constant',
+                      constant_values=1)
+
+        content_tensor = np.pad(content_tensor,
+                                [[min_y_, (image_height - 1) - max_y_],
+                                 [min_x_, (image_width - 1) - max_x_], [0, 0]],
+                                'constant',
+                                constant_values=0)
+        return image_ * mask + content_tensor
+
+    # Zero out original bbox location.
+    mask = np.zeros_like(image)[min_y:max_y + 1, min_x:max_x + 1, :]
+    grey_tensor = np.zeros_like(mask) + replace[0]
+    image = mask_and_add_image(min_y, min_x, max_y, max_x, mask, grey_tensor,
+                               image)
+
+    # Fill in bbox content to new bbox location.
+    mask = np.zeros_like(bbox_content)
+    image = mask_and_add_image(new_min_y, new_min_x, new_max_y, new_max_x,
+                               mask, bbox_content, image)
+
+    return image.astype(np.uint8), new_bbox
+
+
+def _clip_bbox(min_y, min_x, max_y, max_x):
+    """Clip bounding box coordinates between 0 and 1.
+
+    Args:
+        min_y: Normalized bbox coordinate of type float between 0 and 1.
+        min_x: Normalized bbox coordinate of type float between 0 and 1.
+        max_y: Normalized bbox coordinate of type float between 0 and 1.
+        max_x: Normalized bbox coordinate of type float between 0 and 1.
+
+    Returns:
+        Clipped coordinate values between 0 and 1.
+    """
+    min_y = np.clip(min_y, a_min=0, a_max=1.0)
+    min_x = np.clip(min_x, a_min=0, a_max=1.0)
+    max_y = np.clip(max_y, a_min=0, a_max=1.0)
+    max_x = np.clip(max_x, a_min=0, a_max=1.0)
+    return min_y, min_x, max_y, max_x
+
+
+def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05):
+    """Adjusts bbox coordinates to make sure the area is > 0.
+
+    Args:
+        min_y: Normalized bbox coordinate of type float between 0 and 1.
+        min_x: Normalized bbox coordinate of type float between 0 and 1.
+        max_y: Normalized bbox coordinate of type float between 0 and 1.
+        max_x: Normalized bbox coordinate of type float between 0 and 1.
+        delta: Float, this is used to create a gap of size 2 * delta between
+            bbox min/max coordinates that are the same on the boundary.
+            This prevents the bbox from having an area of zero.
+
+    Returns:
+        Tuple of new bbox coordinates between 0 and 1 that will now have a
+        guaranteed area > 0.
+    """
+    height = max_y - min_y
+    width = max_x - min_x
+
+    def _adjust_bbox_boundaries(min_coord, max_coord):
+        # Make sure max is never 0 and min is never 1.
+        max_coord = np.maximum(max_coord, 0.0 + delta)
+        min_coord = np.minimum(min_coord, 1.0 - delta)
+        return min_coord, max_coord
+
+    if _equal(height, 0):
+        min_y, max_y = _adjust_bbox_boundaries(min_y, max_y)
+
+    if _equal(width, 0):
+        min_x, max_x = _adjust_bbox_boundaries(min_x, max_x)
+
+    return min_y, min_x, max_y, max_x
+
+
+def _scale_bbox_only_op_probability(prob):
+    """Reduce the probability of the bbox-only operation.
+
+    Probability is reduced so that we do not distort the content of too many
+    bounding boxes that are close to each other. The value of 3.0 was a chosen
+    hyper parameter when designing the autoaugment algorithm that we found
+    empirically to work well.
+
+    Args:
+        prob: Float that is the probability of applying the bbox-only operation.
+
+    Returns:
+        Reduced probability.
+    """
+    return prob / 3.0
+
+
+def _apply_bbox_augmentation(image, bbox, augmentation_func, *args):
+    """Applies augmentation_func to the subsection of image indicated by bbox.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        augmentation_func: Augmentation function that will be applied to the
+            subsection of image.
+        *args: Additional parameters that will be passed into augmentation_func
+            when it is called.
+
+    Returns:
+        A modified version of image, where the bbox location in the image will
+        have `ugmentation_func applied to it.
+    """
+    image_height = image.shape[0]
+    image_width = image.shape[1]
+
+    min_y = int(image_height * bbox[0])
+    min_x = int(image_width * bbox[1])
+    max_y = int(image_height * bbox[2])
+    max_x = int(image_width * bbox[3])
+
+    # Clip to be sure the max values do not fall out of range.
+    max_y = np.minimum(max_y, image_height - 1)
+    max_x = np.minimum(max_x, image_width - 1)
+
+    # Get the sub-tensor that is the image within the bounding box region.
+    bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :]
+
+    # Apply the augmentation function to the bbox portion of the image.
+    augmented_bbox_content = augmentation_func(bbox_content, *args)
+
+    # Pad the augmented_bbox_content and the mask to match the shape of original
+    # image.
+    augmented_bbox_content = np.pad(
+        augmented_bbox_content, [[min_y, (image_height - 1) - max_y],
+                                 [min_x, (image_width - 1) - max_x], [0, 0]],
+        'constant',
+        constant_values=1)
+
+    # Create a mask that will be used to zero out a part of the original image.
+    mask_tensor = np.zeros_like(bbox_content)
+
+    mask_tensor = np.pad(mask_tensor,
+                         [[min_y, (image_height - 1) - max_y],
+                          [min_x, (image_width - 1) - max_x], [0, 0]],
+                         'constant',
+                         constant_values=1)
+    # Replace the old bbox content with the new augmented content.
+    image = image * mask_tensor + augmented_bbox_content
+    return image.astype(np.uint8)
+
+
+def _concat_bbox(bbox, bboxes):
+    """Helper function that concates bbox to bboxes along the first dimension."""
+
+    # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means
+    # we discard bboxes and start the bboxes Tensor with the current bbox.
+    bboxes_sum_check = np.sum(bboxes)
+    bbox = np.expand_dims(bbox, 0)
+    # This check will be true when it is an _INVALID_BOX
+    if _equal(bboxes_sum_check, -4):
+        bboxes = bbox
+    else:
+        bboxes = np.concatenate([bboxes, bbox], 0)
+    return bboxes
+
+
+def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob,
+                                     augmentation_func, func_changes_bbox,
+                                     *args):
+    """Applies _apply_bbox_augmentation with probability prob.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        new_bboxes: 2D Tensor that is a list of the bboxes in the image after they
+            have been altered by aug_func. These will only be changed when
+            func_changes_bbox is set to true. Each bbox has 4 elements
+            (min_y, min_x, max_y, max_x) of type float that are the normalized
+            bbox coordinates between 0 and 1.
+        prob: Float that is the probability of applying _apply_bbox_augmentation.
+        augmentation_func: Augmentation function that will be applied to the
+            subsection of image.
+        func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
+            to image.
+        *args: Additional parameters that will be passed into augmentation_func
+            when it is called.
+
+    Returns:
+        A tuple. Fist element is a modified version of image, where the bbox
+        location in the image will have augmentation_func applied to it if it is
+        chosen to be called with probability `prob`. The second element is a
+        Tensor of Tensors of length 4 that will contain the altered bbox after
+        applying augmentation_func.
+    """
+    should_apply_op = (np.random.rand() + prob >= 1)
+    if func_changes_bbox:
+        if should_apply_op:
+            augmented_image, bbox = augmentation_func(image, bbox, *args)
+        else:
+            augmented_image, bbox = (image, bbox)
+    else:
+        if should_apply_op:
+            augmented_image = _apply_bbox_augmentation(
+                image, bbox, augmentation_func, *args)
+        else:
+            augmented_image = image
+    new_bboxes = _concat_bbox(bbox, new_bboxes)
+    return augmented_image.astype(np.uint8), new_bboxes
+
+
+def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func,
+                                   func_changes_bbox, *args):
+    """Applies aug_func to the image for each bbox in bboxes.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+            has 4 elements (min_y, min_x, max_y, max_x) of type float.
+        prob: Float that is the probability of applying aug_func to a specific
+            bounding box within the image.
+        aug_func: Augmentation function that will be applied to the
+            subsections of image indicated by the bbox values in bboxes.
+        func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
+            to image.
+        *args: Additional parameters that will be passed into augmentation_func
+            when it is called.
+
+    Returns:
+        A modified version of image, where each bbox location in the image will
+        have augmentation_func applied to it if it is chosen to be called with
+        probability prob independently across all bboxes. Also the final
+        bboxes are returned that will be unchanged if func_changes_bbox is set to
+        false and if true, the new altered ones will be returned.
+    """
+    # Will keep track of the new altered bboxes after aug_func is repeatedly
+    # applied. The -1 values are a dummy value and this first Tensor will be
+    # removed upon appending the first real bbox.
+    new_bboxes = np.array(_INVALID_BOX)
+
+    # If the bboxes are empty, then just give it _INVALID_BOX. The result
+    # will be thrown away.
+    bboxes = np.array((_INVALID_BOX)) if bboxes.size == 0 else bboxes
+
+    assert bboxes.shape[1] == 4, "bboxes.shape[1] must be 4!!!!"
+
+    # pylint:disable=g-long-lambda
+    # pylint:disable=line-too-long
+    wrapped_aug_func = lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args)
+    # pylint:enable=g-long-lambda
+    # pylint:enable=line-too-long
+
+    # Setup the while_loop.
+    num_bboxes = bboxes.shape[0]  # We loop until we go over all bboxes.
+    idx = 0  # Counter for the while loop.
+
+    # Conditional function when to end the loop once we go over all bboxes
+    # images_and_bboxes contain (_image, _new_bboxes)
+    def cond(_idx, _images_and_bboxes):
+        return _idx < num_bboxes
+
+    # Shuffle the bboxes so that the augmentation order is not deterministic if
+    # we are not changing the bboxes with aug_func.
+    # if not func_changes_bbox:
+    #     print(bboxes)
+    #     loop_bboxes = np.take(bboxes,np.random.permutation(bboxes.shape[0]),axis=0)
+    #     print(loop_bboxes)
+    # else:
+    #     loop_bboxes = bboxes
+    # we can not shuffle the bbox because it does not contain class information here
+    loop_bboxes = deepcopy(bboxes)
+
+    # Main function of while_loop where we repeatedly apply augmentation on the
+    # bboxes in the image.
+    # pylint:disable=g-long-lambda
+    body = lambda _idx, _images_and_bboxes: [
+            _idx + 1, wrapped_aug_func(_images_and_bboxes[0],
+                                         loop_bboxes[_idx],
+                                         _images_and_bboxes[1])]
+    while (cond(idx, (image, new_bboxes))):
+        idx, (image, new_bboxes) = body(idx, (image, new_bboxes))
+
+    # Either return the altered bboxes or the original ones depending on if
+    # we altered them in anyway.
+    if func_changes_bbox:
+        final_bboxes = new_bboxes
+    else:
+        final_bboxes = bboxes
+    return image, final_bboxes
+
+
+def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func,
+                                           func_changes_bbox, *args):
+    """Checks to be sure num bboxes > 0 before calling inner function."""
+    num_bboxes = len(bboxes)
+    new_image = deepcopy(image)
+    new_bboxes = deepcopy(bboxes)
+    if num_bboxes != 0:
+        new_image, new_bboxes = _apply_multi_bbox_augmentation(
+            new_image, new_bboxes, prob, aug_func, func_changes_bbox, *args)
+    return new_image, new_bboxes
+
+
+def rotate_only_bboxes(image, bboxes, prob, degrees, replace):
+    """Apply rotate to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, rotate, func_changes_bbox, degrees, replace)
+
+
+def shear_x_only_bboxes(image, bboxes, prob, level, replace):
+    """Apply shear_x to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, shear_x, func_changes_bbox, level, replace)
+
+
+def shear_y_only_bboxes(image, bboxes, prob, level, replace):
+    """Apply shear_y to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, shear_y, func_changes_bbox, level, replace)
+
+
+def translate_x_only_bboxes(image, bboxes, prob, pixels, replace):
+    """Apply translate_x to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, translate_x, func_changes_bbox, pixels, replace)
+
+
+def translate_y_only_bboxes(image, bboxes, prob, pixels, replace):
+    """Apply translate_y to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace)
+
+
+def flip_only_bboxes(image, bboxes, prob):
+    """Apply flip_lr to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob,
+                                                  np.fliplr, func_changes_bbox)
+
+
+def solarize_only_bboxes(image, bboxes, prob, threshold):
+    """Apply solarize to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, solarize, func_changes_bbox, threshold)
+
+
+def equalize_only_bboxes(image, bboxes, prob):
+    """Apply equalize to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob,
+                                                  equalize, func_changes_bbox)
+
+
+def cutout_only_bboxes(image, bboxes, prob, pad_size, replace):
+    """Apply cutout to each bbox in the image with probability prob."""
+    func_changes_bbox = False
+    prob = _scale_bbox_only_op_probability(prob)
+    return _apply_multi_bbox_augmentation_wrapper(
+        image, bboxes, prob, cutout, func_changes_bbox, pad_size, replace)
+
+
+def _rotate_bbox(bbox, image_height, image_width, degrees):
+    """Rotates the bbox coordinated by degrees.
+
+    Args:
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        image_height: Int, height of the image.
+        image_width: Int, height of the image.
+        degrees: Float, a scalar angle in degrees to rotate all images by. If
+            degrees is positive the image will be rotated clockwise otherwise it will
+            be rotated counterclockwise.
+
+    Returns:
+        A tensor of the same shape as bbox, but now with the rotated coordinates.
+    """
+    image_height, image_width = (float(image_height), float(image_width))
+
+    # Convert from degrees to radians.
+    degrees_to_radians = math.pi / 180.0
+    radians = degrees * degrees_to_radians
+
+    # Translate the bbox to the center of the image and turn the normalized 0-1
+    # coordinates to absolute pixel locations.
+    # Y coordinates are made negative as the y axis of images goes down with
+    # increasing pixel values, so we negate to make sure x axis and y axis points
+    # are in the traditionally positive direction.
+    min_y = -int(image_height * (bbox[0] - 0.5))
+    min_x = int(image_width * (bbox[1] - 0.5))
+    max_y = -int(image_height * (bbox[2] - 0.5))
+    max_x = int(image_width * (bbox[3] - 0.5))
+    coordinates = np.stack([[min_y, min_x], [min_y, max_x], [max_y, min_x],
+                            [max_y, max_x]]).astype(np.float32)
+    # Rotate the coordinates according to the rotation matrix clockwise if
+    # radians is positive, else negative
+    rotation_matrix = np.stack([[math.cos(radians), math.sin(radians)],
+                                [-math.sin(radians), math.cos(radians)]])
+    new_coords = np.matmul(rotation_matrix,
+                           np.transpose(coordinates)).astype(np.int32)
+
+    # Find min/max values and convert them back to normalized 0-1 floats.
+    min_y = -(float(np.max(new_coords[0, :])) / image_height - 0.5)
+    min_x = float(np.min(new_coords[1, :])) / image_width + 0.5
+    max_y = -(float(np.min(new_coords[0, :])) / image_height - 0.5)
+    max_x = float(np.max(new_coords[1, :])) / image_width + 0.5
+
+    # Clip the bboxes to be sure the fall between [0, 1].
+    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
+    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
+    return np.stack([min_y, min_x, max_y, max_x])
+
+
+def rotate_with_bboxes(image, bboxes, degrees, replace):
+    # Rotate the image.
+    image = rotate(image, degrees, replace)
+
+    # Convert bbox coordinates to pixel values.
+    image_height, image_width = image.shape[:2]
+    # pylint:disable=g-long-lambda
+    wrapped_rotate_bbox = lambda bbox: _rotate_bbox(bbox, image_height, image_width, degrees)
+    # pylint:enable=g-long-lambda
+    new_bboxes = np.zeros_like(bboxes)
+    for idx in range(len(bboxes)):
+        new_bboxes[idx] = wrapped_rotate_bbox(bboxes[idx])
+    return image, new_bboxes
+
+
+def translate_x(image, pixels, replace):
+    """Equivalent of PIL Translate in X dimension."""
+    image = Image.fromarray(wrap(image))
+    image = image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0))
+    return unwrap(np.array(image), replace)
+
+
+def translate_y(image, pixels, replace):
+    """Equivalent of PIL Translate in Y dimension."""
+    image = Image.fromarray(wrap(image))
+    image = image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels))
+    return unwrap(np.array(image), replace)
+
+
+def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal):
+    """Shifts the bbox coordinates by pixels.
+
+    Args:
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        image_height: Int, height of the image.
+        image_width: Int, width of the image.
+        pixels: An int. How many pixels to shift the bbox.
+        shift_horizontal: Boolean. If true then shift in X dimension else shift in
+            Y dimension.
+
+    Returns:
+        A tensor of the same shape as bbox, but now with the shifted coordinates.
+    """
+    pixels = int(pixels)
+    # Convert bbox to integer pixel locations.
+    min_y = int(float(image_height) * bbox[0])
+    min_x = int(float(image_width) * bbox[1])
+    max_y = int(float(image_height) * bbox[2])
+    max_x = int(float(image_width) * bbox[3])
+
+    if shift_horizontal:
+        min_x = np.maximum(0, min_x - pixels)
+        max_x = np.minimum(image_width, max_x - pixels)
+    else:
+        min_y = np.maximum(0, min_y - pixels)
+        max_y = np.minimum(image_height, max_y - pixels)
+
+    # Convert bbox back to floats.
+    min_y = float(min_y) / float(image_height)
+    min_x = float(min_x) / float(image_width)
+    max_y = float(max_y) / float(image_height)
+    max_x = float(max_x) / float(image_width)
+
+    # Clip the bboxes to be sure the fall between [0, 1].
+    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
+    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
+    return np.stack([min_y, min_x, max_y, max_x])
+
+
+def translate_bbox(image, bboxes, pixels, replace, shift_horizontal):
+    """Equivalent of PIL Translate in X/Y dimension that shifts image and bbox.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
+            between [0, 1].
+        pixels: An int. How many pixels to shift the image and bboxes
+        replace: A one or three value 1D tensor to fill empty pixels.
+        shift_horizontal: Boolean. If true then shift in X dimension else shift in
+            Y dimension.
+
+    Returns:
+        A tuple containing a 3D uint8 Tensor that will be the result of translating
+        image by pixels. The second element of the tuple is bboxes, where now
+        the coordinates will be shifted to reflect the shifted image.
+    """
+    if shift_horizontal:
+        image = translate_x(image, pixels, replace)
+    else:
+        image = translate_y(image, pixels, replace)
+
+    # Convert bbox coordinates to pixel values.
+    image_height, image_width = image.shape[0], image.shape[1]
+    # pylint:disable=g-long-lambda
+    wrapped_shift_bbox = lambda bbox: _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal)
+    # pylint:enable=g-long-lambda
+    new_bboxes = deepcopy(bboxes)
+    num_bboxes = len(bboxes)
+    for idx in range(num_bboxes):
+        new_bboxes[idx] = wrapped_shift_bbox(bboxes[idx])
+    return image.astype(np.uint8), new_bboxes
+
+
+def shear_x(image, level, replace):
+    """Equivalent of PIL Shearing in X dimension."""
+    # Shear parallel to x axis is a projective transform
+    # with a matrix form of:
+    # [1    level
+    #    0    1].
+    image = Image.fromarray(wrap(image))
+    image = image.transform(image.size, Image.AFFINE, (1, level, 0, 0, 1, 0))
+    return unwrap(np.array(image), replace)
+
+
+def shear_y(image, level, replace):
+    """Equivalent of PIL Shearing in Y dimension."""
+    # Shear parallel to y axis is a projective transform
+    # with a matrix form of:
+    # [1    0
+    #    level    1].
+    image = Image.fromarray(wrap(image))
+    image = image.transform(image.size, Image.AFFINE, (1, 0, 0, level, 1, 0))
+    return unwrap(np.array(image), replace)
+
+
+def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal):
+    """Shifts the bbox according to how the image was sheared.
+
+    Args:
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        image_height: Int, height of the image.
+        image_width: Int, height of the image.
+        level: Float. How much to shear the image.
+        shear_horizontal: If true then shear in X dimension else shear in
+            the Y dimension.
+
+    Returns:
+        A tensor of the same shape as bbox, but now with the shifted coordinates.
+    """
+    image_height, image_width = (float(image_height), float(image_width))
+
+    # Change bbox coordinates to be pixels.
+    min_y = int(image_height * bbox[0])
+    min_x = int(image_width * bbox[1])
+    max_y = int(image_height * bbox[2])
+    max_x = int(image_width * bbox[3])
+    coordinates = np.stack(
+        [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]])
+    coordinates = coordinates.astype(np.float32)
+
+    # Shear the coordinates according to the translation matrix.
+    if shear_horizontal:
+        translation_matrix = np.stack([[1, 0], [-level, 1]])
+    else:
+        translation_matrix = np.stack([[1, -level], [0, 1]])
+    translation_matrix = translation_matrix.astype(np.float32)
+    new_coords = np.matmul(translation_matrix,
+                           np.transpose(coordinates)).astype(np.int32)
+
+    # Find min/max values and convert them back to floats.
+    min_y = float(np.min(new_coords[0, :])) / image_height
+    min_x = float(np.min(new_coords[1, :])) / image_width
+    max_y = float(np.max(new_coords[0, :])) / image_height
+    max_x = float(np.max(new_coords[1, :])) / image_width
+
+    # Clip the bboxes to be sure the fall between [0, 1].
+    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
+    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
+    return np.stack([min_y, min_x, max_y, max_x])
+
+
+def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal):
+    """Applies Shear Transformation to the image and shifts the bboxes.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
+            between [0, 1].
+        level: Float. How much to shear the image. This value will be between
+            -0.3 to 0.3.
+        replace: A one or three value 1D tensor to fill empty pixels.
+        shear_horizontal: Boolean. If true then shear in X dimension else shear in
+            the Y dimension.
+
+    Returns:
+        A tuple containing a 3D uint8 Tensor that will be the result of shearing
+        image by level. The second element of the tuple is bboxes, where now
+        the coordinates will be shifted to reflect the sheared image.
+    """
+    if shear_horizontal:
+        image = shear_x(image, level, replace)
+    else:
+        image = shear_y(image, level, replace)
+
+    # Convert bbox coordinates to pixel values.
+    image_height, image_width = image.shape[:2]
+    # pylint:disable=g-long-lambda
+    wrapped_shear_bbox = lambda bbox: _shear_bbox(bbox, image_height, image_width, level, shear_horizontal)
+    # pylint:enable=g-long-lambda
+    new_bboxes = deepcopy(bboxes)
+    num_bboxes = len(bboxes)
+    for idx in range(num_bboxes):
+        new_bboxes[idx] = wrapped_shear_bbox(bboxes[idx])
+    return image.astype(np.uint8), new_bboxes
+
+
+def autocontrast(image):
+    """Implements Autocontrast function from PIL.
+
+    Args:
+        image: A 3D uint8 tensor.
+
+    Returns:
+        The image after it has had autocontrast applied to it and will be of type
+        uint8.
+    """
+
+    def scale_channel(image):
+        """Scale the 2D image using the autocontrast rule."""
+        # A possibly cheaper version can be done using cumsum/unique_with_counts
+        # over the histogram values, rather than iterating over the entire image.
+        # to compute mins and maxes.
+        lo = float(np.min(image))
+        hi = float(np.max(image))
+
+        # Scale the image, making the lowest value 0 and the highest value 255.
+        def scale_values(im):
+            scale = 255.0 / (hi - lo)
+            offset = -lo * scale
+            im = im.astype(np.float32) * scale + offset
+            img = np.clip(im, a_min=0, a_max=255.0)
+            return im.astype(np.uint8)
+
+        result = scale_values(image) if hi > lo else image
+        return result
+
+    # Assumes RGB for now.    Scales each channel independently
+    # and then stacks the result.
+    s1 = scale_channel(image[:, :, 0])
+    s2 = scale_channel(image[:, :, 1])
+    s3 = scale_channel(image[:, :, 2])
+    image = np.stack([s1, s2, s3], 2)
+    return image
+
+
+def sharpness(image, factor):
+    """Implements Sharpness function from PIL."""
+    orig_image = image
+    image = image.astype(np.float32)
+    # Make image 4D for conv operation.
+    # SMOOTH PIL Kernel.
+    kernel = np.array(
+        [[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=np.float32) / 13.
+    result = cv2.filter2D(image, -1, kernel).astype(np.uint8)
+
+    # Blend the final result.
+    return blend(result, orig_image, factor)
+
+
+def equalize(image):
+    """Implements Equalize function from PIL using."""
+
+    def scale_channel(im, c):
+        """Scale the data in the channel to implement equalize."""
+        im = im[:, :, c].astype(np.int32)
+        # Compute the histogram of the image channel.
+        histo, _ = np.histogram(im, range=[0, 255], bins=256)
+
+        # For the purposes of computing the step, filter out the nonzeros.
+        nonzero = np.where(np.not_equal(histo, 0))
+        nonzero_histo = np.reshape(np.take(histo, nonzero), [-1])
+        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
+
+        def build_lut(histo, step):
+            # Compute the cumulative sum, shifting by step // 2
+            # and then normalization by step.
+            lut = (np.cumsum(histo) + (step // 2)) // step
+            # Shift lut, prepending with 0.
+            lut = np.concatenate([[0], lut[:-1]], 0)
+            # Clip the counts to be in range.    This is done
+            # in the C code for image.point.
+            return np.clip(lut, a_min=0, a_max=255).astype(np.uint8)
+
+        # If step is zero, return the original image.    Otherwise, build
+        # lut from the full histogram and step and then index from it.
+        if step == 0:
+            result = im
+        else:
+            result = np.take(build_lut(histo, step), im)
+
+        return result.astype(np.uint8)
+
+    # Assumes RGB for now.    Scales each channel independently
+    # and then stacks the result.
+    s1 = scale_channel(image, 0)
+    s2 = scale_channel(image, 1)
+    s3 = scale_channel(image, 2)
+    image = np.stack([s1, s2, s3], 2)
+    return image
+
+
+def wrap(image):
+    """Returns 'image' with an extra channel set to all 1s."""
+    shape = image.shape
+    extended_channel = 255 * np.ones([shape[0], shape[1], 1], image.dtype)
+    extended = np.concatenate([image, extended_channel], 2).astype(image.dtype)
+    return extended
+
+
+def unwrap(image, replace):
+    """Unwraps an image produced by wrap.
+
+    Where there is a 0 in the last channel for every spatial position,
+    the rest of the three channels in that spatial dimension are grayed
+    (set to 128).    Operations like translate and shear on a wrapped
+    Tensor will leave 0s in empty locations.    Some transformations look
+    at the intensity of values to do preprocessing, and we want these
+    empty pixels to assume the 'average' value, rather than pure black.
+
+
+    Args:
+        image: A 3D Image Tensor with 4 channels.
+        replace: A one or three value 1D tensor to fill empty pixels.
+
+    Returns:
+        image: A 3D image Tensor with 3 channels.
+    """
+    image_shape = image.shape
+    # Flatten the spatial dimensions.
+    flattened_image = np.reshape(image, [-1, image_shape[2]])
+
+    # Find all pixels where the last channel is zero.
+    alpha_channel = flattened_image[:, 3]
+
+    replace = np.concatenate([replace, np.ones([1], image.dtype)], 0)
+
+    # Where they are zero, fill them in with 'replace'.
+    alpha_channel = np.reshape(alpha_channel, (-1, 1))
+    alpha_channel = np.tile(alpha_channel, reps=(1, flattened_image.shape[1]))
+
+    flattened_image = np.where(
+        np.equal(alpha_channel, 0),
+        np.ones_like(
+            flattened_image, dtype=image.dtype) * replace,
+        flattened_image)
+
+    image = np.reshape(flattened_image, image_shape)
+    image = image[:, :, :3]
+    return image.astype(np.uint8)
+
+
+def _cutout_inside_bbox(image, bbox, pad_fraction):
+    """Generates cutout mask and the mean pixel value of the bbox.
+
+    First a location is randomly chosen within the image as the center where the
+    cutout mask will be applied. Note this can be towards the boundaries of the
+    image, so the full cutout mask may not be applied.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
+            of type float that represents the normalized coordinates between 0 and 1.
+        pad_fraction: Float that specifies how large the cutout mask should be in
+            in reference to the size of the original bbox. If pad_fraction is 0.25,
+            then the cutout mask will be of shape
+            (0.25 * bbox height, 0.25 * bbox width).
+
+    Returns:
+        A tuple. Fist element is a tensor of the same shape as image where each
+        element is either a 1 or 0 that is used to determine where the image
+        will have cutout applied. The second element is the mean of the pixels
+        in the image where the bbox is located.
+        mask value: [0,1]
+    """
+    image_height, image_width = image.shape[0], image.shape[1]
+    # Transform from shape [1, 4] to [4].
+    bbox = np.squeeze(bbox)
+
+    min_y = int(float(image_height) * bbox[0])
+    min_x = int(float(image_width) * bbox[1])
+    max_y = int(float(image_height) * bbox[2])
+    max_x = int(float(image_width) * bbox[3])
+
+    # Calculate the mean pixel values in the bounding box, which will be used
+    # to fill the cutout region.
+    mean = np.mean(image[min_y:max_y + 1, min_x:max_x + 1], axis=(0, 1))
+    # Cutout mask will be size pad_size_heigh * 2 by pad_size_width * 2 if the
+    # region lies entirely within the bbox.
+    box_height = max_y - min_y + 1
+    box_width = max_x - min_x + 1
+    pad_size_height = int(pad_fraction * (box_height / 2))
+    pad_size_width = int(pad_fraction * (box_width / 2))
+
+    # Sample the center location in the image where the zero mask will be applied.
+    cutout_center_height = np.random.randint(min_y, max_y + 1, dtype=np.int32)
+    cutout_center_width = np.random.randint(min_x, max_x + 1, dtype=np.int32)
+
+    lower_pad = np.maximum(0, cutout_center_height - pad_size_height)
+    upper_pad = np.maximum(
+        0, image_height - cutout_center_height - pad_size_height)
+    left_pad = np.maximum(0, cutout_center_width - pad_size_width)
+    right_pad = np.maximum(0,
+                           image_width - cutout_center_width - pad_size_width)
+
+    cutout_shape = [
+        image_height - (lower_pad + upper_pad),
+        image_width - (left_pad + right_pad)
+    ]
+    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+
+    mask = np.pad(np.zeros(
+        cutout_shape, dtype=image.dtype),
+                  padding_dims,
+                  'constant',
+                  constant_values=1)
+
+    mask = np.expand_dims(mask, 2)
+    mask = np.tile(mask, [1, 1, 3])
+    return mask, mean
+
+
+def bbox_cutout(image, bboxes, pad_fraction, replace_with_mean):
+    """Applies cutout to the image according to bbox information.
+
+    This is a cutout variant that using bbox information to make more informed
+    decisions on where to place the cutout mask.
+
+    Args:
+        image: 3D uint8 Tensor.
+        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
+            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
+            between [0, 1].
+        pad_fraction: Float that specifies how large the cutout mask should be in
+            in reference to the size of the original bbox. If pad_fraction is 0.25,
+            then the cutout mask will be of shape
+            (0.25 * bbox height, 0.25 * bbox width).
+        replace_with_mean: Boolean that specified what value should be filled in
+            where the cutout mask is applied. Since the incoming image will be of
+            uint8 and will not have had any mean normalization applied, by default
+            we set the value to be 128. If replace_with_mean is True then we find
+            the mean pixel values across the channel dimension and use those to fill
+            in where the cutout mask is applied.
+
+    Returns:
+        A tuple. First element is a tensor of the same shape as image that has
+        cutout applied to it. Second element is the bboxes that were passed in
+        that will be unchanged.
+    """
+
+    def apply_bbox_cutout(image, bboxes, pad_fraction):
+        """Applies cutout to a single bounding box within image."""
+        # Choose a single bounding box to apply cutout to.
+        random_index = np.random.randint(0, bboxes.shape[0], dtype=np.int32)
+        # Select the corresponding bbox and apply cutout.
+        chosen_bbox = np.take(bboxes, random_index, axis=0)
+        mask, mean = _cutout_inside_bbox(image, chosen_bbox, pad_fraction)
+
+        # When applying cutout we either set the pixel value to 128 or to the mean
+        # value inside the bbox.
+        replace = mean if replace_with_mean else [128] * 3
+
+        # Apply the cutout mask to the image. Where the mask is 0 we fill it with
+        # `replace`.
+        image = np.where(
+            np.equal(mask, 0),
+            np.ones_like(
+                image, dtype=image.dtype) * replace,
+            image).astype(image.dtype)
+        return image
+
+    # Check to see if there are boxes, if so then apply boxcutout.
+    if len(bboxes) != 0:
+        image = apply_bbox_cutout(image, bboxes, pad_fraction)
+
+    return image, bboxes
+
+
+NAME_TO_FUNC = {
+        'AutoContrast': autocontrast,
+        'Equalize': equalize,
+        'Posterize': posterize,
+        'Solarize': solarize,
+        'SolarizeAdd': solarize_add,
+        'Color': color,
+        'Contrast': contrast,
+        'Brightness': brightness,
+        'Sharpness': sharpness,
+        'Cutout': cutout,
+        'BBox_Cutout': bbox_cutout,
+        'Rotate_BBox': rotate_with_bboxes,
+        # pylint:disable=g-long-lambda
+        'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
+                image, bboxes, pixels, replace, shift_horizontal=True),
+        'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
+                image, bboxes, pixels, replace, shift_horizontal=False),
+        'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
+                image, bboxes, level, replace, shear_horizontal=True),
+        'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
+                image, bboxes, level, replace, shear_horizontal=False),
+        # pylint:enable=g-long-lambda
+        'Rotate_Only_BBoxes': rotate_only_bboxes,
+        'ShearX_Only_BBoxes': shear_x_only_bboxes,
+        'ShearY_Only_BBoxes': shear_y_only_bboxes,
+        'TranslateX_Only_BBoxes': translate_x_only_bboxes,
+        'TranslateY_Only_BBoxes': translate_y_only_bboxes,
+        'Flip_Only_BBoxes': flip_only_bboxes,
+        'Solarize_Only_BBoxes': solarize_only_bboxes,
+        'Equalize_Only_BBoxes': equalize_only_bboxes,
+        'Cutout_Only_BBoxes': cutout_only_bboxes,
+}
+
+
+def _randomly_negate_tensor(tensor):
+    """With 50% prob turn the tensor negative."""
+    should_flip = np.floor(np.random.rand() + 0.5) >= 1
+    final_tensor = tensor if should_flip else -tensor
+    return final_tensor
+
+
+def _rotate_level_to_arg(level):
+    level = (level / _MAX_LEVEL) * 30.
+    level = _randomly_negate_tensor(level)
+    return (level, )
+
+
+def _shrink_level_to_arg(level):
+    """Converts level to ratio by which we shrink the image content."""
+    if level == 0:
+        return (1.0, )  # if level is zero, do not shrink the image
+    # Maximum shrinking ratio is 2.9.
+    level = 2. / (_MAX_LEVEL / level) + 0.9
+    return (level, )
+
+
+def _enhance_level_to_arg(level):
+    return ((level / _MAX_LEVEL) * 1.8 + 0.1, )
+
+
+def _shear_level_to_arg(level):
+    level = (level / _MAX_LEVEL) * 0.3
+    # Flip level to negative with 50% chance.
+    level = _randomly_negate_tensor(level)
+    return (level, )
+
+
+def _translate_level_to_arg(level, translate_const):
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    # Flip level to negative with 50% chance.
+    level = _randomly_negate_tensor(level)
+    return (level, )
+
+
+def _bbox_cutout_level_to_arg(level, hparams):
+    cutout_pad_fraction = (
+        level / _MAX_LEVEL) * 0.75  # hparams.cutout_max_pad_fraction
+    return (cutout_pad_fraction,
+            False)  # hparams.cutout_bbox_replace_with_mean
+
+
+def level_to_arg(hparams):
+    return {
+        'AutoContrast': lambda level: (),
+        'Equalize': lambda level: (),
+        'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4), ),
+        'Solarize': lambda level: (int((level / _MAX_LEVEL) * 256), ),
+        'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110), ),
+        'Color': _enhance_level_to_arg,
+        'Contrast': _enhance_level_to_arg,
+        'Brightness': _enhance_level_to_arg,
+        'Sharpness': _enhance_level_to_arg,
+        'Cutout':
+        lambda level: (int((level / _MAX_LEVEL) * 100), ),  # hparams.cutout_const=100
+        # pylint:disable=g-long-lambda
+        'BBox_Cutout': lambda level: _bbox_cutout_level_to_arg(level, hparams),
+        'TranslateX_BBox':
+        lambda level: _translate_level_to_arg(level, 250),  # hparams.translate_const=250
+        'TranslateY_BBox':
+        lambda level: _translate_level_to_arg(level, 250),  # hparams.translate_cons
+        # pylint:enable=g-long-lambda
+        'ShearX_BBox': _shear_level_to_arg,
+        'ShearY_BBox': _shear_level_to_arg,
+        'Rotate_BBox': _rotate_level_to_arg,
+        'Rotate_Only_BBoxes': _rotate_level_to_arg,
+        'ShearX_Only_BBoxes': _shear_level_to_arg,
+        'ShearY_Only_BBoxes': _shear_level_to_arg,
+        # pylint:disable=g-long-lambda
+        'TranslateX_Only_BBoxes':
+        lambda level: _translate_level_to_arg(level, 120),  # hparams.translate_bbox_const
+        'TranslateY_Only_BBoxes':
+        lambda level: _translate_level_to_arg(level, 120),  # hparams.translate_bbox_const
+        # pylint:enable=g-long-lambda
+        'Flip_Only_BBoxes': lambda level: (),
+        'Solarize_Only_BBoxes':
+        lambda level: (int((level / _MAX_LEVEL) * 256), ),
+        'Equalize_Only_BBoxes': lambda level: (),
+        # pylint:disable=g-long-lambda
+        'Cutout_Only_BBoxes':
+        lambda level: (int((level / _MAX_LEVEL) * 50), ),  # hparams.cutout_bbox_const
+        # pylint:enable=g-long-lambda
+    }
+
+
+def bbox_wrapper(func):
+    """Adds a bboxes function argument to func and returns unchanged bboxes."""
+
+    def wrapper(images, bboxes, *args, **kwargs):
+        return (func(images, *args, **kwargs), bboxes)
+
+    return wrapper
+
+
+def _parse_policy_info(name, prob, level, replace_value, augmentation_hparams):
+    """Return the function that corresponds to `name` and update `level` param."""
+    func = NAME_TO_FUNC[name]
+    args = level_to_arg(augmentation_hparams)[name](level)
+
+    # Check to see if prob is passed into function. This is used for operations
+    # where we alter bboxes independently.
+    # pytype:disable=wrong-arg-types
+    if 'prob' in inspect.getfullargspec(func)[0]:
+        args = tuple([prob] + list(args))
+    # pytype:enable=wrong-arg-types
+
+    # Add in replace arg if it is required for the function that is being called.
+    if 'replace' in inspect.getfullargspec(func)[0]:
+        # Make sure replace is the final argument
+        assert 'replace' == inspect.getfullargspec(func)[0][-1]
+        args = tuple(list(args) + [replace_value])
+
+    # Add bboxes as the second positional argument for the function if it does
+    # not already exist.
+    if 'bboxes' not in inspect.getfullargspec(func)[0]:
+        func = bbox_wrapper(func)
+    return (func, prob, args)
+
+
+def _apply_func_with_prob(func, image, args, prob, bboxes):
+    """Apply `func` to image w/ `args` as input with probability `prob`."""
+    assert isinstance(args, tuple)
+    assert 'bboxes' == inspect.getfullargspec(func)[0][1]
+
+    # If prob is a function argument, then this randomness is being handled
+    # inside the function, so make sure it is always called.
+    if 'prob' in inspect.getfullargspec(func)[0]:
+        prob = 1.0
+
+    # Apply the function with probability `prob`.
+    should_apply_op = np.floor(np.random.rand() + 0.5) >= 1
+    if should_apply_op:
+        augmented_image, augmented_bboxes = func(image, bboxes, *args)
+    else:
+        augmented_image, augmented_bboxes = (image, bboxes)
+    return augmented_image, augmented_bboxes
+
+
+def select_and_apply_random_policy(policies, image, bboxes):
+    """Select a random policy from `policies` and apply it to `image`."""
+    policy_to_select = np.random.randint(0, len(policies), dtype=np.int32)
+    # policy_to_select = 6 # for test
+    for (i, policy) in enumerate(policies):
+        if i == policy_to_select:
+            image, bboxes = policy(image, bboxes)
+    return (image, bboxes)
+
+
+def build_and_apply_nas_policy(policies, image, bboxes, augmentation_hparams):
+    """Build a policy from the given policies passed in and apply to image.
+
+    Args:
+        policies: list of lists of tuples in the form `(func, prob, level)`, `func`
+            is a string name of the augmentation function, `prob` is the probability
+            of applying the `func` operation, `level` is the input argument for
+            `func`.
+        image: numpy array that the resulting policy will be applied to.
+        bboxes:
+        augmentation_hparams: Hparams associated with the NAS learned policy.
+
+    Returns:
+        A version of image that now has data augmentation applied to it based on
+        the `policies` pass into the function. Additionally, returns bboxes if
+        a value for them is passed in that is not None
+    """
+    replace_value = [128, 128, 128]
+
+    # func is the string name of the augmentation function, prob is the
+    # probability of applying the operation and level is the parameter associated
+
+    # tf_policies are functions that take in an image and return an augmented
+    # image.
+    tf_policies = []
+    for policy in policies:
+        tf_policy = []
+        # Link string name to the correct python function and make sure the correct
+        # argument is passed into that function.
+        for policy_info in policy:
+            policy_info = list(
+                policy_info) + [replace_value, augmentation_hparams]
+
+            tf_policy.append(_parse_policy_info(*policy_info))
+        # Now build the tf policy that will apply the augmentation procedue
+        # on image.
+        def make_final_policy(tf_policy_):
+            def final_policy(image_, bboxes_):
+                for func, prob, args in tf_policy_:
+                    image_, bboxes_ = _apply_func_with_prob(func, image_, args,
+                                                            prob, bboxes_)
+                return image_, bboxes_
+
+            return final_policy
+
+        tf_policies.append(make_final_policy(tf_policy))
+
+    augmented_images, augmented_bboxes = select_and_apply_random_policy(
+        tf_policies, image, bboxes)
+    # If no bounding boxes were specified, then just return the images.
+    return (augmented_images, augmented_bboxes)
+
+
+# TODO(barretzoph): Add in ArXiv link once paper is out.
+def distort_image_with_autoaugment(image, bboxes, augmentation_name):
+    """Applies the AutoAugment policy to `image` and `bboxes`.
+
+    Args:
+        image: `Tensor` of shape [height, width, 3] representing an image.
+        bboxes: `Tensor` of shape [N, 4] representing ground truth boxes that are
+            normalized between [0, 1].
+        augmentation_name: The name of the AutoAugment policy to use. The available
+            options are `v0`, `v1`, `v2`, `v3` and `test`. `v0` is the policy used for
+            all of the results in the paper and was found to achieve the best results
+            on the COCO dataset. `v1`, `v2` and `v3` are additional good policies
+            found on the COCO dataset that have slight variation in what operations
+            were used during the search procedure along with how many operations are
+            applied in parallel to a single image (2 vs 3).
+
+    Returns:
+        A tuple containing the augmented versions of `image` and `bboxes`.
+    """
+    available_policies = {
+        'v0': policy_v0,
+        'v1': policy_v1,
+        'v2': policy_v2,
+        'v3': policy_v3,
+        'test': policy_vtest
+    }
+    if augmentation_name not in available_policies:
+        raise ValueError('Invalid augmentation_name: {}'.format(
+            augmentation_name))
+
+    policy = available_policies[augmentation_name]()
+    augmentation_hparams = {}
+    return build_and_apply_nas_policy(policy, image, bboxes,
+                                      augmentation_hparams)

+ 1080 - 0
paddlers/models/ppdet/data/transform/batch_operators.py

@@ -0,0 +1,1080 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import typing
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+import cv2
+import math
+import numpy as np
+from .operators import register_op, BaseOperator, Resize
+from .op_helper import jaccard_overlap, gaussian2D, gaussian_radius, draw_umich_gaussian
+from .atss_assigner import ATSSAssigner
+from scipy import ndimage
+
+from paddlers.models.ppdet.modeling import bbox_utils
+from paddlers.models.ppdet.utils.logger import setup_logger
+from paddlers.models.ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform
+logger = setup_logger(__name__)
+
+__all__ = [
+    'PadBatch',
+    'BatchRandomResize',
+    'Gt2YoloTarget',
+    'Gt2FCOSTarget',
+    'Gt2TTFTarget',
+    'Gt2Solov2Target',
+    'Gt2SparseRCNNTarget',
+    'PadMaskBatch',
+    'Gt2GFLTarget',
+    'Gt2CenterNetTarget',
+]
+
+
+@register_op
+class PadBatch(BaseOperator):
+    """
+    Pad a batch of samples so they can be divisible by a stride.
+    The layout of each image should be 'CHW'.
+    Args:
+        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
+            height and width is divisible by `pad_to_stride`.
+    """
+
+    def __init__(self, pad_to_stride=0):
+        super(PadBatch, self).__init__()
+        self.pad_to_stride = pad_to_stride
+
+    def __call__(self, samples, context=None):
+        """
+        Args:
+            samples (list): a batch of sample, each is dict.
+        """
+        coarsest_stride = self.pad_to_stride
+
+        # multi scale input is nested list
+        if isinstance(samples,
+                      typing.Sequence) and len(samples) > 0 and isinstance(
+                          samples[0], typing.Sequence):
+            inner_samples = samples[0]
+        else:
+            inner_samples = samples
+
+        max_shape = np.array(
+            [data['image'].shape for data in inner_samples]).max(axis=0)
+        if coarsest_stride > 0:
+            max_shape[1] = int(
+                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
+            max_shape[2] = int(
+                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
+
+        for data in inner_samples:
+            im = data['image']
+            im_c, im_h, im_w = im.shape[:]
+            padding_im = np.zeros(
+                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)
+            padding_im[:, :im_h, :im_w] = im
+            data['image'] = padding_im
+            if 'semantic' in data and data['semantic'] is not None:
+                semantic = data['semantic']
+                padding_sem = np.zeros(
+                    (1, max_shape[1], max_shape[2]), dtype=np.float32)
+                padding_sem[:, :im_h, :im_w] = semantic
+                data['semantic'] = padding_sem
+            if 'gt_segm' in data and data['gt_segm'] is not None:
+                gt_segm = data['gt_segm']
+                padding_segm = np.zeros(
+                    (gt_segm.shape[0], max_shape[1], max_shape[2]),
+                    dtype=np.uint8)
+                padding_segm[:, :im_h, :im_w] = gt_segm
+                data['gt_segm'] = padding_segm
+
+            if 'gt_rbox2poly' in data and data['gt_rbox2poly'] is not None:
+                # ploy to rbox
+                polys = data['gt_rbox2poly']
+                rbox = bbox_utils.poly2rbox(polys)
+                data['gt_rbox'] = rbox
+
+        return samples
+
+
+@register_op
+class BatchRandomResize(BaseOperator):
+    """
+    Resize image to target size randomly. random target_size and interpolation method
+    Args:
+        target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
+        keep_ratio (bool): whether keep_raio or not, default true
+        interp (int): the interpolation method
+        random_size (bool): whether random select target size of image
+        random_interp (bool): whether random select interpolation method
+    """
+
+    def __init__(self,
+                 target_size,
+                 keep_ratio,
+                 interp=cv2.INTER_NEAREST,
+                 random_size=True,
+                 random_interp=False):
+        super(BatchRandomResize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+        self.interp = interp
+        assert isinstance(target_size, (
+            int, Sequence)), "target_size must be int, list or tuple"
+        if random_size and not isinstance(target_size, list):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. Must be List, now is {}".
+                format(type(target_size)))
+        self.target_size = target_size
+        self.random_size = random_size
+        self.random_interp = random_interp
+
+    def __call__(self, samples, context=None):
+        if self.random_size:
+            index = np.random.choice(len(self.target_size))
+            target_size = self.target_size[index]
+        else:
+            target_size = self.target_size
+
+        if self.random_interp:
+            interp = np.random.choice(self.interps)
+        else:
+            interp = self.interp
+
+        resizer = Resize(
+            target_size, keep_ratio=self.keep_ratio, interp=interp)
+        return resizer(samples, context=context)
+
+
+@register_op
+class Gt2YoloTarget(BaseOperator):
+    """
+    Generate YOLOv3 targets by groud truth data, this operator is only used in
+    fine grained YOLOv3 loss mode
+    """
+
+    def __init__(self,
+                 anchors,
+                 anchor_masks,
+                 downsample_ratios,
+                 num_classes=80,
+                 iou_thresh=1.):
+        super(Gt2YoloTarget, self).__init__()
+        self.anchors = anchors
+        self.anchor_masks = anchor_masks
+        self.downsample_ratios = downsample_ratios
+        self.num_classes = num_classes
+        self.iou_thresh = iou_thresh
+
+    def __call__(self, samples, context=None):
+        assert len(self.anchor_masks) == len(self.downsample_ratios), \
+            "anchor_masks', and 'downsample_ratios' should have same length."
+
+        h, w = samples[0]['image'].shape[1:3]
+        an_hw = np.array(self.anchors) / np.array([[w, h]])
+        for sample in samples:
+            gt_bbox = sample['gt_bbox']
+            gt_class = sample['gt_class']
+            if 'gt_score' not in sample:
+                sample['gt_score'] = np.ones(
+                    (gt_bbox.shape[0], 1), dtype=np.float32)
+            gt_score = sample['gt_score']
+            for i, (
+                    mask, downsample_ratio
+            ) in enumerate(zip(self.anchor_masks, self.downsample_ratios)):
+                grid_h = int(h / downsample_ratio)
+                grid_w = int(w / downsample_ratio)
+                target = np.zeros(
+                    (len(mask), 6 + self.num_classes, grid_h, grid_w),
+                    dtype=np.float32)
+                for b in range(gt_bbox.shape[0]):
+                    gx, gy, gw, gh = gt_bbox[b, :]
+                    cls = gt_class[b]
+                    score = gt_score[b]
+                    if gw <= 0. or gh <= 0. or score <= 0.:
+                        continue
+
+                    # find best match anchor index
+                    best_iou = 0.
+                    best_idx = -1
+                    for an_idx in range(an_hw.shape[0]):
+                        iou = jaccard_overlap(
+                            [0., 0., gw, gh],
+                            [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]])
+                        if iou > best_iou:
+                            best_iou = iou
+                            best_idx = an_idx
+
+                    gi = int(gx * grid_w)
+                    gj = int(gy * grid_h)
+
+                    # gtbox should be regresed in this layes if best match
+                    # anchor index in anchor mask of this layer
+                    if best_idx in mask:
+                        best_n = mask.index(best_idx)
+
+                        # x, y, w, h, scale
+                        target[best_n, 0, gj, gi] = gx * grid_w - gi
+                        target[best_n, 1, gj, gi] = gy * grid_h - gj
+                        target[best_n, 2, gj, gi] = np.log(
+                            gw * w / self.anchors[best_idx][0])
+                        target[best_n, 3, gj, gi] = np.log(
+                            gh * h / self.anchors[best_idx][1])
+                        target[best_n, 4, gj, gi] = 2.0 - gw * gh
+
+                        # objectness record gt_score
+                        target[best_n, 5, gj, gi] = score
+
+                        # classification
+                        target[best_n, 6 + cls, gj, gi] = 1.
+
+                    # For non-matched anchors, calculate the target if the iou
+                    # between anchor and gt is larger than iou_thresh
+                    if self.iou_thresh < 1:
+                        for idx, mask_i in enumerate(mask):
+                            if mask_i == best_idx: continue
+                            iou = jaccard_overlap(
+                                [0., 0., gw, gh],
+                                [0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]])
+                            if iou > self.iou_thresh and target[idx, 5, gj,
+                                                                gi] == 0.:
+                                # x, y, w, h, scale
+                                target[idx, 0, gj, gi] = gx * grid_w - gi
+                                target[idx, 1, gj, gi] = gy * grid_h - gj
+                                target[idx, 2, gj, gi] = np.log(
+                                    gw * w / self.anchors[mask_i][0])
+                                target[idx, 3, gj, gi] = np.log(
+                                    gh * h / self.anchors[mask_i][1])
+                                target[idx, 4, gj, gi] = 2.0 - gw * gh
+
+                                # objectness record gt_score
+                                target[idx, 5, gj, gi] = score
+
+                                # classification
+                                target[idx, 6 + cls, gj, gi] = 1.
+                sample['target{}'.format(i)] = target
+
+            # remove useless gt_class and gt_score after target calculated
+            sample.pop('gt_class')
+            sample.pop('gt_score')
+
+        return samples
+
+
+@register_op
+class Gt2FCOSTarget(BaseOperator):
+    """
+    Generate FCOS targets by groud truth data
+    """
+
+    def __init__(self,
+                 object_sizes_boundary,
+                 center_sampling_radius,
+                 downsample_ratios,
+                 norm_reg_targets=False):
+        super(Gt2FCOSTarget, self).__init__()
+        self.center_sampling_radius = center_sampling_radius
+        self.downsample_ratios = downsample_ratios
+        self.INF = np.inf
+        self.object_sizes_boundary = [-1] + object_sizes_boundary + [self.INF]
+        object_sizes_of_interest = []
+        for i in range(len(self.object_sizes_boundary) - 1):
+            object_sizes_of_interest.append([
+                self.object_sizes_boundary[i],
+                self.object_sizes_boundary[i + 1]
+            ])
+        self.object_sizes_of_interest = object_sizes_of_interest
+        self.norm_reg_targets = norm_reg_targets
+
+    def _compute_points(self, w, h):
+        """
+        compute the corresponding points in each feature map
+        :param h: image height
+        :param w: image width
+        :return: points from all feature map
+        """
+        locations = []
+        for stride in self.downsample_ratios:
+            shift_x = np.arange(0, w, stride).astype(np.float32)
+            shift_y = np.arange(0, h, stride).astype(np.float32)
+            shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+            shift_x = shift_x.flatten()
+            shift_y = shift_y.flatten()
+            location = np.stack([shift_x, shift_y], axis=1) + stride // 2
+            locations.append(location)
+        num_points_each_level = [len(location) for location in locations]
+        locations = np.concatenate(locations, axis=0)
+        return locations, num_points_each_level
+
+    def _convert_xywh2xyxy(self, gt_bbox, w, h):
+        """
+        convert the bounding box from style xywh to xyxy
+        :param gt_bbox: bounding boxes normalized into [0, 1]
+        :param w: image width
+        :param h: image height
+        :return: bounding boxes in xyxy style
+        """
+        bboxes = gt_bbox.copy()
+        bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * w
+        bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * h
+        bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
+        bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
+        return bboxes
+
+    def _check_inside_boxes_limited(self, gt_bbox, xs, ys,
+                                    num_points_each_level):
+        """
+        check if points is within the clipped boxes
+        :param gt_bbox: bounding boxes
+        :param xs: horizontal coordinate of points
+        :param ys: vertical coordinate of points
+        :return: the mask of points is within gt_box or not
+        """
+        bboxes = np.reshape(
+            gt_bbox, newshape=[1, gt_bbox.shape[0], gt_bbox.shape[1]])
+        bboxes = np.tile(bboxes, reps=[xs.shape[0], 1, 1])
+        ct_x = (bboxes[:, :, 0] + bboxes[:, :, 2]) / 2
+        ct_y = (bboxes[:, :, 1] + bboxes[:, :, 3]) / 2
+        beg = 0
+        clipped_box = bboxes.copy()
+        for lvl, stride in enumerate(self.downsample_ratios):
+            end = beg + num_points_each_level[lvl]
+            stride_exp = self.center_sampling_radius * stride
+            clipped_box[beg:end, :, 0] = np.maximum(
+                bboxes[beg:end, :, 0], ct_x[beg:end, :] - stride_exp)
+            clipped_box[beg:end, :, 1] = np.maximum(
+                bboxes[beg:end, :, 1], ct_y[beg:end, :] - stride_exp)
+            clipped_box[beg:end, :, 2] = np.minimum(
+                bboxes[beg:end, :, 2], ct_x[beg:end, :] + stride_exp)
+            clipped_box[beg:end, :, 3] = np.minimum(
+                bboxes[beg:end, :, 3], ct_y[beg:end, :] + stride_exp)
+            beg = end
+        l_res = xs - clipped_box[:, :, 0]
+        r_res = clipped_box[:, :, 2] - xs
+        t_res = ys - clipped_box[:, :, 1]
+        b_res = clipped_box[:, :, 3] - ys
+        clipped_box_reg_targets = np.stack(
+            [l_res, t_res, r_res, b_res], axis=2)
+        inside_gt_box = np.min(clipped_box_reg_targets, axis=2) > 0
+        return inside_gt_box
+
+    def __call__(self, samples, context=None):
+        assert len(self.object_sizes_of_interest) == len(self.downsample_ratios), \
+            "object_sizes_of_interest', and 'downsample_ratios' should have same length."
+
+        for sample in samples:
+            im = sample['image']
+            bboxes = sample['gt_bbox']
+            gt_class = sample['gt_class']
+            # calculate the locations
+            h, w = im.shape[1:3]
+            points, num_points_each_level = self._compute_points(w, h)
+            object_scale_exp = []
+            for i, num_pts in enumerate(num_points_each_level):
+                object_scale_exp.append(
+                    np.tile(
+                        np.array([self.object_sizes_of_interest[i]]),
+                        reps=[num_pts, 1]))
+            object_scale_exp = np.concatenate(object_scale_exp, axis=0)
+
+            gt_area = (bboxes[:, 2] - bboxes[:, 0]) * (
+                bboxes[:, 3] - bboxes[:, 1])
+            xs, ys = points[:, 0], points[:, 1]
+            xs = np.reshape(xs, newshape=[xs.shape[0], 1])
+            xs = np.tile(xs, reps=[1, bboxes.shape[0]])
+            ys = np.reshape(ys, newshape=[ys.shape[0], 1])
+            ys = np.tile(ys, reps=[1, bboxes.shape[0]])
+
+            l_res = xs - bboxes[:, 0]
+            r_res = bboxes[:, 2] - xs
+            t_res = ys - bboxes[:, 1]
+            b_res = bboxes[:, 3] - ys
+            reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2)
+            if self.center_sampling_radius > 0:
+                is_inside_box = self._check_inside_boxes_limited(
+                    bboxes, xs, ys, num_points_each_level)
+            else:
+                is_inside_box = np.min(reg_targets, axis=2) > 0
+            # check if the targets is inside the corresponding level
+            max_reg_targets = np.max(reg_targets, axis=2)
+            lower_bound = np.tile(
+                np.expand_dims(
+                    object_scale_exp[:, 0], axis=1),
+                reps=[1, max_reg_targets.shape[1]])
+            high_bound = np.tile(
+                np.expand_dims(
+                    object_scale_exp[:, 1], axis=1),
+                reps=[1, max_reg_targets.shape[1]])
+            is_match_current_level = \
+                (max_reg_targets > lower_bound) & \
+                (max_reg_targets < high_bound)
+            points2gtarea = np.tile(
+                np.expand_dims(
+                    gt_area, axis=0), reps=[xs.shape[0], 1])
+            points2gtarea[is_inside_box == 0] = self.INF
+            points2gtarea[is_match_current_level == 0] = self.INF
+            points2min_area = points2gtarea.min(axis=1)
+            points2min_area_ind = points2gtarea.argmin(axis=1)
+            labels = gt_class[points2min_area_ind] + 1
+            labels[points2min_area == self.INF] = 0
+            reg_targets = reg_targets[range(xs.shape[0]), points2min_area_ind]
+            ctn_targets = np.sqrt((reg_targets[:, [0, 2]].min(axis=1) / \
+                                  reg_targets[:, [0, 2]].max(axis=1)) * \
+                                  (reg_targets[:, [1, 3]].min(axis=1) / \
+                                   reg_targets[:, [1, 3]].max(axis=1))).astype(np.float32)
+            ctn_targets = np.reshape(
+                ctn_targets, newshape=[ctn_targets.shape[0], 1])
+            ctn_targets[labels <= 0] = 0
+            pos_ind = np.nonzero(labels != 0)
+            reg_targets_pos = reg_targets[pos_ind[0], :]
+            split_sections = []
+            beg = 0
+            for lvl in range(len(num_points_each_level)):
+                end = beg + num_points_each_level[lvl]
+                split_sections.append(end)
+                beg = end
+            labels_by_level = np.split(labels, split_sections, axis=0)
+            reg_targets_by_level = np.split(
+                reg_targets, split_sections, axis=0)
+            ctn_targets_by_level = np.split(
+                ctn_targets, split_sections, axis=0)
+            for lvl in range(len(self.downsample_ratios)):
+                grid_w = int(np.ceil(w / self.downsample_ratios[lvl]))
+                grid_h = int(np.ceil(h / self.downsample_ratios[lvl]))
+                if self.norm_reg_targets:
+                    sample['reg_target{}'.format(lvl)] = \
+                        np.reshape(
+                            reg_targets_by_level[lvl] / \
+                            self.downsample_ratios[lvl],
+                            newshape=[grid_h, grid_w, 4])
+                else:
+                    sample['reg_target{}'.format(lvl)] = np.reshape(
+                        reg_targets_by_level[lvl],
+                        newshape=[grid_h, grid_w, 4])
+                sample['labels{}'.format(lvl)] = np.reshape(
+                    labels_by_level[lvl], newshape=[grid_h, grid_w, 1])
+                sample['centerness{}'.format(lvl)] = np.reshape(
+                    ctn_targets_by_level[lvl], newshape=[grid_h, grid_w, 1])
+
+            sample.pop('is_crowd', None)
+            sample.pop('difficult', None)
+            sample.pop('gt_class', None)
+            sample.pop('gt_bbox', None)
+        return samples
+
+
+@register_op
+class Gt2GFLTarget(BaseOperator):
+    """
+    Generate GFocal loss targets by groud truth data
+    """
+
+    def __init__(self,
+                 num_classes=80,
+                 downsample_ratios=[8, 16, 32, 64, 128],
+                 grid_cell_scale=4,
+                 cell_offset=0):
+        super(Gt2GFLTarget, self).__init__()
+        self.num_classes = num_classes
+        self.downsample_ratios = downsample_ratios
+        self.grid_cell_scale = grid_cell_scale
+        self.cell_offset = cell_offset
+
+        self.assigner = ATSSAssigner()
+
+    def get_grid_cells(self, featmap_size, scale, stride, offset=0):
+        """
+        Generate grid cells of a feature map for target assignment.
+        Args:
+            featmap_size: Size of a single level feature map.
+            scale: Grid cell scale.
+            stride: Down sample stride of the feature map.
+            offset: Offset of grid cells.
+        return:
+            Grid_cells xyxy position. Size should be [feat_w * feat_h, 4]
+        """
+        cell_size = stride * scale
+        h, w = featmap_size
+        x_range = (np.arange(w, dtype=np.float32) + offset) * stride
+        y_range = (np.arange(h, dtype=np.float32) + offset) * stride
+        x, y = np.meshgrid(x_range, y_range)
+        y = y.flatten()
+        x = x.flatten()
+        grid_cells = np.stack(
+            [
+                x - 0.5 * cell_size, y - 0.5 * cell_size, x + 0.5 * cell_size,
+                y + 0.5 * cell_size
+            ],
+            axis=-1)
+        return grid_cells
+
+    def get_sample(self, assign_gt_inds, gt_bboxes):
+        pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])
+        neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])
+        pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1
+
+        if gt_bboxes.size == 0:
+            # hack for index error case
+            assert pos_assigned_gt_inds.size == 0
+            pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.resize(-1, 4)
+            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
+        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
+
+    def __call__(self, samples, context=None):
+        assert len(samples) > 0
+        batch_size = len(samples)
+        # get grid cells of image
+        h, w = samples[0]['image'].shape[1:3]
+        multi_level_grid_cells = []
+        for stride in self.downsample_ratios:
+            featmap_size = (int(math.ceil(h / stride)),
+                            int(math.ceil(w / stride)))
+            multi_level_grid_cells.append(
+                self.get_grid_cells(featmap_size, self.grid_cell_scale, stride,
+                                    self.cell_offset))
+        mlvl_grid_cells_list = [
+            multi_level_grid_cells for i in range(batch_size)
+        ]
+        # pixel cell number of multi-level feature maps
+        num_level_cells = [
+            grid_cells.shape[0] for grid_cells in mlvl_grid_cells_list[0]
+        ]
+        num_level_cells_list = [num_level_cells] * batch_size
+        # concat all level cells and to a single array
+        for i in range(batch_size):
+            mlvl_grid_cells_list[i] = np.concatenate(mlvl_grid_cells_list[i])
+        # target assign on all images
+        for sample, grid_cells, num_level_cells in zip(
+                samples, mlvl_grid_cells_list, num_level_cells_list):
+            gt_bboxes = sample['gt_bbox']
+            gt_labels = sample['gt_class'].squeeze()
+            if gt_labels.size == 1:
+                gt_labels = np.array([gt_labels]).astype(np.int32)
+            gt_bboxes_ignore = None
+            assign_gt_inds, _ = self.assigner(grid_cells, num_level_cells,
+                                              gt_bboxes, gt_bboxes_ignore,
+                                              gt_labels)
+            pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.get_sample(
+                assign_gt_inds, gt_bboxes)
+
+            num_cells = grid_cells.shape[0]
+            bbox_targets = np.zeros_like(grid_cells)
+            bbox_weights = np.zeros_like(grid_cells)
+            labels = np.ones([num_cells], dtype=np.int64) * self.num_classes
+            label_weights = np.zeros([num_cells], dtype=np.float32)
+
+            if len(pos_inds) > 0:
+                pos_bbox_targets = pos_gt_bboxes
+                bbox_targets[pos_inds, :] = pos_bbox_targets
+                bbox_weights[pos_inds, :] = 1.0
+                if not np.any(gt_labels):
+                    labels[pos_inds] = 0
+                else:
+                    labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
+
+                label_weights[pos_inds] = 1.0
+            if len(neg_inds) > 0:
+                label_weights[neg_inds] = 1.0
+            sample['grid_cells'] = grid_cells
+            sample['labels'] = labels
+            sample['label_weights'] = label_weights
+            sample['bbox_targets'] = bbox_targets
+            sample['pos_num'] = max(pos_inds.size, 1)
+            sample.pop('is_crowd', None)
+            sample.pop('difficult', None)
+            sample.pop('gt_class', None)
+            sample.pop('gt_bbox', None)
+            sample.pop('gt_score', None)
+        return samples
+
+
+@register_op
+class Gt2TTFTarget(BaseOperator):
+    __shared__ = ['num_classes']
+    """
+    Gt2TTFTarget
+    Generate TTFNet targets by ground truth data
+
+    Args:
+        num_classes(int): the number of classes.
+        down_ratio(int): the down ratio from images to heatmap, 4 by default.
+        alpha(float): the alpha parameter to generate gaussian target.
+            0.54 by default.
+    """
+
+    def __init__(self, num_classes=80, down_ratio=4, alpha=0.54):
+        super(Gt2TTFTarget, self).__init__()
+        self.down_ratio = down_ratio
+        self.num_classes = num_classes
+        self.alpha = alpha
+
+    def __call__(self, samples, context=None):
+        output_size = samples[0]['image'].shape[1]
+        feat_size = output_size // self.down_ratio
+        for sample in samples:
+            heatmap = np.zeros(
+                (self.num_classes, feat_size, feat_size), dtype='float32')
+            box_target = np.ones(
+                (4, feat_size, feat_size), dtype='float32') * -1
+            reg_weight = np.zeros((1, feat_size, feat_size), dtype='float32')
+
+            gt_bbox = sample['gt_bbox']
+            gt_class = sample['gt_class']
+
+            bbox_w = gt_bbox[:, 2] - gt_bbox[:, 0] + 1
+            bbox_h = gt_bbox[:, 3] - gt_bbox[:, 1] + 1
+            area = bbox_w * bbox_h
+            boxes_areas_log = np.log(area)
+            boxes_ind = np.argsort(boxes_areas_log, axis=0)[::-1]
+            boxes_area_topk_log = boxes_areas_log[boxes_ind]
+            gt_bbox = gt_bbox[boxes_ind]
+            gt_class = gt_class[boxes_ind]
+
+            feat_gt_bbox = gt_bbox / self.down_ratio
+            feat_gt_bbox = np.clip(feat_gt_bbox, 0, feat_size - 1)
+            feat_hs, feat_ws = (feat_gt_bbox[:, 3] - feat_gt_bbox[:, 1],
+                                feat_gt_bbox[:, 2] - feat_gt_bbox[:, 0])
+
+            ct_inds = np.stack(
+                [(gt_bbox[:, 0] + gt_bbox[:, 2]) / 2,
+                 (gt_bbox[:, 1] + gt_bbox[:, 3]) / 2],
+                axis=1) / self.down_ratio
+
+            h_radiuses_alpha = (feat_hs / 2. * self.alpha).astype('int32')
+            w_radiuses_alpha = (feat_ws / 2. * self.alpha).astype('int32')
+
+            for k in range(len(gt_bbox)):
+                cls_id = gt_class[k]
+                fake_heatmap = np.zeros(
+                    (feat_size, feat_size), dtype='float32')
+                self.draw_truncate_gaussian(fake_heatmap, ct_inds[k],
+                                            h_radiuses_alpha[k],
+                                            w_radiuses_alpha[k])
+
+                heatmap[cls_id] = np.maximum(heatmap[cls_id], fake_heatmap)
+                box_target_inds = fake_heatmap > 0
+                box_target[:, box_target_inds] = gt_bbox[k][:, None]
+
+                local_heatmap = fake_heatmap[box_target_inds]
+                ct_div = np.sum(local_heatmap)
+                local_heatmap *= boxes_area_topk_log[k]
+                reg_weight[0, box_target_inds] = local_heatmap / ct_div
+            sample['ttf_heatmap'] = heatmap
+            sample['ttf_box_target'] = box_target
+            sample['ttf_reg_weight'] = reg_weight
+            sample.pop('is_crowd', None)
+            sample.pop('difficult', None)
+            sample.pop('gt_class', None)
+            sample.pop('gt_bbox', None)
+            sample.pop('gt_score', None)
+        return samples
+
+    def draw_truncate_gaussian(self, heatmap, center, h_radius, w_radius):
+        h, w = 2 * h_radius + 1, 2 * w_radius + 1
+        sigma_x = w / 6
+        sigma_y = h / 6
+        gaussian = gaussian2D((h, w), sigma_x, sigma_y)
+
+        x, y = int(center[0]), int(center[1])
+
+        height, width = heatmap.shape[0:2]
+
+        left, right = min(x, w_radius), min(width - x, w_radius + 1)
+        top, bottom = min(y, h_radius), min(height - y, h_radius + 1)
+
+        masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+        masked_gaussian = gaussian[h_radius - top:h_radius + bottom, w_radius -
+                                   left:w_radius + right]
+        if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
+            heatmap[y - top:y + bottom, x - left:x + right] = np.maximum(
+                masked_heatmap, masked_gaussian)
+        return heatmap
+
+
+@register_op
+class Gt2Solov2Target(BaseOperator):
+    """Assign mask target and labels in SOLOv2 network.
+    The code of this function is based on:
+        https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L271
+    Args:
+        num_grids (list): The list of feature map grids size.
+        scale_ranges (list): The list of mask boundary range.
+        coord_sigma (float): The coefficient of coordinate area length.
+        sampling_ratio (float): The ratio of down sampling.
+    """
+
+    def __init__(self,
+                 num_grids=[40, 36, 24, 16, 12],
+                 scale_ranges=[[1, 96], [48, 192], [96, 384], [192, 768],
+                               [384, 2048]],
+                 coord_sigma=0.2,
+                 sampling_ratio=4.0):
+        super(Gt2Solov2Target, self).__init__()
+        self.num_grids = num_grids
+        self.scale_ranges = scale_ranges
+        self.coord_sigma = coord_sigma
+        self.sampling_ratio = sampling_ratio
+
+    def _scale_size(self, im, scale):
+        h, w = im.shape[:2]
+        new_size = (int(w * float(scale) + 0.5), int(h * float(scale) + 0.5))
+        resized_img = cv2.resize(
+            im, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
+        return resized_img
+
+    def __call__(self, samples, context=None):
+        sample_id = 0
+        max_ins_num = [0] * len(self.num_grids)
+        for sample in samples:
+            gt_bboxes_raw = sample['gt_bbox']
+            gt_labels_raw = sample['gt_class'] + 1
+            im_c, im_h, im_w = sample['image'].shape[:]
+            gt_masks_raw = sample['gt_segm'].astype(np.uint8)
+            mask_feat_size = [
+                int(im_h / self.sampling_ratio),
+                int(im_w / self.sampling_ratio)
+            ]
+            gt_areas = np.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
+                               (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
+            ins_ind_label_list = []
+            idx = 0
+            for (lower_bound, upper_bound), num_grid \
+                    in zip(self.scale_ranges, self.num_grids):
+
+                hit_indices = ((gt_areas >= lower_bound) &
+                               (gt_areas <= upper_bound)).nonzero()[0]
+                num_ins = len(hit_indices)
+
+                ins_label = []
+                grid_order = []
+                cate_label = np.zeros([num_grid, num_grid], dtype=np.int64)
+                ins_ind_label = np.zeros([num_grid**2], dtype=np.bool)
+
+                if num_ins == 0:
+                    ins_label = np.zeros(
+                        [1, mask_feat_size[0], mask_feat_size[1]],
+                        dtype=np.uint8)
+                    ins_ind_label_list.append(ins_ind_label)
+                    sample['cate_label{}'.format(idx)] = cate_label.flatten()
+                    sample['ins_label{}'.format(idx)] = ins_label
+                    sample['grid_order{}'.format(idx)] = np.asarray(
+                        [sample_id * num_grid * num_grid + 0], dtype=np.int32)
+                    idx += 1
+                    continue
+                gt_bboxes = gt_bboxes_raw[hit_indices]
+                gt_labels = gt_labels_raw[hit_indices]
+                gt_masks = gt_masks_raw[hit_indices, ...]
+
+                half_ws = 0.5 * (
+                    gt_bboxes[:, 2] - gt_bboxes[:, 0]) * self.coord_sigma
+                half_hs = 0.5 * (
+                    gt_bboxes[:, 3] - gt_bboxes[:, 1]) * self.coord_sigma
+
+                for seg_mask, gt_label, half_h, half_w in zip(
+                        gt_masks, gt_labels, half_hs, half_ws):
+                    if seg_mask.sum() == 0:
+                        continue
+                    # mass center
+                    upsampled_size = (mask_feat_size[0] * 4,
+                                      mask_feat_size[1] * 4)
+                    center_h, center_w = ndimage.measurements.center_of_mass(
+                        seg_mask)
+                    coord_w = int(
+                        (center_w / upsampled_size[1]) // (1. / num_grid))
+                    coord_h = int(
+                        (center_h / upsampled_size[0]) // (1. / num_grid))
+
+                    # left, top, right, down
+                    top_box = max(0,
+                                  int(((center_h - half_h) / upsampled_size[0])
+                                      // (1. / num_grid)))
+                    down_box = min(
+                        num_grid - 1,
+                        int(((center_h + half_h) / upsampled_size[0]) //
+                            (1. / num_grid)))
+                    left_box = max(
+                        0,
+                        int(((center_w - half_w) / upsampled_size[1]) //
+                            (1. / num_grid)))
+                    right_box = min(num_grid - 1,
+                                    int(((center_w + half_w) /
+                                         upsampled_size[1]) //
+                                        (1. / num_grid)))
+
+                    top = max(top_box, coord_h - 1)
+                    down = min(down_box, coord_h + 1)
+                    left = max(coord_w - 1, left_box)
+                    right = min(right_box, coord_w + 1)
+
+                    cate_label[top:(down + 1), left:(right + 1)] = gt_label
+                    seg_mask = self._scale_size(
+                        seg_mask, scale=1. / self.sampling_ratio)
+                    for i in range(top, down + 1):
+                        for j in range(left, right + 1):
+                            label = int(i * num_grid + j)
+                            cur_ins_label = np.zeros(
+                                [mask_feat_size[0], mask_feat_size[1]],
+                                dtype=np.uint8)
+                            cur_ins_label[:seg_mask.shape[0], :seg_mask.shape[
+                                1]] = seg_mask
+                            ins_label.append(cur_ins_label)
+                            ins_ind_label[label] = True
+                            grid_order.append(sample_id * num_grid * num_grid +
+                                              label)
+                if ins_label == []:
+                    ins_label = np.zeros(
+                        [1, mask_feat_size[0], mask_feat_size[1]],
+                        dtype=np.uint8)
+                    ins_ind_label_list.append(ins_ind_label)
+                    sample['cate_label{}'.format(idx)] = cate_label.flatten()
+                    sample['ins_label{}'.format(idx)] = ins_label
+                    sample['grid_order{}'.format(idx)] = np.asarray(
+                        [sample_id * num_grid * num_grid + 0], dtype=np.int32)
+                else:
+                    ins_label = np.stack(ins_label, axis=0)
+                    ins_ind_label_list.append(ins_ind_label)
+                    sample['cate_label{}'.format(idx)] = cate_label.flatten()
+                    sample['ins_label{}'.format(idx)] = ins_label
+                    sample['grid_order{}'.format(idx)] = np.asarray(
+                        grid_order, dtype=np.int32)
+                    assert len(grid_order) > 0
+                max_ins_num[idx] = max(
+                    max_ins_num[idx],
+                    sample['ins_label{}'.format(idx)].shape[0])
+                idx += 1
+            ins_ind_labels = np.concatenate([
+                ins_ind_labels_level_img
+                for ins_ind_labels_level_img in ins_ind_label_list
+            ])
+            fg_num = np.sum(ins_ind_labels)
+            sample['fg_num'] = fg_num
+            sample_id += 1
+
+            sample.pop('is_crowd')
+            sample.pop('gt_class')
+            sample.pop('gt_bbox')
+            sample.pop('gt_poly')
+            sample.pop('gt_segm')
+
+        # padding batch
+        for data in samples:
+            for idx in range(len(self.num_grids)):
+                gt_ins_data = np.zeros(
+                    [
+                        max_ins_num[idx],
+                        data['ins_label{}'.format(idx)].shape[1],
+                        data['ins_label{}'.format(idx)].shape[2]
+                    ],
+                    dtype=np.uint8)
+                gt_ins_data[0:data['ins_label{}'.format(idx)].shape[
+                    0], :, :] = data['ins_label{}'.format(idx)]
+                gt_grid_order = np.zeros([max_ins_num[idx]], dtype=np.int32)
+                gt_grid_order[0:data['grid_order{}'.format(idx)].shape[
+                    0]] = data['grid_order{}'.format(idx)]
+                data['ins_label{}'.format(idx)] = gt_ins_data
+                data['grid_order{}'.format(idx)] = gt_grid_order
+
+        return samples
+
+
+@register_op
+class Gt2SparseRCNNTarget(BaseOperator):
+    '''
+    Generate SparseRCNN targets by groud truth data
+    '''
+
+    def __init__(self):
+        super(Gt2SparseRCNNTarget, self).__init__()
+
+    def __call__(self, samples, context=None):
+        for sample in samples:
+            im = sample["image"]
+            h, w = im.shape[1:3]
+            img_whwh = np.array([w, h, w, h], dtype=np.int32)
+            sample["img_whwh"] = img_whwh
+            if "scale_factor" in sample:
+                sample["scale_factor_wh"] = np.array(
+                    [sample["scale_factor"][1], sample["scale_factor"][0]],
+                    dtype=np.float32)
+            else:
+                sample["scale_factor_wh"] = np.array(
+                    [1.0, 1.0], dtype=np.float32)
+
+        return samples
+
+
+@register_op
+class PadMaskBatch(BaseOperator):
+    """
+    Pad a batch of samples so they can be divisible by a stride.
+    The layout of each image should be 'CHW'.
+    Args:
+        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
+            height and width is divisible by `pad_to_stride`.
+        return_pad_mask (bool): If `return_pad_mask = True`, return
+            `pad_mask` for transformer.
+    """
+
+    def __init__(self, pad_to_stride=0, return_pad_mask=False):
+        super(PadMaskBatch, self).__init__()
+        self.pad_to_stride = pad_to_stride
+        self.return_pad_mask = return_pad_mask
+
+    def __call__(self, samples, context=None):
+        """
+        Args:
+            samples (list): a batch of sample, each is dict.
+        """
+        coarsest_stride = self.pad_to_stride
+
+        max_shape = np.array([data['image'].shape for data in samples]).max(
+            axis=0)
+        if coarsest_stride > 0:
+            max_shape[1] = int(
+                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
+            max_shape[2] = int(
+                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
+
+        for data in samples:
+            im = data['image']
+            im_c, im_h, im_w = im.shape[:]
+            padding_im = np.zeros(
+                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)
+            padding_im[:, :im_h, :im_w] = im
+            data['image'] = padding_im
+            if 'semantic' in data and data['semantic'] is not None:
+                semantic = data['semantic']
+                padding_sem = np.zeros(
+                    (1, max_shape[1], max_shape[2]), dtype=np.float32)
+                padding_sem[:, :im_h, :im_w] = semantic
+                data['semantic'] = padding_sem
+            if 'gt_segm' in data and data['gt_segm'] is not None:
+                gt_segm = data['gt_segm']
+                padding_segm = np.zeros(
+                    (gt_segm.shape[0], max_shape[1], max_shape[2]),
+                    dtype=np.uint8)
+                padding_segm[:, :im_h, :im_w] = gt_segm
+                data['gt_segm'] = padding_segm
+            if self.return_pad_mask:
+                padding_mask = np.zeros(
+                    (max_shape[1], max_shape[2]), dtype=np.float32)
+                padding_mask[:im_h, :im_w] = 1.
+                data['pad_mask'] = padding_mask
+
+            if 'gt_rbox2poly' in data and data['gt_rbox2poly'] is not None:
+                # ploy to rbox
+                polys = data['gt_rbox2poly']
+                rbox = bbox_utils.poly2rbox(polys)
+                data['gt_rbox'] = rbox
+
+        return samples
+
+
+@register_op
+class Gt2CenterNetTarget(BaseOperator):
+    """Gt2CenterNetTarget
+    Genterate CenterNet targets by ground-truth
+    Args:
+        down_ratio (int): The down sample ratio between output feature and
+                          input image.
+        num_classes (int): The number of classes, 80 by default.
+        max_objs (int): The maximum objects detected, 128 by default.
+    """
+
+    def __init__(self, down_ratio, num_classes=80, max_objs=128):
+        super(Gt2CenterNetTarget, self).__init__()
+        self.down_ratio = down_ratio
+        self.num_classes = num_classes
+        self.max_objs = max_objs
+
+    def __call__(self, sample, context=None):
+        input_h, input_w = sample['image'].shape[1:]
+        output_h = input_h // self.down_ratio
+        output_w = input_w // self.down_ratio
+        num_classes = self.num_classes
+        c = sample['center']
+        s = sample['scale']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+
+        hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32)
+        wh = np.zeros((self.max_objs, 2), dtype=np.float32)
+        dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)
+        reg = np.zeros((self.max_objs, 2), dtype=np.float32)
+        ind = np.zeros((self.max_objs), dtype=np.int64)
+        reg_mask = np.zeros((self.max_objs), dtype=np.int32)
+        cat_spec_wh = np.zeros(
+            (self.max_objs, num_classes * 2), dtype=np.float32)
+        cat_spec_mask = np.zeros(
+            (self.max_objs, num_classes * 2), dtype=np.int32)
+
+        trans_output = get_affine_transform(c, [s, s], 0, [output_w, output_h])
+
+        gt_det = []
+        for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
+            cls = int(cls)
+            bbox[:2] = affine_transform(bbox[:2], trans_output)
+            bbox[2:] = affine_transform(bbox[2:], trans_output)
+            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
+            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
+            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+            if h > 0 and w > 0:
+                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
+                radius = max(0, int(radius))
+                ct = np.array(
+                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
+                    dtype=np.float32)
+                ct_int = ct.astype(np.int32)
+                draw_umich_gaussian(hm[cls], ct_int, radius)
+                wh[i] = 1. * w, 1. * h
+                ind[i] = ct_int[1] * output_w + ct_int[0]
+                reg[i] = ct - ct_int
+                reg_mask[i] = 1
+                cat_spec_wh[i, cls * 2:cls * 2 + 2] = wh[i]
+                cat_spec_mask[i, cls * 2:cls * 2 + 2] = 1
+                gt_det.append([
+                    ct[0] - w / 2, ct[1] - h / 2, ct[0] + w / 2, ct[1] + h / 2,
+                    1, cls
+                ])
+
+        sample.pop('gt_bbox', None)
+        sample.pop('gt_class', None)
+        sample.pop('center', None)
+        sample.pop('scale', None)
+        sample.pop('is_crowd', None)
+        sample.pop('difficult', None)
+        sample['heatmap'] = hm
+        sample['index_mask'] = reg_mask
+        sample['index'] = ind
+        sample['size'] = wh
+        sample['offset'] = reg
+        return sample

+ 86 - 0
paddlers/models/ppdet/data/transform/gridmask_utils.py

@@ -0,0 +1,86 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/dvlab-research/GridMask/blob/master/detection_grid/maskrcnn_benchmark/data/transforms/grid.py
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+from PIL import Image
+
+
+class Gridmask(object):
+    def __init__(self,
+                 use_h=True,
+                 use_w=True,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=1,
+                 prob=0.7,
+                 upper_iter=360000):
+        super(Gridmask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.prob = prob
+        self.st_prob = prob
+        self.upper_iter = upper_iter
+
+    def __call__(self, x, curr_iter):
+        self.prob = self.st_prob * min(1, 1.0 * curr_iter / self.upper_iter)
+        if np.random.rand() > self.prob:
+            return x
+        h, w, _ = x.shape
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(2, h)
+        self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.l, hh)
+                mask[s:t, :] *= 0
+        if self.use_w:
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.l, ww)
+                mask[:, s:t] *= 0
+
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) //
+                    2 + w].astype(np.float32)
+
+        if self.mode == 1:
+            mask = 1 - mask
+        mask = np.expand_dims(mask, axis=-1)
+        if self.offset:
+            offset = (2 * (np.random.rand(h, w) - 0.5)).astype(np.float32)
+            x = (x * mask + offset * (1 - mask)).astype(x.dtype)
+        else:
+            x = (x * mask).astype(x.dtype)
+
+        return x

+ 868 - 0
paddlers/models/ppdet/data/transform/keypoint_operators.py

@@ -0,0 +1,868 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# function:
+#    operators to process sample,
+#    eg: decode/resize/crop image
+
+from __future__ import absolute_import
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+import cv2
+import numpy as np
+import math
+import copy
+
+from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix
+from paddlers.models.ppdet.core.workspace import serializable
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+registered_ops = []
+
+__all__ = [
+    'RandomAffine',
+    'KeyPointFlip',
+    'TagGenerate',
+    'ToHeatmaps',
+    'NormalizePermute',
+    'EvalAffine',
+    'RandomFlipHalfBodyTransform',
+    'TopDownAffine',
+    'ToHeatmapsTopDown',
+    'ToHeatmapsTopDown_DARK',
+    'ToHeatmapsTopDown_UDP',
+    'TopDownEvalAffine',
+    'AugmentationbyInformantionDropping',
+]
+
+
+def register_keypointop(cls):
+    return serializable(cls)
+
+
+@register_keypointop
+class KeyPointFlip(object):
+    """Get the fliped image by flip_prob. flip the coords also
+    the left coords and right coords should exchange while flip, for the right keypoint will be left keypoint after image fliped
+
+    Args:
+        flip_permutation (list[17]): the left-right exchange order list corresponding to [0,1,2,...,16]
+        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
+        flip_prob (float): the ratio whether to flip the image
+        records(dict): the dict contained the image, mask and coords
+
+    Returns:
+        records(dict): contain the image, mask and coords after tranformed
+
+    """
+
+    def __init__(self, flip_permutation, hmsize, flip_prob=0.5):
+        super(KeyPointFlip, self).__init__()
+        assert isinstance(flip_permutation, Sequence)
+        self.flip_permutation = flip_permutation
+        self.flip_prob = flip_prob
+        self.hmsize = hmsize
+
+    def __call__(self, records):
+        image = records['image']
+        kpts_lst = records['joints']
+        mask_lst = records['mask']
+        flip = np.random.random() < self.flip_prob
+        if flip:
+            image = image[:, ::-1]
+            for idx, hmsize in enumerate(self.hmsize):
+                if len(mask_lst) > idx:
+                    mask_lst[idx] = mask_lst[idx][:, ::-1]
+                if kpts_lst[idx].ndim == 3:
+                    kpts_lst[idx] = kpts_lst[idx][:, self.flip_permutation]
+                else:
+                    kpts_lst[idx] = kpts_lst[idx][self.flip_permutation]
+                kpts_lst[idx][..., 0] = hmsize - kpts_lst[idx][..., 0]
+                kpts_lst[idx] = kpts_lst[idx].astype(np.int64)
+                kpts_lst[idx][kpts_lst[idx][..., 0] >= hmsize, 2] = 0
+                kpts_lst[idx][kpts_lst[idx][..., 1] >= hmsize, 2] = 0
+                kpts_lst[idx][kpts_lst[idx][..., 0] < 0, 2] = 0
+                kpts_lst[idx][kpts_lst[idx][..., 1] < 0, 2] = 0
+        records['image'] = image
+        records['joints'] = kpts_lst
+        records['mask'] = mask_lst
+        return records
+
+
+@register_keypointop
+class RandomAffine(object):
+    """apply affine transform to image, mask and coords
+    to achieve the rotate, scale and shift effect for training image
+
+    Args:
+        max_degree (float): the max abslute rotate degree to apply, transform range is [-max_degree, max_degree]
+        max_scale (list[2]): the scale range to apply, transform range is [min, max]
+        max_shift (float): the max abslute shift ratio to apply, transform range is [-max_shift*imagesize, max_shift*imagesize]
+        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
+        trainsize (int): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard
+        scale_type (str): the length of [h,w] to used for trainsize, chosed between 'short' and 'long'
+        records(dict): the dict contained the image, mask and coords
+
+    Returns:
+        records(dict): contain the image, mask and coords after tranformed
+
+    """
+
+    def __init__(self,
+                 max_degree=30,
+                 scale=[0.75, 1.5],
+                 max_shift=0.2,
+                 hmsize=[128, 256],
+                 trainsize=512,
+                 scale_type='short'):
+        super(RandomAffine, self).__init__()
+        self.max_degree = max_degree
+        self.min_scale = scale[0]
+        self.max_scale = scale[1]
+        self.max_shift = max_shift
+        self.hmsize = hmsize
+        self.trainsize = trainsize
+        self.scale_type = scale_type
+
+    def _get_affine_matrix(self, center, scale, res, rot=0):
+        """Generate transformation matrix."""
+        h = scale
+        t = np.zeros((3, 3), dtype=np.float32)
+        t[0, 0] = float(res[1]) / h
+        t[1, 1] = float(res[0]) / h
+        t[0, 2] = res[1] * (-float(center[0]) / h + .5)
+        t[1, 2] = res[0] * (-float(center[1]) / h + .5)
+        t[2, 2] = 1
+        if rot != 0:
+            rot = -rot  # To match direction of rotation from cropping
+            rot_mat = np.zeros((3, 3), dtype=np.float32)
+            rot_rad = rot * np.pi / 180
+            sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+            rot_mat[0, :2] = [cs, -sn]
+            rot_mat[1, :2] = [sn, cs]
+            rot_mat[2, 2] = 1
+            # Need to rotate around center
+            t_mat = np.eye(3)
+            t_mat[0, 2] = -res[1] / 2
+            t_mat[1, 2] = -res[0] / 2
+            t_inv = t_mat.copy()
+            t_inv[:2, 2] *= -1
+            t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
+        return t
+
+    def __call__(self, records):
+        image = records['image']
+        keypoints = records['joints']
+        heatmap_mask = records['mask']
+
+        degree = (np.random.random() * 2 - 1) * self.max_degree
+        shape = np.array(image.shape[:2][::-1])
+        center = center = np.array((np.array(shape) / 2))
+
+        aug_scale = np.random.random() * (self.max_scale - self.min_scale
+                                          ) + self.min_scale
+        if self.scale_type == 'long':
+            scale = max(shape[0], shape[1]) / 1.0
+        elif self.scale_type == 'short':
+            scale = min(shape[0], shape[1]) / 1.0
+        else:
+            raise ValueError('Unknown scale type: {}'.format(self.scale_type))
+        roi_size = aug_scale * scale
+        dx = int(0)
+        dy = int(0)
+        if self.max_shift > 0:
+
+            dx = np.random.randint(-self.max_shift * roi_size,
+                                   self.max_shift * roi_size)
+            dy = np.random.randint(-self.max_shift * roi_size,
+                                   self.max_shift * roi_size)
+
+        center += np.array([dx, dy])
+        input_size = 2 * center
+
+        keypoints[..., :2] *= shape
+        heatmap_mask *= 255
+        kpts_lst = []
+        mask_lst = []
+
+        image_affine_mat = self._get_affine_matrix(
+            center, roi_size, (self.trainsize, self.trainsize), degree)[:2]
+        image = cv2.warpAffine(
+            image,
+            image_affine_mat, (self.trainsize, self.trainsize),
+            flags=cv2.INTER_LINEAR)
+        for hmsize in self.hmsize:
+            kpts = copy.deepcopy(keypoints)
+            mask_affine_mat = self._get_affine_matrix(
+                center, roi_size, (hmsize, hmsize), degree)[:2]
+            if heatmap_mask is not None:
+                mask = cv2.warpAffine(heatmap_mask, mask_affine_mat,
+                                      (hmsize, hmsize))
+                mask = ((mask / 255) > 0.5).astype(np.float32)
+            kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(),
+                                                mask_affine_mat)
+            kpts[np.trunc(kpts[..., 0]) >= hmsize, 2] = 0
+            kpts[np.trunc(kpts[..., 1]) >= hmsize, 2] = 0
+            kpts[np.trunc(kpts[..., 0]) < 0, 2] = 0
+            kpts[np.trunc(kpts[..., 1]) < 0, 2] = 0
+            kpts_lst.append(kpts)
+            mask_lst.append(mask)
+        records['image'] = image
+        records['joints'] = kpts_lst
+        records['mask'] = mask_lst
+        return records
+
+
+@register_keypointop
+class EvalAffine(object):
+    """apply affine transform to image
+    resize the short of [h,w] to standard size for eval
+
+    Args:
+        size (int): the standard length used to train, the 'short' of [h,w] will be resize to trainsize for standard
+        records(dict): the dict contained the image, mask and coords
+
+    Returns:
+        records(dict): contain the image, mask and coords after tranformed
+
+    """
+
+    def __init__(self, size, stride=64):
+        super(EvalAffine, self).__init__()
+        self.size = size
+        self.stride = stride
+
+    def __call__(self, records):
+        image = records['image']
+        mask = records['mask'] if 'mask' in records else None
+        s = self.size
+        h, w, _ = image.shape
+        trans, size_resized = get_affine_mat_kernel(h, w, s, inv=False)
+        image_resized = cv2.warpAffine(image, trans, size_resized)
+        if mask is not None:
+            mask = cv2.warpAffine(mask, trans, size_resized)
+            records['mask'] = mask
+        if 'joints' in records:
+            del records['joints']
+        records['image'] = image_resized
+        return records
+
+
+@register_keypointop
+class NormalizePermute(object):
+    def __init__(self,
+                 mean=[123.675, 116.28, 103.53],
+                 std=[58.395, 57.120, 57.375],
+                 is_scale=True):
+        super(NormalizePermute, self).__init__()
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+
+    def __call__(self, records):
+        image = records['image']
+        image = image.astype(np.float32)
+        if self.is_scale:
+            image /= 255.
+        image = image.transpose((2, 0, 1))
+        mean = np.array(self.mean, dtype=np.float32)
+        std = np.array(self.std, dtype=np.float32)
+        invstd = 1. / std
+        for v, m, s in zip(image, mean, invstd):
+            v.__isub__(m).__imul__(s)
+        records['image'] = image
+        return records
+
+
+@register_keypointop
+class TagGenerate(object):
+    """record gt coords for aeloss to sample coords value in tagmaps
+
+    Args:
+        num_joints (int): the keypoint numbers of dataset to train
+        num_people (int): maxmum people to support for sample aeloss
+        records(dict): the dict contained the image, mask and coords
+
+    Returns:
+        records(dict): contain the gt coords used in tagmap
+
+    """
+
+    def __init__(self, num_joints, max_people=30):
+        super(TagGenerate, self).__init__()
+        self.max_people = max_people
+        self.num_joints = num_joints
+
+    def __call__(self, records):
+        kpts_lst = records['joints']
+        kpts = kpts_lst[0]
+        tagmap = np.zeros(
+            (self.max_people, self.num_joints, 4), dtype=np.int64)
+        inds = np.where(kpts[..., 2] > 0)
+        p, j = inds[0], inds[1]
+        visible = kpts[inds]
+        # tagmap is [p, j, 3], where last dim is j, y, x
+        tagmap[p, j, 0] = j
+        tagmap[p, j, 1] = visible[..., 1]  # y
+        tagmap[p, j, 2] = visible[..., 0]  # x
+        tagmap[p, j, 3] = 1
+        records['tagmap'] = tagmap
+        del records['joints']
+        return records
+
+
+@register_keypointop
+class ToHeatmaps(object):
+    """to generate the gaussin heatmaps of keypoint for heatmap loss
+
+    Args:
+        num_joints (int): the keypoint numbers of dataset to train
+        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
+        sigma (float): the std of gaussin kernel genereted
+        records(dict): the dict contained the image, mask and coords
+
+    Returns:
+        records(dict): contain the heatmaps used to heatmaploss
+
+    """
+
+    def __init__(self, num_joints, hmsize, sigma=None):
+        super(ToHeatmaps, self).__init__()
+        self.num_joints = num_joints
+        self.hmsize = np.array(hmsize)
+        if sigma is None:
+            sigma = hmsize[0] // 64
+        self.sigma = sigma
+
+        r = 6 * sigma + 3
+        x = np.arange(0, r, 1, np.float32)
+        y = x[:, None]
+        x0, y0 = 3 * sigma + 1, 3 * sigma + 1
+        self.gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
+
+    def __call__(self, records):
+        kpts_lst = records['joints']
+        mask_lst = records['mask']
+        for idx, hmsize in enumerate(self.hmsize):
+            mask = mask_lst[idx]
+            kpts = kpts_lst[idx]
+            heatmaps = np.zeros((self.num_joints, hmsize, hmsize))
+            inds = np.where(kpts[..., 2] > 0)
+            visible = kpts[inds].astype(np.int64)[..., :2]
+            ul = np.round(visible - 3 * self.sigma - 1)
+            br = np.round(visible + 3 * self.sigma + 2)
+            sul = np.maximum(0, -ul)
+            sbr = np.minimum(hmsize, br) - ul
+            dul = np.clip(ul, 0, hmsize - 1)
+            dbr = np.clip(br, 0, hmsize)
+            for i in range(len(visible)):
+                if visible[i][0] < 0 or visible[i][1] < 0 or visible[i][
+                        0] >= hmsize or visible[i][1] >= hmsize:
+                    continue
+                dx1, dy1 = dul[i]
+                dx2, dy2 = dbr[i]
+                sx1, sy1 = sul[i]
+                sx2, sy2 = sbr[i]
+                heatmaps[inds[1][i], dy1:dy2, dx1:dx2] = np.maximum(
+                    self.gaussian[sy1:sy2, sx1:sx2],
+                    heatmaps[inds[1][i], dy1:dy2, dx1:dx2])
+            records['heatmap_gt{}x'.format(idx + 1)] = heatmaps
+            records['mask_{}x'.format(idx + 1)] = mask
+        del records['mask']
+        return records
+
+
+@register_keypointop
+class RandomFlipHalfBodyTransform(object):
+    """apply data augment to image and coords
+    to achieve the flip, scale, rotate and half body transform effect for training image
+
+    Args:
+        trainsize (list):[w, h], Image target size
+        upper_body_ids (list): The upper body joint ids
+        flip_pairs (list): The left-right joints exchange order list
+        pixel_std (int): The pixel std of the scale
+        scale (float): The scale factor to transform the image
+        rot (int): The rotate factor to transform the image
+        num_joints_half_body (int): The joints threshold of the half body transform
+        prob_half_body (float): The threshold of the half body transform
+        flip (bool): Whether to flip the image
+
+    Returns:
+        records(dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self,
+                 trainsize,
+                 upper_body_ids,
+                 flip_pairs,
+                 pixel_std,
+                 scale=0.35,
+                 rot=40,
+                 num_joints_half_body=8,
+                 prob_half_body=0.3,
+                 flip=True,
+                 rot_prob=0.6):
+        super(RandomFlipHalfBodyTransform, self).__init__()
+        self.trainsize = trainsize
+        self.upper_body_ids = upper_body_ids
+        self.flip_pairs = flip_pairs
+        self.pixel_std = pixel_std
+        self.scale = scale
+        self.rot = rot
+        self.num_joints_half_body = num_joints_half_body
+        self.prob_half_body = prob_half_body
+        self.flip = flip
+        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
+        self.rot_prob = rot_prob
+
+    def halfbody_transform(self, joints, joints_vis):
+        upper_joints = []
+        lower_joints = []
+        for joint_id in range(joints.shape[0]):
+            if joints_vis[joint_id][0] > 0:
+                if joint_id in self.upper_body_ids:
+                    upper_joints.append(joints[joint_id])
+                else:
+                    lower_joints.append(joints[joint_id])
+        if np.random.randn() < 0.5 and len(upper_joints) > 2:
+            selected_joints = upper_joints
+        else:
+            selected_joints = lower_joints if len(
+                lower_joints) > 2 else upper_joints
+        if len(selected_joints) < 2:
+            return None, None
+        selected_joints = np.array(selected_joints, dtype=np.float32)
+        center = selected_joints.mean(axis=0)[:2]
+        left_top = np.amin(selected_joints, axis=0)
+        right_bottom = np.amax(selected_joints, axis=0)
+        w = right_bottom[0] - left_top[0]
+        h = right_bottom[1] - left_top[1]
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        scale = scale * 1.5
+
+        return center, scale
+
+    def flip_joints(self, joints, joints_vis, width, matched_parts):
+        joints[:, 0] = width - joints[:, 0] - 1
+        for pair in matched_parts:
+            joints[pair[0], :], joints[pair[1], :] = \
+                joints[pair[1], :], joints[pair[0], :].copy()
+            joints_vis[pair[0], :], joints_vis[pair[1], :] = \
+                joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
+
+        return joints * joints_vis, joints_vis
+
+    def __call__(self, records):
+        image = records['image']
+        joints = records['joints']
+        joints_vis = records['joints_vis']
+        c = records['center']
+        s = records['scale']
+        r = 0
+        if (np.sum(joints_vis[:, 0]) > self.num_joints_half_body and
+                np.random.rand() < self.prob_half_body):
+            c_half_body, s_half_body = self.halfbody_transform(joints,
+                                                               joints_vis)
+            if c_half_body is not None and s_half_body is not None:
+                c, s = c_half_body, s_half_body
+        sf = self.scale
+        rf = self.rot
+        s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+        r = np.clip(np.random.randn() * rf, -rf * 2,
+                    rf * 2) if np.random.random() <= self.rot_prob else 0
+
+        if self.flip and np.random.random() <= 0.5:
+            image = image[:, ::-1, :]
+            joints, joints_vis = self.flip_joints(
+                joints, joints_vis, image.shape[1], self.flip_pairs)
+            c[0] = image.shape[1] - c[0] - 1
+        records['image'] = image
+        records['joints'] = joints
+        records['joints_vis'] = joints_vis
+        records['center'] = c
+        records['scale'] = s
+        records['rotate'] = r
+
+        return records
+
+
+@register_keypointop
+class AugmentationbyInformantionDropping(object):
+    """AID: Augmentation by Informantion Dropping. Please refer
+        to https://arxiv.org/abs/2008.07139
+
+    Args:
+        prob_cutout (float): The probability of the Cutout augmentation.
+        offset_factor (float): Offset factor of cutout center.
+        num_patch (int): Number of patches to be cutout.
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self,
+                 trainsize,
+                 prob_cutout=0.0,
+                 offset_factor=0.2,
+                 num_patch=1):
+        self.prob_cutout = prob_cutout
+        self.offset_factor = offset_factor
+        self.num_patch = num_patch
+        self.trainsize = trainsize
+
+    def _cutout(self, img, joints, joints_vis):
+        height, width, _ = img.shape
+        img = img.reshape((height * width, -1))
+        feat_x_int = np.arange(0, width)
+        feat_y_int = np.arange(0, height)
+        feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int)
+        feat_x_int = feat_x_int.reshape((-1, ))
+        feat_y_int = feat_y_int.reshape((-1, ))
+        for _ in range(self.num_patch):
+            vis_idx, _ = np.where(joints_vis > 0)
+            occlusion_joint_id = np.random.choice(vis_idx)
+            center = joints[occlusion_joint_id, 0:2]
+            offset = np.random.randn(2) * self.trainsize[
+                0] * self.offset_factor
+            center = center + offset
+            radius = np.random.uniform(0.1, 0.2) * self.trainsize[0]
+            x_offset = (center[0] - feat_x_int) / radius
+            y_offset = (center[1] - feat_y_int) / radius
+            dis = x_offset**2 + y_offset**2
+            keep_pos = np.where((dis <= 1) & (dis >= 0))[0]
+            img[keep_pos, :] = 0
+        img = img.reshape((height, width, -1))
+        return img
+
+    def __call__(self, records):
+        img = records['image']
+        joints = records['joints']
+        joints_vis = records['joints_vis']
+        if np.random.rand() < self.prob_cutout:
+            img = self._cutout(img, joints, joints_vis)
+        records['image'] = img
+        return records
+
+
+@register_keypointop
+class TopDownAffine(object):
+    """apply affine transform to image and coords
+
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        use_udp (bool): whether to use Unbiased Data Processing.
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self, trainsize, use_udp=False):
+        self.trainsize = trainsize
+        self.use_udp = use_udp
+
+    def __call__(self, records):
+        image = records['image']
+        joints = records['joints']
+        joints_vis = records['joints_vis']
+        rot = records['rotate'] if "rotate" in records else 0
+        if self.use_udp:
+            trans = get_warp_matrix(
+                rot, records['center'] * 2.0,
+                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
+                records['scale'] * 200.0)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+            joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(), trans)
+        else:
+            trans = get_affine_transform(records['center'], records['scale'] *
+                                         200, rot, self.trainsize)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+            for i in range(joints.shape[0]):
+                if joints_vis[i, 0] > 0.0:
+                    joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
+
+        records['image'] = image
+        records['joints'] = joints
+
+        return records
+
+
+@register_keypointop
+class TopDownEvalAffine(object):
+    """apply affine transform to image and coords
+
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        use_udp (bool): whether to use Unbiased Data Processing.
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self, trainsize, use_udp=False):
+        self.trainsize = trainsize
+        self.use_udp = use_udp
+
+    def __call__(self, records):
+        image = records['image']
+        rot = 0
+        imshape = records['im_shape'][::-1]
+        center = imshape / 2.
+        scale = imshape
+
+        if self.use_udp:
+            trans = get_warp_matrix(
+                rot, center * 2.0,
+                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+        else:
+            trans = get_affine_transform(center, scale, rot, self.trainsize)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+        records['image'] = image
+
+        return records
+
+
+@register_keypointop
+class ToHeatmapsTopDown(object):
+    """to generate the gaussin heatmaps of keypoint for heatmap loss
+
+    Args:
+        hmsize (list): [w, h] output heatmap's size
+        sigma (float): the std of gaussin kernel genereted
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the heatmaps used to heatmaploss
+
+    """
+
+    def __init__(self, hmsize, sigma):
+        super(ToHeatmapsTopDown, self).__init__()
+        self.hmsize = np.array(hmsize)
+        self.sigma = sigma
+
+    def __call__(self, records):
+        """refer to
+            https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+            Copyright (c) Microsoft, under the MIT License.
+        """
+        joints = records['joints']
+        joints_vis = records['joints_vis']
+        num_joints = joints.shape[0]
+        image_size = np.array(
+            [records['image'].shape[1], records['image'].shape[0]])
+        target_weight = np.ones((num_joints, 1), dtype=np.float32)
+        target_weight[:, 0] = joints_vis[:, 0]
+        target = np.zeros(
+            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
+        tmp_size = self.sigma * 3
+        feat_stride = image_size / self.hmsize
+        for joint_id in range(num_joints):
+            mu_x = int(joints[joint_id][0] + 0.5) / feat_stride[0]
+            mu_y = int(joints[joint_id][1] + 0.5) / feat_stride[1]
+            # Check that any part of the gaussian is in-bounds
+            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
+                    0] < 0 or br[1] < 0:
+                # If not, just return the image as is
+                target_weight[joint_id] = 0
+                continue
+            # # Generate gaussian
+            size = 2 * tmp_size + 1
+            x = np.arange(0, size, 1, np.float32)
+            y = x[:, np.newaxis]
+            x0 = y0 = size // 2
+            # The gaussian is not normalized, we want the center value to equal 1
+            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))
+
+            # Usable gaussian range
+            g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]
+            g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]
+            # Image range
+            img_x = max(0, ul[0]), min(br[0], self.hmsize[0])
+            img_y = max(0, ul[1]), min(br[1], self.hmsize[1])
+
+            v = target_weight[joint_id]
+            if v > 0.5:
+                target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[
+                    0]:g_y[1], g_x[0]:g_x[1]]
+        records['target'] = target
+        records['target_weight'] = target_weight
+        del records['joints'], records['joints_vis']
+
+        return records
+
+
+@register_keypointop
+class ToHeatmapsTopDown_DARK(object):
+    """to generate the gaussin heatmaps of keypoint for heatmap loss
+
+    Args:
+        hmsize (list): [w, h] output heatmap's size
+        sigma (float): the std of gaussin kernel genereted
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the heatmaps used to heatmaploss
+
+    """
+
+    def __init__(self, hmsize, sigma):
+        super(ToHeatmapsTopDown_DARK, self).__init__()
+        self.hmsize = np.array(hmsize)
+        self.sigma = sigma
+
+    def __call__(self, records):
+        joints = records['joints']
+        joints_vis = records['joints_vis']
+        num_joints = joints.shape[0]
+        image_size = np.array(
+            [records['image'].shape[1], records['image'].shape[0]])
+        target_weight = np.ones((num_joints, 1), dtype=np.float32)
+        target_weight[:, 0] = joints_vis[:, 0]
+        target = np.zeros(
+            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
+        tmp_size = self.sigma * 3
+        feat_stride = image_size / self.hmsize
+        for joint_id in range(num_joints):
+            mu_x = joints[joint_id][0] / feat_stride[0]
+            mu_y = joints[joint_id][1] / feat_stride[1]
+            # Check that any part of the gaussian is in-bounds
+            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
+                    0] < 0 or br[1] < 0:
+                # If not, just return the image as is
+                target_weight[joint_id] = 0
+                continue
+
+            x = np.arange(0, self.hmsize[0], 1, np.float32)
+            y = np.arange(0, self.hmsize[1], 1, np.float32)
+            y = y[:, np.newaxis]
+
+            v = target_weight[joint_id]
+            if v > 0.5:
+                target[joint_id] = np.exp(-(
+                    (x - mu_x)**2 + (y - mu_y)**2) / (2 * self.sigma**2))
+        records['target'] = target
+        records['target_weight'] = target_weight
+        del records['joints'], records['joints_vis']
+
+        return records
+
+
+@register_keypointop
+class ToHeatmapsTopDown_UDP(object):
+    """This code is based on:
+        https://github.com/HuangJunJie2017/UDP-Pose/blob/master/deep-high-resolution-net.pytorch/lib/dataset/JointsDataset.py
+
+        to generate the gaussian heatmaps of keypoint for heatmap loss.
+        ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing
+        for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        hmsize (list): [w, h] output heatmap's size
+        sigma (float): the std of gaussin kernel genereted
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the heatmaps used to heatmaploss
+    """
+
+    def __init__(self, hmsize, sigma):
+        super(ToHeatmapsTopDown_UDP, self).__init__()
+        self.hmsize = np.array(hmsize)
+        self.sigma = sigma
+
+    def __call__(self, records):
+        joints = records['joints']
+        joints_vis = records['joints_vis']
+        num_joints = joints.shape[0]
+        image_size = np.array(
+            [records['image'].shape[1], records['image'].shape[0]])
+        target_weight = np.ones((num_joints, 1), dtype=np.float32)
+        target_weight[:, 0] = joints_vis[:, 0]
+        target = np.zeros(
+            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
+        tmp_size = self.sigma * 3
+        size = 2 * tmp_size + 1
+        x = np.arange(0, size, 1, np.float32)
+        y = x[:, None]
+        feat_stride = (image_size - 1.0) / (self.hmsize - 1.0)
+        for joint_id in range(num_joints):
+            mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
+            mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
+            # Check that any part of the gaussian is in-bounds
+            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
+                    0] < 0 or br[1] < 0:
+                # If not, just return the image as is
+                target_weight[joint_id] = 0
+                continue
+
+            mu_x_ac = joints[joint_id][0] / feat_stride[0]
+            mu_y_ac = joints[joint_id][1] / feat_stride[1]
+            x0 = y0 = size // 2
+            x0 += mu_x_ac - mu_x
+            y0 += mu_y_ac - mu_y
+            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))
+            # Usable gaussian range
+            g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]
+            g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]
+            # Image range
+            img_x = max(0, ul[0]), min(br[0], self.hmsize[0])
+            img_y = max(0, ul[1]), min(br[1], self.hmsize[1])
+
+            v = target_weight[joint_id]
+            if v > 0.5:
+                target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[
+                    0]:g_y[1], g_x[0]:g_x[1]]
+        records['target'] = target
+        records['target_weight'] = target_weight
+        del records['joints'], records['joints_vis']
+
+        return records

+ 628 - 0
paddlers/models/ppdet/data/transform/mot_operators.py

@@ -0,0 +1,628 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from numbers import Integral
+
+import cv2
+import copy
+import numpy as np
+import random
+import math
+
+from .operators import BaseOperator, register_op
+from .batch_operators import Gt2TTFTarget
+from paddlers.models.ppdet.modeling.bbox_utils import bbox_iou_np_expand
+from paddlers.models.ppdet.utils.logger import setup_logger
+from .op_helper import gaussian_radius
+logger = setup_logger(__name__)
+
+__all__ = [
+    'RGBReverse', 'LetterBoxResize', 'MOTRandomAffine', 'Gt2JDETargetThres',
+    'Gt2JDETargetMax', 'Gt2FairMOTTarget'
+]
+
+
+@register_op
+class RGBReverse(BaseOperator):
+    """RGB to BGR, or BGR to RGB, sensitive to MOTRandomAffine
+    """
+
+    def __init__(self):
+        super(RGBReverse, self).__init__()
+
+    def apply(self, sample, context=None):
+        im = sample['image']
+        sample['image'] = np.ascontiguousarray(im[:, :, ::-1])
+        return sample
+
+
+@register_op
+class LetterBoxResize(BaseOperator):
+    def __init__(self, target_size):
+        """
+        Resize image to target size, convert normalized xywh to pixel xyxy
+        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
+        Args:
+            target_size (int|list): image target size.
+        """
+        super(LetterBoxResize, self).__init__()
+        if not isinstance(target_size, (Integral, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
+                format(type(target_size)))
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def apply_image(self, img, height, width, color=(127.5, 127.5, 127.5)):
+        # letterbox: resize a rectangular image to a padded rectangular
+        shape = img.shape[:2]  # [height, width]
+        ratio_h = float(height) / shape[0]
+        ratio_w = float(width) / shape[1]
+        ratio = min(ratio_h, ratio_w)
+        new_shape = (round(shape[1] * ratio),
+                     round(shape[0] * ratio))  # [width, height]
+        padw = (width - new_shape[0]) / 2
+        padh = (height - new_shape[1]) / 2
+        top, bottom = round(padh - 0.1), round(padh + 0.1)
+        left, right = round(padw - 0.1), round(padw + 0.1)
+
+        img = cv2.resize(
+            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+            value=color)  # padded rectangular
+        return img, ratio, padw, padh
+
+    def apply_bbox(self, bbox0, h, w, ratio, padw, padh):
+        bboxes = bbox0.copy()
+        bboxes[:, 0] = ratio * w * (bbox0[:, 0] - bbox0[:, 2] / 2) + padw
+        bboxes[:, 1] = ratio * h * (bbox0[:, 1] - bbox0[:, 3] / 2) + padh
+        bboxes[:, 2] = ratio * w * (bbox0[:, 0] + bbox0[:, 2] / 2) + padw
+        bboxes[:, 3] = ratio * h * (bbox0[:, 1] + bbox0[:, 3] / 2) + padh
+        return bboxes
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy.
+        """
+        im = sample['image']
+        h, w = sample['im_shape']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            from PIL import UnidentifiedImageError
+            raise UnidentifiedImageError(
+                '{}: image is not 3-dimensional.'.format(self))
+
+        # apply image
+        height, width = self.target_size
+        img, ratio, padw, padh = self.apply_image(
+            im, height=height, width=width)
+
+        sample['image'] = img
+        new_shape = (round(h * ratio), round(w * ratio))
+        sample['im_shape'] = np.asarray(new_shape, dtype=np.float32)
+        sample['scale_factor'] = np.asarray([ratio, ratio], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], h, w, ratio,
+                                                padw, padh)
+        return sample
+
+
+@register_op
+class MOTRandomAffine(BaseOperator):
+    """
+    Affine transform to image and coords to achieve the rotate, scale and
+    shift effect for training image.
+
+    Args:
+        degrees (list[2]): the rotate range to apply, transform range is [min, max]
+        translate (list[2]): the translate range to apply, transform range is [min, max]
+        scale (list[2]): the scale range to apply, transform range is [min, max]
+        shear (list[2]): the shear range to apply, transform range is [min, max]
+        borderValue (list[3]): value used in case of a constant border when appling
+            the perspective transformation
+        reject_outside (bool): reject warped bounding bboxes outside of image
+
+    Returns:
+        records(dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self,
+                 degrees=(-5, 5),
+                 translate=(0.10, 0.10),
+                 scale=(0.50, 1.20),
+                 shear=(-2, 2),
+                 borderValue=(127.5, 127.5, 127.5),
+                 reject_outside=True):
+        super(MOTRandomAffine, self).__init__()
+        self.degrees = degrees
+        self.translate = translate
+        self.scale = scale
+        self.shear = shear
+        self.borderValue = borderValue
+        self.reject_outside = reject_outside
+
+    def apply(self, sample, context=None):
+        # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4
+        border = 0  # width of added border (optional)
+
+        img = sample['image']
+        height, width = img.shape[0], img.shape[1]
+
+        # Rotation and Scale
+        R = np.eye(3)
+        a = random.random() * (self.degrees[1] - self.degrees[0]
+                               ) + self.degrees[0]
+        s = random.random() * (self.scale[1] - self.scale[0]) + self.scale[0]
+        R[:2] = cv2.getRotationMatrix2D(
+            angle=a, center=(width / 2, height / 2), scale=s)
+
+        # Translation
+        T = np.eye(3)
+        T[0, 2] = (
+            random.random() * 2 - 1
+        ) * self.translate[0] * height + border  # x translation (pixels)
+        T[1, 2] = (
+            random.random() * 2 - 1
+        ) * self.translate[1] * width + border  # y translation (pixels)
+
+        # Shear
+        S = np.eye(3)
+        S[0, 1] = math.tan((random.random() *
+                            (self.shear[1] - self.shear[0]) + self.shear[0]) *
+                           math.pi / 180)  # x shear (deg)
+        S[1, 0] = math.tan((random.random() *
+                            (self.shear[1] - self.shear[0]) + self.shear[0]) *
+                           math.pi / 180)  # y shear (deg)
+
+        M = S @T @R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
+        imw = cv2.warpPerspective(
+            img,
+            M,
+            dsize=(width, height),
+            flags=cv2.INTER_LINEAR,
+            borderValue=self.borderValue)  # BGR order borderValue
+
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            targets = sample['gt_bbox']
+            n = targets.shape[0]
+            points = targets.copy()
+            area0 = (points[:, 2] - points[:, 0]) * (
+                points[:, 3] - points[:, 1])
+
+            # warp points
+            xy = np.ones((n * 4, 3))
+            xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+                n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+            xy = (xy @M.T)[:, :2].reshape(n, 8)
+
+            # create new boxes
+            x = xy[:, [0, 2, 4, 6]]
+            y = xy[:, [1, 3, 5, 7]]
+            xy = np.concatenate(
+                (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+            # apply angle-based reduction
+            radians = a * math.pi / 180
+            reduction = max(abs(math.sin(radians)),
+                            abs(math.cos(radians)))**0.5
+            x = (xy[:, 2] + xy[:, 0]) / 2
+            y = (xy[:, 3] + xy[:, 1]) / 2
+            w = (xy[:, 2] - xy[:, 0]) * reduction
+            h = (xy[:, 3] - xy[:, 1]) * reduction
+            xy = np.concatenate(
+                (x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
+
+            # reject warped points outside of image
+            if self.reject_outside:
+                np.clip(xy[:, 0], 0, width, out=xy[:, 0])
+                np.clip(xy[:, 2], 0, width, out=xy[:, 2])
+                np.clip(xy[:, 1], 0, height, out=xy[:, 1])
+                np.clip(xy[:, 3], 0, height, out=xy[:, 3])
+            w = xy[:, 2] - xy[:, 0]
+            h = xy[:, 3] - xy[:, 1]
+            area = w * h
+            ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
+            i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
+
+            if sum(i) > 0:
+                sample['gt_bbox'] = xy[i].astype(sample['gt_bbox'].dtype)
+                sample['gt_class'] = sample['gt_class'][i]
+                if 'difficult' in sample:
+                    sample['difficult'] = sample['difficult'][i]
+                if 'gt_ide' in sample:
+                    sample['gt_ide'] = sample['gt_ide'][i]
+                if 'is_crowd' in sample:
+                    sample['is_crowd'] = sample['is_crowd'][i]
+                sample['image'] = imw
+                return sample
+            else:
+                return sample
+
+
+@register_op
+class Gt2JDETargetThres(BaseOperator):
+    __shared__ = ['num_classes']
+    """
+    Generate JDE targets by groud truth data when training
+    Args:
+        anchors (list): anchors of JDE model
+        anchor_masks (list): anchor_masks of JDE model
+        downsample_ratios (list): downsample ratios of JDE model
+        ide_thresh (float): thresh of identity, higher is groud truth
+        fg_thresh (float): thresh of foreground, higher is foreground
+        bg_thresh (float): thresh of background, lower is background
+        num_classes (int): number of classes
+    """
+
+    def __init__(self,
+                 anchors,
+                 anchor_masks,
+                 downsample_ratios,
+                 ide_thresh=0.5,
+                 fg_thresh=0.5,
+                 bg_thresh=0.4,
+                 num_classes=1):
+        super(Gt2JDETargetThres, self).__init__()
+        self.anchors = anchors
+        self.anchor_masks = anchor_masks
+        self.downsample_ratios = downsample_ratios
+        self.ide_thresh = ide_thresh
+        self.fg_thresh = fg_thresh
+        self.bg_thresh = bg_thresh
+        self.num_classes = num_classes
+
+    def generate_anchor(self, nGh, nGw, anchor_hw):
+        nA = len(anchor_hw)
+        yy, xx = np.meshgrid(np.arange(nGh), np.arange(nGw))
+
+        mesh = np.stack([xx.T, yy.T], axis=0)  # [2, nGh, nGw]
+        mesh = np.repeat(mesh[None, :], nA, axis=0)  # [nA, 2, nGh, nGw]
+
+        anchor_offset_mesh = anchor_hw[:, :, None][:, :, :, None]
+        anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGh, axis=-2)
+        anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGw, axis=-1)
+
+        anchor_mesh = np.concatenate(
+            [mesh, anchor_offset_mesh], axis=1)  # [nA, 4, nGh, nGw]
+        return anchor_mesh
+
+    def encode_delta(self, gt_box_list, fg_anchor_list):
+        px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
+                        fg_anchor_list[:, 2], fg_anchor_list[:,3]
+        gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \
+                        gt_box_list[:, 2], gt_box_list[:, 3]
+        dx = (gx - px) / pw
+        dy = (gy - py) / ph
+        dw = np.log(gw / pw)
+        dh = np.log(gh / ph)
+        return np.stack([dx, dy, dw, dh], axis=1)
+
+    def pad_box(self, sample, num_max):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        gt_num = len(bbox)
+        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
+        if gt_num > 0:
+            pad_bbox[:gt_num, :] = bbox[:gt_num, :]
+        sample['gt_bbox'] = pad_bbox
+        if 'gt_score' in sample:
+            pad_score = np.zeros((num_max, ), dtype=np.float32)
+            if gt_num > 0:
+                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
+            sample['gt_score'] = pad_score
+        if 'difficult' in sample:
+            pad_diff = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
+            sample['difficult'] = pad_diff
+        if 'is_crowd' in sample:
+            pad_crowd = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
+            sample['is_crowd'] = pad_crowd
+        if 'gt_ide' in sample:
+            pad_ide = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
+            sample['gt_ide'] = pad_ide
+        return sample
+
+    def __call__(self, samples, context=None):
+        assert len(self.anchor_masks) == len(self.downsample_ratios), \
+            "anchor_masks', and 'downsample_ratios' should have same length."
+        h, w = samples[0]['image'].shape[1:3]
+
+        num_max = 0
+        for sample in samples:
+            num_max = max(num_max, len(sample['gt_bbox']))
+
+        for sample in samples:
+            gt_bbox = sample['gt_bbox']
+            gt_ide = sample['gt_ide']
+            for i, (anchor_hw, downsample_ratio
+                    ) in enumerate(zip(self.anchors, self.downsample_ratios)):
+                anchor_hw = np.array(
+                    anchor_hw, dtype=np.float32) / downsample_ratio
+                nA = len(anchor_hw)
+                nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)
+                tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)
+                tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)
+                tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)
+
+                gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()
+                gxy[:, 0] = gxy[:, 0] * nGw
+                gxy[:, 1] = gxy[:, 1] * nGh
+                gwh[:, 0] = gwh[:, 0] * nGw
+                gwh[:, 1] = gwh[:, 1] * nGh
+                gxy[:, 0] = np.clip(gxy[:, 0], 0, nGw - 1)
+                gxy[:, 1] = np.clip(gxy[:, 1], 0, nGh - 1)
+                tboxes = np.concatenate([gxy, gwh], axis=1)
+
+                anchor_mesh = self.generate_anchor(nGh, nGw, anchor_hw)
+
+                anchor_list = np.transpose(anchor_mesh,
+                                           (0, 2, 3, 1)).reshape(-1, 4)
+                iou_pdist = bbox_iou_np_expand(
+                    anchor_list, tboxes, x1y1x2y2=False)
+
+                iou_max = np.max(iou_pdist, axis=1)
+                max_gt_index = np.argmax(iou_pdist, axis=1)
+
+                iou_map = iou_max.reshape(nA, nGh, nGw)
+                gt_index_map = max_gt_index.reshape(nA, nGh, nGw)
+
+                id_index = iou_map > self.ide_thresh
+                fg_index = iou_map > self.fg_thresh
+                bg_index = iou_map < self.bg_thresh
+                ign_index = (iou_map < self.fg_thresh) * (
+                    iou_map > self.bg_thresh)
+                tconf[fg_index] = 1
+                tconf[bg_index] = 0
+                tconf[ign_index] = -1
+
+                gt_index = gt_index_map[fg_index]
+                gt_box_list = tboxes[gt_index]
+                gt_id_list = gt_ide[gt_index_map[id_index]]
+
+                if np.sum(fg_index) > 0:
+                    tid[id_index] = gt_id_list
+
+                    fg_anchor_list = anchor_list.reshape(nA, nGh, nGw,
+                                                         4)[fg_index]
+                    delta_target = self.encode_delta(gt_box_list,
+                                                     fg_anchor_list)
+                    tbox[fg_index] = delta_target
+
+                sample['tbox{}'.format(i)] = tbox
+                sample['tconf{}'.format(i)] = tconf
+                sample['tide{}'.format(i)] = tid
+            sample.pop('gt_class')
+            sample = self.pad_box(sample, num_max)
+        return samples
+
+
+@register_op
+class Gt2JDETargetMax(BaseOperator):
+    __shared__ = ['num_classes']
+    """
+    Generate JDE targets by groud truth data when evaluating
+    Args:
+        anchors (list): anchors of JDE model
+        anchor_masks (list): anchor_masks of JDE model
+        downsample_ratios (list): downsample ratios of JDE model
+        max_iou_thresh (float): iou thresh for high quality anchor
+        num_classes (int): number of classes
+    """
+
+    def __init__(self,
+                 anchors,
+                 anchor_masks,
+                 downsample_ratios,
+                 max_iou_thresh=0.60,
+                 num_classes=1):
+        super(Gt2JDETargetMax, self).__init__()
+        self.anchors = anchors
+        self.anchor_masks = anchor_masks
+        self.downsample_ratios = downsample_ratios
+        self.max_iou_thresh = max_iou_thresh
+        self.num_classes = num_classes
+
+    def __call__(self, samples, context=None):
+        assert len(self.anchor_masks) == len(self.downsample_ratios), \
+            "anchor_masks', and 'downsample_ratios' should have same length."
+        h, w = samples[0]['image'].shape[1:3]
+        for sample in samples:
+            gt_bbox = sample['gt_bbox']
+            gt_ide = sample['gt_ide']
+            for i, (anchor_hw, downsample_ratio
+                    ) in enumerate(zip(self.anchors, self.downsample_ratios)):
+                anchor_hw = np.array(
+                    anchor_hw, dtype=np.float32) / downsample_ratio
+                nA = len(anchor_hw)
+                nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)
+                tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)
+                tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)
+                tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)
+
+                gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()
+                gxy[:, 0] = gxy[:, 0] * nGw
+                gxy[:, 1] = gxy[:, 1] * nGh
+                gwh[:, 0] = gwh[:, 0] * nGw
+                gwh[:, 1] = gwh[:, 1] * nGh
+                gi = np.clip(gxy[:, 0], 0, nGw - 1).astype(int)
+                gj = np.clip(gxy[:, 1], 0, nGh - 1).astype(int)
+
+                # iou of targets-anchors (using wh only)
+                box1 = gwh
+                box2 = anchor_hw[:, None, :]
+                inter_area = np.minimum(box1, box2).prod(2)
+                iou = inter_area / (
+                    box1.prod(1) + box2.prod(2) - inter_area + 1e-16)
+
+                # Select best iou_pred and anchor
+                iou_best = iou.max(0)  # best anchor [0-2] for each target
+                a = np.argmax(iou, axis=0)
+
+                # Select best unique target-anchor combinations
+                iou_order = np.argsort(-iou_best)  # best to worst
+
+                # Unique anchor selection
+                u = np.stack((gi, gj, a), 0)[:, iou_order]
+                _, first_unique = np.unique(u, axis=1, return_index=True)
+                mask = iou_order[first_unique]
+                # best anchor must share significant commonality (iou) with target
+                # TODO: examine arbitrary threshold
+                idx = mask[iou_best[mask] > self.max_iou_thresh]
+
+                if len(idx) > 0:
+                    a_i, gj_i, gi_i = a[idx], gj[idx], gi[idx]
+                    t_box = gt_bbox[idx]
+                    t_id = gt_ide[idx]
+                    if len(t_box.shape) == 1:
+                        t_box = t_box.reshape(1, 4)
+
+                    gxy, gwh = t_box[:, 0:2].copy(), t_box[:, 2:4].copy()
+                    gxy[:, 0] = gxy[:, 0] * nGw
+                    gxy[:, 1] = gxy[:, 1] * nGh
+                    gwh[:, 0] = gwh[:, 0] * nGw
+                    gwh[:, 1] = gwh[:, 1] * nGh
+
+                    # XY coordinates
+                    tbox[:, :, :, 0:2][a_i, gj_i, gi_i] = gxy - gxy.astype(int)
+                    # Width and height in yolo method
+                    tbox[:, :, :, 2:4][a_i, gj_i, gi_i] = np.log(
+                        gwh / anchor_hw[a_i])
+                    tconf[a_i, gj_i, gi_i] = 1
+                    tid[a_i, gj_i, gi_i] = t_id
+
+                sample['tbox{}'.format(i)] = tbox
+                sample['tconf{}'.format(i)] = tconf
+                sample['tide{}'.format(i)] = tid
+
+
+class Gt2FairMOTTarget(Gt2TTFTarget):
+    __shared__ = ['num_classes']
+    """
+    Generate FairMOT targets by ground truth data.
+    Difference between Gt2FairMOTTarget and Gt2TTFTarget are:
+        1. the gaussian kernal radius to generate a heatmap.
+        2. the targets needed during traing.
+
+    Args:
+        num_classes(int): the number of classes.
+        down_ratio(int): the down ratio from images to heatmap, 4 by default.
+        max_objs(int): the maximum number of ground truth objects in a image, 500 by default.
+    """
+
+    def __init__(self, num_classes=1, down_ratio=4, max_objs=500):
+        super(Gt2TTFTarget, self).__init__()
+        self.down_ratio = down_ratio
+        self.num_classes = num_classes
+        self.max_objs = max_objs
+
+    def __call__(self, samples, context=None):
+        for b_id, sample in enumerate(samples):
+            output_h = sample['image'].shape[1] // self.down_ratio
+            output_w = sample['image'].shape[2] // self.down_ratio
+
+            heatmap = np.zeros(
+                (self.num_classes, output_h, output_w), dtype='float32')
+            bbox_size = np.zeros((self.max_objs, 4), dtype=np.float32)
+            center_offset = np.zeros((self.max_objs, 2), dtype=np.float32)
+            index = np.zeros((self.max_objs, ), dtype=np.int64)
+            index_mask = np.zeros((self.max_objs, ), dtype=np.int32)
+            reid = np.zeros((self.max_objs, ), dtype=np.int64)
+            bbox_xys = np.zeros((self.max_objs, 4), dtype=np.float32)
+            if self.num_classes > 1:
+                # each category corresponds to a set of track ids
+                cls_tr_ids = np.zeros(
+                    (self.num_classes, output_h, output_w), dtype=np.int64)
+                cls_id_map = np.full((output_h, output_w), -1, dtype=np.int64)
+
+            gt_bbox = sample['gt_bbox']
+            gt_class = sample['gt_class']
+            gt_ide = sample['gt_ide']
+
+            for k in range(len(gt_bbox)):
+                cls_id = gt_class[k][0]
+                bbox = gt_bbox[k]
+                ide = gt_ide[k][0]
+                bbox[[0, 2]] = bbox[[0, 2]] * output_w
+                bbox[[1, 3]] = bbox[[1, 3]] * output_h
+                bbox_amodal = copy.deepcopy(bbox)
+                bbox_amodal[0] = bbox_amodal[0] - bbox_amodal[2] / 2.
+                bbox_amodal[1] = bbox_amodal[1] - bbox_amodal[3] / 2.
+                bbox_amodal[2] = bbox_amodal[0] + bbox_amodal[2]
+                bbox_amodal[3] = bbox_amodal[1] + bbox_amodal[3]
+                bbox[0] = np.clip(bbox[0], 0, output_w - 1)
+                bbox[1] = np.clip(bbox[1], 0, output_h - 1)
+                h = bbox[3]
+                w = bbox[2]
+
+                bbox_xy = copy.deepcopy(bbox)
+                bbox_xy[0] = bbox_xy[0] - bbox_xy[2] / 2
+                bbox_xy[1] = bbox_xy[1] - bbox_xy[3] / 2
+                bbox_xy[2] = bbox_xy[0] + bbox_xy[2]
+                bbox_xy[3] = bbox_xy[1] + bbox_xy[3]
+
+                if h > 0 and w > 0:
+                    radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
+                    radius = max(0, int(radius))
+                    ct = np.array([bbox[0], bbox[1]], dtype=np.float32)
+                    ct_int = ct.astype(np.int32)
+                    self.draw_truncate_gaussian(heatmap[cls_id], ct_int,
+                                                radius, radius)
+                    bbox_size[k] = ct[0] - bbox_amodal[0], ct[1] - bbox_amodal[1], \
+                            bbox_amodal[2] - ct[0], bbox_amodal[3] - ct[1]
+
+                    index[k] = ct_int[1] * output_w + ct_int[0]
+                    center_offset[k] = ct - ct_int
+                    index_mask[k] = 1
+                    reid[k] = ide
+                    bbox_xys[k] = bbox_xy
+                    if self.num_classes > 1:
+                        cls_id_map[ct_int[1], ct_int[0]] = cls_id
+                        cls_tr_ids[cls_id][ct_int[1]][ct_int[0]] = ide - 1
+                        # track id start from 0
+
+            sample['heatmap'] = heatmap
+            sample['index'] = index
+            sample['offset'] = center_offset
+            sample['size'] = bbox_size
+            sample['index_mask'] = index_mask
+            sample['reid'] = reid
+            if self.num_classes > 1:
+                sample['cls_id_map'] = cls_id_map
+                sample['cls_tr_ids'] = cls_tr_ids
+            sample['bbox_xys'] = bbox_xys
+            sample.pop('is_crowd', None)
+            sample.pop('difficult', None)
+            sample.pop('gt_class', None)
+            sample.pop('gt_bbox', None)
+            sample.pop('gt_score', None)
+            sample.pop('gt_ide', None)
+        return samples

+ 498 - 0
paddlers/models/ppdet/data/transform/op_helper.py

@@ -0,0 +1,498 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# this file contains helper methods for BBOX processing
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import random
+import math
+import cv2
+
+
+def meet_emit_constraint(src_bbox, sample_bbox):
+    center_x = (src_bbox[2] + src_bbox[0]) / 2
+    center_y = (src_bbox[3] + src_bbox[1]) / 2
+    if center_x >= sample_bbox[0] and \
+            center_x <= sample_bbox[2] and \
+            center_y >= sample_bbox[1] and \
+            center_y <= sample_bbox[3]:
+        return True
+    return False
+
+
+def clip_bbox(src_bbox):
+    src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0)
+    src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0)
+    src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0)
+    src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0)
+    return src_bbox
+
+
+def bbox_area(src_bbox):
+    if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]:
+        return 0.
+    else:
+        width = src_bbox[2] - src_bbox[0]
+        height = src_bbox[3] - src_bbox[1]
+        return width * height
+
+
+def is_overlap(object_bbox, sample_bbox):
+    if object_bbox[0] >= sample_bbox[2] or \
+       object_bbox[2] <= sample_bbox[0] or \
+       object_bbox[1] >= sample_bbox[3] or \
+       object_bbox[3] <= sample_bbox[1]:
+        return False
+    else:
+        return True
+
+
+def filter_and_process(sample_bbox,
+                       bboxes,
+                       labels,
+                       scores=None,
+                       keypoints=None):
+    new_bboxes = []
+    new_labels = []
+    new_scores = []
+    new_keypoints = []
+    new_kp_ignore = []
+    for i in range(len(bboxes)):
+        new_bbox = [0, 0, 0, 0]
+        obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]]
+        if not meet_emit_constraint(obj_bbox, sample_bbox):
+            continue
+        if not is_overlap(obj_bbox, sample_bbox):
+            continue
+        sample_width = sample_bbox[2] - sample_bbox[0]
+        sample_height = sample_bbox[3] - sample_bbox[1]
+        new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width
+        new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height
+        new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width
+        new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height
+        new_bbox = clip_bbox(new_bbox)
+        if bbox_area(new_bbox) > 0:
+            new_bboxes.append(new_bbox)
+            new_labels.append([labels[i][0]])
+            if scores is not None:
+                new_scores.append([scores[i][0]])
+            if keypoints is not None:
+                sample_keypoint = keypoints[0][i]
+                for j in range(len(sample_keypoint)):
+                    kp_len = sample_height if j % 2 else sample_width
+                    sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0]
+                    sample_keypoint[j] = (
+                        sample_keypoint[j] - sample_coord) / kp_len
+                    sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0)
+                new_keypoints.append(sample_keypoint)
+                new_kp_ignore.append(keypoints[1][i])
+
+    bboxes = np.array(new_bboxes)
+    labels = np.array(new_labels)
+    scores = np.array(new_scores)
+    if keypoints is not None:
+        keypoints = np.array(new_keypoints)
+        new_kp_ignore = np.array(new_kp_ignore)
+        return bboxes, labels, scores, (keypoints, new_kp_ignore)
+    return bboxes, labels, scores
+
+
+def bbox_area_sampling(bboxes, labels, scores, target_size, min_size):
+    new_bboxes = []
+    new_labels = []
+    new_scores = []
+    for i, bbox in enumerate(bboxes):
+        w = float((bbox[2] - bbox[0]) * target_size)
+        h = float((bbox[3] - bbox[1]) * target_size)
+        if w * h < float(min_size * min_size):
+            continue
+        else:
+            new_bboxes.append(bbox)
+            new_labels.append(labels[i])
+            if scores is not None and scores.size != 0:
+                new_scores.append(scores[i])
+    bboxes = np.array(new_bboxes)
+    labels = np.array(new_labels)
+    scores = np.array(new_scores)
+    return bboxes, labels, scores
+
+
+def generate_sample_bbox(sampler):
+    scale = np.random.uniform(sampler[2], sampler[3])
+    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
+    aspect_ratio = max(aspect_ratio, (scale**2.0))
+    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = np.random.uniform(0, xmin_bound)
+    ymin = np.random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = [xmin, ymin, xmax, ymax]
+    return sampled_bbox
+
+
+def generate_sample_bbox_square(sampler, image_width, image_height):
+    scale = np.random.uniform(sampler[2], sampler[3])
+    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
+    aspect_ratio = max(aspect_ratio, (scale**2.0))
+    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    if image_height < image_width:
+        bbox_width = bbox_height * image_height / image_width
+    else:
+        bbox_height = bbox_width * image_width / image_height
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = np.random.uniform(0, xmin_bound)
+    ymin = np.random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = [xmin, ymin, xmax, ymax]
+    return sampled_bbox
+
+
+def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array,
+                         resize_width):
+    num_gt = len(bbox_labels)
+    # np.random.randint range: [low, high)
+    rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0
+
+    if num_gt != 0:
+        norm_xmin = bbox_labels[rand_idx][0]
+        norm_ymin = bbox_labels[rand_idx][1]
+        norm_xmax = bbox_labels[rand_idx][2]
+        norm_ymax = bbox_labels[rand_idx][3]
+
+        xmin = norm_xmin * image_width
+        ymin = norm_ymin * image_height
+        wid = image_width * (norm_xmax - norm_xmin)
+        hei = image_height * (norm_ymax - norm_ymin)
+        range_size = 0
+
+        area = wid * hei
+        for scale_ind in range(0, len(scale_array) - 1):
+            if area > scale_array[scale_ind] ** 2 and area < \
+                    scale_array[scale_ind + 1] ** 2:
+                range_size = scale_ind + 1
+                break
+
+        if area > scale_array[len(scale_array) - 2]**2:
+            range_size = len(scale_array) - 2
+
+        scale_choose = 0.0
+        if range_size == 0:
+            rand_idx_size = 0
+        else:
+            # np.random.randint range: [low, high)
+            rng_rand_size = np.random.randint(0, range_size + 1)
+            rand_idx_size = rng_rand_size % (range_size + 1)
+
+        if rand_idx_size == range_size:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = min(2.0 * scale_array[rand_idx_size],
+                                 2 * math.sqrt(wid * hei))
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+        else:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = 2.0 * scale_array[rand_idx_size]
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+
+        sample_bbox_size = wid * resize_width / scale_choose
+
+        w_off_orig = 0.0
+        h_off_orig = 0.0
+        if sample_bbox_size < max(image_height, image_width):
+            if wid <= sample_bbox_size:
+                w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size,
+                                               xmin)
+            else:
+                w_off_orig = np.random.uniform(xmin,
+                                               xmin + wid - sample_bbox_size)
+
+            if hei <= sample_bbox_size:
+                h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size,
+                                               ymin)
+            else:
+                h_off_orig = np.random.uniform(ymin,
+                                               ymin + hei - sample_bbox_size)
+
+        else:
+            w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0)
+            h_off_orig = np.random.uniform(image_height - sample_bbox_size,
+                                           0.0)
+
+        w_off_orig = math.floor(w_off_orig)
+        h_off_orig = math.floor(h_off_orig)
+
+        # Figure out top left coordinates.
+        w_off = float(w_off_orig / image_width)
+        h_off = float(h_off_orig / image_height)
+
+        sampled_bbox = [
+            w_off, h_off, w_off + float(sample_bbox_size / image_width),
+            h_off + float(sample_bbox_size / image_height)
+        ]
+        return sampled_bbox
+    else:
+        return 0
+
+
+def jaccard_overlap(sample_bbox, object_bbox):
+    if sample_bbox[0] >= object_bbox[2] or \
+        sample_bbox[2] <= object_bbox[0] or \
+        sample_bbox[1] >= object_bbox[3] or \
+        sample_bbox[3] <= object_bbox[1]:
+        return 0
+    intersect_xmin = max(sample_bbox[0], object_bbox[0])
+    intersect_ymin = max(sample_bbox[1], object_bbox[1])
+    intersect_xmax = min(sample_bbox[2], object_bbox[2])
+    intersect_ymax = min(sample_bbox[3], object_bbox[3])
+    intersect_size = (intersect_xmax - intersect_xmin) * (
+        intersect_ymax - intersect_ymin)
+    sample_bbox_size = bbox_area(sample_bbox)
+    object_bbox_size = bbox_area(object_bbox)
+    overlap = intersect_size / (
+        sample_bbox_size + object_bbox_size - intersect_size)
+    return overlap
+
+
+def intersect_bbox(bbox1, bbox2):
+    if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \
+        bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]:
+        intersection_box = [0.0, 0.0, 0.0, 0.0]
+    else:
+        intersection_box = [
+            max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]),
+            min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3])
+        ]
+    return intersection_box
+
+
+def bbox_coverage(bbox1, bbox2):
+    inter_box = intersect_bbox(bbox1, bbox2)
+    intersect_size = bbox_area(inter_box)
+
+    if intersect_size > 0:
+        bbox1_size = bbox_area(bbox1)
+        return intersect_size / bbox1_size
+    else:
+        return 0.
+
+
+def satisfy_sample_constraint(sampler,
+                              sample_bbox,
+                              gt_bboxes,
+                              satisfy_all=False):
+    if sampler[6] == 0 and sampler[7] == 0:
+        return True
+    satisfied = []
+    for i in range(len(gt_bboxes)):
+        object_bbox = [
+            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
+        ]
+        overlap = jaccard_overlap(sample_bbox, object_bbox)
+        if sampler[6] != 0 and \
+                overlap < sampler[6]:
+            satisfied.append(False)
+            continue
+        if sampler[7] != 0 and \
+                overlap > sampler[7]:
+            satisfied.append(False)
+            continue
+        satisfied.append(True)
+        if not satisfy_all:
+            return True
+
+    if satisfy_all:
+        return np.all(satisfied)
+    else:
+        return False
+
+
+def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes):
+    if sampler[6] == 0 and sampler[7] == 0:
+        has_jaccard_overlap = False
+    else:
+        has_jaccard_overlap = True
+    if sampler[8] == 0 and sampler[9] == 0:
+        has_object_coverage = False
+    else:
+        has_object_coverage = True
+
+    if not has_jaccard_overlap and not has_object_coverage:
+        return True
+    found = False
+    for i in range(len(gt_bboxes)):
+        object_bbox = [
+            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
+        ]
+        if has_jaccard_overlap:
+            overlap = jaccard_overlap(sample_bbox, object_bbox)
+            if sampler[6] != 0 and \
+                    overlap < sampler[6]:
+                continue
+            if sampler[7] != 0 and \
+                    overlap > sampler[7]:
+                continue
+            found = True
+        if has_object_coverage:
+            object_coverage = bbox_coverage(object_bbox, sample_bbox)
+            if sampler[8] != 0 and \
+                    object_coverage < sampler[8]:
+                continue
+            if sampler[9] != 0 and \
+                    object_coverage > sampler[9]:
+                continue
+            found = True
+        if found:
+            return True
+    return found
+
+
+def crop_image_sampling(img, sample_bbox, image_width, image_height,
+                        target_size):
+    # no clipping here
+    xmin = int(sample_bbox[0] * image_width)
+    xmax = int(sample_bbox[2] * image_width)
+    ymin = int(sample_bbox[1] * image_height)
+    ymax = int(sample_bbox[3] * image_height)
+
+    w_off = xmin
+    h_off = ymin
+    width = xmax - xmin
+    height = ymax - ymin
+    cross_xmin = max(0.0, float(w_off))
+    cross_ymin = max(0.0, float(h_off))
+    cross_xmax = min(float(w_off + width - 1.0), float(image_width))
+    cross_ymax = min(float(h_off + height - 1.0), float(image_height))
+    cross_width = cross_xmax - cross_xmin
+    cross_height = cross_ymax - cross_ymin
+
+    roi_xmin = 0 if w_off >= 0 else abs(w_off)
+    roi_ymin = 0 if h_off >= 0 else abs(h_off)
+    roi_width = cross_width
+    roi_height = cross_height
+
+    roi_y1 = int(roi_ymin)
+    roi_y2 = int(roi_ymin + roi_height)
+    roi_x1 = int(roi_xmin)
+    roi_x2 = int(roi_xmin + roi_width)
+
+    cross_y1 = int(cross_ymin)
+    cross_y2 = int(cross_ymin + cross_height)
+    cross_x1 = int(cross_xmin)
+    cross_x2 = int(cross_xmin + cross_width)
+
+    sample_img = np.zeros((height, width, 3))
+    sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \
+        img[cross_y1: cross_y2, cross_x1: cross_x2]
+
+    sample_img = cv2.resize(
+        sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA)
+
+    return sample_img
+
+
+def is_poly(segm):
+    assert isinstance(segm, (list, dict)), \
+        "Invalid segm type: {}".format(type(segm))
+    return isinstance(segm, list)
+
+
+def gaussian_radius(bbox_size, min_overlap):
+    height, width = bbox_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = np.sqrt(b1**2 - 4 * a1 * c1)
+    radius1 = (b1 + sq1) / (2 * a1)
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = np.sqrt(b2**2 - 4 * a2 * c2)
+    radius2 = (b2 + sq2) / 2
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = np.sqrt(b3**2 - 4 * a3 * c3)
+    radius3 = (b3 + sq3) / 2
+    return min(radius1, radius2, radius3)
+
+
+def draw_gaussian(heatmap, center, radius, k=1, delte=6):
+    diameter = 2 * radius + 1
+    sigma = diameter / delte
+    gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma)
+
+    x, y = center
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
+                               radius + right]
+    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+
+
+def gaussian2D(shape, sigma_x=1, sigma_y=1):
+    m, n = [(ss - 1.) / 2. for ss in shape]
+    y, x = np.ogrid[-m:m + 1, -n:n + 1]
+
+    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *
+                                                            sigma_y)))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def draw_umich_gaussian(heatmap, center, radius, k=1):
+    """
+    draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
+    """
+    diameter = 2 * radius + 1
+    gaussian = gaussian2D(
+        (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
+
+    x, y = int(center[0]), int(center[1])
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
+                               radius + right]
+    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
+        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+    return heatmap
+
+
+def get_border(border, size):
+    i = 1
+    while size - border // i <= border // i:
+        i *= 2
+    return border // i

+ 3025 - 0
paddlers/models/ppdet/data/transform/operators.py

@@ -0,0 +1,3025 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# function:
+#    operators to process sample,
+#    eg: decode/resize/crop image
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+from numbers import Number, Integral
+
+import uuid
+import random
+import math
+import numpy as np
+import os
+import copy
+import logging
+import cv2
+from PIL import Image, ImageDraw
+import pickle
+import threading
+MUTEX = threading.Lock()
+
+from paddlers.models.ppdet.core.workspace import serializable
+from paddlers.models.ppdet.modeling import bbox_utils
+from ..reader import Compose
+
+from .op_helper import (satisfy_sample_constraint, filter_and_process,
+                        generate_sample_bbox, clip_bbox, data_anchor_sampling,
+                        satisfy_sample_constraint_coverage,
+                        crop_image_sampling, generate_sample_bbox_square,
+                        bbox_area_sampling, is_poly, get_border)
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+from paddlers.models.ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform
+logger = setup_logger(__name__)
+
+registered_ops = []
+
+
+def register_op(cls):
+    registered_ops.append(cls.__name__)
+    if not hasattr(BaseOperator, cls.__name__):
+        setattr(BaseOperator, cls.__name__, cls)
+    else:
+        raise KeyError("The {} class has been registered.".format(
+            cls.__name__))
+    return serializable(cls)
+
+
+class BboxError(ValueError):
+    pass
+
+
+class ImageError(ValueError):
+    pass
+
+
+class BaseOperator(object):
+    def __init__(self, name=None):
+        if name is None:
+            name = self.__class__.__name__
+        self._id = name + '_' + str(uuid.uuid4())[-6:]
+
+    def apply(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        return sample
+
+    def __call__(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        if isinstance(sample, Sequence):
+            for i in range(len(sample)):
+                sample[i] = self.apply(sample[i], context)
+        else:
+            sample = self.apply(sample, context)
+        return sample
+
+    def __str__(self):
+        return str(self._id)
+
+
+@register_op
+class Decode(BaseOperator):
+    def __init__(self):
+        """ Transform the image data to numpy format following the rgb format
+        """
+        super(Decode, self).__init__()
+
+    def apply(self, sample, context=None):
+        """ load image if 'im_file' field is not empty but 'image' is"""
+        if 'image' not in sample:
+            with open(sample['im_file'], 'rb') as f:
+                sample['image'] = f.read()
+            sample.pop('im_file')
+
+        im = sample['image']
+        data = np.frombuffer(im, dtype='uint8')
+        im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+        if 'keep_ori_im' in sample and sample['keep_ori_im']:
+            sample['ori_image'] = im
+        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+
+        sample['image'] = im
+        if 'h' not in sample:
+            sample['h'] = im.shape[0]
+        elif sample['h'] != im.shape[0]:
+            logger.warning(
+                "The actual image height: {} is not equal to the "
+                "height: {} in annotation, and update sample['h'] by actual "
+                "image height.".format(im.shape[0], sample['h']))
+            sample['h'] = im.shape[0]
+        if 'w' not in sample:
+            sample['w'] = im.shape[1]
+        elif sample['w'] != im.shape[1]:
+            logger.warning(
+                "The actual image width: {} is not equal to the "
+                "width: {} in annotation, and update sample['w'] by actual "
+                "image width.".format(im.shape[1], sample['w']))
+            sample['w'] = im.shape[1]
+
+        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
+        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+        return sample
+
+
+def _make_dirs(dirname):
+    try:
+        from pathlib import Path
+    except ImportError:
+        from pathlib2 import Path
+    Path(dirname).mkdir(exist_ok=True)
+
+
+@register_op
+class DecodeCache(BaseOperator):
+    def __init__(self, cache_root=None):
+        '''decode image and caching
+        '''
+        super(DecodeCache, self).__init__()
+
+        self.use_cache = False if cache_root is None else True
+        self.cache_root = cache_root
+
+        if cache_root is not None:
+            _make_dirs(cache_root)
+
+    def apply(self, sample, context=None):
+
+        if self.use_cache and os.path.exists(
+                self.cache_path(self.cache_root, sample['im_file'])):
+            path = self.cache_path(self.cache_root, sample['im_file'])
+            im = self.load(path)
+
+        else:
+            if 'image' not in sample:
+                with open(sample['im_file'], 'rb') as f:
+                    sample['image'] = f.read()
+
+            im = sample['image']
+            data = np.frombuffer(im, dtype='uint8')
+            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+            if 'keep_ori_im' in sample and sample['keep_ori_im']:
+                sample['ori_image'] = im
+            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+
+            if self.use_cache and not os.path.exists(
+                    self.cache_path(self.cache_root, sample['im_file'])):
+                path = self.cache_path(self.cache_root, sample['im_file'])
+                self.dump(im, path)
+
+        sample['image'] = im
+        sample['h'] = im.shape[0]
+        sample['w'] = im.shape[1]
+
+        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
+        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+
+        sample.pop('im_file')
+
+        return sample
+
+    @staticmethod
+    def cache_path(dir_oot, im_file):
+        return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl')
+
+    @staticmethod
+    def load(path):
+        with open(path, 'rb') as f:
+            im = pickle.load(f)
+        return im
+
+    @staticmethod
+    def dump(obj, path):
+        MUTEX.acquire()
+        try:
+            with open(path, 'wb') as f:
+                pickle.dump(obj, f)
+
+        except Exception as e:
+            logger.warning('dump {} occurs exception {}'.format(path, str(e)))
+
+        finally:
+            MUTEX.release()
+
+
+@register_op
+class SniperDecodeCrop(BaseOperator):
+    def __init__(self):
+        super(SniperDecodeCrop, self).__init__()
+
+    def __call__(self, sample, context=None):
+        if 'image' not in sample:
+            with open(sample['im_file'], 'rb') as f:
+                sample['image'] = f.read()
+            sample.pop('im_file')
+
+        im = sample['image']
+        data = np.frombuffer(im, dtype='uint8')
+        im = cv2.imdecode(data,
+                          cv2.IMREAD_COLOR)  # BGR mode, but need RGB mode
+        if 'keep_ori_im' in sample and sample['keep_ori_im']:
+            sample['ori_image'] = im
+        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+
+        chip = sample['chip']
+        x1, y1, x2, y2 = [int(xi) for xi in chip]
+        im = im[max(y1, 0):min(y2, im.shape[0]), max(x1, 0):min(x2, im.shape[
+            1]), :]
+
+        sample['image'] = im
+        h = im.shape[0]
+        w = im.shape[1]
+        # sample['im_info'] = [h, w, 1.0]
+        sample['h'] = h
+        sample['w'] = w
+
+        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
+        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+        return sample
+
+
+@register_op
+class Permute(BaseOperator):
+    def __init__(self):
+        """
+        Change the channel to be (C, H, W)
+        """
+        super(Permute, self).__init__()
+
+    def apply(self, sample, context=None):
+        im = sample['image']
+        im = im.transpose((2, 0, 1))
+        sample['image'] = im
+        return sample
+
+
+@register_op
+class Lighting(BaseOperator):
+    """
+    Lighting the image by eigenvalues and eigenvectors
+    Args:
+        eigval (list): eigenvalues
+        eigvec (list): eigenvectors
+        alphastd (float): random weight of lighting, 0.1 by default
+    """
+
+    def __init__(self, eigval, eigvec, alphastd=0.1):
+        super(Lighting, self).__init__()
+        self.alphastd = alphastd
+        self.eigval = np.array(eigval).astype('float32')
+        self.eigvec = np.array(eigvec).astype('float32')
+
+    def apply(self, sample, context=None):
+        alpha = np.random.normal(scale=self.alphastd, size=(3, ))
+        sample['image'] += np.dot(self.eigvec, self.eigval * alpha)
+        return sample
+
+
+@register_op
+class RandomErasingImage(BaseOperator):
+    def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3):
+        """
+        Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896
+        Args:
+            prob (float): probability to carry out random erasing
+            lower (float): lower limit of the erasing area ratio
+            higher (float): upper limit of the erasing area ratio
+            aspect_ratio (float): aspect ratio of the erasing region
+        """
+        super(RandomErasingImage, self).__init__()
+        self.prob = prob
+        self.lower = lower
+        self.higher = higher
+        self.aspect_ratio = aspect_ratio
+
+    def apply(self, sample):
+        gt_bbox = sample['gt_bbox']
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image is not a numpy array.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError("{}: image is not 3-dimensional.".format(self))
+
+        for idx in range(gt_bbox.shape[0]):
+            if self.prob <= np.random.rand():
+                continue
+
+            x1, y1, x2, y2 = gt_bbox[idx, :]
+            w_bbox = x2 - x1
+            h_bbox = y2 - y1
+            area = w_bbox * h_bbox
+
+            target_area = random.uniform(self.lower, self.higher) * area
+            aspect_ratio = random.uniform(self.aspect_ratio,
+                                          1 / self.aspect_ratio)
+
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w < w_bbox and h < h_bbox:
+                off_y1 = random.randint(0, int(h_bbox - h))
+                off_x1 = random.randint(0, int(w_bbox - w))
+                im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int(
+                    x1 + off_x1 + w), :] = 0
+        sample['image'] = im
+        return sample
+
+
+@register_op
+class NormalizeImage(BaseOperator):
+    def __init__(self,
+                 mean=[0.485, 0.456, 0.406],
+                 std=[1, 1, 1],
+                 is_scale=True):
+        """
+        Args:
+            mean (list): the pixel mean
+            std (list): the pixel variance
+        """
+        super(NormalizeImage, self).__init__()
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+        if not (isinstance(self.mean, list) and isinstance(self.std, list) and
+                isinstance(self.is_scale, bool)):
+            raise TypeError("{}: input type is invalid.".format(self))
+        from functools import reduce
+        if reduce(lambda x, y: x * y, self.std) == 0:
+            raise ValueError('{}: std is invalid!'.format(self))
+
+    def apply(self, sample, context=None):
+        """Normalize the image.
+        Operators:
+            1.(optional) Scale the image to [0,1]
+            2. Each pixel minus mean and is divided by std
+        """
+        im = sample['image']
+        im = im.astype(np.float32, copy=False)
+        mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+        std = np.array(self.std)[np.newaxis, np.newaxis, :]
+
+        if self.is_scale:
+            im = im / 255.0
+
+        im -= mean
+        im /= std
+
+        sample['image'] = im
+        return sample
+
+
+@register_op
+class GridMask(BaseOperator):
+    def __init__(self,
+                 use_h=True,
+                 use_w=True,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=1,
+                 prob=0.7,
+                 upper_iter=360000):
+        """
+        GridMask Data Augmentation, see https://arxiv.org/abs/2001.04086
+        Args:
+            use_h (bool): whether to mask vertically
+            use_w (boo;): whether to mask horizontally
+            rotate (float): angle for the mask to rotate
+            offset (float): mask offset
+            ratio (float): mask ratio
+            mode (int): gridmask mode
+            prob (float): max probability to carry out gridmask
+            upper_iter (int): suggested to be equal to global max_iter
+        """
+        super(GridMask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.prob = prob
+        self.upper_iter = upper_iter
+
+        from .gridmask_utils import Gridmask
+        self.gridmask_op = Gridmask(
+            use_h,
+            use_w,
+            rotate=rotate,
+            offset=offset,
+            ratio=ratio,
+            mode=mode,
+            prob=prob,
+            upper_iter=upper_iter)
+
+    def apply(self, sample, context=None):
+        sample['image'] = self.gridmask_op(sample['image'],
+                                           sample['curr_iter'])
+        return sample
+
+
+@register_op
+class RandomDistort(BaseOperator):
+    """Random color distortion.
+    Args:
+        hue (list): hue settings. in [lower, upper, probability] format.
+        saturation (list): saturation settings. in [lower, upper, probability] format.
+        contrast (list): contrast settings. in [lower, upper, probability] format.
+        brightness (list): brightness settings. in [lower, upper, probability] format.
+        random_apply (bool): whether to apply in random (yolo) or fixed (SSD)
+            order.
+        count (int): the number of doing distrot
+        random_channel (bool): whether to swap channels randomly
+    """
+
+    def __init__(self,
+                 hue=[-18, 18, 0.5],
+                 saturation=[0.5, 1.5, 0.5],
+                 contrast=[0.5, 1.5, 0.5],
+                 brightness=[0.5, 1.5, 0.5],
+                 random_apply=True,
+                 count=4,
+                 random_channel=False):
+        super(RandomDistort, self).__init__()
+        self.hue = hue
+        self.saturation = saturation
+        self.contrast = contrast
+        self.brightness = brightness
+        self.random_apply = random_apply
+        self.count = count
+        self.random_channel = random_channel
+
+    def apply_hue(self, img):
+        low, high, prob = self.hue
+        if np.random.uniform(0., 1.) < prob:
+            return img
+
+        img = img.astype(np.float32)
+        # it works, but result differ from HSV version
+        delta = np.random.uniform(low, high)
+        u = np.cos(delta * np.pi)
+        w = np.sin(delta * np.pi)
+        bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]])
+        tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321],
+                         [0.211, -0.523, 0.311]])
+        ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647],
+                          [1.0, -1.107, 1.705]])
+        t = np.dot(np.dot(ityiq, bt), tyiq).T
+        img = np.dot(img, t)
+        return img
+
+    def apply_saturation(self, img):
+        low, high, prob = self.saturation
+        if np.random.uniform(0., 1.) < prob:
+            return img
+        delta = np.random.uniform(low, high)
+        img = img.astype(np.float32)
+        # it works, but result differ from HSV version
+        gray = img * np.array([[[0.299, 0.587, 0.114]]], dtype=np.float32)
+        gray = gray.sum(axis=2, keepdims=True)
+        gray *= (1.0 - delta)
+        img *= delta
+        img += gray
+        return img
+
+    def apply_contrast(self, img):
+        low, high, prob = self.contrast
+        if np.random.uniform(0., 1.) < prob:
+            return img
+        delta = np.random.uniform(low, high)
+        img = img.astype(np.float32)
+        img *= delta
+        return img
+
+    def apply_brightness(self, img):
+        low, high, prob = self.brightness
+        if np.random.uniform(0., 1.) < prob:
+            return img
+        delta = np.random.uniform(low, high)
+        img = img.astype(np.float32)
+        img += delta
+        return img
+
+    def apply(self, sample, context=None):
+        img = sample['image']
+        if self.random_apply:
+            functions = [
+                self.apply_brightness, self.apply_contrast,
+                self.apply_saturation, self.apply_hue
+            ]
+            distortions = np.random.permutation(functions)[:self.count]
+            for func in distortions:
+                img = func(img)
+            sample['image'] = img
+            return sample
+
+        img = self.apply_brightness(img)
+        mode = np.random.randint(0, 2)
+
+        if mode:
+            img = self.apply_contrast(img)
+
+        img = self.apply_saturation(img)
+        img = self.apply_hue(img)
+
+        if not mode:
+            img = self.apply_contrast(img)
+
+        if self.random_channel:
+            if np.random.randint(0, 2):
+                img = img[..., np.random.permutation(3)]
+        sample['image'] = img
+        return sample
+
+
+@register_op
+class AutoAugment(BaseOperator):
+    def __init__(self, autoaug_type="v1"):
+        """
+        Args:
+            autoaug_type (str): autoaug type, support v0, v1, v2, v3, test
+        """
+        super(AutoAugment, self).__init__()
+        self.autoaug_type = autoaug_type
+
+    def apply(self, sample, context=None):
+        """
+        Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172
+        """
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image is not a numpy array.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError("{}: image is not 3-dimensional.".format(self))
+        if len(gt_bbox) == 0:
+            return sample
+
+        height, width, _ = im.shape
+        norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32)
+        norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height)
+        norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width)
+        norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height)
+        norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width)
+
+        from .autoaugment_utils import distort_image_with_autoaugment
+        im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox,
+                                                          self.autoaug_type)
+
+        gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width)
+        gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height)
+        gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width)
+        gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height)
+
+        sample['image'] = im
+        sample['gt_bbox'] = gt_bbox
+        return sample
+
+
+@register_op
+class RandomFlip(BaseOperator):
+    def __init__(self, prob=0.5):
+        """
+        Args:
+            prob (float): the probability of flipping image
+        """
+        super(RandomFlip, self).__init__()
+        self.prob = prob
+        if not (isinstance(self.prob, float)):
+            raise TypeError("{}: input type is invalid.".format(self))
+
+    def apply_segm(self, segms, height, width):
+        def _flip_poly(poly, width):
+            flipped_poly = np.array(poly)
+            flipped_poly[0::2] = width - np.array(poly[0::2])
+            return flipped_poly.tolist()
+
+        def _flip_rle(rle, height, width):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            mask = mask[:, ::-1]
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        flipped_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                flipped_segms.append(
+                    [_flip_poly(poly, width) for poly in segm])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                flipped_segms.append(_flip_rle(segm, height, width))
+        return flipped_segms
+
+    def apply_keypoint(self, gt_keypoint, width):
+        for i in range(gt_keypoint.shape[1]):
+            if i % 2 == 0:
+                old_x = gt_keypoint[:, i].copy()
+                gt_keypoint[:, i] = width - old_x
+        return gt_keypoint
+
+    def apply_image(self, image):
+        return image[:, ::-1, :]
+
+    def apply_bbox(self, bbox, width):
+        oldx1 = bbox[:, 0].copy()
+        oldx2 = bbox[:, 2].copy()
+        bbox[:, 0] = width - oldx2
+        bbox[:, 2] = width - oldx1
+        return bbox
+
+    def apply_rbox(self, bbox, width):
+        oldx1 = bbox[:, 0].copy()
+        oldx2 = bbox[:, 2].copy()
+        oldx3 = bbox[:, 4].copy()
+        oldx4 = bbox[:, 6].copy()
+        bbox[:, 0] = width - oldx1
+        bbox[:, 2] = width - oldx2
+        bbox[:, 4] = width - oldx3
+        bbox[:, 6] = width - oldx4
+        bbox = [bbox_utils.get_best_begin_point_single(e) for e in bbox]
+        return bbox
+
+    def apply(self, sample, context=None):
+        """Filp the image and bounding box.
+        Operators:
+            1. Flip the image numpy.
+            2. Transform the bboxes' x coordinates.
+              (Must judge whether the coordinates are normalized!)
+            3. Transform the segmentations' x coordinates.
+              (Must judge whether the coordinates are normalized!)
+        Output:
+            sample: the image, bounding box and segmentation part
+                    in sample are flipped.
+        """
+        if np.random.uniform(0, 1) < self.prob:
+            im = sample['image']
+            height, width = im.shape[:2]
+            im = self.apply_image(im)
+            if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+                sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width)
+            if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+                sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height,
+                                                    width)
+            if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
+                sample['gt_keypoint'] = self.apply_keypoint(
+                    sample['gt_keypoint'], width)
+
+            if 'semantic' in sample and sample['semantic']:
+                sample['semantic'] = sample['semantic'][:, ::-1]
+
+            if 'gt_segm' in sample and sample['gt_segm'].any():
+                sample['gt_segm'] = sample['gt_segm'][:, :, ::-1]
+
+            if 'gt_rbox2poly' in sample and sample['gt_rbox2poly'].any():
+                sample['gt_rbox2poly'] = self.apply_rbox(
+                    sample['gt_rbox2poly'], width)
+
+            sample['flipped'] = True
+            sample['image'] = im
+        return sample
+
+
+@register_op
+class Resize(BaseOperator):
+    def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
+        """
+        Resize image to target size. if keep_ratio is True,
+        resize the image's long side to the maximum of target_size
+        if keep_ratio is False, resize the image to target size(h, w)
+        Args:
+            target_size (int|list): image target size
+            keep_ratio (bool): whether keep_ratio or not, default true
+            interp (int): the interpolation method
+        """
+        super(Resize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+        if not isinstance(target_size, (Integral, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
+                format(type(target_size)))
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def apply_image(self, image, scale):
+        im_scale_x, im_scale_y = scale
+
+        return cv2.resize(
+            image,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+
+    def apply_bbox(self, bbox, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        bbox[:, 0::2] *= im_scale_x
+        bbox[:, 1::2] *= im_scale_y
+        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
+        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
+        return bbox
+
+    def apply_segm(self, segms, im_size, scale):
+        def _resize_poly(poly, im_scale_x, im_scale_y):
+            resized_poly = np.array(poly).astype('float32')
+            resized_poly[0::2] *= im_scale_x
+            resized_poly[1::2] *= im_scale_y
+            return resized_poly.tolist()
+
+        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, im_h, im_w)
+
+            mask = mask_util.decode(rle)
+            mask = cv2.resize(
+                mask,
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        im_h, im_w = im_size
+        im_scale_x, im_scale_y = scale
+        resized_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                resized_segms.append([
+                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
+                ])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                resized_segms.append(
+                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
+
+        return resized_segms
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy.
+        """
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError('{}: image is not 3-dimensional.'.format(self))
+
+        # apply image
+        im_shape = im.shape
+        if self.keep_ratio:
+
+            im_size_min = np.min(im_shape[0:2])
+            im_size_max = np.max(im_shape[0:2])
+
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+
+            im_scale = min(target_size_min / im_size_min,
+                           target_size_max / im_size_max)
+
+            resize_h = im_scale * float(im_shape[0])
+            resize_w = im_scale * float(im_shape[1])
+
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / im_shape[0]
+            im_scale_x = resize_w / im_shape[1]
+
+        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
+        sample['image'] = im
+        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
+                                                [im_scale_x, im_scale_y],
+                                                [resize_w, resize_h])
+
+        # apply rbox
+        if 'gt_rbox2poly' in sample:
+            if np.array(sample['gt_rbox2poly']).shape[1] != 8:
+                logger.warning(
+                    "gt_rbox2poly's length shoule be 8, but actually is {}".
+                    format(len(sample['gt_rbox2poly'])))
+            sample['gt_rbox2poly'] = self.apply_bbox(sample['gt_rbox2poly'],
+                                                     [im_scale_x, im_scale_y],
+                                                     [resize_w, resize_h])
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(
+                sample['gt_poly'], im_shape[:2], [im_scale_x, im_scale_y])
+
+        # apply semantic
+        if 'semantic' in sample and sample['semantic']:
+            semantic = sample['semantic']
+            semantic = cv2.resize(
+                semantic.astype('float32'),
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            semantic = np.asarray(semantic).astype('int32')
+            semantic = np.expand_dims(semantic, 0)
+            sample['semantic'] = semantic
+
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            masks = [
+                cv2.resize(
+                    gt_segm,
+                    None,
+                    None,
+                    fx=im_scale_x,
+                    fy=im_scale_y,
+                    interpolation=cv2.INTER_NEAREST)
+                for gt_segm in sample['gt_segm']
+            ]
+            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+
+        return sample
+
+
+@register_op
+class MultiscaleTestResize(BaseOperator):
+    def __init__(self,
+                 origin_target_size=[800, 1333],
+                 target_size=[],
+                 interp=cv2.INTER_LINEAR,
+                 use_flip=True):
+        """
+        Rescale image to the each size in target size, and capped at max_size.
+        Args:
+            origin_target_size (list): origin target size of image
+            target_size (list): A list of target sizes of image.
+            interp (int): the interpolation method.
+            use_flip (bool): whether use flip augmentation.
+        """
+        super(MultiscaleTestResize, self).__init__()
+        self.interp = interp
+        self.use_flip = use_flip
+
+        if not isinstance(target_size, Sequence):
+            raise TypeError(
+                "Type of target_size is invalid. Must be List or Tuple, now is {}".
+                format(type(target_size)))
+        self.target_size = target_size
+
+        if not isinstance(origin_target_size, Sequence):
+            raise TypeError(
+                "Type of origin_target_size is invalid. Must be List or Tuple, now is {}".
+                format(type(origin_target_size)))
+
+        self.origin_target_size = origin_target_size
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy for multi-scale test.
+        """
+        samples = []
+        resizer = Resize(
+            self.origin_target_size, keep_ratio=True, interp=self.interp)
+        samples.append(resizer(sample.copy(), context))
+        if self.use_flip:
+            flipper = RandomFlip(1.1)
+            samples.append(flipper(sample.copy(), context=context))
+
+        for size in self.target_size:
+            resizer = Resize(size, keep_ratio=True, interp=self.interp)
+            samples.append(resizer(sample.copy(), context))
+
+        return samples
+
+
+@register_op
+class RandomResize(BaseOperator):
+    def __init__(self,
+                 target_size,
+                 keep_ratio=True,
+                 interp=cv2.INTER_LINEAR,
+                 random_size=True,
+                 random_interp=False):
+        """
+        Resize image to target size randomly. random target_size and interpolation method
+        Args:
+            target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
+            keep_ratio (bool): whether keep_raio or not, default true
+            interp (int): the interpolation method
+            random_size (bool): whether random select target size of image
+            random_interp (bool): whether random select interpolation method
+        """
+        super(RandomResize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+        assert isinstance(target_size, (
+            Integral, Sequence)), "target_size must be Integer, List or Tuple"
+        if random_size and not isinstance(target_size, Sequence):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. Must be List or Tuple, now is {}".
+                format(type(target_size)))
+        self.target_size = target_size
+        self.random_size = random_size
+        self.random_interp = random_interp
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy.
+        """
+        if self.random_size:
+            target_size = random.choice(self.target_size)
+        else:
+            target_size = self.target_size
+
+        if self.random_interp:
+            interp = random.choice(self.interps)
+        else:
+            interp = self.interp
+
+        resizer = Resize(target_size, self.keep_ratio, interp)
+        return resizer(sample, context=context)
+
+
+@register_op
+class RandomExpand(BaseOperator):
+    """Random expand the canvas.
+    Args:
+        ratio (float): maximum expansion ratio.
+        prob (float): probability to expand.
+        fill_value (list): color value used to fill the canvas. in RGB order.
+    """
+
+    def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)):
+        super(RandomExpand, self).__init__()
+        assert ratio > 1.01, "expand ratio must be larger than 1.01"
+        self.ratio = ratio
+        self.prob = prob
+        assert isinstance(fill_value, (Number, Sequence)), \
+            "fill value must be either float or sequence"
+        if isinstance(fill_value, Number):
+            fill_value = (fill_value, ) * 3
+        if not isinstance(fill_value, tuple):
+            fill_value = tuple(fill_value)
+        self.fill_value = fill_value
+
+    def apply(self, sample, context=None):
+        if np.random.uniform(0., 1.) < self.prob:
+            return sample
+
+        im = sample['image']
+        height, width = im.shape[:2]
+        ratio = np.random.uniform(1., self.ratio)
+        h = int(height * ratio)
+        w = int(width * ratio)
+        if not h > height or not w > width:
+            return sample
+        y = np.random.randint(0, h - height)
+        x = np.random.randint(0, w - width)
+        offsets, size = [x, y], [h, w]
+
+        pad = Pad(size,
+                  pad_mode=-1,
+                  offsets=offsets,
+                  fill_value=self.fill_value)
+
+        return pad(sample, context=context)
+
+
+@register_op
+class CropWithSampling(BaseOperator):
+    def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True):
+        """
+        Args:
+            batch_sampler (list): Multiple sets of different
+                                  parameters for cropping.
+            satisfy_all (bool): whether all boxes must satisfy.
+            e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]]
+           [max sample, max trial, min scale, max scale,
+            min aspect ratio, max aspect ratio,
+            min overlap, max overlap]
+            avoid_no_bbox (bool): whether to to avoid the
+                                  situation where the box does not appear.
+        """
+        super(CropWithSampling, self).__init__()
+        self.batch_sampler = batch_sampler
+        self.satisfy_all = satisfy_all
+        self.avoid_no_bbox = avoid_no_bbox
+
+    def apply(self, sample, context):
+        """
+        Crop the image and modify bounding box.
+        Operators:
+            1. Scale the image width and height.
+            2. Crop the image according to a radom sample.
+            3. Rescale the bounding box.
+            4. Determine if the new bbox is satisfied in the new image.
+        Returns:
+            sample: the image, bounding box are replaced.
+        """
+        assert 'image' in sample, "image data not found"
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        im_height, im_width = im.shape[:2]
+        gt_score = None
+        if 'gt_score' in sample:
+            gt_score = sample['gt_score']
+        sampled_bbox = []
+        gt_bbox = gt_bbox.tolist()
+        for sampler in self.batch_sampler:
+            found = 0
+            for i in range(sampler[1]):
+                if found >= sampler[0]:
+                    break
+                sample_bbox = generate_sample_bbox(sampler)
+                if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox,
+                                             self.satisfy_all):
+                    sampled_bbox.append(sample_bbox)
+                    found = found + 1
+        im = np.array(im)
+        while sampled_bbox:
+            idx = int(np.random.uniform(0, len(sampled_bbox)))
+            sample_bbox = sampled_bbox.pop(idx)
+            sample_bbox = clip_bbox(sample_bbox)
+            crop_bbox, crop_class, crop_score = \
+                filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score)
+            if self.avoid_no_bbox:
+                if len(crop_bbox) < 1:
+                    continue
+            xmin = int(sample_bbox[0] * im_width)
+            xmax = int(sample_bbox[2] * im_width)
+            ymin = int(sample_bbox[1] * im_height)
+            ymax = int(sample_bbox[3] * im_height)
+            im = im[ymin:ymax, xmin:xmax]
+            sample['image'] = im
+            sample['gt_bbox'] = crop_bbox
+            sample['gt_class'] = crop_class
+            sample['gt_score'] = crop_score
+            return sample
+        return sample
+
+
+@register_op
+class CropWithDataAchorSampling(BaseOperator):
+    def __init__(self,
+                 batch_sampler,
+                 anchor_sampler=None,
+                 target_size=None,
+                 das_anchor_scales=[16, 32, 64, 128],
+                 sampling_prob=0.5,
+                 min_size=8.,
+                 avoid_no_bbox=True):
+        """
+        Args:
+            anchor_sampler (list): anchor_sampling sets of different
+                                  parameters for cropping.
+            batch_sampler (list): Multiple sets of different
+                                  parameters for cropping.
+              e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]]
+                  [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]]
+              [max sample, max trial, min scale, max scale,
+               min aspect ratio, max aspect ratio,
+               min overlap, max overlap, min coverage, max coverage]
+            target_size (int): target image size.
+            das_anchor_scales (list[float]): a list of anchor scales in data
+                anchor smapling.
+            min_size (float): minimum size of sampled bbox.
+            avoid_no_bbox (bool): whether to to avoid the
+                                  situation where the box does not appear.
+        """
+        super(CropWithDataAchorSampling, self).__init__()
+        self.anchor_sampler = anchor_sampler
+        self.batch_sampler = batch_sampler
+        self.target_size = target_size
+        self.sampling_prob = sampling_prob
+        self.min_size = min_size
+        self.avoid_no_bbox = avoid_no_bbox
+        self.das_anchor_scales = np.array(das_anchor_scales)
+
+    def apply(self, sample, context):
+        """
+        Crop the image and modify bounding box.
+        Operators:
+            1. Scale the image width and height.
+            2. Crop the image according to a radom sample.
+            3. Rescale the bounding box.
+            4. Determine if the new bbox is satisfied in the new image.
+        Returns:
+            sample: the image, bounding box are replaced.
+        """
+        assert 'image' in sample, "image data not found"
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        image_height, image_width = im.shape[:2]
+        gt_bbox[:, 0] /= image_width
+        gt_bbox[:, 1] /= image_height
+        gt_bbox[:, 2] /= image_width
+        gt_bbox[:, 3] /= image_height
+        gt_score = None
+        if 'gt_score' in sample:
+            gt_score = sample['gt_score']
+        sampled_bbox = []
+        gt_bbox = gt_bbox.tolist()
+
+        prob = np.random.uniform(0., 1.)
+        if prob > self.sampling_prob:  # anchor sampling
+            assert self.anchor_sampler
+            for sampler in self.anchor_sampler:
+                found = 0
+                for i in range(sampler[1]):
+                    if found >= sampler[0]:
+                        break
+                    sample_bbox = data_anchor_sampling(
+                        gt_bbox, image_width, image_height,
+                        self.das_anchor_scales, self.target_size)
+                    if sample_bbox == 0:
+                        break
+                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,
+                                                          gt_bbox):
+                        sampled_bbox.append(sample_bbox)
+                        found = found + 1
+            im = np.array(im)
+            while sampled_bbox:
+                idx = int(np.random.uniform(0, len(sampled_bbox)))
+                sample_bbox = sampled_bbox.pop(idx)
+
+                if 'gt_keypoint' in sample.keys():
+                    keypoints = (sample['gt_keypoint'],
+                                 sample['keypoint_ignore'])
+                    crop_bbox, crop_class, crop_score, gt_keypoints = \
+                        filter_and_process(sample_bbox, gt_bbox, gt_class,
+                                scores=gt_score,
+                                keypoints=keypoints)
+                else:
+                    crop_bbox, crop_class, crop_score = filter_and_process(
+                        sample_bbox, gt_bbox, gt_class, scores=gt_score)
+                crop_bbox, crop_class, crop_score = bbox_area_sampling(
+                    crop_bbox, crop_class, crop_score, self.target_size,
+                    self.min_size)
+
+                if self.avoid_no_bbox:
+                    if len(crop_bbox) < 1:
+                        continue
+                im = crop_image_sampling(im, sample_bbox, image_width,
+                                         image_height, self.target_size)
+                height, width = im.shape[:2]
+                crop_bbox[:, 0] *= width
+                crop_bbox[:, 1] *= height
+                crop_bbox[:, 2] *= width
+                crop_bbox[:, 3] *= height
+                sample['image'] = im
+                sample['gt_bbox'] = crop_bbox
+                sample['gt_class'] = crop_class
+                if 'gt_score' in sample:
+                    sample['gt_score'] = crop_score
+                if 'gt_keypoint' in sample.keys():
+                    sample['gt_keypoint'] = gt_keypoints[0]
+                    sample['keypoint_ignore'] = gt_keypoints[1]
+                return sample
+            return sample
+
+        else:
+            for sampler in self.batch_sampler:
+                found = 0
+                for i in range(sampler[1]):
+                    if found >= sampler[0]:
+                        break
+                    sample_bbox = generate_sample_bbox_square(
+                        sampler, image_width, image_height)
+                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,
+                                                          gt_bbox):
+                        sampled_bbox.append(sample_bbox)
+                        found = found + 1
+            im = np.array(im)
+            while sampled_bbox:
+                idx = int(np.random.uniform(0, len(sampled_bbox)))
+                sample_bbox = sampled_bbox.pop(idx)
+                sample_bbox = clip_bbox(sample_bbox)
+
+                if 'gt_keypoint' in sample.keys():
+                    keypoints = (sample['gt_keypoint'],
+                                 sample['keypoint_ignore'])
+                    crop_bbox, crop_class, crop_score, gt_keypoints = \
+                        filter_and_process(sample_bbox, gt_bbox, gt_class,
+                                scores=gt_score,
+                                keypoints=keypoints)
+                else:
+                    crop_bbox, crop_class, crop_score = filter_and_process(
+                        sample_bbox, gt_bbox, gt_class, scores=gt_score)
+                # sampling bbox according the bbox area
+                crop_bbox, crop_class, crop_score = bbox_area_sampling(
+                    crop_bbox, crop_class, crop_score, self.target_size,
+                    self.min_size)
+
+                if self.avoid_no_bbox:
+                    if len(crop_bbox) < 1:
+                        continue
+                xmin = int(sample_bbox[0] * image_width)
+                xmax = int(sample_bbox[2] * image_width)
+                ymin = int(sample_bbox[1] * image_height)
+                ymax = int(sample_bbox[3] * image_height)
+                im = im[ymin:ymax, xmin:xmax]
+                height, width = im.shape[:2]
+                crop_bbox[:, 0] *= width
+                crop_bbox[:, 1] *= height
+                crop_bbox[:, 2] *= width
+                crop_bbox[:, 3] *= height
+                sample['image'] = im
+                sample['gt_bbox'] = crop_bbox
+                sample['gt_class'] = crop_class
+                if 'gt_score' in sample:
+                    sample['gt_score'] = crop_score
+                if 'gt_keypoint' in sample.keys():
+                    sample['gt_keypoint'] = gt_keypoints[0]
+                    sample['keypoint_ignore'] = gt_keypoints[1]
+                return sample
+            return sample
+
+
+@register_op
+class RandomCrop(BaseOperator):
+    """Random crop image and bboxes.
+    Args:
+        aspect_ratio (list): aspect ratio of cropped region.
+            in [min, max] format.
+        thresholds (list): iou thresholds for decide a valid bbox crop.
+        scaling (list): ratio between a cropped region and the original image.
+             in [min, max] format.
+        num_attempts (int): number of tries before giving up.
+        allow_no_crop (bool): allow return without actually cropping them.
+        cover_all_box (bool): ensure all bboxes are covered in the final crop.
+        is_mask_crop(bool): whether crop the segmentation.
+    """
+
+    def __init__(self,
+                 aspect_ratio=[.5, 2.],
+                 thresholds=[.0, .1, .3, .5, .7, .9],
+                 scaling=[.3, 1.],
+                 num_attempts=50,
+                 allow_no_crop=True,
+                 cover_all_box=False,
+                 is_mask_crop=False):
+        super(RandomCrop, self).__init__()
+        self.aspect_ratio = aspect_ratio
+        self.thresholds = thresholds
+        self.scaling = scaling
+        self.num_attempts = num_attempts
+        self.allow_no_crop = allow_no_crop
+        self.cover_all_box = cover_all_box
+        self.is_mask_crop = is_mask_crop
+
+    def crop_segms(self, segms, valid_ids, crop, height, width):
+        def _crop_poly(segm, crop):
+            xmin, ymin, xmax, ymax = crop
+            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
+            crop_p = np.array(crop_coord).reshape(4, 2)
+            crop_p = Polygon(crop_p)
+
+            crop_segm = list()
+            for poly in segm:
+                poly = np.array(poly).reshape(len(poly) // 2, 2)
+                polygon = Polygon(poly)
+                if not polygon.is_valid:
+                    exterior = polygon.exterior
+                    multi_lines = exterior.intersection(exterior)
+                    polygons = shapely.ops.polygonize(multi_lines)
+                    polygon = MultiPolygon(polygons)
+                multi_polygon = list()
+                if isinstance(polygon, MultiPolygon):
+                    multi_polygon = copy.deepcopy(polygon)
+                else:
+                    multi_polygon.append(copy.deepcopy(polygon))
+                for per_polygon in multi_polygon:
+                    inter = per_polygon.intersection(crop_p)
+                    if not inter:
+                        continue
+                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
+                        for part in inter:
+                            if not isinstance(part, Polygon):
+                                continue
+                            part = np.squeeze(
+                                np.array(part.exterior.coords[:-1]).reshape(
+                                    1, -1))
+                            part[0::2] -= xmin
+                            part[1::2] -= ymin
+                            crop_segm.append(part.tolist())
+                    elif isinstance(inter, Polygon):
+                        crop_poly = np.squeeze(
+                            np.array(inter.exterior.coords[:-1]).reshape(1,
+                                                                         -1))
+                        crop_poly[0::2] -= xmin
+                        crop_poly[1::2] -= ymin
+                        crop_segm.append(crop_poly.tolist())
+                    else:
+                        continue
+            return crop_segm
+
+        def _crop_rle(rle, crop, height, width):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        crop_segms = []
+        for id in valid_ids:
+            segm = segms[id]
+            if is_poly(segm):
+                import copy
+                import shapely.ops
+                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
+                logging.getLogger("shapely").setLevel(logging.WARNING)
+                # Polygon format
+                crop_segms.append(_crop_poly(segm, crop))
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                crop_segms.append(_crop_rle(segm, crop, height, width))
+        return crop_segms
+
+    def apply(self, sample, context=None):
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
+            return sample
+
+        h, w = sample['image'].shape[:2]
+        gt_bbox = sample['gt_bbox']
+
+        # NOTE Original method attempts to generate one candidate for each
+        # threshold then randomly sample one from the resulting list.
+        # Here a short circuit approach is taken, i.e., randomly choose a
+        # threshold and attempt to find a valid crop, and simply return the
+        # first one found.
+        # The probability is not exactly the same, kinda resembling the
+        # "Monty Hall" problem. Actually carrying out the attempts will affect
+        # observability (just like opening doors in the "Monty Hall" game).
+        thresholds = list(self.thresholds)
+        if self.allow_no_crop:
+            thresholds.append('no_crop')
+        np.random.shuffle(thresholds)
+
+        for thresh in thresholds:
+            if thresh == 'no_crop':
+                return sample
+
+            found = False
+            for i in range(self.num_attempts):
+                scale = np.random.uniform(*self.scaling)
+                if self.aspect_ratio is not None:
+                    min_ar, max_ar = self.aspect_ratio
+                    aspect_ratio = np.random.uniform(
+                        max(min_ar, scale**2), min(max_ar, scale**-2))
+                    h_scale = scale / np.sqrt(aspect_ratio)
+                    w_scale = scale * np.sqrt(aspect_ratio)
+                else:
+                    h_scale = np.random.uniform(*self.scaling)
+                    w_scale = np.random.uniform(*self.scaling)
+                crop_h = h * h_scale
+                crop_w = w * w_scale
+                if self.aspect_ratio is None:
+                    if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0:
+                        continue
+
+                crop_h = int(crop_h)
+                crop_w = int(crop_w)
+                crop_y = np.random.randint(0, h - crop_h)
+                crop_x = np.random.randint(0, w - crop_w)
+                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
+                iou = self._iou_matrix(
+                    gt_bbox, np.array(
+                        [crop_box], dtype=np.float32))
+                if iou.max() < thresh:
+                    continue
+
+                if self.cover_all_box and iou.min() < thresh:
+                    continue
+
+                cropped_box, valid_ids = self._crop_box_with_center_constraint(
+                    gt_bbox, np.array(
+                        crop_box, dtype=np.float32))
+                if valid_ids.size > 0:
+                    found = True
+                    break
+
+            if found:
+                if self.is_mask_crop and 'gt_poly' in sample and len(sample[
+                        'gt_poly']) > 0:
+                    crop_polys = self.crop_segms(
+                        sample['gt_poly'],
+                        valid_ids,
+                        np.array(
+                            crop_box, dtype=np.int64),
+                        h,
+                        w)
+                    if [] in crop_polys:
+                        delete_id = list()
+                        valid_polys = list()
+                        for id, crop_poly in enumerate(crop_polys):
+                            if crop_poly == []:
+                                delete_id.append(id)
+                            else:
+                                valid_polys.append(crop_poly)
+                        valid_ids = np.delete(valid_ids, delete_id)
+                        if len(valid_polys) == 0:
+                            return sample
+                        sample['gt_poly'] = valid_polys
+                    else:
+                        sample['gt_poly'] = crop_polys
+
+                if 'gt_segm' in sample:
+                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
+                                                        crop_box)
+                    sample['gt_segm'] = np.take(
+                        sample['gt_segm'], valid_ids, axis=0)
+
+                sample['image'] = self._crop_image(sample['image'], crop_box)
+                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
+                sample['gt_class'] = np.take(
+                    sample['gt_class'], valid_ids, axis=0)
+                if 'gt_score' in sample:
+                    sample['gt_score'] = np.take(
+                        sample['gt_score'], valid_ids, axis=0)
+
+                if 'is_crowd' in sample:
+                    sample['is_crowd'] = np.take(
+                        sample['is_crowd'], valid_ids, axis=0)
+                return sample
+
+        return sample
+
+    def _iou_matrix(self, a, b):
+        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
+        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+        area_o = (area_a[:, np.newaxis] + area_b - area_i)
+        return area_i / (area_o + 1e-10)
+
+    def _crop_box_with_center_constraint(self, box, crop):
+        cropped_box = box.copy()
+
+        cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
+        cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
+        cropped_box[:, :2] -= crop[:2]
+        cropped_box[:, 2:] -= crop[:2]
+
+        centers = (box[:, :2] + box[:, 2:]) / 2
+        valid = np.logical_and(crop[:2] <= centers,
+                               centers < crop[2:]).all(axis=1)
+        valid = np.logical_and(
+            valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
+
+        return cropped_box, np.where(valid)[0]
+
+    def _crop_image(self, img, crop):
+        x1, y1, x2, y2 = crop
+        return img[y1:y2, x1:x2, :]
+
+    def _crop_segm(self, segm, crop):
+        x1, y1, x2, y2 = crop
+        return segm[:, y1:y2, x1:x2]
+
+
+@register_op
+class RandomScaledCrop(BaseOperator):
+    """Resize image and bbox based on long side (with optional random scaling),
+       then crop or pad image to target size.
+    Args:
+        target_dim (int): target size.
+        scale_range (list): random scale range.
+        interp (int): interpolation method, default to `cv2.INTER_LINEAR`.
+    """
+
+    def __init__(self,
+                 target_dim=512,
+                 scale_range=[.1, 2.],
+                 interp=cv2.INTER_LINEAR):
+        super(RandomScaledCrop, self).__init__()
+        self.target_dim = target_dim
+        self.scale_range = scale_range
+        self.interp = interp
+
+    def apply(self, sample, context=None):
+        img = sample['image']
+        h, w = img.shape[:2]
+        random_scale = np.random.uniform(*self.scale_range)
+        dim = self.target_dim
+        random_dim = int(dim * random_scale)
+        dim_max = max(h, w)
+        scale = random_dim / dim_max
+        resize_w = w * scale
+        resize_h = h * scale
+        offset_x = int(max(0, np.random.uniform(0., resize_w - dim)))
+        offset_y = int(max(0, np.random.uniform(0., resize_h - dim)))
+
+        img = cv2.resize(img, (resize_w, resize_h), interpolation=self.interp)
+        img = np.array(img)
+        canvas = np.zeros((dim, dim, 3), dtype=img.dtype)
+        canvas[:min(dim, resize_h), :min(dim, resize_w), :] = img[
+            offset_y:offset_y + dim, offset_x:offset_x + dim, :]
+        sample['image'] = canvas
+        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
+        scale_factor = sample['sacle_factor']
+        sample['scale_factor'] = np.asarray(
+            [scale_factor[0] * scale, scale_factor[1] * scale],
+            dtype=np.float32)
+
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            scale_array = np.array([scale, scale] * 2, dtype=np.float32)
+            shift_array = np.array([offset_x, offset_y] * 2, dtype=np.float32)
+            boxes = sample['gt_bbox'] * scale_array - shift_array
+            boxes = np.clip(boxes, 0, dim - 1)
+            # filter boxes with no area
+            area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1)
+            valid = (area > 1.).nonzero()[0]
+            sample['gt_bbox'] = boxes[valid]
+            sample['gt_class'] = sample['gt_class'][valid]
+
+        return sample
+
+
+@register_op
+class Cutmix(BaseOperator):
+    def __init__(self, alpha=1.5, beta=1.5):
+        """
+        CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899
+        Cutmix image and gt_bbbox/gt_score
+        Args:
+             alpha (float): alpha parameter of beta distribute
+             beta (float): beta parameter of beta distribute
+        """
+        super(Cutmix, self).__init__()
+        self.alpha = alpha
+        self.beta = beta
+        if self.alpha <= 0.0:
+            raise ValueError("alpha shold be positive in {}".format(self))
+        if self.beta <= 0.0:
+            raise ValueError("beta shold be positive in {}".format(self))
+
+    def apply_image(self, img1, img2, factor):
+        """ _rand_bbox """
+        h = max(img1.shape[0], img2.shape[0])
+        w = max(img1.shape[1], img2.shape[1])
+        cut_rat = np.sqrt(1. - factor)
+
+        cut_w = np.int32(w * cut_rat)
+        cut_h = np.int32(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w - 1)
+        bby1 = np.clip(cy - cut_h // 2, 0, h - 1)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w - 1)
+        bby2 = np.clip(cy + cut_h // 2, 0, h - 1)
+
+        img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32')
+        img_1_pad[:img1.shape[0], :img1.shape[1], :] = \
+            img1.astype('float32')
+        img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32')
+        img_2_pad[:img2.shape[0], :img2.shape[1], :] = \
+            img2.astype('float32')
+        img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :]
+        return img_1_pad
+
+    def __call__(self, sample, context=None):
+        if not isinstance(sample, Sequence):
+            return sample
+
+        assert len(sample) == 2, 'cutmix need two samples'
+
+        factor = np.random.beta(self.alpha, self.beta)
+        factor = max(0.0, min(1.0, factor))
+        if factor >= 1.0:
+            return sample[0]
+        if factor <= 0.0:
+            return sample[1]
+        img1 = sample[0]['image']
+        img2 = sample[1]['image']
+        img = self.apply_image(img1, img2, factor)
+        gt_bbox1 = sample[0]['gt_bbox']
+        gt_bbox2 = sample[1]['gt_bbox']
+        gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
+        gt_class1 = sample[0]['gt_class']
+        gt_class2 = sample[1]['gt_class']
+        gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
+        gt_score1 = np.ones_like(sample[0]['gt_class'])
+        gt_score2 = np.ones_like(sample[1]['gt_class'])
+        gt_score = np.concatenate(
+            (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
+        result = copy.deepcopy(sample[0])
+        result['image'] = img
+        result['gt_bbox'] = gt_bbox
+        result['gt_score'] = gt_score
+        result['gt_class'] = gt_class
+        if 'is_crowd' in sample[0]:
+            is_crowd1 = sample[0]['is_crowd']
+            is_crowd2 = sample[1]['is_crowd']
+            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
+            result['is_crowd'] = is_crowd
+        if 'difficult' in sample[0]:
+            is_difficult1 = sample[0]['difficult']
+            is_difficult2 = sample[1]['difficult']
+            is_difficult = np.concatenate(
+                (is_difficult1, is_difficult2), axis=0)
+            result['difficult'] = is_difficult
+        return result
+
+
+@register_op
+class Mixup(BaseOperator):
+    def __init__(self, alpha=1.5, beta=1.5):
+        """ Mixup image and gt_bbbox/gt_score
+        Args:
+            alpha (float): alpha parameter of beta distribute
+            beta (float): beta parameter of beta distribute
+        """
+        super(Mixup, self).__init__()
+        self.alpha = alpha
+        self.beta = beta
+        if self.alpha <= 0.0:
+            raise ValueError("alpha shold be positive in {}".format(self))
+        if self.beta <= 0.0:
+            raise ValueError("beta shold be positive in {}".format(self))
+
+    def apply_image(self, img1, img2, factor):
+        h = max(img1.shape[0], img2.shape[0])
+        w = max(img1.shape[1], img2.shape[1])
+        img = np.zeros((h, w, img1.shape[2]), 'float32')
+        img[:img1.shape[0], :img1.shape[1], :] = \
+            img1.astype('float32') * factor
+        img[:img2.shape[0], :img2.shape[1], :] += \
+            img2.astype('float32') * (1.0 - factor)
+        return img.astype('uint8')
+
+    def __call__(self, sample, context=None):
+        if not isinstance(sample, Sequence):
+            return sample
+
+        assert len(sample) == 2, 'mixup need two samples'
+
+        factor = np.random.beta(self.alpha, self.beta)
+        factor = max(0.0, min(1.0, factor))
+        if factor >= 1.0:
+            return sample[0]
+        if factor <= 0.0:
+            return sample[1]
+        im = self.apply_image(sample[0]['image'], sample[1]['image'], factor)
+        result = copy.deepcopy(sample[0])
+        result['image'] = im
+        # apply bbox and score
+        if 'gt_bbox' in sample[0]:
+            gt_bbox1 = sample[0]['gt_bbox']
+            gt_bbox2 = sample[1]['gt_bbox']
+            gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
+            result['gt_bbox'] = gt_bbox
+        if 'gt_class' in sample[0]:
+            gt_class1 = sample[0]['gt_class']
+            gt_class2 = sample[1]['gt_class']
+            gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
+            result['gt_class'] = gt_class
+
+            gt_score1 = np.ones_like(sample[0]['gt_class'])
+            gt_score2 = np.ones_like(sample[1]['gt_class'])
+            gt_score = np.concatenate(
+                (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
+            result['gt_score'] = gt_score
+        if 'is_crowd' in sample[0]:
+            is_crowd1 = sample[0]['is_crowd']
+            is_crowd2 = sample[1]['is_crowd']
+            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
+            result['is_crowd'] = is_crowd
+        if 'difficult' in sample[0]:
+            is_difficult1 = sample[0]['difficult']
+            is_difficult2 = sample[1]['difficult']
+            is_difficult = np.concatenate(
+                (is_difficult1, is_difficult2), axis=0)
+            result['difficult'] = is_difficult
+
+        if 'gt_ide' in sample[0]:
+            gt_ide1 = sample[0]['gt_ide']
+            gt_ide2 = sample[1]['gt_ide']
+            gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0)
+            result['gt_ide'] = gt_ide
+        return result
+
+
+@register_op
+class NormalizeBox(BaseOperator):
+    """Transform the bounding box's coornidates to [0,1]."""
+
+    def __init__(self):
+        super(NormalizeBox, self).__init__()
+
+    def apply(self, sample, context):
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        height, width, _ = im.shape
+        for i in range(gt_bbox.shape[0]):
+            gt_bbox[i][0] = gt_bbox[i][0] / width
+            gt_bbox[i][1] = gt_bbox[i][1] / height
+            gt_bbox[i][2] = gt_bbox[i][2] / width
+            gt_bbox[i][3] = gt_bbox[i][3] / height
+        sample['gt_bbox'] = gt_bbox
+
+        if 'gt_keypoint' in sample.keys():
+            gt_keypoint = sample['gt_keypoint']
+
+            for i in range(gt_keypoint.shape[1]):
+                if i % 2:
+                    gt_keypoint[:, i] = gt_keypoint[:, i] / height
+                else:
+                    gt_keypoint[:, i] = gt_keypoint[:, i] / width
+            sample['gt_keypoint'] = gt_keypoint
+
+        return sample
+
+
+@register_op
+class BboxXYXY2XYWH(BaseOperator):
+    """
+    Convert bbox XYXY format to XYWH format.
+    """
+
+    def __init__(self):
+        super(BboxXYXY2XYWH, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2]
+        bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2.
+        sample['gt_bbox'] = bbox
+        return sample
+
+
+@register_op
+class PadBox(BaseOperator):
+    def __init__(self, num_max_boxes=50):
+        """
+        Pad zeros to bboxes if number of bboxes is less than num_max_boxes.
+        Args:
+            num_max_boxes (int): the max number of bboxes
+        """
+        self.num_max_boxes = num_max_boxes
+        super(PadBox, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        gt_num = min(self.num_max_boxes, len(bbox))
+        num_max = self.num_max_boxes
+        # fields = context['fields'] if context else []
+        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
+        if gt_num > 0:
+            pad_bbox[:gt_num, :] = bbox[:gt_num, :]
+        sample['gt_bbox'] = pad_bbox
+        if 'gt_class' in sample:
+            pad_class = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_class[:gt_num] = sample['gt_class'][:gt_num, 0]
+            sample['gt_class'] = pad_class
+        if 'gt_score' in sample:
+            pad_score = np.zeros((num_max, ), dtype=np.float32)
+            if gt_num > 0:
+                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
+            sample['gt_score'] = pad_score
+        # in training, for example in op ExpandImage,
+        # the bbox and gt_class is expandded, but the difficult is not,
+        # so, judging by it's length
+        if 'difficult' in sample:
+            pad_diff = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
+            sample['difficult'] = pad_diff
+        if 'is_crowd' in sample:
+            pad_crowd = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
+            sample['is_crowd'] = pad_crowd
+        if 'gt_ide' in sample:
+            pad_ide = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
+            sample['gt_ide'] = pad_ide
+        return sample
+
+
+@register_op
+class DebugVisibleImage(BaseOperator):
+    """
+    In debug mode, visualize images according to `gt_box`.
+    (Currently only supported when not cropping and flipping image.)
+    """
+
+    def __init__(self, output_dir='output/debug', is_normalized=False):
+        super(DebugVisibleImage, self).__init__()
+        self.is_normalized = is_normalized
+        self.output_dir = output_dir
+        if not os.path.isdir(output_dir):
+            os.makedirs(output_dir)
+        if not isinstance(self.is_normalized, bool):
+            raise TypeError("{}: input type is invalid.".format(self))
+
+    def apply(self, sample, context=None):
+        image = Image.fromarray(sample['image'].astype(np.uint8))
+        out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
+        width = sample['w']
+        height = sample['h']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        draw = ImageDraw.Draw(image)
+        for i in range(gt_bbox.shape[0]):
+            if self.is_normalized:
+                gt_bbox[i][0] = gt_bbox[i][0] * width
+                gt_bbox[i][1] = gt_bbox[i][1] * height
+                gt_bbox[i][2] = gt_bbox[i][2] * width
+                gt_bbox[i][3] = gt_bbox[i][3] * height
+
+            xmin, ymin, xmax, ymax = gt_bbox[i]
+            draw.line(
+                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
+                 (xmin, ymin)],
+                width=2,
+                fill='green')
+            # draw label
+            text = str(gt_class[i][0])
+            tw, th = draw.textsize(text)
+            draw.rectangle(
+                [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
+            draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
+
+        if 'gt_keypoint' in sample.keys():
+            gt_keypoint = sample['gt_keypoint']
+            if self.is_normalized:
+                for i in range(gt_keypoint.shape[1]):
+                    if i % 2:
+                        gt_keypoint[:, i] = gt_keypoint[:, i] * height
+                    else:
+                        gt_keypoint[:, i] = gt_keypoint[:, i] * width
+            for i in range(gt_keypoint.shape[0]):
+                keypoint = gt_keypoint[i]
+                for j in range(int(keypoint.shape[0] / 2)):
+                    x1 = round(keypoint[2 * j]).astype(np.int32)
+                    y1 = round(keypoint[2 * j + 1]).astype(np.int32)
+                    draw.ellipse(
+                        (x1, y1, x1 + 5, y1 + 5),
+                        fill='green',
+                        outline='green')
+        save_path = os.path.join(self.output_dir, out_file_name)
+        image.save(save_path, quality=95)
+        return sample
+
+
+@register_op
+class Pad(BaseOperator):
+    def __init__(self,
+                 size=None,
+                 size_divisor=32,
+                 pad_mode=0,
+                 offsets=None,
+                 fill_value=(127.5, 127.5, 127.5)):
+        """
+        Pad image to a specified size or multiple of size_divisor.
+        Args:
+            size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None
+            size_divisor (int): size divisor, default 32
+            pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets
+                if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top
+            offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1
+            fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)
+        """
+        super(Pad, self).__init__()
+
+        if not isinstance(size, (int, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. \
+                            Must be List, now is {}".format(type(size)))
+
+        if isinstance(size, int):
+            size = [size, size]
+
+        assert pad_mode in [
+            -1, 0, 1, 2
+        ], 'currently only supports four modes [-1, 0, 1, 2]'
+        if pad_mode == -1:
+            assert offsets, 'if pad_mode is -1, offsets should not be None'
+
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_mode = pad_mode
+        self.fill_value = fill_value
+        self.offsets = offsets
+
+    def apply_segm(self, segms, offsets, im_size, size):
+        def _expand_poly(poly, x, y):
+            expanded_poly = np.array(poly)
+            expanded_poly[0::2] += x
+            expanded_poly[1::2] += y
+            return expanded_poly.tolist()
+
+        def _expand_rle(rle, x, y, height, width, h, w):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            expanded_mask = np.full((h, w), 0).astype(mask.dtype)
+            expanded_mask[y:y + height, x:x + width] = mask
+            rle = mask_util.encode(
+                np.array(
+                    expanded_mask, order='F', dtype=np.uint8))
+            return rle
+
+        x, y = offsets
+        height, width = im_size
+        h, w = size
+        expanded_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                expanded_segms.append(
+                    [_expand_poly(poly, x, y) for poly in segm])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                expanded_segms.append(
+                    _expand_rle(segm, x, y, height, width, h, w))
+        return expanded_segms
+
+    def apply_bbox(self, bbox, offsets):
+        return bbox + np.array(offsets * 2, dtype=np.float32)
+
+    def apply_keypoint(self, keypoints, offsets):
+        n = len(keypoints[0]) // 2
+        return keypoints + np.array(offsets * n, dtype=np.float32)
+
+    def apply_image(self, image, offsets, im_size, size):
+        x, y = offsets
+        im_h, im_w = im_size
+        h, w = size
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)
+        return canvas
+
+    def apply(self, sample, context=None):
+        im = sample['image']
+        im_h, im_w = im.shape[:2]
+        if self.size:
+            h, w = self.size
+            assert (
+                im_h < h and im_w < w
+            ), '(h, w) of target size should be greater than (im_h, im_w)'
+        else:
+            h = np.ceil(im_h / self.size_divisor) * self.size_divisor
+            w = np.ceil(im_w / self.size_divisor) * self.size_divisor
+
+        if h == im_h and w == im_w:
+            return sample
+
+        if self.pad_mode == -1:
+            offset_x, offset_y = self.offsets
+        elif self.pad_mode == 0:
+            offset_y, offset_x = 0, 0
+        elif self.pad_mode == 1:
+            offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2
+        else:
+            offset_y, offset_x = h - im_h, w - im_w
+
+        offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]
+
+        sample['image'] = self.apply_image(im, offsets, im_size, size)
+
+        if self.pad_mode == 0:
+            return sample
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets)
+
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets,
+                                                im_size, size)
+
+        if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
+            sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'],
+                                                        offsets)
+
+        return sample
+
+
+@register_op
+class Poly2Mask(BaseOperator):
+    """
+    gt poly to mask annotations
+    """
+
+    def __init__(self):
+        super(Poly2Mask, self).__init__()
+        import pycocotools.mask as maskUtils
+        self.maskutils = maskUtils
+
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
+            rle = self.maskutils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = self.maskutils.decode(rle)
+        return mask
+
+    def apply(self, sample, context=None):
+        assert 'gt_poly' in sample
+        im_h = sample['h']
+        im_w = sample['w']
+        masks = [
+            self._poly2mask(gt_poly, im_h, im_w)
+            for gt_poly in sample['gt_poly']
+        ]
+        sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+        return sample
+
+
+@register_op
+class Rbox2Poly(BaseOperator):
+    """
+    Convert rbbox format to poly format.
+    """
+
+    def __init__(self):
+        super(Rbox2Poly, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_rbox' in sample
+        assert sample['gt_rbox'].shape[1] == 5
+        rrects = sample['gt_rbox']
+        x_ctr = rrects[:, 0]
+        y_ctr = rrects[:, 1]
+        width = rrects[:, 2]
+        height = rrects[:, 3]
+        x1 = x_ctr - width / 2.0
+        y1 = y_ctr - height / 2.0
+        x2 = x_ctr + width / 2.0
+        y2 = y_ctr + height / 2.0
+        sample['gt_bbox'] = np.stack([x1, y1, x2, y2], axis=1)
+        polys = bbox_utils.rbox2poly_np(rrects)
+        sample['gt_rbox2poly'] = polys
+        return sample
+
+
+@register_op
+class AugmentHSV(BaseOperator):
+    def __init__(self, fraction=0.50, is_bgr=True):
+        """
+        Augment the SV channel of image data.
+        Args:
+            fraction (float): the fraction for augment. Default: 0.5.
+            is_bgr (bool): whether the image is BGR mode. Default: True.
+        """
+        super(AugmentHSV, self).__init__()
+        self.fraction = fraction
+        self.is_bgr = is_bgr
+
+    def apply(self, sample, context=None):
+        img = sample['image']
+        if self.is_bgr:
+            img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+        else:
+            img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
+        S = img_hsv[:, :, 1].astype(np.float32)
+        V = img_hsv[:, :, 2].astype(np.float32)
+
+        a = (random.random() * 2 - 1) * self.fraction + 1
+        S *= a
+        if a > 1:
+            np.clip(S, a_min=0, a_max=255, out=S)
+
+        a = (random.random() * 2 - 1) * self.fraction + 1
+        V *= a
+        if a > 1:
+            np.clip(V, a_min=0, a_max=255, out=V)
+
+        img_hsv[:, :, 1] = S.astype(np.uint8)
+        img_hsv[:, :, 2] = V.astype(np.uint8)
+        if self.is_bgr:
+            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)
+        else:
+            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img)
+
+        sample['image'] = img
+        return sample
+
+
+@register_op
+class Norm2PixelBbox(BaseOperator):
+    """
+    Transform the bounding box's coornidates which is in [0,1] to pixels.
+    """
+
+    def __init__(self):
+        super(Norm2PixelBbox, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        height, width = sample['image'].shape[:2]
+        bbox[:, 0::2] = bbox[:, 0::2] * width
+        bbox[:, 1::2] = bbox[:, 1::2] * height
+        sample['gt_bbox'] = bbox
+        return sample
+
+
+@register_op
+class BboxCXCYWH2XYXY(BaseOperator):
+    """
+    Convert bbox CXCYWH format to XYXY format.
+    [center_x, center_y, width, height] -> [x0, y0, x1, y1]
+    """
+
+    def __init__(self):
+        super(BboxCXCYWH2XYXY, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox0 = sample['gt_bbox']
+        bbox = bbox0.copy()
+
+        bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2.
+        bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2.
+        sample['gt_bbox'] = bbox
+        return sample
+
+
+@register_op
+class RandomResizeCrop(BaseOperator):
+    """Random resize and crop image and bboxes.
+    Args:
+        resizes (list): resize image to one of resizes. if keep_ratio is True and mode is
+        'long', resize the image's long side to the maximum of target_size, if keep_ratio is
+        True and mode is 'short', resize the image's short side to the minimum of target_size.
+        cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...]
+        mode (str): resize mode, `long` or `short`. Details see resizes.
+        prob (float): probability of this op.
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): the interpolation method
+        thresholds (list): iou thresholds for decide a valid bbox crop.
+        num_attempts (int): number of tries before giving up.
+        allow_no_crop (bool): allow return without actually cropping them.
+        cover_all_box (bool): ensure all bboxes are covered in the final crop.
+        is_mask_crop(bool): whether crop the segmentation.
+    """
+
+    def __init__(
+            self,
+            resizes,
+            cropsizes,
+            prob=0.5,
+            mode='short',
+            keep_ratio=True,
+            interp=cv2.INTER_LINEAR,
+            num_attempts=3,
+            cover_all_box=False,
+            allow_no_crop=False,
+            thresholds=[0.3, 0.5, 0.7],
+            is_mask_crop=False, ):
+        super(RandomResizeCrop, self).__init__()
+
+        self.resizes = resizes
+        self.cropsizes = cropsizes
+        self.prob = prob
+        self.mode = mode
+
+        self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp)
+        self.croper = RandomCrop(
+            num_attempts=num_attempts,
+            cover_all_box=cover_all_box,
+            thresholds=thresholds,
+            allow_no_crop=allow_no_crop,
+            is_mask_crop=is_mask_crop)
+
+    def _format_size(self, size):
+        if isinstance(size, Integral):
+            size = (size, size)
+        return size
+
+    def apply(self, sample, context=None):
+        if random.random() < self.prob:
+            _resize = self._format_size(random.choice(self.resizes))
+            _cropsize = self._format_size(random.choice(self.cropsizes))
+            sample = self._resize(
+                self.resizer,
+                sample,
+                size=_resize,
+                mode=self.mode,
+                context=context)
+            sample = self._random_crop(
+                self.croper, sample, size=_cropsize, context=context)
+        return sample
+
+    @staticmethod
+    def _random_crop(croper, sample, size, context=None):
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
+            return sample
+
+        self = croper
+        h, w = sample['image'].shape[:2]
+        gt_bbox = sample['gt_bbox']
+        cropsize = size
+        min_crop = min(cropsize)
+        max_crop = max(cropsize)
+
+        thresholds = list(self.thresholds)
+        np.random.shuffle(thresholds)
+
+        for thresh in thresholds:
+            found = False
+            for _ in range(self.num_attempts):
+
+                crop_h = random.randint(min_crop, min(h, max_crop))
+                crop_w = random.randint(min_crop, min(w, max_crop))
+
+                crop_y = random.randint(0, h - crop_h)
+                crop_x = random.randint(0, w - crop_w)
+
+                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
+                iou = self._iou_matrix(
+                    gt_bbox, np.array(
+                        [crop_box], dtype=np.float32))
+                if iou.max() < thresh:
+                    continue
+
+                if self.cover_all_box and iou.min() < thresh:
+                    continue
+
+                cropped_box, valid_ids = self._crop_box_with_center_constraint(
+                    gt_bbox, np.array(
+                        crop_box, dtype=np.float32))
+                if valid_ids.size > 0:
+                    found = True
+                    break
+
+            if found:
+                if self.is_mask_crop and 'gt_poly' in sample and len(sample[
+                        'gt_poly']) > 0:
+                    crop_polys = self.crop_segms(
+                        sample['gt_poly'],
+                        valid_ids,
+                        np.array(
+                            crop_box, dtype=np.int64),
+                        h,
+                        w)
+                    if [] in crop_polys:
+                        delete_id = list()
+                        valid_polys = list()
+                        for id, crop_poly in enumerate(crop_polys):
+                            if crop_poly == []:
+                                delete_id.append(id)
+                            else:
+                                valid_polys.append(crop_poly)
+                        valid_ids = np.delete(valid_ids, delete_id)
+                        if len(valid_polys) == 0:
+                            return sample
+                        sample['gt_poly'] = valid_polys
+                    else:
+                        sample['gt_poly'] = crop_polys
+
+                if 'gt_segm' in sample:
+                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
+                                                        crop_box)
+                    sample['gt_segm'] = np.take(
+                        sample['gt_segm'], valid_ids, axis=0)
+
+                sample['image'] = self._crop_image(sample['image'], crop_box)
+                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
+                sample['gt_class'] = np.take(
+                    sample['gt_class'], valid_ids, axis=0)
+                if 'gt_score' in sample:
+                    sample['gt_score'] = np.take(
+                        sample['gt_score'], valid_ids, axis=0)
+
+                if 'is_crowd' in sample:
+                    sample['is_crowd'] = np.take(
+                        sample['is_crowd'], valid_ids, axis=0)
+                return sample
+
+        return sample
+
+    @staticmethod
+    def _resize(resizer, sample, size, mode='short', context=None):
+        self = resizer
+        im = sample['image']
+        target_size = size
+
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError('{}: image is not 3-dimensional.'.format(self))
+
+        # apply image
+        im_shape = im.shape
+        if self.keep_ratio:
+
+            im_size_min = np.min(im_shape[0:2])
+            im_size_max = np.max(im_shape[0:2])
+
+            target_size_min = np.min(target_size)
+            target_size_max = np.max(target_size)
+
+            if mode == 'long':
+                im_scale = min(target_size_min / im_size_min,
+                               target_size_max / im_size_max)
+            else:
+                im_scale = max(target_size_min / im_size_min,
+                               target_size_max / im_size_max)
+
+            resize_h = im_scale * float(im_shape[0])
+            resize_w = im_scale * float(im_shape[1])
+
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = target_size
+            im_scale_y = resize_h / im_shape[0]
+            im_scale_x = resize_w / im_shape[1]
+
+        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
+        sample['image'] = im
+        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
+                                                [im_scale_x, im_scale_y],
+                                                [resize_w, resize_h])
+
+        # apply rbox
+        if 'gt_rbox2poly' in sample:
+            if np.array(sample['gt_rbox2poly']).shape[1] != 8:
+                logger.warn(
+                    "gt_rbox2poly's length shoule be 8, but actually is {}".
+                    format(len(sample['gt_rbox2poly'])))
+            sample['gt_rbox2poly'] = self.apply_bbox(sample['gt_rbox2poly'],
+                                                     [im_scale_x, im_scale_y],
+                                                     [resize_w, resize_h])
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(
+                sample['gt_poly'], im_shape[:2], [im_scale_x, im_scale_y])
+
+        # apply semantic
+        if 'semantic' in sample and sample['semantic']:
+            semantic = sample['semantic']
+            semantic = cv2.resize(
+                semantic.astype('float32'),
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            semantic = np.asarray(semantic).astype('int32')
+            semantic = np.expand_dims(semantic, 0)
+            sample['semantic'] = semantic
+
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            masks = [
+                cv2.resize(
+                    gt_segm,
+                    None,
+                    None,
+                    fx=im_scale_x,
+                    fy=im_scale_y,
+                    interpolation=cv2.INTER_NEAREST)
+                for gt_segm in sample['gt_segm']
+            ]
+            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+
+        return sample
+
+
+@register_op
+class RandomSelect(BaseOperator):
+    """
+    Randomly choose a transformation between transforms1 and transforms2,
+    and the probability of choosing transforms1 is p.
+
+    The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py
+
+    """
+
+    def __init__(self, transforms1, transforms2, p=0.5):
+        super(RandomSelect, self).__init__()
+        self.transforms1 = Compose(transforms1)
+        self.transforms2 = Compose(transforms2)
+        self.p = p
+
+    def apply(self, sample, context=None):
+        if random.random() < self.p:
+            return self.transforms1(sample)
+        return self.transforms2(sample)
+
+
+@register_op
+class RandomShortSideResize(BaseOperator):
+    def __init__(self,
+                 short_side_sizes,
+                 max_size=None,
+                 interp=cv2.INTER_LINEAR,
+                 random_interp=False):
+        """
+        Resize the image randomly according to the short side. If max_size is not None,
+        the long side is scaled according to max_size. The whole process will be keep ratio.
+        Args:
+            short_side_sizes (list|tuple): Image target short side size.
+            max_size (int): The size of the longest side of image after resize.
+            interp (int): The interpolation method.
+            random_interp (bool): Whether random select interpolation method.
+        """
+        super(RandomShortSideResize, self).__init__()
+
+        assert isinstance(short_side_sizes,
+                          Sequence), "short_side_sizes must be List or Tuple"
+
+        self.short_side_sizes = short_side_sizes
+        self.max_size = max_size
+        self.interp = interp
+        self.random_interp = random_interp
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+
+    def get_size_with_aspect_ratio(self, image_shape, size, max_size=None):
+        h, w = image_shape
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(
+                    round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (w, h)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (ow, oh)
+
+    def resize(self,
+               sample,
+               target_size,
+               max_size=None,
+               interp=cv2.INTER_LINEAR):
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError('{}: image is not 3-dimensional.'.format(self))
+
+        target_size = self.get_size_with_aspect_ratio(im.shape[:2],
+                                                      target_size, max_size)
+        im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[
+            0] / im.shape[1]
+
+        sample['image'] = cv2.resize(im, target_size, interpolation=interp)
+        sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(
+                sample['gt_bbox'], [im_scale_x, im_scale_y], target_size)
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(
+                sample['gt_poly'], im.shape[:2], [im_scale_x, im_scale_y])
+        # apply semantic
+        if 'semantic' in sample and sample['semantic']:
+            semantic = sample['semantic']
+            semantic = cv2.resize(
+                semantic.astype('float32'),
+                target_size,
+                interpolation=self.interp)
+            semantic = np.asarray(semantic).astype('int32')
+            semantic = np.expand_dims(semantic, 0)
+            sample['semantic'] = semantic
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            masks = [
+                cv2.resize(
+                    gt_segm, target_size, interpolation=cv2.INTER_NEAREST)
+                for gt_segm in sample['gt_segm']
+            ]
+            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+        return sample
+
+    def apply_bbox(self, bbox, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        bbox[:, 0::2] *= im_scale_x
+        bbox[:, 1::2] *= im_scale_y
+        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
+        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
+        return bbox.astype('float32')
+
+    def apply_segm(self, segms, im_size, scale):
+        def _resize_poly(poly, im_scale_x, im_scale_y):
+            resized_poly = np.array(poly).astype('float32')
+            resized_poly[0::2] *= im_scale_x
+            resized_poly[1::2] *= im_scale_y
+            return resized_poly.tolist()
+
+        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, im_h, im_w)
+
+            mask = mask_util.decode(rle)
+            mask = cv2.resize(
+                mask,
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        im_h, im_w = im_size
+        im_scale_x, im_scale_y = scale
+        resized_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                resized_segms.append([
+                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
+                ])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                resized_segms.append(
+                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
+
+        return resized_segms
+
+    def apply(self, sample, context=None):
+        target_size = random.choice(self.short_side_sizes)
+        interp = random.choice(
+            self.interps) if self.random_interp else self.interp
+
+        return self.resize(sample, target_size, self.max_size, interp)
+
+
+@register_op
+class RandomSizeCrop(BaseOperator):
+    """
+    Cut the image randomly according to `min_size` and `max_size`
+    """
+
+    def __init__(self, min_size, max_size):
+        super(RandomSizeCrop, self).__init__()
+        self.min_size = min_size
+        self.max_size = max_size
+
+        from paddle.vision.transforms.functional import crop as paddle_crop
+        self.paddle_crop = paddle_crop
+
+    @staticmethod
+    def get_crop_params(img_shape, output_size):
+        """Get parameters for ``crop`` for a random crop.
+        Args:
+            img_shape (list|tuple): Image's height and width.
+            output_size (list|tuple): Expected output size of the crop.
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        h, w = img_shape
+        th, tw = output_size
+
+        if h + 1 < th or w + 1 < tw:
+            raise ValueError(
+                "Required crop size {} is larger then input image size {}".
+                format((th, tw), (h, w)))
+
+        if w == tw and h == th:
+            return 0, 0, h, w
+
+        i = random.randint(0, h - th + 1)
+        j = random.randint(0, w - tw + 1)
+        return i, j, th, tw
+
+    def crop(self, sample, region):
+        image_shape = sample['image'].shape[:2]
+        sample['image'] = self.paddle_crop(sample['image'], *region)
+
+        keep_index = None
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], region)
+            bbox = sample['gt_bbox'].reshape([-1, 2, 2])
+            area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1)
+            keep_index = np.where(area > 0)[0]
+            sample['gt_bbox'] = sample['gt_bbox'][keep_index] if len(
+                keep_index) > 0 else np.zeros(
+                    [0, 4], dtype=np.float32)
+            sample['gt_class'] = sample['gt_class'][keep_index] if len(
+                keep_index) > 0 else np.zeros(
+                    [0, 1], dtype=np.float32)
+            if 'gt_score' in sample:
+                sample['gt_score'] = sample['gt_score'][keep_index] if len(
+                    keep_index) > 0 else np.zeros(
+                        [0, 1], dtype=np.float32)
+            if 'is_crowd' in sample:
+                sample['is_crowd'] = sample['is_crowd'][keep_index] if len(
+                    keep_index) > 0 else np.zeros(
+                        [0, 1], dtype=np.float32)
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region,
+                                                image_shape)
+            if keep_index is not None:
+                sample['gt_poly'] = sample['gt_poly'][keep_index]
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            i, j, h, w = region
+            sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w]
+            if keep_index is not None:
+                sample['gt_segm'] = sample['gt_segm'][keep_index]
+
+        return sample
+
+    def apply_bbox(self, bbox, region):
+        i, j, h, w = region
+        region_size = np.asarray([w, h])
+        crop_bbox = bbox - np.asarray([j, i, j, i])
+        crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size)
+        crop_bbox = crop_bbox.clip(min=0)
+        return crop_bbox.reshape([-1, 4]).astype('float32')
+
+    def apply_segm(self, segms, region, image_shape):
+        def _crop_poly(segm, crop):
+            xmin, ymin, xmax, ymax = crop
+            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
+            crop_p = np.array(crop_coord).reshape(4, 2)
+            crop_p = Polygon(crop_p)
+
+            crop_segm = list()
+            for poly in segm:
+                poly = np.array(poly).reshape(len(poly) // 2, 2)
+                polygon = Polygon(poly)
+                if not polygon.is_valid:
+                    exterior = polygon.exterior
+                    multi_lines = exterior.intersection(exterior)
+                    polygons = shapely.ops.polygonize(multi_lines)
+                    polygon = MultiPolygon(polygons)
+                multi_polygon = list()
+                if isinstance(polygon, MultiPolygon):
+                    multi_polygon = copy.deepcopy(polygon)
+                else:
+                    multi_polygon.append(copy.deepcopy(polygon))
+                for per_polygon in multi_polygon:
+                    inter = per_polygon.intersection(crop_p)
+                    if not inter:
+                        continue
+                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
+                        for part in inter:
+                            if not isinstance(part, Polygon):
+                                continue
+                            part = np.squeeze(
+                                np.array(part.exterior.coords[:-1]).reshape(
+                                    1, -1))
+                            part[0::2] -= xmin
+                            part[1::2] -= ymin
+                            crop_segm.append(part.tolist())
+                    elif isinstance(inter, Polygon):
+                        crop_poly = np.squeeze(
+                            np.array(inter.exterior.coords[:-1]).reshape(1,
+                                                                         -1))
+                        crop_poly[0::2] -= xmin
+                        crop_poly[1::2] -= ymin
+                        crop_segm.append(crop_poly.tolist())
+                    else:
+                        continue
+            return crop_segm
+
+        def _crop_rle(rle, crop, height, width):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        i, j, h, w = region
+        crop = [j, i, j + w, i + h]
+        height, width = image_shape
+        crop_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                import copy
+                import shapely.ops
+                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
+                # Polygon format
+                crop_segms.append(_crop_poly(segm, crop))
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                crop_segms.append(_crop_rle(segm, crop, height, width))
+        return crop_segms
+
+    def apply(self, sample, context=None):
+        h = random.randint(self.min_size,
+                           min(sample['image'].shape[0], self.max_size))
+        w = random.randint(self.min_size,
+                           min(sample['image'].shape[1], self.max_size))
+
+        region = self.get_crop_params(sample['image'].shape[:2], [h, w])
+        return self.crop(sample, region)
+
+
+@register_op
+class WarpAffine(BaseOperator):
+    def __init__(self,
+                 keep_res=False,
+                 pad=31,
+                 input_h=512,
+                 input_w=512,
+                 scale=0.4,
+                 shift=0.1):
+        """WarpAffine
+        Warp affine the image
+
+        The code is based on https://github.com/xingyizhou/CenterNet/blob/master/src/lib/datasets/sample/ctdet.py
+
+
+        """
+        super(WarpAffine, self).__init__()
+        self.keep_res = keep_res
+        self.pad = pad
+        self.input_h = input_h
+        self.input_w = input_w
+        self.scale = scale
+        self.shift = shift
+
+    def apply(self, sample, context=None):
+        img = sample['image']
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
+            return sample
+
+        h, w = img.shape[:2]
+
+        if self.keep_res:
+            input_h = (h | self.pad) + 1
+            input_w = (w | self.pad) + 1
+            s = np.array([input_w, input_h], dtype=np.float32)
+            c = np.array([w // 2, h // 2], dtype=np.float32)
+
+        else:
+            s = max(h, w) * 1.0
+            input_h, input_w = self.input_h, self.input_w
+            c = np.array([w / 2., h / 2.], dtype=np.float32)
+
+        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
+        img = cv2.resize(img, (w, h))
+        inp = cv2.warpAffine(
+            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
+        sample['image'] = inp
+        return sample
+
+
+@register_op
+class FlipWarpAffine(BaseOperator):
+    def __init__(self,
+                 keep_res=False,
+                 pad=31,
+                 input_h=512,
+                 input_w=512,
+                 not_rand_crop=False,
+                 scale=0.4,
+                 shift=0.1,
+                 flip=0.5,
+                 is_scale=True,
+                 use_random=True):
+        """FlipWarpAffine
+        1. Random Crop
+        2. Flip the image horizontal
+        3. Warp affine the image
+        """
+        super(FlipWarpAffine, self).__init__()
+        self.keep_res = keep_res
+        self.pad = pad
+        self.input_h = input_h
+        self.input_w = input_w
+        self.not_rand_crop = not_rand_crop
+        self.scale = scale
+        self.shift = shift
+        self.flip = flip
+        self.is_scale = is_scale
+        self.use_random = use_random
+
+    def apply(self, sample, context=None):
+        img = sample['image']
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
+            return sample
+
+        h, w = img.shape[:2]
+
+        if self.keep_res:
+            input_h = (h | self.pad) + 1
+            input_w = (w | self.pad) + 1
+            s = np.array([input_w, input_h], dtype=np.float32)
+            c = np.array([w // 2, h // 2], dtype=np.float32)
+
+        else:
+            s = max(h, w) * 1.0
+            input_h, input_w = self.input_h, self.input_w
+            c = np.array([w / 2., h / 2.], dtype=np.float32)
+
+        if self.use_random:
+            gt_bbox = sample['gt_bbox']
+            if not self.not_rand_crop:
+                s = s * np.random.choice(np.arange(0.6, 1.4, 0.1))
+                w_border = get_border(128, w)
+                h_border = get_border(128, h)
+                c[0] = np.random.randint(low=w_border, high=w - w_border)
+                c[1] = np.random.randint(low=h_border, high=h - h_border)
+            else:
+                sf = self.scale
+                cf = self.shift
+                c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
+                c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
+                s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+
+            if np.random.random() < self.flip:
+                img = img[:, ::-1, :]
+                c[0] = w - c[0] - 1
+                oldx1 = gt_bbox[:, 0].copy()
+                oldx2 = gt_bbox[:, 2].copy()
+                gt_bbox[:, 0] = w - oldx2 - 1
+                gt_bbox[:, 2] = w - oldx1 - 1
+            sample['gt_bbox'] = gt_bbox
+
+        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
+        if not self.use_random:
+            img = cv2.resize(img, (w, h))
+        inp = cv2.warpAffine(
+            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
+        if self.is_scale:
+            inp = (inp.astype(np.float32) / 255.)
+        sample['image'] = inp
+        sample['center'] = c
+        sample['scale'] = s
+        return sample
+
+
+@register_op
+class CenterRandColor(BaseOperator):
+    """Random color for CenterNet series models.
+    Args:
+        saturation (float): saturation settings.
+        contrast (float): contrast settings.
+        brightness (float): brightness settings.
+    """
+
+    def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4):
+        super(CenterRandColor, self).__init__()
+        self.saturation = saturation
+        self.contrast = contrast
+        self.brightness = brightness
+
+    def apply_saturation(self, img, img_gray):
+        alpha = 1. + np.random.uniform(
+            low=-self.saturation, high=self.saturation)
+        self._blend(alpha, img, img_gray[:, :, None])
+        return img
+
+    def apply_contrast(self, img, img_gray):
+        alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast)
+        img_mean = img_gray.mean()
+        self._blend(alpha, img, img_mean)
+        return img
+
+    def apply_brightness(self, img, img_gray):
+        alpha = 1 + np.random.uniform(
+            low=-self.brightness, high=self.brightness)
+        img *= alpha
+        return img
+
+    def _blend(self, alpha, img, img_mean):
+        img *= alpha
+        img_mean *= (1 - alpha)
+        img += img_mean
+
+    def __call__(self, sample, context=None):
+        img = sample['image']
+        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        functions = [
+            self.apply_brightness,
+            self.apply_contrast,
+            self.apply_saturation,
+        ]
+        distortions = np.random.permutation(functions)
+        for func in distortions:
+            img = func(img, img_gray)
+        sample['image'] = img
+        return sample

+ 30 - 0
paddlers/models/ppdet/engine/__init__.py

@@ -0,0 +1,30 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import trainer
+from .trainer import *
+
+from . import callbacks
+from .callbacks import *
+
+from . import env
+from .env import *
+
+__all__ = trainer.__all__ \
+        + callbacks.__all__ \
+        + env.__all__
+
+from . import tracker
+from .tracker import *
+__all__ = __all__ + tracker.__all__

+ 340 - 0
paddlers/models/ppdet/engine/callbacks.py

@@ -0,0 +1,340 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import datetime
+import six
+import copy
+import json
+
+import paddle
+import paddle.distributed as dist
+
+from paddlers.models.ppdet.utils.checkpoint import save_model
+from paddlers.models.ppdet.metrics import get_infer_results
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = [
+    'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer',
+    'VisualDLWriter', 'SniperProposalsGenerator'
+]
+
+
+class Callback(object):
+    def __init__(self, model):
+        self.model = model
+
+    def on_step_begin(self, status):
+        pass
+
+    def on_step_end(self, status):
+        pass
+
+    def on_epoch_begin(self, status):
+        pass
+
+    def on_epoch_end(self, status):
+        pass
+
+    def on_train_begin(self, status):
+        pass
+
+    def on_train_end(self, status):
+        pass
+
+
+class ComposeCallback(object):
+    def __init__(self, callbacks):
+        callbacks = [c for c in list(callbacks) if c is not None]
+        for c in callbacks:
+            assert isinstance(
+                c, Callback), "callback should be subclass of Callback"
+        self._callbacks = callbacks
+
+    def on_step_begin(self, status):
+        for c in self._callbacks:
+            c.on_step_begin(status)
+
+    def on_step_end(self, status):
+        for c in self._callbacks:
+            c.on_step_end(status)
+
+    def on_epoch_begin(self, status):
+        for c in self._callbacks:
+            c.on_epoch_begin(status)
+
+    def on_epoch_end(self, status):
+        for c in self._callbacks:
+            c.on_epoch_end(status)
+
+    def on_train_begin(self, status):
+        for c in self._callbacks:
+            c.on_train_begin(status)
+
+    def on_train_end(self, status):
+        for c in self._callbacks:
+            c.on_train_end(status)
+
+
+class LogPrinter(Callback):
+    def __init__(self, model):
+        super(LogPrinter, self).__init__(model)
+
+    def on_step_end(self, status):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            mode = status['mode']
+            if mode == 'train':
+                epoch_id = status['epoch_id']
+                step_id = status['step_id']
+                steps_per_epoch = status['steps_per_epoch']
+                training_staus = status['training_staus']
+                batch_time = status['batch_time']
+                data_time = status['data_time']
+
+                epoches = self.model.cfg.epoch
+                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
+                ))]['batch_size']
+
+                logs = training_staus.log()
+                space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd'
+                if step_id % self.model.cfg.log_iter == 0:
+                    eta_steps = (epoches - epoch_id
+                                 ) * steps_per_epoch - step_id
+                    eta_sec = eta_steps * batch_time.global_avg
+                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+                    ips = float(batch_size) / batch_time.avg
+                    fmt = ' '.join([
+                        'Epoch: [{}]',
+                        '[{' + space_fmt + '}/{}]',
+                        'learning_rate: {lr:.6f}',
+                        '{meters}',
+                        'eta: {eta}',
+                        'batch_cost: {btime}',
+                        'data_cost: {dtime}',
+                        'ips: {ips:.4f} images/s',
+                    ])
+                    fmt = fmt.format(
+                        epoch_id,
+                        step_id,
+                        steps_per_epoch,
+                        lr=status['learning_rate'],
+                        meters=logs,
+                        eta=eta_str,
+                        btime=str(batch_time),
+                        dtime=str(data_time),
+                        ips=ips)
+                    logger.info(fmt)
+            if mode == 'eval':
+                step_id = status['step_id']
+                if step_id % 100 == 0:
+                    logger.info("Eval iter: {}".format(step_id))
+
+    def on_epoch_end(self, status):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            mode = status['mode']
+            if mode == 'eval':
+                sample_num = status['sample_num']
+                cost_time = status['cost_time']
+                logger.info('Total sample number: {}, averge FPS: {}'.format(
+                    sample_num, sample_num / cost_time))
+
+
+class Checkpointer(Callback):
+    def __init__(self, model):
+        super(Checkpointer, self).__init__(model)
+        cfg = self.model.cfg
+        self.best_ap = 0.
+        self.save_dir = os.path.join(self.model.cfg.save_dir,
+                                     self.model.cfg.filename)
+        if hasattr(self.model.model, 'student_model'):
+            self.weight = self.model.model.student_model
+        else:
+            self.weight = self.model.model
+
+    def on_epoch_end(self, status):
+        # Checkpointer only performed during training
+        mode = status['mode']
+        epoch_id = status['epoch_id']
+        weight = None
+        save_name = None
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                end_epoch = self.model.cfg.epoch
+                if (
+                        epoch_id + 1
+                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
+                    save_name = str(
+                        epoch_id
+                    ) if epoch_id != end_epoch - 1 else "model_final"
+                    weight = self.weight
+            elif mode == 'eval':
+                if 'save_best_model' in status and status['save_best_model']:
+                    for metric in self.model._metrics:
+                        map_res = metric.get_results()
+                        if 'bbox' in map_res:
+                            key = 'bbox'
+                        elif 'keypoint' in map_res:
+                            key = 'keypoint'
+                        else:
+                            key = 'mask'
+                        if key not in map_res:
+                            logger.warning("Evaluation results empty, this may be due to " \
+                                        "training iterations being too few or not " \
+                                        "loading the correct weights.")
+                            return
+                        if map_res[key][0] > self.best_ap:
+                            self.best_ap = map_res[key][0]
+                            save_name = 'best_model'
+                            weight = self.weight
+                        logger.info("Best test {} ap is {:0.3f}.".format(
+                            key, self.best_ap))
+            if weight:
+                save_model(weight, self.model.optimizer, self.save_dir,
+                           save_name, epoch_id + 1)
+
+
+class WiferFaceEval(Callback):
+    def __init__(self, model):
+        super(WiferFaceEval, self).__init__(model)
+
+    def on_epoch_begin(self, status):
+        assert self.model.mode == 'eval', \
+            "WiferFaceEval can only be set during evaluation"
+        for metric in self.model._metrics:
+            metric.update(self.model.model)
+        sys.exit()
+
+
+class VisualDLWriter(Callback):
+    """
+    Use VisualDL to log data or image
+    """
+
+    def __init__(self, model):
+        super(VisualDLWriter, self).__init__(model)
+
+        assert six.PY3, "VisualDL requires Python >= 3.5"
+        try:
+            from visualdl import LogWriter
+        except Exception as e:
+            logger.error('visualdl not found, plaese install visualdl. '
+                         'for example: `pip install visualdl`.')
+            raise e
+        self.vdl_writer = LogWriter(
+            model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar'))
+        self.vdl_loss_step = 0
+        self.vdl_mAP_step = 0
+        self.vdl_image_step = 0
+        self.vdl_image_frame = 0
+
+    def on_step_end(self, status):
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                training_staus = status['training_staus']
+                for loss_name, loss_value in training_staus.get().items():
+                    self.vdl_writer.add_scalar(loss_name, loss_value,
+                                               self.vdl_loss_step)
+                    self.vdl_loss_step += 1
+            elif mode == 'test':
+                ori_image = status['original_image']
+                result_image = status['result_image']
+                self.vdl_writer.add_image(
+                    "original/frame_{}".format(self.vdl_image_frame),
+                    ori_image, self.vdl_image_step)
+                self.vdl_writer.add_image(
+                    "result/frame_{}".format(self.vdl_image_frame),
+                    result_image, self.vdl_image_step)
+                self.vdl_image_step += 1
+                # each frame can display ten pictures at most.
+                if self.vdl_image_step % 10 == 0:
+                    self.vdl_image_step = 0
+                    self.vdl_image_frame += 1
+
+    def on_epoch_end(self, status):
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'eval':
+                for metric in self.model._metrics:
+                    for key, map_value in metric.get_results().items():
+                        self.vdl_writer.add_scalar("{}-mAP".format(key),
+                                                   map_value[0],
+                                                   self.vdl_mAP_step)
+                self.vdl_mAP_step += 1
+
+
+class SniperProposalsGenerator(Callback):
+    def __init__(self, model):
+        super(SniperProposalsGenerator, self).__init__(model)
+        ori_dataset = self.model.dataset
+        self.dataset = self._create_new_dataset(ori_dataset)
+        self.loader = self.model.loader
+        self.cfg = self.model.cfg
+        self.infer_model = self.model.model
+
+    def _create_new_dataset(self, ori_dataset):
+        dataset = copy.deepcopy(ori_dataset)
+        # init anno_cropper
+        dataset.init_anno_cropper()
+        # generate infer roidbs
+        ori_roidbs = dataset.get_ori_roidbs()
+        roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs)
+        # set new roidbs
+        dataset.set_roidbs(roidbs)
+
+        return dataset
+
+    def _eval_with_loader(self, loader):
+        results = []
+        with paddle.no_grad():
+            self.infer_model.eval()
+            for step_id, data in enumerate(loader):
+                outs = self.infer_model(data)
+                for key in ['im_shape', 'scale_factor', 'im_id']:
+                    outs[key] = data[key]
+                for key, value in outs.items():
+                    if hasattr(value, 'numpy'):
+                        outs[key] = value.numpy()
+
+                results.append(outs)
+
+        return results
+
+    def on_train_end(self, status):
+        self.loader.dataset = self.dataset
+        results = self._eval_with_loader(self.loader)
+        results = self.dataset.anno_cropper.aggregate_chips_detections(results)
+        # sniper
+        proposals = []
+        clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()}
+        for outs in results:
+            batch_res = get_infer_results(outs, clsid2catid)
+            start = 0
+            for i, im_id in enumerate(outs['im_id']):
+                bbox_num = outs['bbox_num']
+                end = start + bbox_num[i]
+                bbox_res = batch_res['bbox'][start:end] \
+                    if 'bbox' in batch_res else None
+                if bbox_res:
+                    proposals += bbox_res
+        logger.info("save proposals in {}".format(self.cfg.proposals_path))
+        with open(self.cfg.proposals_path, 'w') as f:
+            json.dump(proposals, f)

+ 50 - 0
paddlers/models/ppdet/engine/env.py

@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+import numpy as np
+
+import paddle
+from paddle.distributed import fleet
+
+__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env']
+
+
+def init_fleet_env(find_unused_parameters=False):
+    strategy = fleet.DistributedStrategy()
+    strategy.find_unused_parameters = find_unused_parameters
+    fleet.init(is_collective=True, strategy=strategy)
+
+
+def init_parallel_env():
+    env = os.environ
+    dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
+    if dist:
+        trainer_id = int(env['PADDLE_TRAINER_ID'])
+        local_seed = (99 + trainer_id)
+        random.seed(local_seed)
+        np.random.seed(local_seed)
+
+    paddle.distributed.init_parallel_env()
+
+
+def set_random_seed(seed):
+    paddle.seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)

+ 177 - 0
paddlers/models/ppdet/engine/export_utils.py

@@ -0,0 +1,177 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import yaml
+from collections import OrderedDict
+
+import paddle
+from paddlers.models.ppdet.data.source.category import get_categories
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+# Global dictionary
+TRT_MIN_SUBGRAPH = {
+    'YOLO': 3,
+    'SSD': 60,
+    'RCNN': 40,
+    'RetinaNet': 40,
+    'S2ANet': 80,
+    'EfficientDet': 40,
+    'Face': 3,
+    'TTFNet': 60,
+    'FCOS': 16,
+    'SOLOv2': 60,
+    'HigherHRNet': 3,
+    'HRNet': 3,
+    'DeepSORT': 3,
+    'JDE': 10,
+    'FairMOT': 5,
+    'GFL': 16,
+    'PicoDet': 3,
+    'CenterNet': 5,
+    'TOOD': 5,
+}
+
+KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
+MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT']
+
+
+def _prune_input_spec(input_spec, program, targets):
+    # try to prune static program to figure out pruned input spec
+    # so we perform following operations in static mode
+    paddle.enable_static()
+    pruned_input_spec = [{}]
+    program = program.clone()
+    program = program._prune(targets=targets)
+    global_block = program.global_block()
+    for name, spec in input_spec[0].items():
+        try:
+            v = global_block.var(name)
+            pruned_input_spec[0][name] = spec
+        except Exception:
+            pass
+    paddle.disable_static()
+    return pruned_input_spec
+
+
+def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
+    preprocess_list = []
+
+    anno_file = dataset_cfg.get_anno()
+
+    clsid2catid, catid2name = get_categories(metric, anno_file, arch)
+
+    label_list = [str(cat) for cat in catid2name.values()]
+
+    fuse_normalize = reader_cfg.get('fuse_normalize', False)
+    sample_transforms = reader_cfg['sample_transforms']
+    for st in sample_transforms[1:]:
+        for key, value in st.items():
+            p = {'type': key}
+            if key == 'Resize':
+                if int(image_shape[1]) != -1:
+                    value['target_size'] = image_shape[1:]
+            if fuse_normalize and key == 'NormalizeImage':
+                continue
+            p.update(value)
+            preprocess_list.append(p)
+    batch_transforms = reader_cfg.get('batch_transforms', None)
+    if batch_transforms:
+        for bt in batch_transforms:
+            for key, value in bt.items():
+                # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride)
+                if key == 'PadBatch':
+                    preprocess_list.append({
+                        'type': 'PadStride',
+                        'stride': value['pad_to_stride']
+                    })
+                    break
+
+    return preprocess_list, label_list
+
+
+def _parse_tracker(tracker_cfg):
+    tracker_params = {}
+    for k, v in tracker_cfg.items():
+        tracker_params.update({k: v})
+    return tracker_params
+
+
+def _dump_infer_config(config, path, image_shape, model):
+    arch_state = False
+    from paddlers.models.ppdet.core.config.yaml_helpers import setup_orderdict
+    setup_orderdict()
+    use_dynamic_shape = True if image_shape[2] == -1 else False
+    infer_cfg = OrderedDict({
+        'mode': 'fluid',
+        'draw_threshold': 0.5,
+        'metric': config['metric'],
+        'use_dynamic_shape': use_dynamic_shape
+    })
+    infer_arch = config['architecture']
+
+    if infer_arch in MOT_ARCH:
+        if infer_arch == 'DeepSORT':
+            tracker_cfg = config['DeepSORTTracker']
+        else:
+            tracker_cfg = config['JDETracker']
+        infer_cfg['tracker'] = _parse_tracker(tracker_cfg)
+
+    for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items():
+        if arch in infer_arch:
+            infer_cfg['arch'] = arch
+            infer_cfg['min_subgraph_size'] = min_subgraph_size
+            arch_state = True
+            break
+    if not arch_state:
+        logger.error(
+            'Architecture: {} is not supported for exporting model now.\n'.
+            format(infer_arch) +
+            'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py')
+        os._exit(0)
+    if 'mask_head' in config[config['architecture']] and config[config[
+            'architecture']]['mask_head']:
+        infer_cfg['mask'] = True
+    label_arch = 'detection_arch'
+    if infer_arch in KEYPOINT_ARCH:
+        label_arch = 'keypoint_arch'
+
+    if infer_arch in MOT_ARCH:
+        label_arch = 'mot_arch'
+        reader_cfg = config['TestMOTReader']
+        dataset_cfg = config['TestMOTDataset']
+    else:
+        reader_cfg = config['TestReader']
+        dataset_cfg = config['TestDataset']
+
+    infer_cfg['Preprocess'], infer_cfg['label_list'] = _parse_reader(
+        reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:])
+
+    if infer_arch == 'PicoDet':
+        infer_cfg['NMS'] = config['PicoHead']['nms']
+        # In order to speed up the prediction, the threshold of nms
+        # is adjusted here, which can be changed in infer_cfg.yml
+        config['PicoHead']['nms']["score_threshold"] = 0.3
+        config['PicoHead']['nms']["nms_threshold"] = 0.5
+        infer_cfg['fpn_stride'] = config['PicoHead']['fpn_stride']
+
+    yaml.dump(infer_cfg, open(path, 'w'))
+    logger.info("Export inference config file to {}".format(
+        os.path.join(path)))

+ 538 - 0
paddlers/models/ppdet/engine/tracker.py

@@ -0,0 +1,538 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import cv2
+import glob
+import re
+import paddle
+import numpy as np
+import os.path as osp
+from collections import defaultdict
+
+from paddlers.models.ppdet.core.workspace import create
+from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
+from paddlers.models.ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
+from paddlers.models.ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results
+
+from paddlers.models.ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric
+from paddlers.models.ppdet.metrics import MCMOTMetric
+
+from .callbacks import Callback, ComposeCallback
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['Tracker']
+
+
+class Tracker(object):
+    def __init__(self, cfg, mode='eval'):
+        self.cfg = cfg
+        assert mode.lower() in ['test', 'eval'], \
+                "mode should be 'test' or 'eval'"
+        self.mode = mode.lower()
+        self.optimizer = None
+
+        # build MOT data loader
+        self.dataset = cfg['{}MOTDataset'.format(self.mode.capitalize())]
+
+        # build model
+        self.model = create(cfg.architecture)
+
+        self.status = {}
+        self.start_epoch = 0
+
+        # initial default callbacks
+        self._init_callbacks()
+
+        # initial default metrics
+        self._init_metrics()
+        self._reset_metrics()
+
+    def _init_callbacks(self):
+        self._callbacks = []
+        self._compose_callback = None
+
+    def _init_metrics(self):
+        if self.mode in ['test']:
+            self._metrics = []
+            return
+
+        if self.cfg.metric == 'MOT':
+            self._metrics = [MOTMetric(), ]
+        elif self.cfg.metric == 'MCMOT':
+            self._metrics = [MCMOTMetric(self.cfg.num_classes), ]
+        elif self.cfg.metric == 'KITTI':
+            self._metrics = [KITTIMOTMetric(), ]
+        else:
+            logger.warning("Metric not support for metric type {}".format(
+                self.cfg.metric))
+            self._metrics = []
+
+    def _reset_metrics(self):
+        for metric in self._metrics:
+            metric.reset()
+
+    def register_callbacks(self, callbacks):
+        callbacks = [h for h in list(callbacks) if h is not None]
+        for c in callbacks:
+            assert isinstance(c, Callback), \
+                    "metrics shoule be instances of subclass of Metric"
+        self._callbacks.extend(callbacks)
+        self._compose_callback = ComposeCallback(self._callbacks)
+
+    def register_metrics(self, metrics):
+        metrics = [m for m in list(metrics) if m is not None]
+        for m in metrics:
+            assert isinstance(m, Metric), \
+                    "metrics shoule be instances of subclass of Metric"
+        self._metrics.extend(metrics)
+
+    def load_weights_jde(self, weights):
+        load_weight(self.model, weights, self.optimizer)
+
+    def load_weights_sde(self, det_weights, reid_weights):
+        if self.model.detector:
+            load_weight(self.model.detector, det_weights)
+            load_weight(self.model.reid, reid_weights)
+        else:
+            load_weight(self.model.reid, reid_weights, self.optimizer)
+
+    def _eval_seq_jde(self,
+                      dataloader,
+                      save_dir=None,
+                      show_image=False,
+                      frame_rate=30,
+                      draw_threshold=0):
+        if save_dir:
+            if not os.path.exists(save_dir): os.makedirs(save_dir)
+        tracker = self.model.tracker
+        tracker.max_time_lost = int(frame_rate / 30.0 * tracker.track_buffer)
+
+        timer = MOTTimer()
+        frame_id = 0
+        self.status['mode'] = 'track'
+        self.model.eval()
+        results = defaultdict(list)  # support single class and multi classes
+
+        for step_id, data in enumerate(dataloader):
+            self.status['step_id'] = step_id
+            if frame_id % 40 == 0:
+                logger.info('Processing frame {} ({:.2f} fps)'.format(
+                    frame_id, 1. / max(1e-5, timer.average_time)))
+            # forward
+            timer.tic()
+            pred_dets, pred_embs = self.model(data)
+
+            pred_dets, pred_embs = pred_dets.numpy(), pred_embs.numpy()
+            online_targets_dict = self.model.tracker.update(pred_dets,
+                                                            pred_embs)
+            online_tlwhs = defaultdict(list)
+            online_scores = defaultdict(list)
+            online_ids = defaultdict(list)
+            for cls_id in range(self.cfg.num_classes):
+                online_targets = online_targets_dict[cls_id]
+                for t in online_targets:
+                    tlwh = t.tlwh
+                    tid = t.track_id
+                    tscore = t.score
+                    if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
+                    if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
+                            3] > tracker.vertical_ratio:
+                        continue
+                    online_tlwhs[cls_id].append(tlwh)
+                    online_ids[cls_id].append(tid)
+                    online_scores[cls_id].append(tscore)
+                # save results
+                results[cls_id].append(
+                    (frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id],
+                     online_ids[cls_id]))
+
+            timer.toc()
+            save_vis_results(data, frame_id, online_ids, online_tlwhs,
+                             online_scores, timer.average_time, show_image,
+                             save_dir, self.cfg.num_classes)
+            frame_id += 1
+
+        return results, frame_id, timer.average_time, timer.calls
+
+    def _eval_seq_sde(self,
+                      dataloader,
+                      save_dir=None,
+                      show_image=False,
+                      frame_rate=30,
+                      seq_name='',
+                      scaled=False,
+                      det_file='',
+                      draw_threshold=0):
+        if save_dir:
+            if not os.path.exists(save_dir): os.makedirs(save_dir)
+        use_detector = False if not self.model.detector else True
+
+        timer = MOTTimer()
+        results = defaultdict(list)
+        frame_id = 0
+        self.status['mode'] = 'track'
+        self.model.eval()
+        self.model.reid.eval()
+        if not use_detector:
+            dets_list = load_det_results(det_file, len(dataloader))
+            logger.info('Finish loading detection results file {}.'.format(
+                det_file))
+
+        for step_id, data in enumerate(dataloader):
+            self.status['step_id'] = step_id
+            if frame_id % 40 == 0:
+                logger.info('Processing frame {} ({:.2f} fps)'.format(
+                    frame_id, 1. / max(1e-5, timer.average_time)))
+
+            ori_image = data['ori_image']  # [bs, H, W, 3]
+            ori_image_shape = data['ori_image'].shape[1:3]
+            # ori_image_shape: [H, W]
+
+            input_shape = data['image'].shape[2:]
+            # input_shape: [h, w], before data transforms, set in model config
+
+            im_shape = data['im_shape'][0].numpy()
+            # im_shape: [new_h, new_w], after data transforms
+            scale_factor = data['scale_factor'][0].numpy()
+
+            empty_detections = False
+            # when it has no detected bboxes, will not inference reid model
+            # and if visualize, use original image instead
+
+            # forward
+            timer.tic()
+            if not use_detector:
+                dets = dets_list[frame_id]
+                bbox_tlwh = np.array(dets['bbox'], dtype='float32')
+                if bbox_tlwh.shape[0] > 0:
+                    # detector outputs: pred_cls_ids, pred_scores, pred_bboxes
+                    pred_cls_ids = np.array(dets['cls_id'], dtype='float32')
+                    pred_scores = np.array(dets['score'], dtype='float32')
+                    pred_bboxes = np.concatenate(
+                        (bbox_tlwh[:, 0:2],
+                         bbox_tlwh[:, 2:4] + bbox_tlwh[:, 0:2]),
+                        axis=1)
+                else:
+                    logger.warning(
+                        'Frame {} has not object, try to modify score threshold.'.
+                        format(frame_id))
+                    empty_detections = True
+            else:
+                outs = self.model.detector(data)
+                outs['bbox'] = outs['bbox'].numpy()
+                outs['bbox_num'] = outs['bbox_num'].numpy()
+
+                if outs['bbox_num'] > 0 and empty_detections == False:
+                    # detector outputs: pred_cls_ids, pred_scores, pred_bboxes
+                    pred_cls_ids = outs['bbox'][:, 0:1]
+                    pred_scores = outs['bbox'][:, 1:2]
+                    if not scaled:
+                        # Note: scaled=False only in JDE YOLOv3 or other detectors
+                        # with LetterBoxResize and JDEBBoxPostProcess.
+                        #
+                        # 'scaled' means whether the coords after detector outputs
+                        # have been scaled back to the original image, set True
+                        # in general detector, set False in JDE YOLOv3.
+                        pred_bboxes = scale_coords(outs['bbox'][:, 2:],
+                                                   input_shape, im_shape,
+                                                   scale_factor)
+                    else:
+                        pred_bboxes = outs['bbox'][:, 2:]
+                else:
+                    logger.warning(
+                        'Frame {} has not detected object, try to modify score threshold.'.
+                        format(frame_id))
+                    empty_detections = True
+
+            if not empty_detections:
+                pred_xyxys, keep_idx = clip_box(pred_bboxes, ori_image_shape)
+                if len(keep_idx[0]) == 0:
+                    logger.warning(
+                        'Frame {} has not detected object left after clip_box.'.
+                        format(frame_id))
+                    empty_detections = True
+
+            if empty_detections:
+                timer.toc()
+                # if visualize, use original image instead
+                online_ids, online_tlwhs, online_scores = None, None, None
+                save_vis_results(data, frame_id, online_ids, online_tlwhs,
+                                 online_scores, timer.average_time, show_image,
+                                 save_dir, self.cfg.num_classes)
+                frame_id += 1
+                # thus will not inference reid model
+                continue
+
+            pred_scores = pred_scores[keep_idx[0]]
+            pred_cls_ids = pred_cls_ids[keep_idx[0]]
+            pred_tlwhs = np.concatenate(
+                (pred_xyxys[:, 0:2],
+                 pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1),
+                axis=1)
+            pred_dets = np.concatenate(
+                (pred_tlwhs, pred_scores, pred_cls_ids), axis=1)
+
+            tracker = self.model.tracker
+            crops = get_crops(
+                pred_xyxys,
+                ori_image,
+                w=tracker.input_size[0],
+                h=tracker.input_size[1])
+            crops = paddle.to_tensor(crops)
+
+            data.update({'crops': crops})
+            pred_embs = self.model(data).numpy()
+
+            tracker.predict()
+            online_targets = tracker.update(pred_dets, pred_embs)
+
+            online_tlwhs, online_scores, online_ids = [], [], []
+            for t in online_targets:
+                if not t.is_confirmed() or t.time_since_update > 1:
+                    continue
+                tlwh = t.to_tlwh()
+                tscore = t.score
+                tid = t.track_id
+                if tscore < draw_threshold: continue
+                if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
+                if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
+                        3] > tracker.vertical_ratio:
+                    continue
+                online_tlwhs.append(tlwh)
+                online_scores.append(tscore)
+                online_ids.append(tid)
+            timer.toc()
+
+            # save results
+            results[0].append(
+                (frame_id + 1, online_tlwhs, online_scores, online_ids))
+            save_vis_results(data, frame_id, online_ids, online_tlwhs,
+                             online_scores, timer.average_time, show_image,
+                             save_dir, self.cfg.num_classes)
+            frame_id += 1
+
+        return results, frame_id, timer.average_time, timer.calls
+
+    def mot_evaluate(self,
+                     data_root,
+                     seqs,
+                     output_dir,
+                     data_type='mot',
+                     model_type='JDE',
+                     save_images=False,
+                     save_videos=False,
+                     show_image=False,
+                     scaled=False,
+                     det_results_dir=''):
+        if not os.path.exists(output_dir): os.makedirs(output_dir)
+        result_root = os.path.join(output_dir, 'mot_results')
+        if not os.path.exists(result_root): os.makedirs(result_root)
+        assert data_type in ['mot', 'mcmot', 'kitti'], \
+            "data_type should be 'mot', 'mcmot' or 'kitti'"
+        assert model_type in ['JDE', 'DeepSORT', 'FairMOT'], \
+            "model_type should be 'JDE', 'DeepSORT' or 'FairMOT'"
+
+        # run tracking
+        n_frame = 0
+        timer_avgs, timer_calls = [], []
+        for seq in seqs:
+            infer_dir = os.path.join(data_root, seq)
+            if not os.path.exists(infer_dir) or not os.path.isdir(infer_dir):
+                logger.warning("Seq {} error, {} has no images.".format(
+                    seq, infer_dir))
+                continue
+            if os.path.exists(os.path.join(infer_dir, 'img1')):
+                infer_dir = os.path.join(infer_dir, 'img1')
+
+            frame_rate = 30
+            seqinfo = os.path.join(data_root, seq, 'seqinfo.ini')
+            if os.path.exists(seqinfo):
+                meta_info = open(seqinfo).read()
+                frame_rate = int(meta_info[meta_info.find('frameRate') + 10:
+                                           meta_info.find('\nseqLength')])
+
+            save_dir = os.path.join(
+                output_dir, 'mot_outputs',
+                seq) if save_images or save_videos else None
+            logger.info('start seq: {}'.format(seq))
+
+            self.dataset.set_images(self.get_infer_images(infer_dir))
+            dataloader = create('EvalMOTReader')(self.dataset, 0)
+
+            result_filename = os.path.join(result_root, '{}.txt'.format(seq))
+
+            with paddle.no_grad():
+                if model_type in ['JDE', 'FairMOT']:
+                    results, nf, ta, tc = self._eval_seq_jde(
+                        dataloader,
+                        save_dir=save_dir,
+                        show_image=show_image,
+                        frame_rate=frame_rate)
+                elif model_type in ['DeepSORT']:
+                    results, nf, ta, tc = self._eval_seq_sde(
+                        dataloader,
+                        save_dir=save_dir,
+                        show_image=show_image,
+                        frame_rate=frame_rate,
+                        seq_name=seq,
+                        scaled=scaled,
+                        det_file=os.path.join(det_results_dir,
+                                              '{}.txt'.format(seq)))
+                else:
+                    raise ValueError(model_type)
+
+            write_mot_results(result_filename, results, data_type,
+                              self.cfg.num_classes)
+            n_frame += nf
+            timer_avgs.append(ta)
+            timer_calls.append(tc)
+
+            if save_videos:
+                output_video_path = os.path.join(save_dir, '..',
+                                                 '{}_vis.mp4'.format(seq))
+                cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(
+                    save_dir, output_video_path)
+                os.system(cmd_str)
+                logger.info('Save video in {}.'.format(output_video_path))
+
+            logger.info('Evaluate seq: {}'.format(seq))
+            # update metrics
+            for metric in self._metrics:
+                metric.update(data_root, seq, data_type, result_root,
+                              result_filename)
+
+        timer_avgs = np.asarray(timer_avgs)
+        timer_calls = np.asarray(timer_calls)
+        all_time = np.dot(timer_avgs, timer_calls)
+        avg_time = all_time / np.sum(timer_calls)
+        logger.info('Time elapsed: {:.2f} seconds, FPS: {:.2f}'.format(
+            all_time, 1.0 / avg_time))
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()
+
+    def get_infer_images(self, infer_dir):
+        assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+        images = set()
+        assert os.path.isdir(infer_dir), \
+            "infer_dir {} is not a directory".format(infer_dir)
+        exts = ['jpg', 'jpeg', 'png', 'bmp']
+        exts += [ext.upper() for ext in exts]
+        for ext in exts:
+            images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+        images = list(images)
+        images.sort()
+        assert len(images) > 0, "no image found in {}".format(infer_dir)
+        logger.info("Found {} inference images in total.".format(len(images)))
+        return images
+
+    def mot_predict_seq(self,
+                        video_file,
+                        frame_rate,
+                        image_dir,
+                        output_dir,
+                        data_type='mot',
+                        model_type='JDE',
+                        save_images=False,
+                        save_videos=True,
+                        show_image=False,
+                        scaled=False,
+                        det_results_dir='',
+                        draw_threshold=0.5):
+        assert video_file is not None or image_dir is not None, \
+            "--video_file or --image_dir should be set."
+        assert video_file is None or os.path.isfile(video_file), \
+                "{} is not a file".format(video_file)
+        assert image_dir is None or os.path.isdir(image_dir), \
+                "{} is not a directory".format(image_dir)
+
+        if not os.path.exists(output_dir): os.makedirs(output_dir)
+        result_root = os.path.join(output_dir, 'mot_results')
+        if not os.path.exists(result_root): os.makedirs(result_root)
+        assert data_type in ['mot', 'mcmot', 'kitti'], \
+            "data_type should be 'mot', 'mcmot' or 'kitti'"
+        assert model_type in ['JDE', 'DeepSORT', 'FairMOT'], \
+            "model_type should be 'JDE', 'DeepSORT' or 'FairMOT'"
+
+        # run tracking
+        if video_file:
+            seq = video_file.split('/')[-1].split('.')[0]
+            self.dataset.set_video(video_file, frame_rate)
+            logger.info('Starting tracking video {}'.format(video_file))
+        elif image_dir:
+            seq = image_dir.split('/')[-1].split('.')[0]
+            if os.path.exists(os.path.join(image_dir, 'img1')):
+                image_dir = os.path.join(image_dir, 'img1')
+            images = [
+                '{}/{}'.format(image_dir, x) for x in os.listdir(image_dir)
+            ]
+            images.sort()
+            self.dataset.set_images(images)
+            logger.info('Starting tracking folder {}, found {} images'.format(
+                image_dir, len(images)))
+        else:
+            raise ValueError('--video_file or --image_dir should be set.')
+
+        save_dir = os.path.join(output_dir, 'mot_outputs',
+                                seq) if save_images or save_videos else None
+
+        dataloader = create('TestMOTReader')(self.dataset, 0)
+        result_filename = os.path.join(result_root, '{}.txt'.format(seq))
+        if frame_rate == -1:
+            frame_rate = self.dataset.frame_rate
+
+        with paddle.no_grad():
+            if model_type in ['JDE', 'FairMOT']:
+                results, nf, ta, tc = self._eval_seq_jde(
+                    dataloader,
+                    save_dir=save_dir,
+                    show_image=show_image,
+                    frame_rate=frame_rate,
+                    draw_threshold=draw_threshold)
+            elif model_type in ['DeepSORT']:
+                results, nf, ta, tc = self._eval_seq_sde(
+                    dataloader,
+                    save_dir=save_dir,
+                    show_image=show_image,
+                    frame_rate=frame_rate,
+                    seq_name=seq,
+                    scaled=scaled,
+                    det_file=os.path.join(det_results_dir,
+                                          '{}.txt'.format(seq)),
+                    draw_threshold=draw_threshold)
+            else:
+                raise ValueError(model_type)
+
+        if save_videos:
+            output_video_path = os.path.join(save_dir, '..',
+                                             '{}_vis.mp4'.format(seq))
+            cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(
+                save_dir, output_video_path)
+            os.system(cmd_str)
+            logger.info('Save video in {}'.format(output_video_path))
+
+        write_mot_results(result_filename, results, data_type,
+                          self.cfg.num_classes)

+ 742 - 0
paddlers/models/ppdet/engine/trainer.py

@@ -0,0 +1,742 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import copy
+import time
+
+import numpy as np
+import typing
+from PIL import Image, ImageOps
+
+import paddle
+import paddle.distributed as dist
+from paddle.distributed import fleet
+from paddle import amp
+from paddle.static import InputSpec
+from paddlers.models.ppdet.optimizer import ModelEMA
+
+from paddlers.models.ppdet.core.workspace import create
+from paddlers.models.ppdet.modeling.architectures.meta_arch import BaseArch
+from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
+from paddlers.models.ppdet.utils.visualizer import visualize_results, save_result
+from paddlers.models.ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownMPIIEval
+from paddlers.models.ppdet.metrics import RBoxMetric, JDEDetMetric, SNIPERCOCOMetric
+from paddlers.models.ppdet.data.source.sniper_coco import SniperCOCODataSet
+from paddlers.models.ppdet.data.source.category import get_categories
+from paddlers.models.ppdet.utils import stats
+from paddlers.models.ppdet.utils import profiler
+
+from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator
+from .export_utils import _dump_infer_config, _prune_input_spec
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = ['Trainer']
+
+MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT']
+
+
+class Trainer(object):
+    def __init__(self, cfg, mode='train'):
+        self.cfg = cfg
+        assert mode.lower() in ['train', 'eval', 'test'], \
+                "mode should be 'train', 'eval' or 'test'"
+        self.mode = mode.lower()
+        self.optimizer = None
+        self.is_loaded_weights = False
+
+        # build data loader
+        if cfg.architecture in MOT_ARCH and self.mode in ['eval', 'test']:
+            self.dataset = cfg['{}MOTDataset'.format(self.mode.capitalize())]
+        else:
+            self.dataset = cfg['{}Dataset'.format(self.mode.capitalize())]
+
+        if cfg.architecture == 'DeepSORT' and self.mode == 'train':
+            logger.error('DeepSORT has no need of training on mot dataset.')
+            sys.exit(1)
+
+        if self.mode == 'train':
+            self.loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, cfg.worker_num)
+
+        if cfg.architecture == 'JDE' and self.mode == 'train':
+            cfg['JDEEmbeddingHead'][
+                'num_identities'] = self.dataset.num_identities_dict[0]
+            # JDE only support single class MOT now.
+
+        if cfg.architecture == 'FairMOT' and self.mode == 'train':
+            cfg['FairMOTEmbeddingHead'][
+                'num_identities_dict'] = self.dataset.num_identities_dict
+            # FairMOT support single class and multi-class MOT now.
+
+        # build model
+        if 'model' not in self.cfg:
+            self.model = create(cfg.architecture)
+        else:
+            self.model = self.cfg.model
+            self.is_loaded_weights = True
+
+        #normalize params for deploy
+        self.model.load_meanstd(cfg['TestReader']['sample_transforms'])
+
+        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
+        if self.use_ema:
+            ema_decay = self.cfg.get('ema_decay', 0.9998)
+            cycle_epoch = self.cfg.get('cycle_epoch', -1)
+            self.ema = ModelEMA(
+                self.model,
+                decay=ema_decay,
+                use_thres_step=True,
+                cycle_epoch=cycle_epoch)
+
+        # EvalDataset build with BatchSampler to evaluate in single device
+        # TODO: multi-device evaluate
+        if self.mode == 'eval':
+            self._eval_batch_sampler = paddle.io.BatchSampler(
+                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
+            reader_name = '{}Reader'.format(self.mode.capitalize())
+            # If metric is VOC, need to be set collate_batch=False.
+            if cfg.metric == 'VOC':
+                cfg[reader_name]['collate_batch'] = False
+            self.loader = create(reader_name)(self.dataset, cfg.worker_num,
+                                              self._eval_batch_sampler)
+        # TestDataset build after user set images, skip loader creation here
+
+        # build optimizer in train mode
+        if self.mode == 'train':
+            steps_per_epoch = len(self.loader)
+            self.lr = create('LearningRate')(steps_per_epoch)
+            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
+
+        if self.cfg.get('unstructured_prune'):
+            self.pruner = create('UnstructuredPruner')(self.model,
+                                                       steps_per_epoch)
+
+        self._nranks = dist.get_world_size()
+        self._local_rank = dist.get_rank()
+
+        self.status = {}
+
+        self.start_epoch = 0
+        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
+
+        # initial default callbacks
+        self._init_callbacks()
+
+        # initial default metrics
+        self._init_metrics()
+        self._reset_metrics()
+
+    def _init_callbacks(self):
+        if self.mode == 'train':
+            self._callbacks = [LogPrinter(self), Checkpointer(self)]
+            if self.cfg.get('use_vdl', False):
+                self._callbacks.append(VisualDLWriter(self))
+            if self.cfg.get('save_proposals', False):
+                self._callbacks.append(SniperProposalsGenerator(self))
+            self._compose_callback = ComposeCallback(self._callbacks)
+        elif self.mode == 'eval':
+            self._callbacks = [LogPrinter(self)]
+            if self.cfg.metric == 'WiderFace':
+                self._callbacks.append(WiferFaceEval(self))
+            self._compose_callback = ComposeCallback(self._callbacks)
+        elif self.mode == 'test' and self.cfg.get('use_vdl', False):
+            self._callbacks = [VisualDLWriter(self)]
+            self._compose_callback = ComposeCallback(self._callbacks)
+        else:
+            self._callbacks = []
+            self._compose_callback = None
+
+    def _init_metrics(self, validate=False):
+        if self.mode == 'test' or (self.mode == 'train' and not validate):
+            self._metrics = []
+            return
+        classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False
+        if self.cfg.metric == 'COCO' or self.cfg.metric == "SNIPERCOCO":
+            # TODO: bias should be unified
+            bias = self.cfg['bias'] if 'bias' in self.cfg else 0
+            output_eval = self.cfg['output_eval'] \
+                if 'output_eval' in self.cfg else None
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+
+            # pass clsid2catid info to metric instance to avoid multiple loading
+            # annotation file
+            clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \
+                                if self.mode == 'eval' else None
+
+            # when do validation in train, annotation file should be get from
+            # EvalReader instead of self.dataset(which is TrainReader)
+            anno_file = self.dataset.get_anno()
+            dataset = self.dataset
+            if self.mode == 'train' and validate:
+                eval_dataset = self.cfg['EvalDataset']
+                eval_dataset.check_or_download_dataset()
+                anno_file = eval_dataset.get_anno()
+                dataset = eval_dataset
+
+            IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox'
+            if self.cfg.metric == "COCO":
+                self._metrics = [
+                    COCOMetric(
+                        anno_file=anno_file,
+                        clsid2catid=clsid2catid,
+                        classwise=classwise,
+                        output_eval=output_eval,
+                        bias=bias,
+                        IouType=IouType,
+                        save_prediction_only=save_prediction_only)
+                ]
+            elif self.cfg.metric == "SNIPERCOCO":  # sniper
+                self._metrics = [
+                    SNIPERCOCOMetric(
+                        anno_file=anno_file,
+                        dataset=dataset,
+                        clsid2catid=clsid2catid,
+                        classwise=classwise,
+                        output_eval=output_eval,
+                        bias=bias,
+                        IouType=IouType,
+                        save_prediction_only=save_prediction_only)
+                ]
+        elif self.cfg.metric == 'RBOX':
+            # TODO: bias should be unified
+            bias = self.cfg['bias'] if 'bias' in self.cfg else 0
+            output_eval = self.cfg['output_eval'] \
+                if 'output_eval' in self.cfg else None
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+
+            # pass clsid2catid info to metric instance to avoid multiple loading
+            # annotation file
+            clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \
+                                if self.mode == 'eval' else None
+
+            # when do validation in train, annotation file should be get from
+            # EvalReader instead of self.dataset(which is TrainReader)
+            anno_file = self.dataset.get_anno()
+            if self.mode == 'train' and validate:
+                eval_dataset = self.cfg['EvalDataset']
+                eval_dataset.check_or_download_dataset()
+                anno_file = eval_dataset.get_anno()
+
+            self._metrics = [
+                RBoxMetric(
+                    anno_file=anno_file,
+                    clsid2catid=clsid2catid,
+                    classwise=classwise,
+                    output_eval=output_eval,
+                    bias=bias,
+                    save_prediction_only=save_prediction_only)
+            ]
+        elif self.cfg.metric == 'VOC':
+            self._metrics = [
+                VOCMetric(
+                    label_list=self.dataset.get_label_list(),
+                    class_num=self.cfg.num_classes,
+                    map_type=self.cfg.map_type,
+                    classwise=classwise)
+            ]
+        elif self.cfg.metric == 'WiderFace':
+            multi_scale = self.cfg.multi_scale_eval if 'multi_scale_eval' in self.cfg else True
+            self._metrics = [
+                WiderFaceMetric(
+                    image_dir=os.path.join(self.dataset.dataset_dir,
+                                           self.dataset.image_dir),
+                    anno_file=self.dataset.get_anno(),
+                    multi_scale=multi_scale)
+            ]
+        elif self.cfg.metric == 'KeyPointTopDownCOCOEval':
+            eval_dataset = self.cfg['EvalDataset']
+            eval_dataset.check_or_download_dataset()
+            anno_file = eval_dataset.get_anno()
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+            self._metrics = [
+                KeyPointTopDownCOCOEval(
+                    anno_file,
+                    len(eval_dataset),
+                    self.cfg.num_joints,
+                    self.cfg.save_dir,
+                    save_prediction_only=save_prediction_only)
+            ]
+        elif self.cfg.metric == 'KeyPointTopDownMPIIEval':
+            eval_dataset = self.cfg['EvalDataset']
+            eval_dataset.check_or_download_dataset()
+            anno_file = eval_dataset.get_anno()
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+            self._metrics = [
+                KeyPointTopDownMPIIEval(
+                    anno_file,
+                    len(eval_dataset),
+                    self.cfg.num_joints,
+                    self.cfg.save_dir,
+                    save_prediction_only=save_prediction_only)
+            ]
+        elif self.cfg.metric == 'MOTDet':
+            self._metrics = [JDEDetMetric(), ]
+        else:
+            logger.warning("Metric not support for metric type {}".format(
+                self.cfg.metric))
+            self._metrics = []
+
+    def _reset_metrics(self):
+        for metric in self._metrics:
+            metric.reset()
+
+    def register_callbacks(self, callbacks):
+        callbacks = [c for c in list(callbacks) if c is not None]
+        for c in callbacks:
+            assert isinstance(c, Callback), \
+                    "metrics shoule be instances of subclass of Metric"
+        self._callbacks.extend(callbacks)
+        self._compose_callback = ComposeCallback(self._callbacks)
+
+    def register_metrics(self, metrics):
+        metrics = [m for m in list(metrics) if m is not None]
+        for m in metrics:
+            assert isinstance(m, Metric), \
+                    "metrics shoule be instances of subclass of Metric"
+        self._metrics.extend(metrics)
+
+    def load_weights(self, weights):
+        if self.is_loaded_weights:
+            return
+        self.start_epoch = 0
+        load_pretrain_weight(self.model, weights)
+        logger.debug("Load weights {} to start training".format(weights))
+
+    def load_weights_sde(self, det_weights, reid_weights):
+        if self.model.detector:
+            load_weight(self.model.detector, det_weights)
+            load_weight(self.model.reid, reid_weights)
+        else:
+            load_weight(self.model.reid, reid_weights)
+
+    def resume_weights(self, weights):
+        # support Distill resume weights
+        if hasattr(self.model, 'student_model'):
+            self.start_epoch = load_weight(self.model.student_model, weights,
+                                           self.optimizer)
+        else:
+            self.start_epoch = load_weight(self.model, weights, self.optimizer)
+        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
+
+    def train(self, validate=False):
+        assert self.mode == 'train', "Model not in 'train' mode"
+        Init_mark = False
+
+        sync_bn = (
+            getattr(self.cfg, 'norm_type', None) in [None, 'sync_bn'] and
+            self.cfg.use_gpu and self._nranks > 1)
+        if sync_bn:
+            self.model = BaseArch.convert_sync_batchnorm(self.model)
+
+        model = self.model
+        if self.cfg.get('fleet', False):
+            model = fleet.distributed_model(model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
+
+        # initial fp16
+        if self.cfg.get('fp16', False):
+            scaler = amp.GradScaler(
+                enable=self.cfg.use_gpu, init_loss_scaling=1024)
+
+        self.status.update({
+            'epoch_id': self.start_epoch,
+            'step_id': 0,
+            'steps_per_epoch': len(self.loader)
+        })
+
+        self.status['batch_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['data_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
+
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, self.cfg.worker_num)
+            self._flops(flops_loader)
+        profiler_options = self.cfg.get('profiler_options', None)
+
+        self._compose_callback.on_train_begin(self.status)
+
+        for epoch_id in range(self.start_epoch, self.cfg.epoch):
+            self.status['mode'] = 'train'
+            self.status['epoch_id'] = epoch_id
+            self._compose_callback.on_epoch_begin(self.status)
+            self.loader.dataset.set_epoch(epoch_id)
+            model.train()
+            iter_tic = time.time()
+            for step_id, data in enumerate(self.loader):
+                self.status['data_time'].update(time.time() - iter_tic)
+                self.status['step_id'] = step_id
+                profiler.add_profiler_step(profiler_options)
+                self._compose_callback.on_step_begin(self.status)
+                data['epoch_id'] = epoch_id
+
+                if self.cfg.get('fp16', False):
+                    with amp.auto_cast(enable=self.cfg.use_gpu):
+                        # model forward
+                        outputs = model(data)
+                        loss = outputs['loss']
+
+                    # model backward
+                    scaled_loss = scaler.scale(loss)
+                    scaled_loss.backward()
+                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
+                    scaler.minimize(self.optimizer, scaled_loss)
+                else:
+                    # model forward
+                    outputs = model(data)
+                    loss = outputs['loss']
+                    # model backward
+                    loss.backward()
+                    self.optimizer.step()
+                curr_lr = self.optimizer.get_lr()
+                self.lr.step()
+                if self.cfg.get('unstructured_prune'):
+                    self.pruner.step()
+                self.optimizer.clear_grad()
+                self.status['learning_rate'] = curr_lr
+
+                if self._nranks < 2 or self._local_rank == 0:
+                    self.status['training_staus'].update(outputs)
+
+                self.status['batch_time'].update(time.time() - iter_tic)
+                self._compose_callback.on_step_end(self.status)
+                if self.use_ema:
+                    self.ema.update(self.model)
+                iter_tic = time.time()
+
+            # apply ema weight on model
+            if self.use_ema:
+                weight = copy.deepcopy(self.model.state_dict())
+                self.model.set_dict(self.ema.apply())
+            if self.cfg.get('unstructured_prune'):
+                self.pruner.update_params()
+
+            self._compose_callback.on_epoch_end(self.status)
+
+            if validate and (self._nranks < 2 or self._local_rank == 0) \
+                    and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \
+                             or epoch_id == self.end_epoch - 1):
+                if not hasattr(self, '_eval_loader'):
+                    # build evaluation dataset and loader
+                    self._eval_dataset = self.cfg.EvalDataset
+                    self._eval_batch_sampler = \
+                        paddle.io.BatchSampler(
+                            self._eval_dataset,
+                            batch_size=self.cfg.EvalReader['batch_size'])
+                    # If metric is VOC, need to be set collate_batch=False.
+                    if self.cfg.metric == 'VOC':
+                        self.cfg['EvalReader']['collate_batch'] = False
+                    self._eval_loader = create('EvalReader')(
+                        self._eval_dataset,
+                        self.cfg.worker_num,
+                        batch_sampler=self._eval_batch_sampler)
+                # if validation in training is enabled, metrics should be re-init
+                # Init_mark makes sure this code will only execute once
+                if validate and Init_mark == False:
+                    Init_mark = True
+                    self._init_metrics(validate=validate)
+                    self._reset_metrics()
+                with paddle.no_grad():
+                    self.status['save_best_model'] = True
+                    self._eval_with_loader(self._eval_loader)
+
+            # restore origin weight on model
+            if self.use_ema:
+                self.model.set_dict(weight)
+
+        self._compose_callback.on_train_end(self.status)
+
+    def _eval_with_loader(self, loader):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+        self.model.eval()
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
+            self._flops(flops_loader)
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            # forward
+            outs = self.model(data)
+
+            # update metrics
+            for metric in self._metrics:
+                metric.update(data, outs)
+
+            # multi-scale inputs: all inputs have same im_id
+            if isinstance(data, typing.Sequence):
+                sample_num += data[0]['im_id'].numpy().shape[0]
+            else:
+                sample_num += data['im_id'].numpy().shape[0]
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()
+
+    def evaluate(self):
+        with paddle.no_grad():
+            self._eval_with_loader(self.loader)
+
+    def predict(self,
+                images,
+                draw_threshold=0.5,
+                output_dir='output',
+                save_txt=False):
+        self.dataset.set_images(images)
+        loader = create('TestReader')(self.dataset, 0)
+
+        imid2path = self.dataset.get_imid2path()
+
+        anno_file = self.dataset.get_anno()
+        clsid2catid, catid2name = get_categories(
+            self.cfg.metric, anno_file=anno_file)
+
+        # Run Infer
+        self.status['mode'] = 'test'
+        self.model.eval()
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('TestReader')(self.dataset, 0)
+            self._flops(flops_loader)
+        results = []
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            # forward
+            outs = self.model(data)
+
+            for key in ['im_shape', 'scale_factor', 'im_id']:
+                if isinstance(data, typing.Sequence):
+                    outs[key] = data[0][key]
+                else:
+                    outs[key] = data[key]
+            for key, value in outs.items():
+                if hasattr(value, 'numpy'):
+                    outs[key] = value.numpy()
+            results.append(outs)
+        # sniper
+        if type(self.dataset) == SniperCOCODataSet:
+            results = self.dataset.anno_cropper.aggregate_chips_detections(
+                results)
+
+        for outs in results:
+            batch_res = get_infer_results(outs, clsid2catid)
+            bbox_num = outs['bbox_num']
+
+            start = 0
+            for i, im_id in enumerate(outs['im_id']):
+                image_path = imid2path[int(im_id)]
+                image = Image.open(image_path).convert('RGB')
+                image = ImageOps.exif_transpose(image)
+                self.status['original_image'] = np.array(image.copy())
+
+                end = start + bbox_num[i]
+                bbox_res = batch_res['bbox'][start:end] \
+                        if 'bbox' in batch_res else None
+                mask_res = batch_res['mask'][start:end] \
+                        if 'mask' in batch_res else None
+                segm_res = batch_res['segm'][start:end] \
+                        if 'segm' in batch_res else None
+                keypoint_res = batch_res['keypoint'][start:end] \
+                        if 'keypoint' in batch_res else None
+                image = visualize_results(
+                    image, bbox_res, mask_res, segm_res, keypoint_res,
+                    int(im_id), catid2name, draw_threshold)
+                self.status['result_image'] = np.array(image.copy())
+                if self._compose_callback:
+                    self._compose_callback.on_step_end(self.status)
+                # save image with detection
+                save_name = self._get_save_image_name(output_dir, image_path)
+                logger.info("Detection bbox results save in {}".format(
+                    save_name))
+                image.save(save_name, quality=95)
+                if save_txt:
+                    save_path = os.path.splitext(save_name)[0] + '.txt'
+                    results = {}
+                    results["im_id"] = im_id
+                    if bbox_res:
+                        results["bbox_res"] = bbox_res
+                    if keypoint_res:
+                        results["keypoint_res"] = keypoint_res
+                    save_result(save_path, results, catid2name, draw_threshold)
+                start = end
+
+    def _get_save_image_name(self, output_dir, image_path):
+        """
+        Get save image name from source image path.
+        """
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        image_name = os.path.split(image_path)[-1]
+        name, ext = os.path.splitext(image_name)
+        return os.path.join(output_dir, "{}".format(name)) + ext
+
+    def _get_infer_cfg_and_input_spec(self, save_dir, prune_input=True):
+        image_shape = None
+        im_shape = [None, 2]
+        scale_factor = [None, 2]
+        if self.cfg.architecture in MOT_ARCH:
+            test_reader_name = 'TestMOTReader'
+        else:
+            test_reader_name = 'TestReader'
+        if 'inputs_def' in self.cfg[test_reader_name]:
+            inputs_def = self.cfg[test_reader_name]['inputs_def']
+            image_shape = inputs_def.get('image_shape', None)
+        # set image_shape=[None, 3, -1, -1] as default
+        if image_shape is None:
+            image_shape = [None, 3, -1, -1]
+
+        if len(image_shape) == 3:
+            image_shape = [None] + image_shape
+        else:
+            im_shape = [image_shape[0], 2]
+            scale_factor = [image_shape[0], 2]
+
+        if hasattr(self.model, 'deploy'):
+            self.model.deploy = True
+        if hasattr(self.model, 'fuse_norm'):
+            self.model.fuse_norm = self.cfg['TestReader'].get('fuse_normalize',
+                                                              False)
+
+        # Save infer cfg
+        _dump_infer_config(self.cfg,
+                           os.path.join(save_dir, 'infer_cfg.yml'),
+                           image_shape, self.model)
+
+        input_spec = [{
+            "image": InputSpec(
+                shape=image_shape, name='image'),
+            "im_shape": InputSpec(
+                shape=im_shape, name='im_shape'),
+            "scale_factor": InputSpec(
+                shape=scale_factor, name='scale_factor')
+        }]
+        if self.cfg.architecture == 'DeepSORT':
+            input_spec[0].update({
+                "crops": InputSpec(
+                    shape=[None, 3, 192, 64], name='crops')
+            })
+        if prune_input:
+            static_model = paddle.jit.to_static(
+                self.model, input_spec=input_spec)
+            # NOTE: dy2st do not pruned program, but jit.save will prune program
+            # input spec, prune input spec here and save with pruned input spec
+            pruned_input_spec = _prune_input_spec(
+                input_spec, static_model.forward.main_program,
+                static_model.forward.outputs)
+        else:
+            static_model = None
+            pruned_input_spec = input_spec
+
+        # TODO: Hard code, delete it when support prune input_spec.
+        if self.cfg.architecture == 'PicoDet':
+            pruned_input_spec = [{
+                "image": InputSpec(
+                    shape=image_shape, name='image')
+            }]
+
+        return static_model, pruned_input_spec
+
+    def export(self, output_dir='output_inference'):
+        self.model.eval()
+        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
+        save_dir = os.path.join(output_dir, model_name)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+
+        static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec(
+            save_dir)
+
+        # dy2st and save model
+        if 'slim' not in self.cfg or self.cfg['slim_type'] != 'QAT':
+            paddle.jit.save(
+                static_model,
+                os.path.join(save_dir, 'model'),
+                input_spec=pruned_input_spec)
+        else:
+            self.cfg.slim.save_quantized_model(
+                self.model,
+                os.path.join(save_dir, 'model'),
+                input_spec=pruned_input_spec)
+        logger.info("Export model and saved in {}".format(save_dir))
+
+    def post_quant(self, output_dir='output_inference'):
+        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
+        save_dir = os.path.join(output_dir, model_name)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+
+        for idx, data in enumerate(self.loader):
+            self.model(data)
+            if idx == int(self.cfg.get('quant_batch_num', 10)):
+                break
+
+        # TODO: support prune input_spec
+        _, pruned_input_spec = self._get_infer_cfg_and_input_spec(
+            save_dir, prune_input=False)
+
+        self.cfg.slim.save_quantized_model(
+            self.model,
+            os.path.join(save_dir, 'model'),
+            input_spec=pruned_input_spec)
+        logger.info("Export Post-Quant model and saved in {}".format(save_dir))
+
+    def _flops(self, loader):
+        self.model.eval()
+        try:
+            import paddleslim
+        except Exception as e:
+            logger.warning(
+                'Unable to calculate flops, please install paddleslim, for example: `pip install paddleslim`'
+            )
+            return
+
+        from paddleslim.analysis import dygraph_flops as flops
+        input_data = None
+        for data in loader:
+            input_data = data
+            break
+
+        input_spec = [{
+            "image": input_data['image'][0].unsqueeze(0),
+            "im_shape": input_data['im_shape'][0].unsqueeze(0),
+            "scale_factor": input_data['scale_factor'][0].unsqueeze(0)
+        }]
+        flops = flops(self.model, input_spec) / (1000**3)
+        logger.info(" Model FLOPs : {:.6f}G. (image shape is {})".format(
+            flops, input_data['image'][0].unsqueeze(0).shape))

+ 29 - 0
paddlers/models/ppdet/metrics/__init__.py

@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import metrics
+from . import keypoint_metrics
+
+from .metrics import *
+from .keypoint_metrics import *
+
+__all__ = metrics.__all__ + keypoint_metrics.__all__
+
+from . import mot_metrics
+from .mot_metrics import *
+__all__ = metrics.__all__ + mot_metrics.__all__
+
+from . import mcmot_metrics
+from .mcmot_metrics import *
+__all__ = metrics.__all__ + mcmot_metrics.__all__

+ 184 - 0
paddlers/models/ppdet/metrics/coco_utils.py

@@ -0,0 +1,184 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import numpy as np
+import itertools
+
+from paddlers.models.ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res
+from paddlers.models.ppdet.metrics.map_utils import draw_pr_curve
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+def get_infer_results(outs, catid, bias=0):
+    """
+    Get result at the stage of inference.
+    The output format is dictionary containing bbox or mask result.
+
+    For example, bbox result is a list and each element contains
+    image_id, category_id, bbox and score.
+    """
+    if outs is None or len(outs) == 0:
+        raise ValueError(
+            'The number of valid detection result if zero. Please use reasonable model and check input data.'
+        )
+
+    im_id = outs['im_id']
+
+    infer_res = {}
+    if 'bbox' in outs:
+        if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6:
+            infer_res['bbox'] = get_det_poly_res(
+                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
+        else:
+            infer_res['bbox'] = get_det_res(
+                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
+
+    if 'mask' in outs:
+        # mask post process
+        infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'],
+                                        outs['bbox_num'], im_id, catid)
+
+    if 'segm' in outs:
+        infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid)
+
+    if 'keypoint' in outs:
+        infer_res['keypoint'] = get_keypoint_res(outs, im_id)
+        outs['bbox_num'] = [len(infer_res['keypoint'])]
+
+    return infer_res
+
+
+def cocoapi_eval(jsonfile,
+                 style,
+                 coco_gt=None,
+                 anno_file=None,
+                 max_dets=(100, 300, 1000),
+                 classwise=False,
+                 sigmas=None,
+                 use_area=True):
+    """
+    Args:
+        jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.
+        style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.
+        coco_gt (str): Whether to load COCOAPI through anno_file,
+                 eg: coco_gt = COCO(anno_file)
+        anno_file (str): COCO annotations file.
+        max_dets (tuple): COCO evaluation maxDets.
+        classwise (bool): Whether per-category AP and draw P-R Curve or not.
+        sigmas (nparray): keypoint labelling sigmas.
+        use_area (bool): If gt annotations (eg. CrowdPose, AIC)
+                         do not have 'area', please set use_area=False.
+    """
+    assert coco_gt != None or anno_file != None
+    if style == 'keypoints_crowd':
+        #please install xtcocotools==1.6
+        from xtcocotools.coco import COCO
+        from xtcocotools.cocoeval import COCOeval
+    else:
+        from pycocotools.coco import COCO
+        from pycocotools.cocoeval import COCOeval
+
+    if coco_gt == None:
+        coco_gt = COCO(anno_file)
+    logger.info("Start evaluate...")
+    coco_dt = coco_gt.loadRes(jsonfile)
+    if style == 'proposal':
+        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
+        coco_eval.params.useCats = 0
+        coco_eval.params.maxDets = list(max_dets)
+    elif style == 'keypoints_crowd':
+        coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area)
+    else:
+        coco_eval = COCOeval(coco_gt, coco_dt, style)
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    if classwise:
+        # Compute per-category AP and PR curve
+        try:
+            from terminaltables import AsciiTable
+        except Exception as e:
+            logger.error(
+                'terminaltables not found, plaese install terminaltables. '
+                'for example: `pip install terminaltables`.')
+            raise e
+        precisions = coco_eval.eval['precision']
+        cat_ids = coco_gt.getCatIds()
+        # precision: (iou, recall, cls, area range, max dets)
+        assert len(cat_ids) == precisions.shape[2]
+        results_per_category = []
+        for idx, catId in enumerate(cat_ids):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            nm = coco_gt.loadCats(catId)[0]
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            if precision.size:
+                ap = np.mean(precision)
+            else:
+                ap = float('nan')
+            results_per_category.append(
+                (str(nm["name"]), '{:0.3f}'.format(float(ap))))
+            pr_array = precisions[0, :, idx, 0, 2]
+            recall_array = np.arange(0.0, 1.01, 0.01)
+            draw_pr_curve(
+                pr_array,
+                recall_array,
+                out_dir=style + '_pr_curve',
+                file_name='{}_precision_recall_curve.jpg'.format(nm["name"]))
+
+        num_columns = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        headers = ['category', 'AP'] * (num_columns // 2)
+        results_2d = itertools.zip_longest(
+            *[results_flatten[i::num_columns] for i in range(num_columns)])
+        table_data = [headers]
+        table_data += [result for result in results_2d]
+        table = AsciiTable(table_data)
+        logger.info('Per-category of {} AP: \n{}'.format(style, table.table))
+        logger.info("per-category PR curve has output to {} folder.".format(
+            style + '_pr_curve'))
+    # flush coco evaluation result
+    sys.stdout.flush()
+    return coco_eval.stats
+
+
+def json_eval_results(metric, json_directory, dataset):
+    """
+    cocoapi eval with already exists proposal.json, bbox.json or mask.json
+    """
+    assert metric == 'COCO'
+    anno_file = dataset.get_anno()
+    json_file_list = ['proposal.json', 'bbox.json', 'mask.json']
+    if json_directory:
+        assert os.path.exists(
+            json_directory), "The json directory:{} does not exist".format(
+                json_directory)
+        for k, v in enumerate(json_file_list):
+            json_file_list[k] = os.path.join(str(json_directory), v)
+
+    coco_eval_style = ['proposal', 'bbox', 'segm']
+    for i, v_json in enumerate(json_file_list):
+        if os.path.exists(v_json):
+            cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file)
+        else:
+            logger.info("{} not exists!".format(v_json))

+ 149 - 0
paddlers/models/ppdet/metrics/json_results.py

@@ -0,0 +1,149 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import six
+import numpy as np
+
+
+def get_det_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
+    det_res = []
+    k = 0
+    for i in range(len(bbox_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = bbox_nums[i]
+        for j in range(det_nums):
+            dt = bboxes[k]
+            k = k + 1
+            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
+            if int(num_id) < 0:
+                continue
+            category_id = label_to_cat_id_map[int(num_id)]
+            w = xmax - xmin + bias
+            h = ymax - ymin + bias
+            bbox = [xmin, ymin, w, h]
+            dt_res = {
+                'image_id': cur_image_id,
+                'category_id': category_id,
+                'bbox': bbox,
+                'score': score
+            }
+            det_res.append(dt_res)
+    return det_res
+
+
+def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
+    det_res = []
+    k = 0
+    for i in range(len(bbox_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = bbox_nums[i]
+        for j in range(det_nums):
+            dt = bboxes[k]
+            k = k + 1
+            num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist()
+            if int(num_id) < 0:
+                continue
+            category_id = label_to_cat_id_map[int(num_id)]
+            rbox = [x1, y1, x2, y2, x3, y3, x4, y4]
+            dt_res = {
+                'image_id': cur_image_id,
+                'category_id': category_id,
+                'bbox': rbox,
+                'score': score
+            }
+            det_res.append(dt_res)
+    return det_res
+
+
+def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):
+    import pycocotools.mask as mask_util
+    seg_res = []
+    k = 0
+    for i in range(len(mask_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = mask_nums[i]
+        for j in range(det_nums):
+            mask = masks[k].astype(np.uint8)
+            score = float(bboxes[k][1])
+            label = int(bboxes[k][0])
+            k = k + 1
+            if label == -1:
+                continue
+            cat_id = label_to_cat_id_map[label]
+            rle = mask_util.encode(
+                np.array(
+                    mask[:, :, None], order="F", dtype="uint8"))[0]
+            if six.PY3:
+                if 'counts' in rle:
+                    rle['counts'] = rle['counts'].decode("utf8")
+            sg_res = {
+                'image_id': cur_image_id,
+                'category_id': cat_id,
+                'segmentation': rle,
+                'score': score
+            }
+            seg_res.append(sg_res)
+    return seg_res
+
+
+def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map):
+    import pycocotools.mask as mask_util
+    segm_res = []
+    # for each batch
+    segms = results['segm'].astype(np.uint8)
+    clsid_labels = results['cate_label']
+    clsid_scores = results['cate_score']
+    lengths = segms.shape[0]
+    im_id = int(image_id[0][0])
+    if lengths == 0 or segms is None:
+        return None
+    # for each sample
+    for i in range(lengths - 1):
+        clsid = int(clsid_labels[i])
+        catid = num_id_to_cat_id_map[clsid]
+        score = float(clsid_scores[i])
+        mask = segms[i]
+        segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]
+        segm['counts'] = segm['counts'].decode('utf8')
+        coco_res = {
+            'image_id': im_id,
+            'category_id': catid,
+            'segmentation': segm,
+            'score': score
+        }
+        segm_res.append(coco_res)
+    return segm_res
+
+
+def get_keypoint_res(results, im_id):
+    anns = []
+    preds = results['keypoint']
+    for idx in range(im_id.shape[0]):
+        image_id = im_id[idx].item()
+        kpts, scores = preds[idx]
+        for kpt, score in zip(kpts, scores):
+            kpt = kpt.flatten()
+            ann = {
+                'image_id': image_id,
+                'category_id': 1,  # XXX hard code
+                'keypoints': kpt.tolist(),
+                'score': float(score)
+            }
+            x = kpt[0::3]
+            y = kpt[1::3]
+            x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(
+                y).item(), np.max(y).item()
+            ann['area'] = (x1 - x0) * (y1 - y0)
+            ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
+            anns.append(ann)
+    return anns

+ 401 - 0
paddlers/models/ppdet/metrics/keypoint_metrics.py

@@ -0,0 +1,401 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+from collections import defaultdict, OrderedDict
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from ..modeling.keypoint_utils import oks_nms
+from scipy.io import loadmat, savemat
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['KeyPointTopDownCOCOEval', 'KeyPointTopDownMPIIEval']
+
+
+class KeyPointTopDownCOCOEval(object):
+    """refer to
+        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+        Copyright (c) Microsoft, under the MIT License.
+    """
+
+    def __init__(self,
+                 anno_file,
+                 num_samples,
+                 num_joints,
+                 output_eval,
+                 iou_type='keypoints',
+                 in_vis_thre=0.2,
+                 oks_thre=0.9,
+                 save_prediction_only=False):
+        super(KeyPointTopDownCOCOEval, self).__init__()
+        self.coco = COCO(anno_file)
+        self.num_samples = num_samples
+        self.num_joints = num_joints
+        self.iou_type = iou_type
+        self.in_vis_thre = in_vis_thre
+        self.oks_thre = oks_thre
+        self.output_eval = output_eval
+        self.res_file = os.path.join(output_eval, "keypoints_results.json")
+        self.save_prediction_only = save_prediction_only
+        self.reset()
+
+    def reset(self):
+        self.results = {
+            'all_preds': np.zeros(
+                (self.num_samples, self.num_joints, 3), dtype=np.float32),
+            'all_boxes': np.zeros((self.num_samples, 6)),
+            'image_path': []
+        }
+        self.eval_results = {}
+        self.idx = 0
+
+    def update(self, inputs, outputs):
+        kpts, _ = outputs['keypoint'][0]
+
+        num_images = inputs['image'].shape[0]
+        self.results['all_preds'][self.idx:self.idx + num_images, :, 0:
+                                  3] = kpts[:, :, 0:3]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 0:
+                                  2] = inputs['center'].numpy()[:, 0:2]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 2:
+                                  4] = inputs['scale'].numpy()[:, 0:2]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod(
+            inputs['scale'].numpy() * 200, 1)
+        self.results['all_boxes'][self.idx:self.idx + num_images,
+                                  5] = np.squeeze(inputs['score'].numpy())
+        self.results['image_path'].extend(inputs['im_id'].numpy())
+
+        self.idx += num_images
+
+    def _write_coco_keypoint_results(self, keypoints):
+        data_pack = [{
+            'cat_id': 1,
+            'cls': 'person',
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        }]
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+        if not os.path.exists(self.output_eval):
+            os.makedirs(self.output_eval)
+        with open(self.res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+            logger.info(f'The keypoint result is saved to {self.res_file}.')
+        try:
+            json.load(open(self.res_file))
+        except Exception:
+            content = []
+            with open(self.res_file, 'r') as f:
+                for line in f:
+                    content.append(line)
+            content[-1] = ']'
+            with open(self.res_file, 'w') as f:
+                for c in content:
+                    f.write(c)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpts[k]['keypoints'] for k in range(len(img_kpts))])
+            _key_points = _key_points.reshape(_key_points.shape[0], -1)
+
+            result = [{
+                'image_id': img_kpts[k]['image'],
+                'category_id': cat_id,
+                'keypoints': _key_points[k].tolist(),
+                'score': img_kpts[k]['score'],
+                'center': list(img_kpts[k]['center']),
+                'scale': list(img_kpts[k]['scale'])
+            } for k in range(len(img_kpts))]
+            cat_results.extend(result)
+
+        return cat_results
+
+    def get_final_results(self, preds, all_boxes, img_path):
+        _kpts = []
+        for idx, kpt in enumerate(preds):
+            _kpts.append({
+                'keypoints': kpt,
+                'center': all_boxes[idx][0:2],
+                'scale': all_boxes[idx][2:4],
+                'area': all_boxes[idx][4],
+                'score': all_boxes[idx][5],
+                'image': int(img_path[idx])
+            })
+        # image x person x (keypoints)
+        kpts = defaultdict(list)
+        for kpt in _kpts:
+            kpts[kpt['image']].append(kpt)
+
+        # rescoring and oks nms
+        num_joints = preds.shape[1]
+        in_vis_thre = self.in_vis_thre
+        oks_thre = self.oks_thre
+        oks_nmsed_kpts = []
+        for img in kpts.keys():
+            img_kpts = kpts[img]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > in_vis_thre:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))],
+                           oks_thre)
+
+            if len(keep) == 0:
+                oks_nmsed_kpts.append(img_kpts)
+            else:
+                oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep])
+
+        self._write_coco_keypoint_results(oks_nmsed_kpts)
+
+    def accumulate(self):
+        self.get_final_results(self.results['all_preds'],
+                               self.results['all_boxes'],
+                               self.results['image_path'])
+        if self.save_prediction_only:
+            logger.info(f'The keypoint result is saved to {self.res_file} '
+                        'and do not evaluate the mAP.')
+            return
+        coco_dt = self.coco.loadRes(self.res_file)
+        coco_eval = COCOeval(self.coco, coco_dt, 'keypoints')
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        keypoint_stats = []
+        for ind in range(len(coco_eval.stats)):
+            keypoint_stats.append((coco_eval.stats[ind]))
+        self.eval_results['keypoint'] = keypoint_stats
+
+    def log(self):
+        if self.save_prediction_only:
+            return
+        stats_names = [
+            'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+        num_values = len(stats_names)
+        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
+        print('|---' * (num_values + 1) + '|')
+
+        print(' '.join([
+            '| {:.3f}'.format(value) for value in self.eval_results['keypoint']
+        ]) + ' |')
+
+    def get_results(self):
+        return self.eval_results
+
+
+class KeyPointTopDownMPIIEval(object):
+    def __init__(self,
+                 anno_file,
+                 num_samples,
+                 num_joints,
+                 output_eval,
+                 oks_thre=0.9,
+                 save_prediction_only=False):
+        super(KeyPointTopDownMPIIEval, self).__init__()
+        self.ann_file = anno_file
+        self.res_file = os.path.join(output_eval, "keypoints_results.json")
+        self.save_prediction_only = save_prediction_only
+        self.reset()
+
+    def reset(self):
+        self.results = []
+        self.eval_results = {}
+        self.idx = 0
+
+    def update(self, inputs, outputs):
+        kpts, _ = outputs['keypoint'][0]
+
+        num_images = inputs['image'].shape[0]
+        results = {}
+        results['preds'] = kpts[:, :, 0:3]
+        results['boxes'] = np.zeros((num_images, 6))
+        results['boxes'][:, 0:2] = inputs['center'].numpy()[:, 0:2]
+        results['boxes'][:, 2:4] = inputs['scale'].numpy()[:, 0:2]
+        results['boxes'][:, 4] = np.prod(inputs['scale'].numpy() * 200, 1)
+        results['boxes'][:, 5] = np.squeeze(inputs['score'].numpy())
+        results['image_path'] = inputs['image_file']
+
+        self.results.append(results)
+
+    def accumulate(self):
+        self._mpii_keypoint_results_save()
+        if self.save_prediction_only:
+            logger.info(f'The keypoint result is saved to {self.res_file} '
+                        'and do not evaluate the mAP.')
+            return
+
+        self.eval_results = self.evaluate(self.results)
+
+    def _mpii_keypoint_results_save(self):
+        results = []
+        for res in self.results:
+            if len(res) == 0:
+                continue
+            result = [{
+                'preds': res['preds'][k].tolist(),
+                'boxes': res['boxes'][k].tolist(),
+                'image_path': res['image_path'][k],
+            } for k in range(len(res))]
+            results.extend(result)
+        with open(self.res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+            logger.info(f'The keypoint result is saved to {self.res_file}.')
+
+    def log(self):
+        if self.save_prediction_only:
+            return
+        for item, value in self.eval_results.items():
+            print("{} : {}".format(item, value))
+
+    def get_results(self):
+        return self.eval_results
+
+    def evaluate(self, outputs, savepath=None):
+        """Evaluate PCKh for MPII dataset. refer to
+        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+        Copyright (c) Microsoft, under the MIT License.
+
+        Args:
+            outputs(list(preds, boxes)):
+
+                * preds (np.ndarray[N,K,3]): The first two dimensions are
+                  coordinates, score is the third dimension of the array.
+                * boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
+                  , scale[1],area, score]
+
+        Returns:
+            dict: PCKh for each joint
+        """
+
+        kpts = []
+        for output in outputs:
+            preds = output['preds']
+            batch_size = preds.shape[0]
+            for i in range(batch_size):
+                kpts.append({'keypoints': preds[i]})
+
+        preds = np.stack([kpt['keypoints'] for kpt in kpts])
+
+        # convert 0-based index to 1-based index,
+        # and get the first two dimensions.
+        preds = preds[..., :2] + 1.0
+
+        if savepath is not None:
+            pred_file = os.path.join(savepath, 'pred.mat')
+            savemat(pred_file, mdict={'preds': preds})
+
+        SC_BIAS = 0.6
+        threshold = 0.5
+
+        gt_file = os.path.join(
+            os.path.dirname(self.ann_file), 'mpii_gt_val.mat')
+        gt_dict = loadmat(gt_file)
+        dataset_joints = gt_dict['dataset_joints']
+        jnt_missing = gt_dict['jnt_missing']
+        pos_gt_src = gt_dict['pos_gt_src']
+        headboxes_src = gt_dict['headboxes_src']
+
+        pos_pred_src = np.transpose(preds, [1, 2, 0])
+
+        head = np.where(dataset_joints == 'head')[1][0]
+        lsho = np.where(dataset_joints == 'lsho')[1][0]
+        lelb = np.where(dataset_joints == 'lelb')[1][0]
+        lwri = np.where(dataset_joints == 'lwri')[1][0]
+        lhip = np.where(dataset_joints == 'lhip')[1][0]
+        lkne = np.where(dataset_joints == 'lkne')[1][0]
+        lank = np.where(dataset_joints == 'lank')[1][0]
+
+        rsho = np.where(dataset_joints == 'rsho')[1][0]
+        relb = np.where(dataset_joints == 'relb')[1][0]
+        rwri = np.where(dataset_joints == 'rwri')[1][0]
+        rkne = np.where(dataset_joints == 'rkne')[1][0]
+        rank = np.where(dataset_joints == 'rank')[1][0]
+        rhip = np.where(dataset_joints == 'rhip')[1][0]
+
+        jnt_visible = 1 - jnt_missing
+        uv_error = pos_pred_src - pos_gt_src
+        uv_err = np.linalg.norm(uv_error, axis=1)
+        headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :]
+        headsizes = np.linalg.norm(headsizes, axis=0)
+        headsizes *= SC_BIAS
+        scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32)
+        scaled_uv_err = uv_err / scale
+        scaled_uv_err = scaled_uv_err * jnt_visible
+        jnt_count = np.sum(jnt_visible, axis=1)
+        less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
+        PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count
+
+        # save
+        rng = np.arange(0, 0.5 + 0.01, 0.01)
+        pckAll = np.zeros((len(rng), 16), dtype=np.float32)
+
+        for r, threshold in enumerate(rng):
+            less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
+            pckAll[r, :] = 100. * np.sum(less_than_threshold,
+                                         axis=1) / jnt_count
+
+        PCKh = np.ma.array(PCKh, mask=False)
+        PCKh.mask[6:8] = True
+
+        jnt_count = np.ma.array(jnt_count, mask=False)
+        jnt_count.mask[6:8] = True
+        jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64)
+
+        name_value = [  #noqa
+            ('Head', PCKh[head]),
+            ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])),
+            ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])),
+            ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])),
+            ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])),
+            ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])),
+            ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])),
+            ('PCKh', np.sum(PCKh * jnt_ratio)),
+            ('PCKh@0.1', np.sum(pckAll[11, :] * jnt_ratio))
+        ]
+        name_value = OrderedDict(name_value)
+
+        return name_value
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts

+ 444 - 0
paddlers/models/ppdet/metrics/map_utils.py

@@ -0,0 +1,444 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import sys
+import numpy as np
+import itertools
+import paddle
+from paddlers.models.ppdet.modeling.bbox_utils import poly2rbox, rbox2poly_np
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'draw_pr_curve',
+    'bbox_area',
+    'jaccard_overlap',
+    'prune_zero_padding',
+    'DetectionMAP',
+    'ap_per_class',
+    'compute_ap',
+]
+
+
+def draw_pr_curve(precision,
+                  recall,
+                  iou=0.5,
+                  out_dir='pr_curve',
+                  file_name='precision_recall_curve.jpg'):
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    output_path = os.path.join(out_dir, file_name)
+    try:
+        import matplotlib.pyplot as plt
+    except Exception as e:
+        logger.error('Matplotlib not found, plaese install matplotlib.'
+                     'for example: `pip install matplotlib`.')
+        raise e
+    plt.cla()
+    plt.figure('P-R Curve')
+    plt.title('Precision/Recall Curve(IoU={})'.format(iou))
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.grid(True)
+    plt.plot(recall, precision)
+    plt.savefig(output_path)
+
+
+def bbox_area(bbox, is_bbox_normalized):
+    """
+    Calculate area of a bounding box
+    """
+    norm = 1. - float(is_bbox_normalized)
+    width = bbox[2] - bbox[0] + norm
+    height = bbox[3] - bbox[1] + norm
+    return width * height
+
+
+def jaccard_overlap(pred, gt, is_bbox_normalized=False):
+    """
+    Calculate jaccard overlap ratio between two bounding box
+    """
+    if pred[0] >= gt[2] or pred[2] <= gt[0] or \
+        pred[1] >= gt[3] or pred[3] <= gt[1]:
+        return 0.
+    inter_xmin = max(pred[0], gt[0])
+    inter_ymin = max(pred[1], gt[1])
+    inter_xmax = min(pred[2], gt[2])
+    inter_ymax = min(pred[3], gt[3])
+    inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax],
+                           is_bbox_normalized)
+    pred_size = bbox_area(pred, is_bbox_normalized)
+    gt_size = bbox_area(gt, is_bbox_normalized)
+    overlap = float(inter_size) / (pred_size + gt_size - inter_size)
+    return overlap
+
+
+def calc_rbox_iou(pred, gt_rbox):
+    """
+    calc iou between rotated bbox
+    """
+    # calc iou of bounding box for speedup
+    pred = np.array(pred, np.float32).reshape(-1, 8)
+    pred = pred.reshape(-1, 2)
+    gt_poly = rbox2poly_np(np.array(gt_rbox).reshape(-1, 5))[0]
+    gt_poly = gt_poly.reshape(-1, 2)
+    pred_rect = [
+        np.min(pred[:, 0]), np.min(pred[:, 1]), np.max(pred[:, 0]),
+        np.max(pred[:, 1])
+    ]
+    gt_rect = [
+        np.min(gt_poly[:, 0]), np.min(gt_poly[:, 1]), np.max(gt_poly[:, 0]),
+        np.max(gt_poly[:, 1])
+    ]
+    iou = jaccard_overlap(pred_rect, gt_rect, False)
+
+    if iou <= 0:
+        return iou
+
+    # calc rbox iou
+    pred = pred.reshape(-1, 8)
+
+    pred = np.array(pred, np.float32).reshape(-1, 8)
+    pred_rbox = poly2rbox(pred)
+    pred_rbox = pred_rbox.reshape(-1, 5)
+    pred_rbox = pred_rbox.reshape(-1, 5)
+    try:
+        from rbox_iou_ops import rbox_iou
+    except Exception as e:
+        print("import custom_ops error, try install rbox_iou_ops " \
+                  "following ppdet/ext_op/README.md", e)
+        sys.stdout.flush()
+        sys.exit(-1)
+    gt_rbox = np.array(gt_rbox, np.float32).reshape(-1, 5)
+    pd_gt_rbox = paddle.to_tensor(gt_rbox, dtype='float32')
+    pd_pred_rbox = paddle.to_tensor(pred_rbox, dtype='float32')
+    iou = rbox_iou(pd_gt_rbox, pd_pred_rbox)
+    iou = iou.numpy()
+    return iou[0][0]
+
+
+def prune_zero_padding(gt_box, gt_label, difficult=None):
+    valid_cnt = 0
+    for i in range(len(gt_box)):
+        if gt_box[i, 0] == 0 and gt_box[i, 1] == 0 and \
+                gt_box[i, 2] == 0 and gt_box[i, 3] == 0:
+            break
+        valid_cnt += 1
+    return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt]
+            if difficult is not None else None)
+
+
+class DetectionMAP(object):
+    """
+    Calculate detection mean average precision.
+    Currently support two types: 11point and integral
+
+    Args:
+        class_num (int): The class number.
+        overlap_thresh (float): The threshold of overlap
+            ratio between prediction bounding box and
+            ground truth bounding box for deciding
+            true/false positive. Default 0.5.
+        map_type (str): Calculation method of mean average
+            precision, currently support '11point' and
+            'integral'. Default '11point'.
+        is_bbox_normalized (bool): Whether bounding boxes
+            is normalized to range[0, 1]. Default False.
+        evaluate_difficult (bool): Whether to evaluate
+            difficult bounding boxes. Default False.
+        catid2name (dict): Mapping between category id and category name.
+        classwise (bool): Whether per-category AP and draw
+            P-R Curve or not.
+    """
+
+    def __init__(self,
+                 class_num,
+                 overlap_thresh=0.5,
+                 map_type='11point',
+                 is_bbox_normalized=False,
+                 evaluate_difficult=False,
+                 catid2name=None,
+                 classwise=False):
+        self.class_num = class_num
+        self.overlap_thresh = overlap_thresh
+        assert map_type in ['11point', 'integral'], \
+                "map_type currently only support '11point' "\
+                "and 'integral'"
+        self.map_type = map_type
+        self.is_bbox_normalized = is_bbox_normalized
+        self.evaluate_difficult = evaluate_difficult
+        self.classwise = classwise
+        self.classes = []
+        for cname in catid2name.values():
+            self.classes.append(cname)
+        self.reset()
+
+    def update(self, bbox, score, label, gt_box, gt_label, difficult=None):
+        """
+        Update metric statics from given prediction and ground
+        truth infomations.
+        """
+        if difficult is None:
+            difficult = np.zeros_like(gt_label)
+
+        # record class gt count
+        for gtl, diff in zip(gt_label, difficult):
+            if self.evaluate_difficult or int(diff) == 0:
+                self.class_gt_counts[int(np.array(gtl))] += 1
+
+        # record class score positive
+        visited = [False] * len(gt_label)
+        for b, s, l in zip(bbox, score, label):
+            pred = b.tolist() if isinstance(b, np.ndarray) else b
+            max_idx = -1
+            max_overlap = -1.0
+            for i, gl in enumerate(gt_label):
+                if int(gl) == int(l):
+                    if len(gt_box[i]) == 5:
+                        overlap = calc_rbox_iou(pred, gt_box[i])
+                    else:
+                        overlap = jaccard_overlap(pred, gt_box[i],
+                                                  self.is_bbox_normalized)
+                    if overlap > max_overlap:
+                        max_overlap = overlap
+                        max_idx = i
+
+            if max_overlap > self.overlap_thresh:
+                if self.evaluate_difficult or \
+                        int(np.array(difficult[max_idx])) == 0:
+                    if not visited[max_idx]:
+                        self.class_score_poss[int(l)].append([s, 1.0])
+                        visited[max_idx] = True
+                    else:
+                        self.class_score_poss[int(l)].append([s, 0.0])
+            else:
+                self.class_score_poss[int(l)].append([s, 0.0])
+
+    def reset(self):
+        """
+        Reset metric statics
+        """
+        self.class_score_poss = [[] for _ in range(self.class_num)]
+        self.class_gt_counts = [0] * self.class_num
+        self.mAP = 0.0
+
+    def accumulate(self):
+        """
+        Accumulate metric results and calculate mAP
+        """
+        mAP = 0.
+        valid_cnt = 0
+        eval_results = []
+        for score_pos, count in zip(self.class_score_poss,
+                                    self.class_gt_counts):
+            if count == 0: continue
+            if len(score_pos) == 0:
+                valid_cnt += 1
+                continue
+
+            accum_tp_list, accum_fp_list = \
+                    self._get_tp_fp_accum(score_pos)
+            precision = []
+            recall = []
+            for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):
+                precision.append(float(ac_tp) / (ac_tp + ac_fp))
+                recall.append(float(ac_tp) / count)
+
+            one_class_ap = 0.0
+            if self.map_type == '11point':
+                max_precisions = [0.] * 11
+                start_idx = len(precision) - 1
+                for j in range(10, -1, -1):
+                    for i in range(start_idx, -1, -1):
+                        if recall[i] < float(j) / 10.:
+                            start_idx = i
+                            if j > 0:
+                                max_precisions[j - 1] = max_precisions[j]
+                                break
+                        else:
+                            if max_precisions[j] < precision[i]:
+                                max_precisions[j] = precision[i]
+                one_class_ap = sum(max_precisions) / 11.
+                mAP += one_class_ap
+                valid_cnt += 1
+            elif self.map_type == 'integral':
+                import math
+                prev_recall = 0.
+                for i in range(len(precision)):
+                    recall_gap = math.fabs(recall[i] - prev_recall)
+                    if recall_gap > 1e-6:
+                        one_class_ap += precision[i] * recall_gap
+                        prev_recall = recall[i]
+                mAP += one_class_ap
+                valid_cnt += 1
+            else:
+                logger.error("Unspported mAP type {}".format(self.map_type))
+                sys.exit(1)
+            eval_results.append({
+                'class': self.classes[valid_cnt - 1],
+                'ap': one_class_ap,
+                'precision': precision,
+                'recall': recall,
+            })
+        self.eval_results = eval_results
+        self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP
+
+    def get_map(self):
+        """
+        Get mAP result
+        """
+        if self.mAP is None:
+            logger.error("mAP is not calculated.")
+        if self.classwise:
+            # Compute per-category AP and PR curve
+            try:
+                from terminaltables import AsciiTable
+            except Exception as e:
+                logger.error(
+                    'terminaltables not found, plaese install terminaltables. '
+                    'for example: `pip install terminaltables`.')
+                raise e
+            results_per_category = []
+            for eval_result in self.eval_results:
+                results_per_category.append(
+                    (str(eval_result['class']),
+                     '{:0.3f}'.format(float(eval_result['ap']))))
+                draw_pr_curve(
+                    eval_result['precision'],
+                    eval_result['recall'],
+                    out_dir='voc_pr_curve',
+                    file_name='{}_precision_recall_curve.jpg'.format(
+                        eval_result['class']))
+
+            num_columns = min(6, len(results_per_category) * 2)
+            results_flatten = list(itertools.chain(*results_per_category))
+            headers = ['category', 'AP'] * (num_columns // 2)
+            results_2d = itertools.zip_longest(*[
+                results_flatten[i::num_columns] for i in range(num_columns)
+            ])
+            table_data = [headers]
+            table_data += [result for result in results_2d]
+            table = AsciiTable(table_data)
+            logger.info('Per-category of VOC AP: \n{}'.format(table.table))
+            logger.info(
+                "per-category PR curve has output to voc_pr_curve folder.")
+        return self.mAP
+
+    def _get_tp_fp_accum(self, score_pos_list):
+        """
+        Calculate accumulating true/false positive results from
+        [score, pos] records
+        """
+        sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True)
+        accum_tp = 0
+        accum_fp = 0
+        accum_tp_list = []
+        accum_fp_list = []
+        for (score, pos) in sorted_list:
+            accum_tp += int(pos)
+            accum_tp_list.append(accum_tp)
+            accum_fp += 1 - int(pos)
+            accum_fp_list.append(accum_fp)
+        return accum_tp_list, accum_fp_list
+
+
+def ap_per_class(tp, conf, pred_cls, target_cls):
+    """
+    Computes the average precision, given the recall and precision curves.
+    Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics.
+
+    Args:
+        tp (list): True positives.
+        conf (list): Objectness value from 0-1.
+        pred_cls (list): Predicted object classes.
+        target_cls (list): Target object classes.
+    """
+    tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(
+        pred_cls), np.array(target_cls)
+
+    # Sort by objectness
+    i = np.argsort(-conf)
+    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
+
+    # Find unique classes
+    unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0))
+
+    # Create Precision-Recall curve and compute AP for each class
+    ap, p, r = [], [], []
+    for c in unique_classes:
+        i = pred_cls == c
+        n_gt = sum(target_cls == c)  # Number of ground truth objects
+        n_p = sum(i)  # Number of predicted objects
+
+        if (n_p == 0) and (n_gt == 0):
+            continue
+        elif (n_p == 0) or (n_gt == 0):
+            ap.append(0)
+            r.append(0)
+            p.append(0)
+        else:
+            # Accumulate FPs and TPs
+            fpc = np.cumsum(1 - tp[i])
+            tpc = np.cumsum(tp[i])
+
+            # Recall
+            recall_curve = tpc / (n_gt + 1e-16)
+            r.append(tpc[-1] / (n_gt + 1e-16))
+
+            # Precision
+            precision_curve = tpc / (tpc + fpc)
+            p.append(tpc[-1] / (tpc[-1] + fpc[-1]))
+
+            # AP from recall-precision curve
+            ap.append(compute_ap(recall_curve, precision_curve))
+
+    return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(
+        p)
+
+
+def compute_ap(recall, precision):
+    """
+    Computes the average precision, given the recall and precision curves.
+    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
+
+    Args:
+        recall (list): The recall curve.
+        precision (list): The precision curve.
+
+    Returns:
+        The average precision as computed in py-faster-rcnn.
+    """
+    # correct AP calculation
+    # first append sentinel values at the end
+    mrec = np.concatenate(([0.], recall, [1.]))
+    mpre = np.concatenate(([0.], precision, [0.]))
+
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap

+ 470 - 0
paddlers/models/ppdet/metrics/mcmot_metrics.py

@@ -0,0 +1,470 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import copy
+import sys
+import math
+from collections import defaultdict
+from motmetrics.math_util import quiet_divide
+
+import numpy as np
+import pandas as pd
+
+import paddle
+import paddle.nn.functional as F
+from .metrics import Metric
+import motmetrics as mm
+import openpyxl
+metrics = mm.metrics.motchallenge_metrics
+mh = mm.metrics.create()
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['MCMOTEvaluator', 'MCMOTMetric']
+
+METRICS_LIST = [
+    'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend',
+    'num_migrate', 'num_false_positives', 'num_misses', 'num_detections',
+    'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked',
+    'partially_tracked', 'mostly_lost', 'num_fragmentations', 'motp', 'mota',
+    'precision', 'recall', 'idfp', 'idfn', 'idtp', 'idp', 'idr', 'idf1'
+]
+
+NAME_MAP = {
+    'num_frames': 'num_frames',
+    'num_matches': 'num_matches',
+    'num_switches': 'IDs',
+    'num_transfer': 'IDt',
+    'num_ascend': 'IDa',
+    'num_migrate': 'IDm',
+    'num_false_positives': 'FP',
+    'num_misses': 'FN',
+    'num_detections': 'num_detections',
+    'num_objects': 'num_objects',
+    'num_predictions': 'num_predictions',
+    'num_unique_objects': 'GT',
+    'mostly_tracked': 'MT',
+    'partially_tracked': 'partially_tracked',
+    'mostly_lost': 'ML',
+    'num_fragmentations': 'FM',
+    'motp': 'MOTP',
+    'mota': 'MOTA',
+    'precision': 'Prcn',
+    'recall': 'Rcll',
+    'idfp': 'idfp',
+    'idfn': 'idfn',
+    'idtp': 'idtp',
+    'idp': 'IDP',
+    'idr': 'IDR',
+    'idf1': 'IDF1'
+}
+
+
+def parse_accs_metrics(seq_acc, index_name, verbose=False):
+    """
+    Parse the evaluation indicators of multiple MOTAccumulator
+    """
+    mh = mm.metrics.create()
+    summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST)
+    summary.loc['OVERALL', 'motp'] = (summary['motp'] * summary['num_detections']).sum() / \
+                                     summary.loc['OVERALL', 'num_detections']
+    if verbose:
+        strsummary = mm.io.render_summary(
+            summary, formatters=mh.formatters, namemap=NAME_MAP)
+        print(strsummary)
+
+    return summary
+
+
+def seqs_overall_metrics(summary_df, verbose=False):
+    """
+    Calculate overall metrics for multiple sequences
+    """
+    add_col = [
+        'num_frames', 'num_matches', 'num_switches', 'num_transfer',
+        'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses',
+        'num_detections', 'num_objects', 'num_predictions',
+        'num_unique_objects', 'mostly_tracked', 'partially_tracked',
+        'mostly_lost', 'num_fragmentations', 'idfp', 'idfn', 'idtp'
+    ]
+    calc_col = ['motp', 'mota', 'precision', 'recall', 'idp', 'idr', 'idf1']
+    calc_df = summary_df.copy()
+
+    overall_dic = {}
+    for col in add_col:
+        overall_dic[col] = calc_df[col].sum()
+
+    for col in calc_col:
+        overall_dic[col] = getattr(MCMOTMetricOverall, col + '_overall')(
+            calc_df, overall_dic)
+
+    overall_df = pd.DataFrame(overall_dic, index=['overall_calc'])
+    calc_df = pd.concat([calc_df, overall_df])
+
+    if verbose:
+        mh = mm.metrics.create()
+        str_calc_df = mm.io.render_summary(
+            calc_df, formatters=mh.formatters, namemap=NAME_MAP)
+        print(str_calc_df)
+
+    return calc_df
+
+
+class MCMOTMetricOverall(object):
+    def motp_overall(summary_df, overall_dic):
+        motp = quiet_divide((summary_df['motp'] *
+                             summary_df['num_detections']).sum(),
+                            overall_dic['num_detections'])
+        return motp
+
+    def mota_overall(summary_df, overall_dic):
+        del summary_df
+        mota = 1. - quiet_divide(
+            (overall_dic['num_misses'] + overall_dic['num_switches'] +
+             overall_dic['num_false_positives']), overall_dic['num_objects'])
+        return mota
+
+    def precision_overall(summary_df, overall_dic):
+        del summary_df
+        precision = quiet_divide(overall_dic['num_detections'], (
+            overall_dic['num_false_positives'] + overall_dic['num_detections']
+        ))
+        return precision
+
+    def recall_overall(summary_df, overall_dic):
+        del summary_df
+        recall = quiet_divide(overall_dic['num_detections'],
+                              overall_dic['num_objects'])
+        return recall
+
+    def idp_overall(summary_df, overall_dic):
+        del summary_df
+        idp = quiet_divide(overall_dic['idtp'],
+                           (overall_dic['idtp'] + overall_dic['idfp']))
+        return idp
+
+    def idr_overall(summary_df, overall_dic):
+        del summary_df
+        idr = quiet_divide(overall_dic['idtp'],
+                           (overall_dic['idtp'] + overall_dic['idfn']))
+        return idr
+
+    def idf1_overall(summary_df, overall_dic):
+        del summary_df
+        idf1 = quiet_divide(2. * overall_dic['idtp'], (
+            overall_dic['num_objects'] + overall_dic['num_predictions']))
+        return idf1
+
+
+def read_mcmot_results_union(filename, is_gt, is_ignore):
+    results_dict = dict()
+    if os.path.isfile(filename):
+        all_result = np.loadtxt(filename, delimiter=',')
+        if all_result.shape[0] == 0 or all_result.shape[1] < 7:
+            return results_dict
+        if is_ignore:
+            return results_dict
+        if is_gt:
+            # only for test use
+            all_result = all_result[all_result[:, 7] != 0]
+            all_result[:, 7] = all_result[:, 7] - 1
+
+        if all_result.shape[0] == 0:
+            return results_dict
+
+        class_unique = np.unique(all_result[:, 7])
+
+        last_max_id = 0
+        result_cls_list = []
+        for cls in class_unique:
+            result_cls_split = all_result[all_result[:, 7] == cls]
+            result_cls_split[:, 1] = result_cls_split[:, 1] + last_max_id
+            # make sure track id different between every category
+            last_max_id = max(np.unique(result_cls_split[:, 1])) + 1
+            result_cls_list.append(result_cls_split)
+
+        results_con = np.concatenate(result_cls_list)
+
+        for line in range(len(results_con)):
+            linelist = results_con[line]
+            fid = int(linelist[0])
+            if fid < 1:
+                continue
+            results_dict.setdefault(fid, list())
+
+            if is_gt:
+                score = 1
+            else:
+                score = float(linelist[6])
+
+            tlwh = tuple(map(float, linelist[2:6]))
+            target_id = int(linelist[1])
+            cls = int(linelist[7])
+
+            results_dict[fid].append((tlwh, target_id, cls, score))
+
+        return results_dict
+
+
+def read_mcmot_results(filename, is_gt, is_ignore):
+    results_dict = dict()
+    if os.path.isfile(filename):
+        with open(filename, 'r') as f:
+            for line in f.readlines():
+                linelist = line.strip().split(',')
+                if len(linelist) < 7:
+                    continue
+                fid = int(linelist[0])
+                if fid < 1:
+                    continue
+                cid = int(linelist[7])
+                if is_gt:
+                    score = 1
+                    # only for test use
+                    cid -= 1
+                else:
+                    score = float(linelist[6])
+
+                cls_result_dict = results_dict.setdefault(cid, dict())
+                cls_result_dict.setdefault(fid, list())
+
+                tlwh = tuple(map(float, linelist[2:6]))
+                target_id = int(linelist[1])
+                cls_result_dict[fid].append((tlwh, target_id, score))
+    return results_dict
+
+
+def read_results(filename,
+                 data_type,
+                 is_gt=False,
+                 is_ignore=False,
+                 multi_class=False,
+                 union=False):
+    if data_type in ['mcmot', 'lab']:
+        if multi_class:
+            if union:
+                # The results are evaluated by union all the categories.
+                # Track IDs between different categories cannot be duplicate.
+                read_fun = read_mcmot_results_union
+            else:
+                # The results are evaluated separately by category.
+                read_fun = read_mcmot_results
+        else:
+            raise ValueError('multi_class: {}, MCMOT should have cls_id.'.
+                             format(multi_class))
+    else:
+        raise ValueError('Unknown data type: {}'.format(data_type))
+
+    return read_fun(filename, is_gt, is_ignore)
+
+
+def unzip_objs(objs):
+    if len(objs) > 0:
+        tlwhs, ids, scores = zip(*objs)
+    else:
+        tlwhs, ids, scores = [], [], []
+    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
+    return tlwhs, ids, scores
+
+
+def unzip_objs_cls(objs):
+    if len(objs) > 0:
+        tlwhs, ids, cls, scores = zip(*objs)
+    else:
+        tlwhs, ids, cls, scores = [], [], [], []
+    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
+    ids = np.array(ids)
+    cls = np.array(cls)
+    scores = np.array(scores)
+    return tlwhs, ids, cls, scores
+
+
+class MCMOTEvaluator(object):
+    def __init__(self, data_root, seq_name, data_type, num_classes):
+        self.data_root = data_root
+        self.seq_name = seq_name
+        self.data_type = data_type
+        self.num_classes = num_classes
+
+        self.load_annotations()
+        self.reset_accumulator()
+
+        self.class_accs = []
+
+    def load_annotations(self):
+        assert self.data_type == 'mcmot'
+        self.gt_filename = os.path.join(self.data_root, '../', '../',
+                                        'sequences',
+                                        '{}.txt'.format(self.seq_name))
+
+    def reset_accumulator(self):
+        import motmetrics as mm
+        mm.lap.default_solver = 'lap'
+        self.acc = mm.MOTAccumulator(auto_id=True)
+
+    def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False,
+                        union=False):
+        import motmetrics as mm
+        mm.lap.default_solver = 'lap'
+        if union:
+            trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3]
+            gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3]
+
+            # get distance matrix
+            iou_distance = mm.distances.iou_matrix(
+                gt_tlwhs, trk_tlwhs, max_iou=0.5)
+
+            # Set the distance between objects of different categories to nan
+            gt_cls_len = len(gt_cls)
+            trk_cls_len = len(trk_cls)
+            # When the number of GT or Trk is 0, iou_distance dimension is (0,0)
+            if gt_cls_len != 0 and trk_cls_len != 0:
+                gt_cls = gt_cls.reshape(gt_cls_len, 1)
+                gt_cls = np.repeat(gt_cls, trk_cls_len, axis=1)
+                trk_cls = trk_cls.reshape(1, trk_cls_len)
+                trk_cls = np.repeat(trk_cls, gt_cls_len, axis=0)
+                iou_distance = np.where(gt_cls == trk_cls, iou_distance,
+                                        np.nan)
+
+        else:
+            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
+            gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
+
+            # get distance matrix
+            iou_distance = mm.distances.iou_matrix(
+                gt_tlwhs, trk_tlwhs, max_iou=0.5)
+
+        self.acc.update(gt_ids, trk_ids, iou_distance)
+
+        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
+                                                            'mot_events'):
+            events = self.acc.mot_events  # only supported by https://github.com/longcw/py-motmetrics
+        else:
+            events = None
+        return events
+
+    def eval_file(self, result_filename):
+        # evaluation of each category
+        gt_frame_dict = read_results(
+            self.gt_filename,
+            self.data_type,
+            is_gt=True,
+            multi_class=True,
+            union=False)
+        result_frame_dict = read_results(
+            result_filename,
+            self.data_type,
+            is_gt=False,
+            multi_class=True,
+            union=False)
+
+        for cid in range(self.num_classes):
+            self.reset_accumulator()
+            cls_result_frame_dict = result_frame_dict.setdefault(cid, dict())
+            cls_gt_frame_dict = gt_frame_dict.setdefault(cid, dict())
+
+            # only labeled frames will be evaluated
+            frames = sorted(list(set(cls_gt_frame_dict.keys())))
+
+            for frame_id in frames:
+                trk_objs = cls_result_frame_dict.get(frame_id, [])
+                gt_objs = cls_gt_frame_dict.get(frame_id, [])
+                self.eval_frame_dict(trk_objs, gt_objs, rtn_events=False)
+
+            self.class_accs.append(self.acc)
+
+        return self.class_accs
+
+    @staticmethod
+    def get_summary(accs,
+                    names,
+                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
+                             'precision', 'recall')):
+        import motmetrics as mm
+        mm.lap.default_solver = 'lap'
+
+        names = copy.deepcopy(names)
+        if metrics is None:
+            metrics = mm.metrics.motchallenge_metrics
+        metrics = copy.deepcopy(metrics)
+
+        mh = mm.metrics.create()
+        summary = mh.compute_many(
+            accs, metrics=metrics, names=names, generate_overall=True)
+
+        return summary
+
+    @staticmethod
+    def save_summary(summary, filename):
+        import pandas as pd
+        writer = pd.ExcelWriter(filename)
+        summary.to_excel(writer)
+        writer.save()
+
+
+class MCMOTMetric(Metric):
+    def __init__(self, num_classes, save_summary=False):
+        self.num_classes = num_classes
+        self.save_summary = save_summary
+        self.MCMOTEvaluator = MCMOTEvaluator
+        self.result_root = None
+        self.reset()
+
+        self.seqs_overall = defaultdict(list)
+
+    def reset(self):
+        self.accs = []
+        self.seqs = []
+
+    def update(self, data_root, seq, data_type, result_root, result_filename):
+        evaluator = self.MCMOTEvaluator(data_root, seq, data_type,
+                                        self.num_classes)
+        seq_acc = evaluator.eval_file(result_filename)
+        self.accs.append(seq_acc)
+        self.seqs.append(seq)
+        self.result_root = result_root
+
+        cls_index_name = [
+            '{}_{}'.format(seq, i) for i in range(self.num_classes)
+        ]
+        summary = parse_accs_metrics(seq_acc, cls_index_name)
+        summary.rename(
+            index={'OVERALL': '{}_OVERALL'.format(seq)}, inplace=True)
+        for row in range(len(summary)):
+            self.seqs_overall[row].append(summary.iloc[row:row + 1])
+
+    def accumulate(self):
+        self.cls_summary_list = []
+        for row in range(self.num_classes):
+            seqs_cls_df = pd.concat(self.seqs_overall[row])
+            seqs_cls_summary = seqs_overall_metrics(seqs_cls_df)
+            cls_summary_overall = seqs_cls_summary.iloc[-1:].copy()
+            cls_summary_overall.rename(
+                index={'overall_calc': 'overall_calc_{}'.format(row)},
+                inplace=True)
+            self.cls_summary_list.append(cls_summary_overall)
+
+    def log(self):
+        seqs_summary = seqs_overall_metrics(
+            pd.concat(self.seqs_overall[self.num_classes]), verbose=True)
+        class_summary = seqs_overall_metrics(
+            pd.concat(self.cls_summary_list), verbose=True)
+
+    def get_results(self):
+        return 1

+ 434 - 0
paddlers/models/ppdet/metrics/metrics.py

@@ -0,0 +1,434 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import json
+import paddle
+import numpy as np
+import typing
+
+from .map_utils import prune_zero_padding, DetectionMAP
+from .coco_utils import get_infer_results, cocoapi_eval
+from .widerface_utils import face_eval_run
+from paddlers.models.ppdet.data.source.category import get_categories
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric',
+    'get_infer_results', 'RBoxMetric', 'SNIPERCOCOMetric'
+]
+
+COCO_SIGMAS = np.array([
+    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87,
+    .87, .89, .89
+]) / 10.0
+CROWD_SIGMAS = np.array(
+    [.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79,
+     .79]) / 10.0
+
+
+class Metric(paddle.metric.Metric):
+    def name(self):
+        return self.__class__.__name__
+
+    def reset(self):
+        pass
+
+    def accumulate(self):
+        pass
+
+    # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate`
+    # :metch:`reset`, in ppdet, we also need following 2 methods:
+
+    # abstract method for logging metric results
+    def log(self):
+        pass
+
+    # abstract method for getting metric results
+    def get_results(self):
+        pass
+
+
+class COCOMetric(Metric):
+    def __init__(self, anno_file, **kwargs):
+        assert os.path.isfile(anno_file), \
+                "anno_file {} not a file".format(anno_file)
+        self.anno_file = anno_file
+        self.clsid2catid = kwargs.get('clsid2catid', None)
+        if self.clsid2catid is None:
+            self.clsid2catid, _ = get_categories('COCO', anno_file)
+        self.classwise = kwargs.get('classwise', False)
+        self.output_eval = kwargs.get('output_eval', None)
+        # TODO: bias should be unified
+        self.bias = kwargs.get('bias', 0)
+        self.save_prediction_only = kwargs.get('save_prediction_only', False)
+        self.iou_type = kwargs.get('IouType', 'bbox')
+        self.reset()
+
+    def reset(self):
+        # only bbox and mask evaluation support currently
+        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
+        self.eval_results = {}
+
+    def update(self, inputs, outputs):
+        outs = {}
+        # outputs Tensor -> numpy.ndarray
+        for k, v in outputs.items():
+            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
+
+        # multi-scale inputs: all inputs have same im_id
+        if isinstance(inputs, typing.Sequence):
+            im_id = inputs[0]['im_id']
+        else:
+            im_id = inputs['im_id']
+        outs['im_id'] = im_id.numpy() if isinstance(im_id,
+                                                    paddle.Tensor) else im_id
+
+        infer_results = get_infer_results(
+            outs, self.clsid2catid, bias=self.bias)
+        self.results['bbox'] += infer_results[
+            'bbox'] if 'bbox' in infer_results else []
+        self.results['mask'] += infer_results[
+            'mask'] if 'mask' in infer_results else []
+        self.results['segm'] += infer_results[
+            'segm'] if 'segm' in infer_results else []
+        self.results['keypoint'] += infer_results[
+            'keypoint'] if 'keypoint' in infer_results else []
+
+    def accumulate(self):
+        if len(self.results['bbox']) > 0:
+            output = "bbox.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['bbox'], f)
+                logger.info('The bbox result is saved to bbox.json.')
+
+            if self.save_prediction_only:
+                logger.info('The bbox result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                bbox_stats = cocoapi_eval(
+                    output,
+                    'bbox',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['bbox'] = bbox_stats
+                sys.stdout.flush()
+
+        if len(self.results['mask']) > 0:
+            output = "mask.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['mask'], f)
+                logger.info('The mask result is saved to mask.json.')
+
+            if self.save_prediction_only:
+                logger.info('The mask result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                seg_stats = cocoapi_eval(
+                    output,
+                    'segm',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['mask'] = seg_stats
+                sys.stdout.flush()
+
+        if len(self.results['segm']) > 0:
+            output = "segm.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['segm'], f)
+                logger.info('The segm result is saved to segm.json.')
+
+            if self.save_prediction_only:
+                logger.info('The segm result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                seg_stats = cocoapi_eval(
+                    output,
+                    'segm',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['mask'] = seg_stats
+                sys.stdout.flush()
+
+        if len(self.results['keypoint']) > 0:
+            output = "keypoint.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['keypoint'], f)
+                logger.info('The keypoint result is saved to keypoint.json.')
+
+            if self.save_prediction_only:
+                logger.info('The keypoint result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                style = 'keypoints'
+                use_area = True
+                sigmas = COCO_SIGMAS
+                if self.iou_type == 'keypoints_crowd':
+                    style = 'keypoints_crowd'
+                    use_area = False
+                    sigmas = CROWD_SIGMAS
+                keypoint_stats = cocoapi_eval(
+                    output,
+                    style,
+                    anno_file=self.anno_file,
+                    classwise=self.classwise,
+                    sigmas=sigmas,
+                    use_area=use_area)
+                self.eval_results['keypoint'] = keypoint_stats
+                sys.stdout.flush()
+
+    def log(self):
+        pass
+
+    def get_results(self):
+        return self.eval_results
+
+
+class VOCMetric(Metric):
+    def __init__(self,
+                 label_list,
+                 class_num=20,
+                 overlap_thresh=0.5,
+                 map_type='11point',
+                 is_bbox_normalized=False,
+                 evaluate_difficult=False,
+                 classwise=False):
+        assert os.path.isfile(label_list), \
+                "label_list {} not a file".format(label_list)
+        self.clsid2catid, self.catid2name = get_categories('VOC', label_list)
+
+        self.overlap_thresh = overlap_thresh
+        self.map_type = map_type
+        self.evaluate_difficult = evaluate_difficult
+        self.detection_map = DetectionMAP(
+            class_num=class_num,
+            overlap_thresh=overlap_thresh,
+            map_type=map_type,
+            is_bbox_normalized=is_bbox_normalized,
+            evaluate_difficult=evaluate_difficult,
+            catid2name=self.catid2name,
+            classwise=classwise)
+
+        self.reset()
+
+    def reset(self):
+        self.detection_map.reset()
+
+    def update(self, inputs, outputs):
+        bbox_np = outputs['bbox'].numpy()
+        bboxes = bbox_np[:, 2:]
+        scores = bbox_np[:, 1]
+        labels = bbox_np[:, 0]
+        bbox_lengths = outputs['bbox_num'].numpy()
+
+        if bboxes.shape == (1, 1) or bboxes is None:
+            return
+        gt_boxes = inputs['gt_bbox']
+        gt_labels = inputs['gt_class']
+        difficults = inputs['difficult'] if not self.evaluate_difficult \
+                            else None
+
+        scale_factor = inputs['scale_factor'].numpy(
+        ) if 'scale_factor' in inputs else np.ones(
+            (gt_boxes.shape[0], 2)).astype('float32')
+
+        bbox_idx = 0
+        for i in range(len(gt_boxes)):
+            gt_box = gt_boxes[i].numpy()
+            h, w = scale_factor[i]
+            gt_box = gt_box / np.array([w, h, w, h])
+            gt_label = gt_labels[i].numpy()
+            difficult = None if difficults is None \
+                            else difficults[i].numpy()
+            bbox_num = bbox_lengths[i]
+            bbox = bboxes[bbox_idx:bbox_idx + bbox_num]
+            score = scores[bbox_idx:bbox_idx + bbox_num]
+            label = labels[bbox_idx:bbox_idx + bbox_num]
+            gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label,
+                                                             difficult)
+            self.detection_map.update(bbox, score, label, gt_box, gt_label,
+                                      difficult)
+            bbox_idx += bbox_num
+
+    def accumulate(self):
+        logger.info("Accumulating evaluatation results...")
+        self.detection_map.accumulate()
+
+    def log(self):
+        map_stat = 100. * self.detection_map.get_map()
+        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(
+            self.overlap_thresh, self.map_type, map_stat))
+
+    def get_results(self):
+        return {'bbox': [self.detection_map.get_map()]}
+
+
+class WiderFaceMetric(Metric):
+    def __init__(self, image_dir, anno_file, multi_scale=True):
+        self.image_dir = image_dir
+        self.anno_file = anno_file
+        self.multi_scale = multi_scale
+        self.clsid2catid, self.catid2name = get_categories('widerface')
+
+    def update(self, model):
+
+        face_eval_run(
+            model,
+            self.image_dir,
+            self.anno_file,
+            pred_dir='output/pred',
+            eval_mode='widerface',
+            multi_scale=self.multi_scale)
+
+
+class RBoxMetric(Metric):
+    def __init__(self, anno_file, **kwargs):
+        assert os.path.isfile(anno_file), \
+                "anno_file {} not a file".format(anno_file)
+        assert os.path.exists(anno_file), "anno_file {} not exists".format(
+            anno_file)
+        self.anno_file = anno_file
+        self.gt_anno = json.load(open(self.anno_file))
+        cats = self.gt_anno['categories']
+        self.clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
+        self.catid2clsid = {cat['id']: i for i, cat in enumerate(cats)}
+        self.catid2name = {cat['id']: cat['name'] for cat in cats}
+        self.classwise = kwargs.get('classwise', False)
+        self.output_eval = kwargs.get('output_eval', None)
+        # TODO: bias should be unified
+        self.bias = kwargs.get('bias', 0)
+        self.save_prediction_only = kwargs.get('save_prediction_only', False)
+        self.iou_type = kwargs.get('IouType', 'bbox')
+        self.overlap_thresh = kwargs.get('overlap_thresh', 0.5)
+        self.map_type = kwargs.get('map_type', '11point')
+        self.evaluate_difficult = kwargs.get('evaluate_difficult', False)
+        class_num = len(self.catid2name)
+        self.detection_map = DetectionMAP(
+            class_num=class_num,
+            overlap_thresh=self.overlap_thresh,
+            map_type=self.map_type,
+            is_bbox_normalized=False,
+            evaluate_difficult=self.evaluate_difficult,
+            catid2name=self.catid2name,
+            classwise=self.classwise)
+
+        self.reset()
+
+    def reset(self):
+        self.result_bbox = []
+        self.detection_map.reset()
+
+    def update(self, inputs, outputs):
+        outs = {}
+        # outputs Tensor -> numpy.ndarray
+        for k, v in outputs.items():
+            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
+
+        im_id = inputs['im_id']
+        outs['im_id'] = im_id.numpy() if isinstance(im_id,
+                                                    paddle.Tensor) else im_id
+
+        infer_results = get_infer_results(
+            outs, self.clsid2catid, bias=self.bias)
+        self.result_bbox += infer_results[
+            'bbox'] if 'bbox' in infer_results else []
+        bbox = [b['bbox'] for b in self.result_bbox]
+        score = [b['score'] for b in self.result_bbox]
+        label = [b['category_id'] for b in self.result_bbox]
+        label = [self.catid2clsid[e] for e in label]
+        gt_box = [
+            e['bbox'] for e in self.gt_anno['annotations']
+            if e['image_id'] == outs['im_id']
+        ]
+        gt_label = [
+            e['category_id'] for e in self.gt_anno['annotations']
+            if e['image_id'] == outs['im_id']
+        ]
+        gt_label = [self.catid2clsid[e] for e in gt_label]
+        self.detection_map.update(bbox, score, label, gt_box, gt_label)
+
+    def accumulate(self):
+        if len(self.result_bbox) > 0:
+            output = "bbox.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.result_bbox, f)
+                logger.info('The bbox result is saved to bbox.json.')
+
+            if self.save_prediction_only:
+                logger.info('The bbox result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                logger.info("Accumulating evaluatation results...")
+                self.detection_map.accumulate()
+
+    def log(self):
+        map_stat = 100. * self.detection_map.get_map()
+        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(
+            self.overlap_thresh, self.map_type, map_stat))
+
+    def get_results(self):
+        return {'bbox': [self.detection_map.get_map()]}
+
+
+class SNIPERCOCOMetric(COCOMetric):
+    def __init__(self, anno_file, **kwargs):
+        super(SNIPERCOCOMetric, self).__init__(anno_file, **kwargs)
+        self.dataset = kwargs["dataset"]
+        self.chip_results = []
+
+    def reset(self):
+        # only bbox and mask evaluation support currently
+        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
+        self.eval_results = {}
+        self.chip_results = []
+
+    def update(self, inputs, outputs):
+        outs = {}
+        # outputs Tensor -> numpy.ndarray
+        for k, v in outputs.items():
+            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
+
+        im_id = inputs['im_id']
+        outs['im_id'] = im_id.numpy() if isinstance(im_id,
+                                                    paddle.Tensor) else im_id
+
+        self.chip_results.append(outs)
+
+    def accumulate(self):
+        results = self.dataset.anno_cropper.aggregate_chips_detections(
+            self.chip_results)
+        for outs in results:
+            infer_results = get_infer_results(
+                outs, self.clsid2catid, bias=self.bias)
+            self.results['bbox'] += infer_results[
+                'bbox'] if 'bbox' in infer_results else []
+
+        super(SNIPERCOCOMetric, self).accumulate()

+ 1236 - 0
paddlers/models/ppdet/metrics/mot_metrics.py

@@ -0,0 +1,1236 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import copy
+import sys
+import math
+from collections import defaultdict
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddlers.models.ppdet.modeling.bbox_utils import bbox_iou_np_expand
+from .map_utils import ap_per_class
+from .metrics import Metric
+from .munkres import Munkres
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric']
+
+
+def read_mot_results(filename, is_gt=False, is_ignore=False):
+    valid_labels = {1}
+    ignore_labels = {2, 7, 8, 12}  # only in motchallenge datasets like 'MOT16'
+    results_dict = dict()
+    if os.path.isfile(filename):
+        with open(filename, 'r') as f:
+            for line in f.readlines():
+                linelist = line.split(',')
+                if len(linelist) < 7:
+                    continue
+                fid = int(linelist[0])
+                if fid < 1:
+                    continue
+                results_dict.setdefault(fid, list())
+
+                box_size = float(linelist[4]) * float(linelist[5])
+
+                if is_gt:
+                    label = int(float(linelist[7]))
+                    mark = int(float(linelist[6]))
+                    if mark == 0 or label not in valid_labels:
+                        continue
+                    score = 1
+                elif is_ignore:
+                    if 'MOT16-' in filename or 'MOT17-' in filename or 'MOT15-' in filename or 'MOT20-' in filename:
+                        label = int(float(linelist[7]))
+                        vis_ratio = float(linelist[8])
+                        if label not in ignore_labels and vis_ratio >= 0:
+                            continue
+                    else:
+                        continue
+                    score = 1
+                else:
+                    score = float(linelist[6])
+
+                tlwh = tuple(map(float, linelist[2:6]))
+                target_id = int(linelist[1])
+
+                results_dict[fid].append((tlwh, target_id, score))
+    return results_dict
+
+
+"""
+MOT dataset label list, see in https://motchallenge.net
+labels={'ped', ...			    % 1
+        'person_on_vhcl', ...	% 2
+        'car', ...				% 3
+        'bicycle', ...			% 4
+        'mbike', ...			% 5
+        'non_mot_vhcl', ...		% 6
+        'static_person', ...	% 7
+        'distractor', ...		% 8
+        'occluder', ...			% 9
+        'occluder_on_grnd', ...	% 10
+        'occluder_full', ...	% 11
+        'reflection', ...		% 12
+        'crowd' ...			    % 13
+};
+"""
+
+
+def unzip_objs(objs):
+    if len(objs) > 0:
+        tlwhs, ids, scores = zip(*objs)
+    else:
+        tlwhs, ids, scores = [], [], []
+    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
+    return tlwhs, ids, scores
+
+
+class MOTEvaluator(object):
+    def __init__(self, data_root, seq_name, data_type):
+        self.data_root = data_root
+        self.seq_name = seq_name
+        self.data_type = data_type
+
+        self.load_annotations()
+        self.reset_accumulator()
+
+    def load_annotations(self):
+        assert self.data_type == 'mot'
+        gt_filename = os.path.join(self.data_root, self.seq_name, 'gt',
+                                   'gt.txt')
+        self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True)
+        self.gt_ignore_frame_dict = read_mot_results(
+            gt_filename, is_ignore=True)
+
+    def reset_accumulator(self):
+        import motmetrics as mm
+        mm.lap.default_solver = 'lap'
+        self.acc = mm.MOTAccumulator(auto_id=True)
+
+    def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False):
+        import motmetrics as mm
+        mm.lap.default_solver = 'lap'
+        # results
+        trk_tlwhs = np.copy(trk_tlwhs)
+        trk_ids = np.copy(trk_ids)
+
+        # gts
+        gt_objs = self.gt_frame_dict.get(frame_id, [])
+        gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
+
+        # ignore boxes
+        ignore_objs = self.gt_ignore_frame_dict.get(frame_id, [])
+        ignore_tlwhs = unzip_objs(ignore_objs)[0]
+
+        # remove ignored results
+        keep = np.ones(len(trk_tlwhs), dtype=bool)
+        iou_distance = mm.distances.iou_matrix(
+            ignore_tlwhs, trk_tlwhs, max_iou=0.5)
+        if len(iou_distance) > 0:
+            match_is, match_js = mm.lap.linear_sum_assignment(iou_distance)
+            match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js])
+            match_ious = iou_distance[match_is, match_js]
+
+            match_js = np.asarray(match_js, dtype=int)
+            match_js = match_js[np.logical_not(np.isnan(match_ious))]
+            keep[match_js] = False
+            trk_tlwhs = trk_tlwhs[keep]
+            trk_ids = trk_ids[keep]
+
+        # get distance matrix
+        iou_distance = mm.distances.iou_matrix(
+            gt_tlwhs, trk_tlwhs, max_iou=0.5)
+
+        # acc
+        self.acc.update(gt_ids, trk_ids, iou_distance)
+
+        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
+                                                            'last_mot_events'):
+            events = self.acc.last_mot_events  # only supported by https://github.com/longcw/py-motmetrics
+        else:
+            events = None
+        return events
+
+    def eval_file(self, filename):
+        self.reset_accumulator()
+
+        result_frame_dict = read_mot_results(filename, is_gt=False)
+        frames = sorted(list(set(result_frame_dict.keys())))
+        for frame_id in frames:
+            trk_objs = result_frame_dict.get(frame_id, [])
+            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
+            self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False)
+
+        return self.acc
+
+    @staticmethod
+    def get_summary(accs,
+                    names,
+                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
+                             'precision', 'recall')):
+        import motmetrics as mm
+        mm.lap.default_solver = 'lap'
+        names = copy.deepcopy(names)
+        if metrics is None:
+            metrics = mm.metrics.motchallenge_metrics
+        metrics = copy.deepcopy(metrics)
+
+        mh = mm.metrics.create()
+        summary = mh.compute_many(
+            accs, metrics=metrics, names=names, generate_overall=True)
+        return summary
+
+    @staticmethod
+    def save_summary(summary, filename):
+        import pandas as pd
+        writer = pd.ExcelWriter(filename)
+        summary.to_excel(writer)
+        writer.save()
+
+
+class MOTMetric(Metric):
+    def __init__(self, save_summary=False):
+        self.save_summary = save_summary
+        self.MOTEvaluator = MOTEvaluator
+        self.result_root = None
+        self.reset()
+
+    def reset(self):
+        self.accs = []
+        self.seqs = []
+
+    def update(self, data_root, seq, data_type, result_root, result_filename):
+        evaluator = self.MOTEvaluator(data_root, seq, data_type)
+        self.accs.append(evaluator.eval_file(result_filename))
+        self.seqs.append(seq)
+        self.result_root = result_root
+
+    def accumulate(self):
+        import motmetrics as mm
+        import openpyxl
+        metrics = mm.metrics.motchallenge_metrics
+        mh = mm.metrics.create()
+        summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics)
+        self.strsummary = mm.io.render_summary(
+            summary,
+            formatters=mh.formatters,
+            namemap=mm.io.motchallenge_metric_names)
+        if self.save_summary:
+            self.MOTEvaluator.save_summary(
+                summary, os.path.join(self.result_root, 'summary.xlsx'))
+
+    def log(self):
+        print(self.strsummary)
+
+    def get_results(self):
+        return self.strsummary
+
+
+class JDEDetMetric(Metric):
+    # Note this detection AP metric is different from COCOMetric or VOCMetric,
+    # and the bboxes coordinates are not scaled to the original image
+    def __init__(self, overlap_thresh=0.5):
+        self.overlap_thresh = overlap_thresh
+        self.reset()
+
+    def reset(self):
+        self.AP_accum = np.zeros(1)
+        self.AP_accum_count = np.zeros(1)
+
+    def update(self, inputs, outputs):
+        bboxes = outputs['bbox'][:, 2:].numpy()
+        scores = outputs['bbox'][:, 1].numpy()
+        labels = outputs['bbox'][:, 0].numpy()
+        bbox_lengths = outputs['bbox_num'].numpy()
+        if bboxes.shape[0] == 1 and bboxes.sum() == 0.0:
+            return
+
+        gt_boxes = inputs['gt_bbox'].numpy()[0]
+        gt_labels = inputs['gt_class'].numpy()[0]
+        if gt_labels.shape[0] == 0:
+            return
+
+        correct = []
+        detected = []
+        for i in range(bboxes.shape[0]):
+            obj_pred = 0
+            pred_bbox = bboxes[i].reshape(1, 4)
+            # Compute iou with target boxes
+            iou = bbox_iou_np_expand(pred_bbox, gt_boxes, x1y1x2y2=True)[0]
+            # Extract index of largest overlap
+            best_i = np.argmax(iou)
+            # If overlap exceeds threshold and classification is correct mark as correct
+            if iou[best_i] > self.overlap_thresh and obj_pred == gt_labels[
+                    best_i] and best_i not in detected:
+                correct.append(1)
+                detected.append(best_i)
+            else:
+                correct.append(0)
+
+        # Compute Average Precision (AP) per class
+        target_cls = list(gt_labels.T[0])
+        AP, AP_class, R, P = ap_per_class(
+            tp=correct,
+            conf=scores,
+            pred_cls=np.zeros_like(scores),
+            target_cls=target_cls)
+        self.AP_accum_count += np.bincount(AP_class, minlength=1)
+        self.AP_accum += np.bincount(AP_class, minlength=1, weights=AP)
+
+    def accumulate(self):
+        logger.info("Accumulating evaluatation results...")
+        self.map_stat = self.AP_accum[0] / (self.AP_accum_count[0] + 1E-16)
+
+    def log(self):
+        map_stat = 100. * self.map_stat
+        logger.info("mAP({:.2f}) = {:.2f}%".format(self.overlap_thresh,
+                                                   map_stat))
+
+    def get_results(self):
+        return self.map_stat
+
+
+"""
+Following code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/evaluate_tracking.py
+"""
+
+
+class tData:
+    """
+        Utility class to load data.
+    """
+    def __init__(self,frame=-1,obj_type="unset",truncation=-1,occlusion=-1,\
+                 obs_angle=-10,x1=-1,y1=-1,x2=-1,y2=-1,w=-1,h=-1,l=-1,\
+                 X=-1000,Y=-1000,Z=-1000,yaw=-10,score=-1000,track_id=-1):
+        """
+            Constructor, initializes the object given the parameters.
+        """
+        self.frame = frame
+        self.track_id = track_id
+        self.obj_type = obj_type
+        self.truncation = truncation
+        self.occlusion = occlusion
+        self.obs_angle = obs_angle
+        self.x1 = x1
+        self.y1 = y1
+        self.x2 = x2
+        self.y2 = y2
+        self.w = w
+        self.h = h
+        self.l = l
+        self.X = X
+        self.Y = Y
+        self.Z = Z
+        self.yaw = yaw
+        self.score = score
+        self.ignored = False
+        self.valid = False
+        self.tracker = -1
+
+    def __str__(self):
+        attrs = vars(self)
+        return '\n'.join("%s: %s" % item for item in attrs.items())
+
+
+class KITTIEvaluation(object):
+    """ KITTI tracking statistics (CLEAR MOT, id-switches, fragments, ML/PT/MT, precision/recall)
+             MOTA	- Multi-object tracking accuracy in [0,100]
+             MOTP	- Multi-object tracking precision in [0,100] (3D) / [td,100] (2D)
+             MOTAL	- Multi-object tracking accuracy in [0,100] with log10(id-switches)
+
+             id-switches - number of id switches
+             fragments   - number of fragmentations
+
+             MT, PT, ML	- number of mostly tracked, partially tracked and mostly lost trajectories
+
+             recall	        - recall = percentage of detected targets
+             precision	    - precision = percentage of correctly detected targets
+             FAR		    - number of false alarms per frame
+             falsepositives - number of false positives (FP)
+             missed         - number of missed targets (FN)
+    """
+    def __init__(self, result_path, gt_path, min_overlap=0.5, max_truncation = 0,\
+                min_height = 25, max_occlusion = 2, cls="car",\
+                n_frames=[], seqs=[], n_sequences=0):
+        # get number of sequences and
+        # get number of frames per sequence from test mapping
+        # (created while extracting the benchmark)
+        self.gt_path = os.path.join(gt_path, "../labels")
+        self.n_frames = n_frames
+        self.sequence_name = seqs
+        self.n_sequences = n_sequences
+
+        self.cls = cls  # class to evaluate, i.e. pedestrian or car
+
+        self.result_path = result_path
+
+        # statistics and numbers for evaluation
+        self.n_gt = 0  # number of ground truth detections minus ignored false negatives and true positives
+        self.n_igt = 0  # number of ignored ground truth detections
+        self.n_gts = [
+        ]  # number of ground truth detections minus ignored false negatives and true positives PER SEQUENCE
+        self.n_igts = [
+        ]  # number of ground ignored truth detections PER SEQUENCE
+        self.n_gt_trajectories = 0
+        self.n_gt_seq = []
+        self.n_tr = 0  # number of tracker detections minus ignored tracker detections
+        self.n_trs = [
+        ]  # number of tracker detections minus ignored tracker detections PER SEQUENCE
+        self.n_itr = 0  # number of ignored tracker detections
+        self.n_itrs = []  # number of ignored tracker detections PER SEQUENCE
+        self.n_igttr = 0  # number of ignored ground truth detections where the corresponding associated tracker detection is also ignored
+        self.n_tr_trajectories = 0
+        self.n_tr_seq = []
+        self.MOTA = 0
+        self.MOTP = 0
+        self.MOTAL = 0
+        self.MODA = 0
+        self.MODP = 0
+        self.MODP_t = []
+        self.recall = 0
+        self.precision = 0
+        self.F1 = 0
+        self.FAR = 0
+        self.total_cost = 0
+        self.itp = 0  # number of ignored true positives
+        self.itps = []  # number of ignored true positives PER SEQUENCE
+        self.tp = 0  # number of true positives including ignored true positives!
+        self.tps = [
+        ]  # number of true positives including ignored true positives PER SEQUENCE
+        self.fn = 0  # number of false negatives WITHOUT ignored false negatives
+        self.fns = [
+        ]  # number of false negatives WITHOUT ignored false negatives PER SEQUENCE
+        self.ifn = 0  # number of ignored false negatives
+        self.ifns = []  # number of ignored false negatives PER SEQUENCE
+        self.fp = 0  # number of false positives
+        # a bit tricky, the number of ignored false negatives and ignored true positives
+        # is subtracted, but if both tracker detection and ground truth detection
+        # are ignored this number is added again to avoid double counting
+        self.fps = []  # above PER SEQUENCE
+        self.mme = 0
+        self.fragments = 0
+        self.id_switches = 0
+        self.MT = 0
+        self.PT = 0
+        self.ML = 0
+
+        self.min_overlap = min_overlap  # minimum bounding box overlap for 3rd party metrics
+        self.max_truncation = max_truncation  # maximum truncation of an object for evaluation
+        self.max_occlusion = max_occlusion  # maximum occlusion of an object for evaluation
+        self.min_height = min_height  # minimum height of an object for evaluation
+        self.n_sample_points = 500
+
+        # this should be enough to hold all groundtruth trajectories
+        # is expanded if necessary and reduced in any case
+        self.gt_trajectories = [[] for x in range(self.n_sequences)]
+        self.ign_trajectories = [[] for x in range(self.n_sequences)]
+
+    def loadGroundtruth(self):
+        try:
+            self._loadData(
+                self.gt_path, cls=self.cls, loading_groundtruth=True)
+        except IOError:
+            return False
+        return True
+
+    def loadTracker(self):
+        try:
+            if not self._loadData(
+                    self.result_path, cls=self.cls, loading_groundtruth=False):
+                return False
+        except IOError:
+            return False
+        return True
+
+    def _loadData(self,
+                  root_dir,
+                  cls,
+                  min_score=-1000,
+                  loading_groundtruth=False):
+        """
+            Generic loader for ground truth and tracking data.
+            Use loadGroundtruth() or loadTracker() to load this data.
+            Loads detections in KITTI format from textfiles.
+        """
+        # construct objectDetections object to hold detection data
+        t_data = tData()
+        data = []
+        eval_2d = True
+        eval_3d = True
+
+        seq_data = []
+        n_trajectories = 0
+        n_trajectories_seq = []
+        for seq, s_name in enumerate(self.sequence_name):
+            i = 0
+            filename = os.path.join(root_dir, "%s.txt" % s_name)
+            f = open(filename, "r")
+
+            f_data = [
+                [] for x in range(self.n_frames[seq])
+            ]  # current set has only 1059 entries, sufficient length is checked anyway
+            ids = []
+            n_in_seq = 0
+            id_frame_cache = []
+            for line in f:
+                # KITTI tracking benchmark data format:
+                # (frame,tracklet_id,objectType,truncation,occlusion,alpha,x1,y1,x2,y2,h,w,l,X,Y,Z,ry)
+                line = line.strip()
+                fields = line.split(" ")
+                # classes that should be loaded (ignored neighboring classes)
+                if "car" in cls.lower():
+                    classes = ["car", "van"]
+                elif "pedestrian" in cls.lower():
+                    classes = ["pedestrian", "person_sitting"]
+                else:
+                    classes = [cls.lower()]
+                classes += ["dontcare"]
+                if not any([s for s in classes if s in fields[2].lower()]):
+                    continue
+                # get fields from table
+                t_data.frame = int(float(fields[0]))  # frame
+                t_data.track_id = int(float(fields[1]))  # id
+                t_data.obj_type = fields[
+                    2].lower()  # object type [car, pedestrian, cyclist, ...]
+                t_data.truncation = int(
+                    float(fields[3]))  # truncation [-1,0,1,2]
+                t_data.occlusion = int(
+                    float(fields[4]))  # occlusion  [-1,0,1,2]
+                t_data.obs_angle = float(fields[5])  # observation angle [rad]
+                t_data.x1 = float(fields[6])  # left   [px]
+                t_data.y1 = float(fields[7])  # top    [px]
+                t_data.x2 = float(fields[8])  # right  [px]
+                t_data.y2 = float(fields[9])  # bottom [px]
+                t_data.h = float(fields[10])  # height [m]
+                t_data.w = float(fields[11])  # width  [m]
+                t_data.l = float(fields[12])  # length [m]
+                t_data.X = float(fields[13])  # X [m]
+                t_data.Y = float(fields[14])  # Y [m]
+                t_data.Z = float(fields[15])  # Z [m]
+                t_data.yaw = float(fields[16])  # yaw angle [rad]
+                if not loading_groundtruth:
+                    if len(fields) == 17:
+                        t_data.score = -1
+                    elif len(fields) == 18:
+                        t_data.score = float(fields[17])  # detection score
+                    else:
+                        logger.info("file is not in KITTI format")
+                        return
+
+                # do not consider objects marked as invalid
+                if t_data.track_id is -1 and t_data.obj_type != "dontcare":
+                    continue
+
+                idx = t_data.frame
+                # check if length for frame data is sufficient
+                if idx >= len(f_data):
+                    print("extend f_data", idx, len(f_data))
+                    f_data += [[] for x in range(max(500, idx - len(f_data)))]
+                try:
+                    id_frame = (t_data.frame, t_data.track_id)
+                    if id_frame in id_frame_cache and not loading_groundtruth:
+                        logger.info(
+                            "track ids are not unique for sequence %d: frame %d"
+                            % (seq, t_data.frame))
+                        logger.info(
+                            "track id %d occured at least twice for this frame"
+                            % t_data.track_id)
+                        logger.info("Exiting...")
+                        #continue # this allows to evaluate non-unique result files
+                        return False
+                    id_frame_cache.append(id_frame)
+                    f_data[t_data.frame].append(copy.copy(t_data))
+                except:
+                    print(len(f_data), idx)
+                    raise
+
+                if t_data.track_id not in ids and t_data.obj_type != "dontcare":
+                    ids.append(t_data.track_id)
+                    n_trajectories += 1
+                    n_in_seq += 1
+
+                # check if uploaded data provides information for 2D and 3D evaluation
+                if not loading_groundtruth and eval_2d is True and (
+                        t_data.x1 == -1 or t_data.x2 == -1 or
+                        t_data.y1 == -1 or t_data.y2 == -1):
+                    eval_2d = False
+                if not loading_groundtruth and eval_3d is True and (
+                        t_data.X == -1000 or t_data.Y == -1000 or
+                        t_data.Z == -1000):
+                    eval_3d = False
+
+            # only add existing frames
+            n_trajectories_seq.append(n_in_seq)
+            seq_data.append(f_data)
+            f.close()
+
+        if not loading_groundtruth:
+            self.tracker = seq_data
+            self.n_tr_trajectories = n_trajectories
+            self.eval_2d = eval_2d
+            self.eval_3d = eval_3d
+            self.n_tr_seq = n_trajectories_seq
+            if self.n_tr_trajectories == 0:
+                return False
+        else:
+            # split ground truth and DontCare areas
+            self.dcareas = []
+            self.groundtruth = []
+            for seq_idx in range(len(seq_data)):
+                seq_gt = seq_data[seq_idx]
+                s_g, s_dc = [], []
+                for f in range(len(seq_gt)):
+                    all_gt = seq_gt[f]
+                    g, dc = [], []
+                    for gg in all_gt:
+                        if gg.obj_type == "dontcare":
+                            dc.append(gg)
+                        else:
+                            g.append(gg)
+                    s_g.append(g)
+                    s_dc.append(dc)
+                self.dcareas.append(s_dc)
+                self.groundtruth.append(s_g)
+            self.n_gt_seq = n_trajectories_seq
+            self.n_gt_trajectories = n_trajectories
+        return True
+
+    def boxoverlap(self, a, b, criterion="union"):
+        """
+            boxoverlap computes intersection over union for bbox a and b in KITTI format.
+            If the criterion is 'union', overlap = (a inter b) / a union b).
+            If the criterion is 'a', overlap = (a inter b) / a, where b should be a dontcare area.
+        """
+        x1 = max(a.x1, b.x1)
+        y1 = max(a.y1, b.y1)
+        x2 = min(a.x2, b.x2)
+        y2 = min(a.y2, b.y2)
+
+        w = x2 - x1
+        h = y2 - y1
+
+        if w <= 0. or h <= 0.:
+            return 0.
+        inter = w * h
+        aarea = (a.x2 - a.x1) * (a.y2 - a.y1)
+        barea = (b.x2 - b.x1) * (b.y2 - b.y1)
+        # intersection over union overlap
+        if criterion.lower() == "union":
+            o = inter / float(aarea + barea - inter)
+        elif criterion.lower() == "a":
+            o = float(inter) / float(aarea)
+        else:
+            raise TypeError("Unkown type for criterion")
+        return o
+
+    def compute3rdPartyMetrics(self):
+        """
+            Computes the metrics defined in
+                - Stiefelhagen 2008: Evaluating Multiple Object Tracking Performance: The CLEAR MOT Metrics
+                  MOTA, MOTAL, MOTP
+                - Nevatia 2008: Global Data Association for Multi-Object Tracking Using Network Flows
+                  MT/PT/ML
+        """
+        # construct Munkres object for Hungarian Method association
+        hm = Munkres()
+        max_cost = 1e9
+
+        # go through all frames and associate ground truth and tracker results
+        # groundtruth and tracker contain lists for every single frame containing lists of KITTI format detections
+        fr, ids = 0, 0
+        for seq_idx in range(len(self.groundtruth)):
+            seq_gt = self.groundtruth[seq_idx]
+            seq_dc = self.dcareas[seq_idx]  # don't care areas
+            seq_tracker = self.tracker[seq_idx]
+            seq_trajectories = defaultdict(list)
+            seq_ignored = defaultdict(list)
+
+            # statistics over the current sequence, check the corresponding
+            # variable comments in __init__ to get their meaning
+            seqtp = 0
+            seqitp = 0
+            seqfn = 0
+            seqifn = 0
+            seqfp = 0
+            seqigt = 0
+            seqitr = 0
+
+            last_ids = [[], []]
+            n_gts = 0
+            n_trs = 0
+
+            for f in range(len(seq_gt)):
+                g = seq_gt[f]
+                dc = seq_dc[f]
+
+                t = seq_tracker[f]
+                # counting total number of ground truth and tracker objects
+                self.n_gt += len(g)
+                self.n_tr += len(t)
+
+                n_gts += len(g)
+                n_trs += len(t)
+
+                # use hungarian method to associate, using boxoverlap 0..1 as cost
+                # build cost matrix
+                cost_matrix = []
+                this_ids = [[], []]
+                for gg in g:
+                    # save current ids
+                    this_ids[0].append(gg.track_id)
+                    this_ids[1].append(-1)
+                    gg.tracker = -1
+                    gg.id_switch = 0
+                    gg.fragmentation = 0
+                    cost_row = []
+                    for tt in t:
+                        # overlap == 1 is cost ==0
+                        c = 1 - self.boxoverlap(gg, tt)
+                        # gating for boxoverlap
+                        if c <= self.min_overlap:
+                            cost_row.append(c)
+                        else:
+                            cost_row.append(max_cost)  # = 1e9
+                    cost_matrix.append(cost_row)
+                    # all ground truth trajectories are initially not associated
+                    # extend groundtruth trajectories lists (merge lists)
+                    seq_trajectories[gg.track_id].append(-1)
+                    seq_ignored[gg.track_id].append(False)
+
+                if len(g) is 0:
+                    cost_matrix = [[]]
+                # associate
+                association_matrix = hm.compute(cost_matrix)
+
+                # tmp variables for sanity checks and MODP computation
+                tmptp = 0
+                tmpfp = 0
+                tmpfn = 0
+                tmpc = 0  # this will sum up the overlaps for all true positives
+                tmpcs = [0] * len(
+                    g)  # this will save the overlaps for all true positives
+                # the reason is that some true positives might be ignored
+                # later such that the corrsponding overlaps can
+                # be subtracted from tmpc for MODP computation
+
+                # mapping for tracker ids and ground truth ids
+                for row, col in association_matrix:
+                    # apply gating on boxoverlap
+                    c = cost_matrix[row][col]
+                    if c < max_cost:
+                        g[row].tracker = t[col].track_id
+                        this_ids[1][row] = t[col].track_id
+                        t[col].valid = True
+                        g[row].distance = c
+                        self.total_cost += 1 - c
+                        tmpc += 1 - c
+                        tmpcs[row] = 1 - c
+                        seq_trajectories[g[row].track_id][-1] = t[col].track_id
+
+                        # true positives are only valid associations
+                        self.tp += 1
+                        tmptp += 1
+                    else:
+                        g[row].tracker = -1
+                        self.fn += 1
+                        tmpfn += 1
+
+                # associate tracker and DontCare areas
+                # ignore tracker in neighboring classes
+                nignoredtracker = 0  # number of ignored tracker detections
+                ignoredtrackers = dict()  # will associate the track_id with -1
+                # if it is not ignored and 1 if it is
+                # ignored;
+                # this is used to avoid double counting ignored
+                # cases, see the next loop
+
+                for tt in t:
+                    ignoredtrackers[tt.track_id] = -1
+                    # ignore detection if it belongs to a neighboring class or is
+                    # smaller or equal to the minimum height
+
+                    tt_height = abs(tt.y1 - tt.y2)
+                    if ((self.cls == "car" and tt.obj_type == "van") or
+                        (self.cls == "pedestrian" and
+                         tt.obj_type == "person_sitting") or
+                            tt_height <= self.min_height) and not tt.valid:
+                        nignoredtracker += 1
+                        tt.ignored = True
+                        ignoredtrackers[tt.track_id] = 1
+                        continue
+                    for d in dc:
+                        overlap = self.boxoverlap(tt, d, "a")
+                        if overlap > 0.5 and not tt.valid:
+                            tt.ignored = True
+                            nignoredtracker += 1
+                            ignoredtrackers[tt.track_id] = 1
+                            break
+
+                # check for ignored FN/TP (truncation or neighboring object class)
+                ignoredfn = 0  # the number of ignored false negatives
+                nignoredtp = 0  # the number of ignored true positives
+                nignoredpairs = 0  # the number of ignored pairs, i.e. a true positive
+                # which is ignored but where the associated tracker
+                # detection has already been ignored
+
+                gi = 0
+                for gg in g:
+                    if gg.tracker < 0:
+                        if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\
+                                or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"):
+                            seq_ignored[gg.track_id][-1] = True
+                            gg.ignored = True
+                            ignoredfn += 1
+
+                    elif gg.tracker >= 0:
+                        if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\
+                                or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"):
+
+                            seq_ignored[gg.track_id][-1] = True
+                            gg.ignored = True
+                            nignoredtp += 1
+
+                            # if the associated tracker detection is already ignored,
+                            # we want to avoid double counting ignored detections
+                            if ignoredtrackers[gg.tracker] > 0:
+                                nignoredpairs += 1
+
+                            # for computing MODP, the overlaps from ignored detections
+                            # are subtracted
+                            tmpc -= tmpcs[gi]
+                    gi += 1
+
+                # the below might be confusion, check the comments in __init__
+                # to see what the individual statistics represent
+
+                # correct TP by number of ignored TP due to truncation
+                # ignored TP are shown as tracked in visualization
+                tmptp -= nignoredtp
+
+                # count the number of ignored true positives
+                self.itp += nignoredtp
+
+                # adjust the number of ground truth objects considered
+                self.n_gt -= (ignoredfn + nignoredtp)
+
+                # count the number of ignored ground truth objects
+                self.n_igt += ignoredfn + nignoredtp
+
+                # count the number of ignored tracker objects
+                self.n_itr += nignoredtracker
+
+                # count the number of ignored pairs, i.e. associated tracker and
+                # ground truth objects that are both ignored
+                self.n_igttr += nignoredpairs
+
+                # false negatives = associated gt bboxes exceding association threshold + non-associated gt bboxes
+                tmpfn += len(g) - len(association_matrix) - ignoredfn
+                self.fn += len(g) - len(association_matrix) - ignoredfn
+                self.ifn += ignoredfn
+
+                # false positives = tracker bboxes - associated tracker bboxes
+                # mismatches (mme_t)
+                tmpfp += len(
+                    t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs
+                self.fp += len(
+                    t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs
+
+                # update sequence data
+                seqtp += tmptp
+                seqitp += nignoredtp
+                seqfp += tmpfp
+                seqfn += tmpfn
+                seqifn += ignoredfn
+                seqigt += ignoredfn + nignoredtp
+                seqitr += nignoredtracker
+
+                # sanity checks
+                # - the number of true positives minues ignored true positives
+                #   should be greater or equal to 0
+                # - the number of false negatives should be greater or equal to 0
+                # - the number of false positives needs to be greater or equal to 0
+                #   otherwise ignored detections might be counted double
+                # - the number of counted true positives (plus ignored ones)
+                #   and the number of counted false negatives (plus ignored ones)
+                #   should match the total number of ground truth objects
+                # - the number of counted true positives (plus ignored ones)
+                #   and the number of counted false positives
+                #   plus the number of ignored tracker detections should
+                #   match the total number of tracker detections; note that
+                #   nignoredpairs is subtracted here to avoid double counting
+                #   of ignored detection sin nignoredtp and nignoredtracker
+                if tmptp < 0:
+                    print(tmptp, nignoredtp)
+                    raise NameError("Something went wrong! TP is negative")
+                if tmpfn < 0:
+                    print(tmpfn,
+                          len(g),
+                          len(association_matrix), ignoredfn, nignoredpairs)
+                    raise NameError("Something went wrong! FN is negative")
+                if tmpfp < 0:
+                    print(tmpfp,
+                          len(t), tmptp, nignoredtracker, nignoredtp,
+                          nignoredpairs)
+                    raise NameError("Something went wrong! FP is negative")
+                if tmptp + tmpfn is not len(g) - ignoredfn - nignoredtp:
+                    print("seqidx", seq_idx)
+                    print("frame ", f)
+                    print("TP    ", tmptp)
+                    print("FN    ", tmpfn)
+                    print("FP    ", tmpfp)
+                    print("nGT   ", len(g))
+                    print("nAss  ", len(association_matrix))
+                    print("ign GT", ignoredfn)
+                    print("ign TP", nignoredtp)
+                    raise NameError(
+                        "Something went wrong! nGroundtruth is not TP+FN")
+                if tmptp + tmpfp + nignoredtp + nignoredtracker - nignoredpairs is not len(
+                        t):
+                    print(seq_idx, f, len(t), tmptp, tmpfp)
+                    print(len(association_matrix), association_matrix)
+                    raise NameError(
+                        "Something went wrong! nTracker is not TP+FP")
+
+                # check for id switches or fragmentations
+                for i, tt in enumerate(this_ids[0]):
+                    if tt in last_ids[0]:
+                        idx = last_ids[0].index(tt)
+                        tid = this_ids[1][i]
+                        lid = last_ids[1][idx]
+                        if tid != lid and lid != -1 and tid != -1:
+                            if g[i].truncation < self.max_truncation:
+                                g[i].id_switch = 1
+                                ids += 1
+                        if tid != lid and lid != -1:
+                            if g[i].truncation < self.max_truncation:
+                                g[i].fragmentation = 1
+                                fr += 1
+
+                # save current index
+                last_ids = this_ids
+                # compute MOTP_t
+                MODP_t = 1
+                if tmptp != 0:
+                    MODP_t = tmpc / float(tmptp)
+                self.MODP_t.append(MODP_t)
+
+            # remove empty lists for current gt trajectories
+            self.gt_trajectories[seq_idx] = seq_trajectories
+            self.ign_trajectories[seq_idx] = seq_ignored
+
+            # gather statistics for "per sequence" statistics.
+            self.n_gts.append(n_gts)
+            self.n_trs.append(n_trs)
+            self.tps.append(seqtp)
+            self.itps.append(seqitp)
+            self.fps.append(seqfp)
+            self.fns.append(seqfn)
+            self.ifns.append(seqifn)
+            self.n_igts.append(seqigt)
+            self.n_itrs.append(seqitr)
+
+        # compute MT/PT/ML, fragments, idswitches for all groundtruth trajectories
+        n_ignored_tr_total = 0
+        for seq_idx, (
+                seq_trajectories, seq_ignored
+        ) in enumerate(zip(self.gt_trajectories, self.ign_trajectories)):
+            if len(seq_trajectories) == 0:
+                continue
+            tmpMT, tmpML, tmpPT, tmpId_switches, tmpFragments = [0] * 5
+            n_ignored_tr = 0
+            for g, ign_g in zip(seq_trajectories.values(),
+                                seq_ignored.values()):
+                # all frames of this gt trajectory are ignored
+                if all(ign_g):
+                    n_ignored_tr += 1
+                    n_ignored_tr_total += 1
+                    continue
+                # all frames of this gt trajectory are not assigned to any detections
+                if all([this == -1 for this in g]):
+                    tmpML += 1
+                    self.ML += 1
+                    continue
+                # compute tracked frames in trajectory
+                last_id = g[0]
+                # first detection (necessary to be in gt_trajectories) is always tracked
+                tracked = 1 if g[0] >= 0 else 0
+                lgt = 0 if ign_g[0] else 1
+                for f in range(1, len(g)):
+                    if ign_g[f]:
+                        last_id = -1
+                        continue
+                    lgt += 1
+                    if last_id != g[f] and last_id != -1 and g[f] != -1 and g[
+                            f - 1] != -1:
+                        tmpId_switches += 1
+                        self.id_switches += 1
+                    if f < len(g) - 1 and g[f - 1] != g[
+                            f] and last_id != -1 and g[f] != -1 and g[f +
+                                                                      1] != -1:
+                        tmpFragments += 1
+                        self.fragments += 1
+                    if g[f] != -1:
+                        tracked += 1
+                        last_id = g[f]
+                # handle last frame; tracked state is handled in for loop (g[f]!=-1)
+                if len(g) > 1 and g[f - 1] != g[f] and last_id != -1 and g[
+                        f] != -1 and not ign_g[f]:
+                    tmpFragments += 1
+                    self.fragments += 1
+
+                # compute MT/PT/ML
+                tracking_ratio = tracked / float(len(g) - sum(ign_g))
+                if tracking_ratio > 0.8:
+                    tmpMT += 1
+                    self.MT += 1
+                elif tracking_ratio < 0.2:
+                    tmpML += 1
+                    self.ML += 1
+                else:  # 0.2 <= tracking_ratio <= 0.8
+                    tmpPT += 1
+                    self.PT += 1
+
+        if (self.n_gt_trajectories - n_ignored_tr_total) == 0:
+            self.MT = 0.
+            self.PT = 0.
+            self.ML = 0.
+        else:
+            self.MT /= float(self.n_gt_trajectories - n_ignored_tr_total)
+            self.PT /= float(self.n_gt_trajectories - n_ignored_tr_total)
+            self.ML /= float(self.n_gt_trajectories - n_ignored_tr_total)
+
+        # precision/recall etc.
+        if (self.fp + self.tp) == 0 or (self.tp + self.fn) == 0:
+            self.recall = 0.
+            self.precision = 0.
+        else:
+            self.recall = self.tp / float(self.tp + self.fn)
+            self.precision = self.tp / float(self.fp + self.tp)
+        if (self.recall + self.precision) == 0:
+            self.F1 = 0.
+        else:
+            self.F1 = 2. * (self.precision * self.recall) / (
+                self.precision + self.recall)
+        if sum(self.n_frames) == 0:
+            self.FAR = "n/a"
+        else:
+            self.FAR = self.fp / float(sum(self.n_frames))
+
+        # compute CLEARMOT
+        if self.n_gt == 0:
+            self.MOTA = -float("inf")
+            self.MODA = -float("inf")
+        else:
+            self.MOTA = 1 - (self.fn + self.fp + self.id_switches
+                             ) / float(self.n_gt)
+            self.MODA = 1 - (self.fn + self.fp) / float(self.n_gt)
+        if self.tp == 0:
+            self.MOTP = float("inf")
+        else:
+            self.MOTP = self.total_cost / float(self.tp)
+        if self.n_gt != 0:
+            if self.id_switches == 0:
+                self.MOTAL = 1 - (self.fn + self.fp + self.id_switches
+                                  ) / float(self.n_gt)
+            else:
+                self.MOTAL = 1 - (self.fn + self.fp +
+                                  math.log10(self.id_switches)
+                                  ) / float(self.n_gt)
+        else:
+            self.MOTAL = -float("inf")
+        if sum(self.n_frames) == 0:
+            self.MODP = "n/a"
+        else:
+            self.MODP = sum(self.MODP_t) / float(sum(self.n_frames))
+        return True
+
+    def createSummary(self):
+        summary = ""
+        summary += "tracking evaluation summary".center(80, "=") + "\n"
+        summary += self.printEntry("Multiple Object Tracking Accuracy (MOTA)",
+                                   self.MOTA) + "\n"
+        summary += self.printEntry("Multiple Object Tracking Precision (MOTP)",
+                                   self.MOTP) + "\n"
+        summary += self.printEntry("Multiple Object Tracking Accuracy (MOTAL)",
+                                   self.MOTAL) + "\n"
+        summary += self.printEntry("Multiple Object Detection Accuracy (MODA)",
+                                   self.MODA) + "\n"
+        summary += self.printEntry(
+            "Multiple Object Detection Precision (MODP)", self.MODP) + "\n"
+        summary += "\n"
+        summary += self.printEntry("Recall", self.recall) + "\n"
+        summary += self.printEntry("Precision", self.precision) + "\n"
+        summary += self.printEntry("F1", self.F1) + "\n"
+        summary += self.printEntry("False Alarm Rate", self.FAR) + "\n"
+        summary += "\n"
+        summary += self.printEntry("Mostly Tracked", self.MT) + "\n"
+        summary += self.printEntry("Partly Tracked", self.PT) + "\n"
+        summary += self.printEntry("Mostly Lost", self.ML) + "\n"
+        summary += "\n"
+        summary += self.printEntry("True Positives", self.tp) + "\n"
+        #summary += self.printEntry("True Positives per Sequence", self.tps) + "\n"
+        summary += self.printEntry("Ignored True Positives", self.itp) + "\n"
+        #summary += self.printEntry("Ignored True Positives per Sequence", self.itps) + "\n"
+
+        summary += self.printEntry("False Positives", self.fp) + "\n"
+        #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n"
+        summary += self.printEntry("False Negatives", self.fn) + "\n"
+        #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n"
+        summary += self.printEntry("ID-switches", self.id_switches) + "\n"
+        self.fp = self.fp / self.n_gt
+        self.fn = self.fn / self.n_gt
+        self.id_switches = self.id_switches / self.n_gt
+        summary += self.printEntry("False Positives Ratio", self.fp) + "\n"
+        #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n"
+        summary += self.printEntry("False Negatives Ratio", self.fn) + "\n"
+        #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n"
+        summary += self.printEntry("Ignored False Negatives Ratio",
+                                   self.ifn) + "\n"
+
+        #summary += self.printEntry("Ignored False Negatives per Sequence", self.ifns) + "\n"
+        summary += self.printEntry("Missed Targets", self.fn) + "\n"
+        summary += self.printEntry("ID-switches", self.id_switches) + "\n"
+        summary += self.printEntry("Fragmentations", self.fragments) + "\n"
+        summary += "\n"
+        summary += self.printEntry("Ground Truth Objects (Total)", self.n_gt +
+                                   self.n_igt) + "\n"
+        #summary += self.printEntry("Ground Truth Objects (Total) per Sequence", self.n_gts) + "\n"
+        summary += self.printEntry("Ignored Ground Truth Objects",
+                                   self.n_igt) + "\n"
+        #summary += self.printEntry("Ignored Ground Truth Objects per Sequence", self.n_igts) + "\n"
+        summary += self.printEntry("Ground Truth Trajectories",
+                                   self.n_gt_trajectories) + "\n"
+        summary += "\n"
+        summary += self.printEntry("Tracker Objects (Total)", self.n_tr) + "\n"
+        #summary += self.printEntry("Tracker Objects (Total) per Sequence", self.n_trs) + "\n"
+        summary += self.printEntry("Ignored Tracker Objects",
+                                   self.n_itr) + "\n"
+        #summary += self.printEntry("Ignored Tracker Objects per Sequence", self.n_itrs) + "\n"
+        summary += self.printEntry("Tracker Trajectories",
+                                   self.n_tr_trajectories) + "\n"
+        #summary += "\n"
+        #summary += self.printEntry("Ignored Tracker Objects with Associated Ignored Ground Truth Objects", self.n_igttr) + "\n"
+        summary += "=" * 80
+        return summary
+
+    def printEntry(self, key, val, width=(70, 10)):
+        """
+            Pretty print an entry in a table fashion.
+        """
+        s_out = key.ljust(width[0])
+        if type(val) == int:
+            s = "%%%dd" % width[1]
+            s_out += s % val
+        elif type(val) == float:
+            s = "%%%df" % (width[1])
+            s_out += s % val
+        else:
+            s_out += ("%s" % val).rjust(width[1])
+        return s_out
+
+    def saveToStats(self, save_summary):
+        """
+            Save the statistics in a whitespace separate file.
+        """
+        summary = self.createSummary()
+        if save_summary:
+            filename = os.path.join(self.result_path,
+                                    "summary_%s.txt" % self.cls)
+            dump = open(filename, "w+")
+            dump.write(summary)
+            dump.close()
+        return summary
+
+
+class KITTIMOTMetric(Metric):
+    def __init__(self, save_summary=True):
+        self.save_summary = save_summary
+        self.MOTEvaluator = KITTIEvaluation
+        self.result_root = None
+        self.reset()
+
+    def reset(self):
+        self.seqs = []
+        self.n_sequences = 0
+        self.n_frames = []
+        self.strsummary = ''
+
+    def update(self, data_root, seq, data_type, result_root, result_filename):
+        assert data_type == 'kitti', "data_type should 'kitti'"
+        self.result_root = result_root
+        self.gt_path = data_root
+        gt_path = '{}/../labels/{}.txt'.format(data_root, seq)
+        gt = open(gt_path, "r")
+        max_frame = 0
+        for line in gt:
+            line = line.strip()
+            line_list = line.split(" ")
+            if int(line_list[0]) > max_frame:
+                max_frame = int(line_list[0])
+        rs = open(result_filename, "r")
+        for line in rs:
+            line = line.strip()
+            line_list = line.split(" ")
+            if int(line_list[0]) > max_frame:
+                max_frame = int(line_list[0])
+        gt.close()
+        rs.close()
+        self.n_frames.append(max_frame + 1)
+        self.seqs.append(seq)
+        self.n_sequences += 1
+
+    def accumulate(self):
+        logger.info("Processing Result for KITTI Tracking Benchmark")
+        e = self.MOTEvaluator(result_path=self.result_root, gt_path=self.gt_path,\
+            n_frames=self.n_frames, seqs=self.seqs, n_sequences=self.n_sequences)
+        try:
+            if not e.loadTracker():
+                return
+            logger.info("Loading Results - Success")
+            logger.info("Evaluate Object Class: %s" % c.upper())
+        except:
+            logger.info("Caught exception while loading result data.")
+        if not e.loadGroundtruth():
+            raise ValueError("Ground truth not found.")
+        logger.info("Loading Groundtruth - Success")
+        # sanity checks
+        if len(e.groundtruth) is not len(e.tracker):
+            logger.info(
+                "The uploaded data does not provide results for every sequence."
+            )
+            return False
+        logger.info("Loaded %d Sequences." % len(e.groundtruth))
+        logger.info("Start Evaluation...")
+
+        if e.compute3rdPartyMetrics():
+            self.strsummary = e.saveToStats(self.save_summary)
+        else:
+            logger.info(
+                "There seem to be no true positives or false positives at all in the submitted data."
+            )
+
+    def log(self):
+        print(self.strsummary)
+
+    def get_results(self):
+        return self.strsummary

+ 428 - 0
paddlers/models/ppdet/metrics/munkres.py

@@ -0,0 +1,428 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py
+"""
+
+import sys
+
+__all__ = ['Munkres', 'make_cost_matrix']
+
+
+class Munkres:
+    """
+    Calculate the Munkres solution to the classical assignment problem.
+    See the module documentation for usage.
+    """
+
+    def __init__(self):
+        """Create a new instance"""
+        self.C = None
+        self.row_covered = []
+        self.col_covered = []
+        self.n = 0
+        self.Z0_r = 0
+        self.Z0_c = 0
+        self.marked = None
+        self.path = None
+
+    def make_cost_matrix(profit_matrix, inversion_function):
+        """
+        **DEPRECATED**
+
+        Please use the module function ``make_cost_matrix()``.
+        """
+        import munkres
+        return munkres.make_cost_matrix(profit_matrix, inversion_function)
+
+    make_cost_matrix = staticmethod(make_cost_matrix)
+
+    def pad_matrix(self, matrix, pad_value=0):
+        """
+        Pad a possibly non-square matrix to make it square.
+
+        :Parameters:
+            matrix : list of lists
+                matrix to pad
+
+            pad_value : int
+                value to use to pad the matrix
+
+        :rtype: list of lists
+        :return: a new, possibly padded, matrix
+        """
+        max_columns = 0
+        total_rows = len(matrix)
+
+        for row in matrix:
+            max_columns = max(max_columns, len(row))
+
+        total_rows = max(max_columns, total_rows)
+
+        new_matrix = []
+        for row in matrix:
+            row_len = len(row)
+            new_row = row[:]
+            if total_rows > row_len:
+                # Row too short. Pad it.
+                new_row += [0] * (total_rows - row_len)
+            new_matrix += [new_row]
+
+        while len(new_matrix) < total_rows:
+            new_matrix += [[0] * total_rows]
+
+        return new_matrix
+
+    def compute(self, cost_matrix):
+        """
+        Compute the indexes for the lowest-cost pairings between rows and
+        columns in the database. Returns a list of (row, column) tuples
+        that can be used to traverse the matrix.
+
+        :Parameters:
+            cost_matrix : list of lists
+                The cost matrix. If this cost matrix is not square, it
+                will be padded with zeros, via a call to ``pad_matrix()``.
+                (This method does *not* modify the caller's matrix. It
+                operates on a copy of the matrix.)
+
+                **WARNING**: This code handles square and rectangular
+                matrices. It does *not* handle irregular matrices.
+
+        :rtype: list
+        :return: A list of ``(row, column)`` tuples that describe the lowest
+                 cost path through the matrix
+
+        """
+        self.C = self.pad_matrix(cost_matrix)
+        self.n = len(self.C)
+        self.original_length = len(cost_matrix)
+        self.original_width = len(cost_matrix[0])
+        self.row_covered = [False for i in range(self.n)]
+        self.col_covered = [False for i in range(self.n)]
+        self.Z0_r = 0
+        self.Z0_c = 0
+        self.path = self.__make_matrix(self.n * 2, 0)
+        self.marked = self.__make_matrix(self.n, 0)
+
+        done = False
+        step = 1
+
+        steps = {
+            1: self.__step1,
+            2: self.__step2,
+            3: self.__step3,
+            4: self.__step4,
+            5: self.__step5,
+            6: self.__step6
+        }
+
+        while not done:
+            try:
+                func = steps[step]
+                step = func()
+            except KeyError:
+                done = True
+
+        # Look for the starred columns
+        results = []
+        for i in range(self.original_length):
+            for j in range(self.original_width):
+                if self.marked[i][j] == 1:
+                    results += [(i, j)]
+
+        return results
+
+    def __copy_matrix(self, matrix):
+        """Return an exact copy of the supplied matrix"""
+        return copy.deepcopy(matrix)
+
+    def __make_matrix(self, n, val):
+        """Create an *n*x*n* matrix, populating it with the specific value."""
+        matrix = []
+        for i in range(n):
+            matrix += [[val for j in range(n)]]
+        return matrix
+
+    def __step1(self):
+        """
+        For each row of the matrix, find the smallest element and
+        subtract it from every element in its row. Go to Step 2.
+        """
+        C = self.C
+        n = self.n
+        for i in range(n):
+            minval = min(self.C[i])
+            # Find the minimum value for this row and subtract that minimum
+            # from every element in the row.
+            for j in range(n):
+                self.C[i][j] -= minval
+
+        return 2
+
+    def __step2(self):
+        """
+        Find a zero (Z) in the resulting matrix. If there is no starred
+        zero in its row or column, star Z. Repeat for each element in the
+        matrix. Go to Step 3.
+        """
+        n = self.n
+        for i in range(n):
+            for j in range(n):
+                if (self.C[i][j] == 0) and \
+                   (not self.col_covered[j]) and \
+                   (not self.row_covered[i]):
+                    self.marked[i][j] = 1
+                    self.col_covered[j] = True
+                    self.row_covered[i] = True
+
+        self.__clear_covers()
+        return 3
+
+    def __step3(self):
+        """
+        Cover each column containing a starred zero. If K columns are
+        covered, the starred zeros describe a complete set of unique
+        assignments. In this case, Go to DONE, otherwise, Go to Step 4.
+        """
+        n = self.n
+        count = 0
+        for i in range(n):
+            for j in range(n):
+                if self.marked[i][j] == 1:
+                    self.col_covered[j] = True
+                    count += 1
+
+        if count >= n:
+            step = 7  # done
+        else:
+            step = 4
+
+        return step
+
+    def __step4(self):
+        """
+        Find a noncovered zero and prime it. If there is no starred zero
+        in the row containing this primed zero, Go to Step 5. Otherwise,
+        cover this row and uncover the column containing the starred
+        zero. Continue in this manner until there are no uncovered zeros
+        left. Save the smallest uncovered value and Go to Step 6.
+        """
+        step = 0
+        done = False
+        row = -1
+        col = -1
+        star_col = -1
+        while not done:
+            (row, col) = self.__find_a_zero()
+            if row < 0:
+                done = True
+                step = 6
+            else:
+                self.marked[row][col] = 2
+                star_col = self.__find_star_in_row(row)
+                if star_col >= 0:
+                    col = star_col
+                    self.row_covered[row] = True
+                    self.col_covered[col] = False
+                else:
+                    done = True
+                    self.Z0_r = row
+                    self.Z0_c = col
+                    step = 5
+
+        return step
+
+    def __step5(self):
+        """
+        Construct a series of alternating primed and starred zeros as
+        follows. Let Z0 represent the uncovered primed zero found in Step 4.
+        Let Z1 denote the starred zero in the column of Z0 (if any).
+        Let Z2 denote the primed zero in the row of Z1 (there will always
+        be one). Continue until the series terminates at a primed zero
+        that has no starred zero in its column. Unstar each starred zero
+        of the series, star each primed zero of the series, erase all
+        primes and uncover every line in the matrix. Return to Step 3
+        """
+        count = 0
+        path = self.path
+        path[count][0] = self.Z0_r
+        path[count][1] = self.Z0_c
+        done = False
+        while not done:
+            row = self.__find_star_in_col(path[count][1])
+            if row >= 0:
+                count += 1
+                path[count][0] = row
+                path[count][1] = path[count - 1][1]
+            else:
+                done = True
+
+            if not done:
+                col = self.__find_prime_in_row(path[count][0])
+                count += 1
+                path[count][0] = path[count - 1][0]
+                path[count][1] = col
+
+        self.__convert_path(path, count)
+        self.__clear_covers()
+        self.__erase_primes()
+        return 3
+
+    def __step6(self):
+        """
+        Add the value found in Step 4 to every element of each covered
+        row, and subtract it from every element of each uncovered column.
+        Return to Step 4 without altering any stars, primes, or covered
+        lines.
+        """
+        minval = self.__find_smallest()
+        for i in range(self.n):
+            for j in range(self.n):
+                if self.row_covered[i]:
+                    self.C[i][j] += minval
+                if not self.col_covered[j]:
+                    self.C[i][j] -= minval
+        return 4
+
+    def __find_smallest(self):
+        """Find the smallest uncovered value in the matrix."""
+        minval = 2e9  # sys.maxint
+        for i in range(self.n):
+            for j in range(self.n):
+                if (not self.row_covered[i]) and (not self.col_covered[j]):
+                    if minval > self.C[i][j]:
+                        minval = self.C[i][j]
+        return minval
+
+    def __find_a_zero(self):
+        """Find the first uncovered element with value 0"""
+        row = -1
+        col = -1
+        i = 0
+        n = self.n
+        done = False
+
+        while not done:
+            j = 0
+            while True:
+                if (self.C[i][j] == 0) and \
+                   (not self.row_covered[i]) and \
+                   (not self.col_covered[j]):
+                    row = i
+                    col = j
+                    done = True
+                j += 1
+                if j >= n:
+                    break
+            i += 1
+            if i >= n:
+                done = True
+
+        return (row, col)
+
+    def __find_star_in_row(self, row):
+        """
+        Find the first starred element in the specified row. Returns
+        the column index, or -1 if no starred element was found.
+        """
+        col = -1
+        for j in range(self.n):
+            if self.marked[row][j] == 1:
+                col = j
+                break
+
+        return col
+
+    def __find_star_in_col(self, col):
+        """
+        Find the first starred element in the specified row. Returns
+        the row index, or -1 if no starred element was found.
+        """
+        row = -1
+        for i in range(self.n):
+            if self.marked[i][col] == 1:
+                row = i
+                break
+
+        return row
+
+    def __find_prime_in_row(self, row):
+        """
+        Find the first prime element in the specified row. Returns
+        the column index, or -1 if no starred element was found.
+        """
+        col = -1
+        for j in range(self.n):
+            if self.marked[row][j] == 2:
+                col = j
+                break
+
+        return col
+
+    def __convert_path(self, path, count):
+        for i in range(count + 1):
+            if self.marked[path[i][0]][path[i][1]] == 1:
+                self.marked[path[i][0]][path[i][1]] = 0
+            else:
+                self.marked[path[i][0]][path[i][1]] = 1
+
+    def __clear_covers(self):
+        """Clear all covered matrix cells"""
+        for i in range(self.n):
+            self.row_covered[i] = False
+            self.col_covered[i] = False
+
+    def __erase_primes(self):
+        """Erase all prime markings"""
+        for i in range(self.n):
+            for j in range(self.n):
+                if self.marked[i][j] == 2:
+                    self.marked[i][j] = 0
+
+
+def make_cost_matrix(profit_matrix, inversion_function):
+    """
+    Create a cost matrix from a profit matrix by calling
+    'inversion_function' to invert each value. The inversion
+    function must take one numeric argument (of any type) and return
+    another numeric argument which is presumed to be the cost inverse
+    of the original profit.
+
+    This is a static method. Call it like this:
+
+    .. python::
+
+        cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func)
+
+    For example:
+
+    .. python::
+
+        cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x)
+
+    :Parameters:
+        profit_matrix : list of lists
+            The matrix to convert from a profit to a cost matrix
+
+        inversion_function : function
+            The function to use to invert each entry in the profit matrix
+
+    :rtype: list of lists
+    :return: The converted matrix
+    """
+    cost_matrix = []
+    for row in profit_matrix:
+        cost_matrix.append([inversion_function(value) for value in row])
+    return cost_matrix

+ 393 - 0
paddlers/models/ppdet/metrics/widerface_utils.py

@@ -0,0 +1,393 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import cv2
+import numpy as np
+from collections import OrderedDict
+
+import paddle
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['face_eval_run', 'lmk2out']
+
+
+def face_eval_run(model,
+                  image_dir,
+                  gt_file,
+                  pred_dir='output/pred',
+                  eval_mode='widerface',
+                  multi_scale=False):
+    # load ground truth files
+    with open(gt_file, 'r') as f:
+        gt_lines = f.readlines()
+    imid2path = []
+    pos_gt = 0
+    while pos_gt < len(gt_lines):
+        name_gt = gt_lines[pos_gt].strip('\n\t').split()[0]
+        imid2path.append(name_gt)
+        pos_gt += 1
+        n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0])
+        pos_gt += 1 + n_gt
+    logger.info('The ground truth file load {} images'.format(len(imid2path)))
+
+    dets_dist = OrderedDict()
+    for iter_id, im_path in enumerate(imid2path):
+        image_path = os.path.join(image_dir, im_path)
+        if eval_mode == 'fddb':
+            image_path += '.jpg'
+        assert os.path.exists(image_path)
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        if multi_scale:
+            shrink, max_shrink = get_shrink(image.shape[0], image.shape[1])
+            det0 = detect_face(model, image, shrink)
+            det1 = flip_test(model, image, shrink)
+            [det2, det3] = multi_scale_test(model, image, max_shrink)
+            det4 = multi_scale_test_pyramid(model, image, max_shrink)
+            det = np.row_stack((det0, det1, det2, det3, det4))
+            dets = bbox_vote(det)
+        else:
+            dets = detect_face(model, image, 1)
+        if eval_mode == 'widerface':
+            save_widerface_bboxes(image_path, dets, pred_dir)
+        else:
+            dets_dist[im_path] = dets
+        if iter_id % 100 == 0:
+            logger.info('Test iter {}'.format(iter_id))
+    if eval_mode == 'fddb':
+        save_fddb_bboxes(dets_dist, pred_dir)
+    logger.info("Finish evaluation.")
+
+
+def detect_face(model, image, shrink):
+    image_shape = [image.shape[0], image.shape[1]]
+    if shrink != 1:
+        h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink)
+        image = cv2.resize(image, (w, h))
+        image_shape = [h, w]
+
+    img = face_img_process(image)
+    image_shape = np.asarray([image_shape])
+    scale_factor = np.asarray([[shrink, shrink]])
+    data = {
+        "image": paddle.to_tensor(
+            img, dtype='float32'),
+        "im_shape": paddle.to_tensor(
+            image_shape, dtype='float32'),
+        "scale_factor": paddle.to_tensor(
+            scale_factor, dtype='float32')
+    }
+    model.eval()
+    detection = model(data)
+    detection = detection['bbox'].numpy()
+    # layout: xmin, ymin, xmax. ymax, score
+    if np.prod(detection.shape) == 1:
+        logger.info("No face detected")
+        return np.array([[0, 0, 0, 0, 0]])
+    det_conf = detection[:, 1]
+    det_xmin = detection[:, 2]
+    det_ymin = detection[:, 3]
+    det_xmax = detection[:, 4]
+    det_ymax = detection[:, 5]
+
+    det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
+    return det
+
+
+def flip_test(model, image, shrink):
+    img = cv2.flip(image, 1)
+    det_f = detect_face(model, img, shrink)
+    det_t = np.zeros(det_f.shape)
+    img_width = image.shape[1]
+    det_t[:, 0] = img_width - det_f[:, 2]
+    det_t[:, 1] = det_f[:, 1]
+    det_t[:, 2] = img_width - det_f[:, 0]
+    det_t[:, 3] = det_f[:, 3]
+    det_t[:, 4] = det_f[:, 4]
+    return det_t
+
+
+def multi_scale_test(model, image, max_shrink):
+    # Shrink detecting is only used to detect big faces
+    st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink
+    det_s = detect_face(model, image, st)
+    index = np.where(
+        np.maximum(det_s[:, 2] - det_s[:, 0] + 1,
+                   det_s[:, 3] - det_s[:, 1] + 1) > 30)[0]
+    det_s = det_s[index, :]
+    # Enlarge one times
+    bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2
+    det_b = detect_face(model, image, bt)
+
+    # Enlarge small image x times for small faces
+    if max_shrink > 2:
+        bt *= 2
+        while bt < max_shrink:
+            det_b = np.row_stack((det_b, detect_face(model, image, bt)))
+            bt *= 2
+        det_b = np.row_stack((det_b, detect_face(model, image, max_shrink)))
+
+    # Enlarged images are only used to detect small faces.
+    if bt > 1:
+        index = np.where(
+            np.minimum(det_b[:, 2] - det_b[:, 0] + 1,
+                       det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]
+        det_b = det_b[index, :]
+    # Shrinked images are only used to detect big faces.
+    else:
+        index = np.where(
+            np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
+                       det_b[:, 3] - det_b[:, 1] + 1) > 30)[0]
+        det_b = det_b[index, :]
+    return det_s, det_b
+
+
+def multi_scale_test_pyramid(model, image, max_shrink):
+    # Use image pyramids to detect faces
+    det_b = detect_face(model, image, 0.25)
+    index = np.where(
+        np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
+                   det_b[:, 3] - det_b[:, 1] + 1) > 30)[0]
+    det_b = det_b[index, :]
+
+    st = [0.75, 1.25, 1.5, 1.75]
+    for i in range(len(st)):
+        if st[i] <= max_shrink:
+            det_temp = detect_face(model, image, st[i])
+            # Enlarged images are only used to detect small faces.
+            if st[i] > 1:
+                index = np.where(
+                    np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
+                               det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
+                det_temp = det_temp[index, :]
+            # Shrinked images are only used to detect big faces.
+            else:
+                index = np.where(
+                    np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
+                               det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]
+                det_temp = det_temp[index, :]
+            det_b = np.row_stack((det_b, det_temp))
+    return det_b
+
+
+def to_chw(image):
+    """
+    Transpose image from HWC to CHW.
+    Args:
+        image (np.array): an image with HWC layout.
+    """
+    # HWC to CHW
+    if len(image.shape) == 3:
+        image = np.swapaxes(image, 1, 2)
+        image = np.swapaxes(image, 1, 0)
+    return image
+
+
+def face_img_process(image,
+                     mean=[104., 117., 123.],
+                     std=[127.502231, 127.502231, 127.502231]):
+    img = np.array(image)
+    img = to_chw(img)
+    img = img.astype('float32')
+    img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
+    img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32')
+    img = [img]
+    img = np.array(img)
+    return img
+
+
+def get_shrink(height, width):
+    """
+    Args:
+        height (int): image height.
+        width (int): image width.
+    """
+    # avoid out of memory
+    max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5
+    max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5
+
+    def get_round(x, loc):
+        str_x = str(x)
+        if '.' in str_x:
+            str_before, str_after = str_x.split('.')
+            len_after = len(str_after)
+            if len_after >= 3:
+                str_final = str_before + '.' + str_after[0:loc]
+                return float(str_final)
+            else:
+                return x
+
+    max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
+    if max_shrink >= 1.5 and max_shrink < 2:
+        max_shrink = max_shrink - 0.1
+    elif max_shrink >= 2 and max_shrink < 3:
+        max_shrink = max_shrink - 0.2
+    elif max_shrink >= 3 and max_shrink < 4:
+        max_shrink = max_shrink - 0.3
+    elif max_shrink >= 4 and max_shrink < 5:
+        max_shrink = max_shrink - 0.4
+    elif max_shrink >= 5:
+        max_shrink = max_shrink - 0.5
+    elif max_shrink <= 0.1:
+        max_shrink = 0.1
+
+    shrink = max_shrink if max_shrink < 1 else 1
+    return shrink, max_shrink
+
+
+def bbox_vote(det):
+    order = det[:, 4].ravel().argsort()[::-1]
+    det = det[order, :]
+    if det.shape[0] == 0:
+        dets = np.array([[10, 10, 20, 20, 0.002]])
+        det = np.empty(shape=[0, 5])
+    while det.shape[0] > 0:
+        # IOU
+        area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
+        xx1 = np.maximum(det[0, 0], det[:, 0])
+        yy1 = np.maximum(det[0, 1], det[:, 1])
+        xx2 = np.minimum(det[0, 2], det[:, 2])
+        yy2 = np.minimum(det[0, 3], det[:, 3])
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        o = inter / (area[0] + area[:] - inter)
+
+        # nms
+        merge_index = np.where(o >= 0.3)[0]
+        det_accu = det[merge_index, :]
+        det = np.delete(det, merge_index, 0)
+        if merge_index.shape[0] <= 1:
+            if det.shape[0] == 0:
+                try:
+                    dets = np.row_stack((dets, det_accu))
+                except:
+                    dets = det_accu
+            continue
+        det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
+        max_score = np.max(det_accu[:, 4])
+        det_accu_sum = np.zeros((1, 5))
+        det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4],
+                                      axis=0) / np.sum(det_accu[:, -1:])
+        det_accu_sum[:, 4] = max_score
+        try:
+            dets = np.row_stack((dets, det_accu_sum))
+        except:
+            dets = det_accu_sum
+    dets = dets[0:750, :]
+    keep_index = np.where(dets[:, 4] >= 0.01)[0]
+    dets = dets[keep_index, :]
+    return dets
+
+
+def save_widerface_bboxes(image_path, bboxes_scores, output_dir):
+    image_name = image_path.split('/')[-1]
+    image_class = image_path.split('/')[-2]
+    odir = os.path.join(output_dir, image_class)
+    if not os.path.exists(odir):
+        os.makedirs(odir)
+
+    ofname = os.path.join(odir, '%s.txt' % (image_name[:-4]))
+    f = open(ofname, 'w')
+    f.write('{:s}\n'.format(image_class + '/' + image_name))
+    f.write('{:d}\n'.format(bboxes_scores.shape[0]))
+    for box_score in bboxes_scores:
+        xmin, ymin, xmax, ymax, score = box_score
+        f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, (
+            xmax - xmin + 1), (ymax - ymin + 1), score))
+    f.close()
+    logger.info("The predicted result is saved as {}".format(ofname))
+
+
+def save_fddb_bboxes(bboxes_scores,
+                     output_dir,
+                     output_fname='pred_fddb_res.txt'):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    predict_file = os.path.join(output_dir, output_fname)
+    f = open(predict_file, 'w')
+    for image_path, dets in bboxes_scores.iteritems():
+        f.write('{:s}\n'.format(image_path))
+        f.write('{:d}\n'.format(dets.shape[0]))
+        for box_score in dets:
+            xmin, ymin, xmax, ymax, score = box_score
+            width, height = xmax - xmin, ymax - ymin
+            f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'
+                    .format(xmin, ymin, width, height, score))
+    logger.info("The predicted result is saved as {}".format(predict_file))
+    return predict_file
+
+
+def lmk2out(results, is_bbox_normalized=False):
+    """
+    Args:
+        results: request a dict, should include: `landmark`, `im_id`,
+                 if is_bbox_normalized=True, also need `im_shape`.
+        is_bbox_normalized: whether or not landmark is normalized.
+    """
+    xywh_res = []
+    for t in results:
+        bboxes = t['bbox'][0]
+        lengths = t['bbox'][1][0]
+        im_ids = np.array(t['im_id'][0]).flatten()
+        if bboxes.shape == (1, 1) or bboxes is None:
+            continue
+        face_index = t['face_index'][0]
+        prior_box = t['prior_boxes'][0]
+        predict_lmk = t['landmark'][0]
+        prior = np.reshape(prior_box, (-1, 4))
+        predictlmk = np.reshape(predict_lmk, (-1, 10))
+
+        k = 0
+        for a in range(len(lengths)):
+            num = lengths[a]
+            im_id = int(im_ids[a])
+            for i in range(num):
+                score = bboxes[k][1]
+                theindex = face_index[i][0]
+                me_prior = prior[theindex, :]
+                lmk_pred = predictlmk[theindex, :]
+                prior_w = me_prior[2] - me_prior[0]
+                prior_h = me_prior[3] - me_prior[1]
+                prior_w_center = (me_prior[2] + me_prior[0]) / 2
+                prior_h_center = (me_prior[3] + me_prior[1]) / 2
+                lmk_decode = np.zeros((10))
+                for j in [0, 2, 4, 6, 8]:
+                    lmk_decode[j] = lmk_pred[
+                        j] * 0.1 * prior_w + prior_w_center
+                for j in [1, 3, 5, 7, 9]:
+                    lmk_decode[j] = lmk_pred[
+                        j] * 0.1 * prior_h + prior_h_center
+                im_shape = t['im_shape'][0][a].tolist()
+                image_h, image_w = int(im_shape[0]), int(im_shape[1])
+                if is_bbox_normalized:
+                    lmk_decode = lmk_decode * np.array([
+                        image_w, image_h, image_w, image_h, image_w, image_h,
+                        image_w, image_h, image_w, image_h
+                    ])
+                lmk_res = {
+                    'image_id': im_id,
+                    'landmark': lmk_decode,
+                    'score': score,
+                }
+                xywh_res.append(lmk_res)
+                k += 1
+    return xywh_res

+ 18 - 0
paddlers/models/ppdet/model_zoo/__init__.py

@@ -0,0 +1,18 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import model_zoo
+from .model_zoo import *
+
+__all__ = model_zoo.__all__

+ 84 - 0
paddlers/models/ppdet/model_zoo/model_zoo.py

@@ -0,0 +1,84 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import pkg_resources
+
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
+
+from paddlers.models.ppdet.core.workspace import load_config, create
+from paddlers.models.ppdet.utils.checkpoint import load_weight
+from paddlers.models.ppdet.utils.download import get_config_path
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'list_model', 'get_config_file', 'get_weights_url', 'get_model',
+    'MODEL_ZOO_FILENAME'
+]
+
+MODEL_ZOO_FILENAME = 'MODEL_ZOO'
+
+
+def list_model(filters=[]):
+    model_zoo_file = pkg_resources.resource_filename('ppdet.model_zoo',
+                                                     MODEL_ZOO_FILENAME)
+    with open(model_zoo_file) as f:
+        model_names = f.read().splitlines()
+
+    # filter model_name
+    def filt(name):
+        for f in filters:
+            if name.find(f) < 0:
+                return False
+        return True
+
+    if isinstance(filters, str) or not isinstance(filters, Sequence):
+        filters = [filters]
+    model_names = [name for name in model_names if filt(name)]
+    if len(model_names) == 0 and len(filters) > 0:
+        raise ValueError("no model found, please check filters seeting, "
+                         "filters can be set as following kinds:\n"
+                         "\tDataset: coco, voc ...\n"
+                         "\tArchitecture: yolo, rcnn, ssd ...\n"
+                         "\tBackbone: resnet, vgg, darknet ...\n")
+
+    model_str = "Available Models:\n"
+    for model_name in model_names:
+        model_str += "\t{}\n".format(model_name)
+    logger.info(model_str)
+
+
+# models and configs save on bcebos under dygraph directory
+def get_config_file(model_name):
+    return get_config_path("ppdet://configs/{}.yml".format(model_name))
+
+
+def get_weights_url(model_name):
+    return "ppdet://models/{}.pdparams".format(osp.split(model_name)[-1])
+
+
+def get_model(model_name, pretrained=True):
+    cfg_file = get_config_file(model_name)
+    cfg = load_config(cfg_file)
+    model = create(cfg.architecture)
+
+    if pretrained:
+        load_weight(model, get_weights_url(model_name))
+
+    return model

+ 45 - 0
paddlers/models/ppdet/modeling/__init__.py

@@ -0,0 +1,45 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+warnings.filterwarnings(
+    action='ignore', category=DeprecationWarning, module='ops')
+
+from . import ops
+from . import backbones
+from . import necks
+from . import proposal_generator
+from . import heads
+from . import losses
+from . import architectures
+from . import post_process
+from . import layers
+from . import reid
+from . import mot
+from . import transformers
+from . import assigners
+
+from .ops import *
+from .backbones import *
+from .necks import *
+from .proposal_generator import *
+from .heads import *
+from .losses import *
+from .architectures import *
+from .post_process import *
+from .layers import *
+from .reid import *
+from .mot import *
+from .transformers import *
+from .assigners import *

+ 51 - 0
paddlers/models/ppdet/modeling/architectures/__init__.py

@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+from . import meta_arch
+from . import faster_rcnn
+from . import mask_rcnn
+from . import yolo
+from . import cascade_rcnn
+from . import ssd
+from . import fcos
+from . import solov2
+from . import ttfnet
+from . import s2anet
+from . import keypoint_hrhrnet
+from . import keypoint_hrnet
+from . import jde
+from . import deepsort
+from . import fairmot
+from . import centernet
+from . import gfl
+from . import picodet
+from . import detr
+from . import sparse_rcnn
+from . import tood
+
+from .meta_arch import *
+from .faster_rcnn import *
+from .mask_rcnn import *
+from .yolo import *
+from .cascade_rcnn import *
+from .ssd import *
+from .fcos import *
+from .solov2 import *
+from .ttfnet import *
+from .s2anet import *
+from .keypoint_hrhrnet import *
+from .keypoint_hrnet import *
+from .jde import *
+from .deepsort import *
+from .fairmot import *
+from .centernet import *
+from .blazeface import *
+from .gfl import *
+from .picodet import *
+from .detr import *
+from .sparse_rcnn import *
+from .tood import *

+ 91 - 0
paddlers/models/ppdet/modeling/architectures/blazeface.py

@@ -0,0 +1,91 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['BlazeFace']
+
+
+@register
+class BlazeFace(BaseArch):
+    """
+    BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs,
+               see https://arxiv.org/abs/1907.05047
+
+    Args:
+        backbone (nn.Layer): backbone instance
+        neck (nn.Layer): neck instance
+        blaze_head (nn.Layer): `blazeHead` instance
+        post_process (object): `BBoxPostProcess` instance
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self, backbone, blaze_head, neck, post_process):
+        super(BlazeFace, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.blaze_head = blaze_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        blaze_head = create(cfg['blaze_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            'blaze_head': blaze_head,
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs)
+        # neck
+        neck_feats = self.neck(body_feats)
+        # blaze Head
+        if self.training:
+            return self.blaze_head(neck_feats, self.inputs['image'],
+                                   self.inputs['gt_bbox'],
+                                   self.inputs['gt_class'])
+        else:
+            preds, anchors = self.blaze_head(neck_feats, self.inputs['image'])
+            bbox, bbox_num = self.post_process(preds, anchors,
+                                               self.inputs['im_shape'],
+                                               self.inputs['scale_factor'])
+            return bbox, bbox_num
+
+    def get_loss(self, ):
+        return {"loss": self._forward()}
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {
+            "bbox": bbox_pred,
+            "bbox_num": bbox_num,
+        }
+        return output

+ 144 - 0
paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py

@@ -0,0 +1,144 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['CascadeRCNN']
+
+
+@register
+class CascadeRCNN(BaseArch):
+    """
+    Cascade R-CNN network, see https://arxiv.org/abs/1712.00726
+
+    Args:
+        backbone (object): backbone instance
+        rpn_head (object): `RPNHead` instance
+        bbox_head (object): `BBoxHead` instance
+        bbox_post_process (object): `BBoxPostProcess` instance
+        neck (object): 'FPN' instance
+        mask_head (object): `MaskHead` instance
+        mask_post_process (object): `MaskPostProcess` instance
+    """
+    __category__ = 'architecture'
+    __inject__ = [
+        'bbox_post_process',
+        'mask_post_process',
+    ]
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 bbox_head,
+                 bbox_post_process,
+                 neck=None,
+                 mask_head=None,
+                 mask_post_process=None):
+        super(CascadeRCNN, self).__init__()
+        self.backbone = backbone
+        self.rpn_head = rpn_head
+        self.bbox_head = bbox_head
+        self.bbox_post_process = bbox_post_process
+        self.neck = neck
+        self.mask_head = mask_head
+        self.mask_post_process = mask_post_process
+        self.with_mask = mask_head is not None
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        bbox_head = create(cfg['bbox_head'], **kwargs)
+
+        out_shape = neck and out_shape or bbox_head.get_head().out_shape
+        kwargs = {'input_shape': out_shape}
+        mask_head = cfg['mask_head'] and create(cfg['mask_head'], **kwargs)
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "rpn_head": rpn_head,
+            "bbox_head": bbox_head,
+            "mask_head": mask_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+
+        if self.training:
+            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
+            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
+                                                  self.inputs)
+            rois, rois_num = self.bbox_head.get_assigned_rois()
+            bbox_targets = self.bbox_head.get_assigned_targets()
+            if self.with_mask:
+                mask_loss = self.mask_head(body_feats, rois, rois_num,
+                                           self.inputs, bbox_targets,
+                                           bbox_feat)
+                return rpn_loss, bbox_loss, mask_loss
+            else:
+                return rpn_loss, bbox_loss, {}
+        else:
+            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
+            preds, _ = self.bbox_head(body_feats, rois, rois_num, self.inputs)
+            refined_rois = self.bbox_head.get_refined_rois()
+
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+
+            bbox, bbox_num = self.bbox_post_process(
+                preds, (refined_rois, rois_num), im_shape, scale_factor)
+            # rescale the prediction back to origin image
+            bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num,
+                                                        im_shape, scale_factor)
+            if not self.with_mask:
+                return bbox_pred, bbox_num, None
+            mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs)
+            origin_shape = self.bbox_post_process.get_origin_shape()
+            mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred,
+                                               bbox_num, origin_shape)
+            return bbox_pred, bbox_num, mask_pred
+
+    def get_loss(self, ):
+        rpn_loss, bbox_loss, mask_loss = self._forward()
+        loss = {}
+        loss.update(rpn_loss)
+        loss.update(bbox_loss)
+        if self.with_mask:
+            loss.update(mask_loss)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        bbox_pred, bbox_num, mask_pred = self._forward()
+        output = {
+            'bbox': bbox_pred,
+            'bbox_num': bbox_num,
+        }
+        if self.with_mask:
+            output.update({'mask': mask_pred})
+        return output

+ 108 - 0
paddlers/models/ppdet/modeling/architectures/centernet.py

@@ -0,0 +1,108 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['CenterNet']
+
+
+@register
+class CenterNet(BaseArch):
+    """
+    CenterNet network, see http://arxiv.org/abs/1904.07850
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): FPN instance, default use 'CenterNetDLAFPN'
+        head (object): 'CenterNetHead' instance
+        post_process (object): 'CenterNetPostProcess' instance
+        for_mot (bool): whether return other features used in tracking model
+
+    """
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+    __shared__ = ['for_mot']
+
+    def __init__(self,
+                 backbone,
+                 neck='CenterNetDLAFPN',
+                 head='CenterNetHead',
+                 post_process='CenterNetPostProcess',
+                 for_mot=False):
+        super(CenterNet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {'backbone': backbone, 'neck': neck, "head": head}
+
+    def _forward(self):
+        neck_feat = self.backbone(self.inputs)
+        if self.neck is not None:
+            neck_feat = self.neck(neck_feat)
+        head_out = self.head(neck_feat, self.inputs)
+        if self.for_mot:
+            head_out.update({'neck_feat': neck_feat})
+        elif self.training:
+            head_out['loss'] = head_out.pop('det_loss')
+        return head_out
+
+    def get_pred(self):
+        head_out = self._forward()
+        if self.for_mot:
+            bbox, bbox_inds, topk_clses = self.post_process(
+                head_out['heatmap'],
+                head_out['size'],
+                head_out['offset'],
+                im_shape=self.inputs['im_shape'],
+                scale_factor=self.inputs['scale_factor'])
+            output = {
+                "bbox": bbox,
+                "bbox_inds": bbox_inds,
+                "topk_clses": topk_clses,
+                "neck_feat": head_out['neck_feat']
+            }
+        else:
+            bbox, bbox_num, _ = self.post_process(
+                head_out['heatmap'],
+                head_out['size'],
+                head_out['offset'],
+                im_shape=self.inputs['im_shape'],
+                scale_factor=self.inputs['scale_factor'])
+            output = {
+                "bbox": bbox,
+                "bbox_num": bbox_num,
+            }
+        return output
+
+    def get_loss(self):
+        return self._forward()

+ 69 - 0
paddlers/models/ppdet/modeling/architectures/deepsort.py

@@ -0,0 +1,69 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+from paddlers.models.ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
+
+__all__ = ['DeepSORT']
+
+
+@register
+class DeepSORT(BaseArch):
+    """
+    DeepSORT network, see https://arxiv.org/abs/1703.07402
+
+    Args:
+        detector (object): detector model instance
+        reid (object): reid model instance
+        tracker (object): tracker instance
+    """
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 detector='YOLOv3',
+                 reid='PCBPyramid',
+                 tracker='DeepSORTTracker'):
+        super(DeepSORT, self).__init__()
+        self.detector = detector
+        self.reid = reid
+        self.tracker = tracker
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        if cfg['detector'] != 'None':
+            detector = create(cfg['detector'])
+        else:
+            detector = None
+        reid = create(cfg['reid'])
+        tracker = create(cfg['tracker'])
+
+        return {
+            "detector": detector,
+            "reid": reid,
+            "tracker": tracker,
+        }
+
+    def _forward(self):
+        crops = self.inputs['crops']
+        features = self.reid(crops)
+        return features
+
+    def get_pred(self):
+        return self._forward()

+ 93 - 0
paddlers/models/ppdet/modeling/architectures/detr.py

@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from .meta_arch import BaseArch
+from paddlers.models.ppdet.core.workspace import register, create
+
+__all__ = ['DETR']
+
+
+@register
+class DETR(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone,
+                 transformer,
+                 detr_head,
+                 post_process='DETRBBoxPostProcess'):
+        super(DETR, self).__init__()
+        self.backbone = backbone
+        self.transformer = transformer
+        self.detr_head = detr_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # transformer
+        kwargs = {'input_shape': backbone.out_shape}
+        transformer = create(cfg['transformer'], **kwargs)
+        # head
+        kwargs = {
+            'hidden_dim': transformer.hidden_dim,
+            'nhead': transformer.nhead,
+            'input_shape': backbone.out_shape
+        }
+        detr_head = create(cfg['detr_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'transformer': transformer,
+            "detr_head": detr_head,
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs)
+
+        # Transformer
+        out_transformer = self.transformer(body_feats, self.inputs['pad_mask'])
+
+        # DETR Head
+        if self.training:
+            return self.detr_head(out_transformer, body_feats, self.inputs)
+        else:
+            preds = self.detr_head(out_transformer, body_feats)
+            bbox, bbox_num = self.post_process(preds, self.inputs['im_shape'],
+                                               self.inputs['scale_factor'])
+            return bbox, bbox_num
+
+    def get_loss(self, ):
+        losses = self._forward()
+        losses.update({
+            'loss':
+            paddle.add_n([v for k, v in losses.items() if 'log' not in k])
+        })
+        return losses
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {
+            "bbox": bbox_pred,
+            "bbox_num": bbox_num,
+        }
+        return output

+ 100 - 0
paddlers/models/ppdet/modeling/architectures/fairmot.py

@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['FairMOT']
+
+
+@register
+class FairMOT(BaseArch):
+    """
+    FairMOT network, see http://arxiv.org/abs/2004.01888
+
+    Args:
+        detector (object): 'CenterNet' instance
+        reid (object): 'FairMOTEmbeddingHead' instance
+        tracker (object): 'JDETracker' instance
+        loss (object): 'FairMOTLoss' instance
+
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 detector='CenterNet',
+                 reid='FairMOTEmbeddingHead',
+                 tracker='JDETracker',
+                 loss='FairMOTLoss'):
+        super(FairMOT, self).__init__()
+        self.detector = detector
+        self.reid = reid
+        self.tracker = tracker
+        self.loss = loss
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        detector = create(cfg['detector'])
+        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape
+
+        kwargs = {'input_shape': detector_out_shape}
+        reid = create(cfg['reid'], **kwargs)
+        loss = create(cfg['loss'])
+        tracker = create(cfg['tracker'])
+
+        return {
+            'detector': detector,
+            'reid': reid,
+            'loss': loss,
+            'tracker': tracker
+        }
+
+    def _forward(self):
+        loss = dict()
+        # det_outs keys:
+        # train: neck_feat, det_loss, heatmap_loss, size_loss, offset_loss (optional: iou_loss)
+        # eval/infer: neck_feat, bbox, bbox_inds
+        det_outs = self.detector(self.inputs)
+        neck_feat = det_outs['neck_feat']
+        if self.training:
+            reid_loss = self.reid(neck_feat, self.inputs)
+
+            det_loss = det_outs['det_loss']
+            loss = self.loss(det_loss, reid_loss)
+            for k, v in det_outs.items():
+                if 'loss' not in k:
+                    continue
+                loss.update({k: v})
+            loss.update({'reid_loss': reid_loss})
+            return loss
+        else:
+            pred_dets, pred_embs = self.reid(
+                neck_feat, self.inputs, det_outs['bbox'],
+                det_outs['bbox_inds'], det_outs['topk_clses'])
+            return pred_dets, pred_embs
+
+    def get_pred(self):
+        output = self._forward()
+        return output
+
+    def get_loss(self):
+        loss = self._forward()
+        return loss

+ 106 - 0
paddlers/models/ppdet/modeling/architectures/faster_rcnn.py

@@ -0,0 +1,106 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['FasterRCNN']
+
+
+@register
+class FasterRCNN(BaseArch):
+    """
+    Faster R-CNN network, see https://arxiv.org/abs/1506.01497
+
+    Args:
+        backbone (object): backbone instance
+        rpn_head (object): `RPNHead` instance
+        bbox_head (object): `BBoxHead` instance
+        bbox_post_process (object): `BBoxPostProcess` instance
+        neck (object): 'FPN' instance
+    """
+    __category__ = 'architecture'
+    __inject__ = ['bbox_post_process']
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 bbox_head,
+                 bbox_post_process,
+                 neck=None):
+        super(FasterRCNN, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.rpn_head = rpn_head
+        self.bbox_head = bbox_head
+        self.bbox_post_process = bbox_post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        bbox_head = create(cfg['bbox_head'], **kwargs)
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "rpn_head": rpn_head,
+            "bbox_head": bbox_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+        if self.training:
+            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
+            bbox_loss, _ = self.bbox_head(body_feats, rois, rois_num,
+                                          self.inputs)
+            return rpn_loss, bbox_loss
+        else:
+            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
+            preds, _ = self.bbox_head(body_feats, rois, rois_num, None)
+
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+            bbox, bbox_num = self.bbox_post_process(preds, (rois, rois_num),
+                                                    im_shape, scale_factor)
+
+            # rescale the prediction back to origin image
+            bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num,
+                                                        im_shape, scale_factor)
+            return bbox_pred, bbox_num
+
+    def get_loss(self, ):
+        rpn_loss, bbox_loss = self._forward()
+        loss = {}
+        loss.update(rpn_loss)
+        loss.update(bbox_loss)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output

+ 105 - 0
paddlers/models/ppdet/modeling/architectures/fcos.py

@@ -0,0 +1,105 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['FCOS']
+
+
+@register
+class FCOS(BaseArch):
+    """
+    FCOS network, see https://arxiv.org/abs/1904.01355
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): 'FPN' instance
+        fcos_head (object): 'FCOSHead' instance
+        post_process (object): 'FCOSPostProcess' instance
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['fcos_post_process']
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 fcos_head='FCOSHead',
+                 fcos_post_process='FCOSPostProcess'):
+        super(FCOS, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.fcos_head = fcos_head
+        self.fcos_post_process = fcos_post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        fcos_head = create(cfg['fcos_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "fcos_head": fcos_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        fcos_head_outs = self.fcos_head(fpn_feats, self.training)
+        if not self.training:
+            scale_factor = self.inputs['scale_factor']
+            bboxes = self.fcos_post_process(fcos_head_outs, scale_factor)
+            return bboxes
+        else:
+            return fcos_head_outs
+
+    def get_loss(self, ):
+        loss = {}
+        tag_labels, tag_bboxes, tag_centerness = [], [], []
+        for i in range(len(self.fcos_head.fpn_stride)):
+            # labels, reg_target, centerness
+            k_lbl = 'labels{}'.format(i)
+            if k_lbl in self.inputs:
+                tag_labels.append(self.inputs[k_lbl])
+            k_box = 'reg_target{}'.format(i)
+            if k_box in self.inputs:
+                tag_bboxes.append(self.inputs[k_box])
+            k_ctn = 'centerness{}'.format(i)
+            if k_ctn in self.inputs:
+                tag_centerness.append(self.inputs[k_ctn])
+
+        fcos_head_outs = self._forward()
+        loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels,
+                                            tag_bboxes, tag_centerness)
+        loss.update(loss_fcos)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output

+ 87 - 0
paddlers/models/ppdet/modeling/architectures/gfl.py

@@ -0,0 +1,87 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['GFL']
+
+
+@register
+class GFL(BaseArch):
+    """
+    Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): 'FPN' instance
+        head (object): 'GFLHead' instance
+    """
+
+    __category__ = 'architecture'
+
+    def __init__(self, backbone, neck, head='GFLHead'):
+        super(GFL, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        head_outs = self.head(fpn_feats)
+        if not self.training:
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+            bboxes, bbox_num = self.head.post_process(head_outs, im_shape,
+                                                      scale_factor)
+            return bboxes, bbox_num
+        else:
+            return head_outs
+
+    def get_loss(self, ):
+        loss = {}
+
+        head_outs = self._forward()
+        loss_gfl = self.head.get_loss(head_outs, self.inputs)
+        loss.update(loss_gfl)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output

+ 111 - 0
paddlers/models/ppdet/modeling/architectures/jde.py

@@ -0,0 +1,111 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['JDE']
+
+
+@register
+class JDE(BaseArch):
+    __category__ = 'architecture'
+    __shared__ = ['metric']
+    """
+    JDE network, see https://arxiv.org/abs/1909.12605v1
+
+    Args:
+        detector (object): detector model instance
+        reid (object): reid model instance
+        tracker (object): tracker instance
+        metric (str): 'MOTDet' for training and detection evaluation, 'ReID'
+            for ReID embedding evaluation, or 'MOT' for multi object tracking
+            evaluation.
+    """
+
+    def __init__(self,
+                 detector='YOLOv3',
+                 reid='JDEEmbeddingHead',
+                 tracker='JDETracker',
+                 metric='MOT'):
+        super(JDE, self).__init__()
+        self.detector = detector
+        self.reid = reid
+        self.tracker = tracker
+        self.metric = metric
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        detector = create(cfg['detector'])
+        kwargs = {'input_shape': detector.neck.out_shape}
+
+        reid = create(cfg['reid'], **kwargs)
+
+        tracker = create(cfg['tracker'])
+
+        return {
+            "detector": detector,
+            "reid": reid,
+            "tracker": tracker,
+        }
+
+    def _forward(self):
+        det_outs = self.detector(self.inputs)
+
+        if self.training:
+            emb_feats = det_outs['emb_feats']
+            loss_confs = det_outs['det_losses']['loss_confs']
+            loss_boxes = det_outs['det_losses']['loss_boxes']
+            jde_losses = self.reid(
+                emb_feats,
+                self.inputs,
+                loss_confs=loss_confs,
+                loss_boxes=loss_boxes)
+            return jde_losses
+        else:
+            if self.metric == 'MOTDet':
+                det_results = {
+                    'bbox': det_outs['bbox'],
+                    'bbox_num': det_outs['bbox_num'],
+                }
+                return det_results
+
+            elif self.metric == 'MOT':
+                emb_feats = det_outs['emb_feats']
+                bboxes = det_outs['bbox']
+                boxes_idx = det_outs['boxes_idx']
+                nms_keep_idx = det_outs['nms_keep_idx']
+
+                pred_dets, pred_embs = self.reid(
+                    emb_feats,
+                    self.inputs,
+                    bboxes=bboxes,
+                    boxes_idx=boxes_idx,
+                    nms_keep_idx=nms_keep_idx)
+                return pred_dets, pred_embs
+
+            else:
+                raise ValueError(
+                    "Unknown metric {} for multi object tracking.".format(
+                        self.metric))
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()

+ 287 - 0
paddlers/models/ppdet/modeling/architectures/keypoint_hrhrnet.py

@@ -0,0 +1,287 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from scipy.optimize import linear_sum_assignment
+from collections import abc, defaultdict
+import numpy as np
+import paddle
+
+from paddlers.models.ppdet.core.workspace import register, create, serializable
+from .meta_arch import BaseArch
+from .. import layers as L
+from ..keypoint_utils import transpred
+
+__all__ = ['HigherHRNet']
+
+
+@register
+class HigherHRNet(BaseArch):
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 backbone='HRNet',
+                 hrhrnet_head='HrHRNetHead',
+                 post_process='HrHRNetPostProcess',
+                 eval_flip=True,
+                 flip_perm=None,
+                 max_num_people=30):
+        """
+        HigherHRNet network, see https://arxiv.org/abs/1908.10357;
+        HigherHRNet+swahr, see https://arxiv.org/abs/2012.15175
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            hrhrnet_head (nn.Layer): keypoint_head instance
+            bbox_post_process (object): `BBoxPostProcess` instance
+        """
+        super(HigherHRNet, self).__init__()
+        self.backbone = backbone
+        self.hrhrnet_head = hrhrnet_head
+        self.post_process = post_process
+        self.flip = eval_flip
+        self.flip_perm = paddle.to_tensor(flip_perm)
+        self.deploy = False
+        self.interpolate = L.Upsample(2, mode='bilinear')
+        self.pool = L.MaxPool(5, 1, 2)
+        self.max_num_people = max_num_people
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # head
+        kwargs = {'input_shape': backbone.out_shape}
+        hrhrnet_head = create(cfg['hrhrnet_head'], **kwargs)
+        post_process = create(cfg['post_process'])
+
+        return {
+            'backbone': backbone,
+            "hrhrnet_head": hrhrnet_head,
+            "post_process": post_process,
+        }
+
+    def _forward(self):
+        if self.flip and not self.training and not self.deploy:
+            self.inputs['image'] = paddle.concat(
+                (self.inputs['image'], paddle.flip(self.inputs['image'], [3])))
+        body_feats = self.backbone(self.inputs)
+
+        if self.training:
+            return self.hrhrnet_head(body_feats, self.inputs)
+        else:
+            outputs = self.hrhrnet_head(body_feats)
+
+            if self.flip and not self.deploy:
+                outputs = [paddle.split(o, 2) for o in outputs]
+                output_rflip = [
+                    paddle.flip(paddle.gather(o[1], self.flip_perm, 1), [3])
+                    for o in outputs
+                ]
+                output1 = [o[0] for o in outputs]
+                heatmap = (output1[0] + output_rflip[0]) / 2.
+                tagmaps = [output1[1], output_rflip[1]]
+                outputs = [heatmap] + tagmaps
+            outputs = self.get_topk(outputs)
+
+            if self.deploy:
+                return outputs
+
+            res_lst = []
+            h = self.inputs['im_shape'][0, 0].numpy().item()
+            w = self.inputs['im_shape'][0, 1].numpy().item()
+            kpts, scores = self.post_process(*outputs, h, w)
+            res_lst.append([kpts, scores])
+            return res_lst
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        outputs = {}
+        res_lst = self._forward()
+        outputs['keypoint'] = res_lst
+        return outputs
+
+    def get_topk(self, outputs):
+        # resize to image size
+        outputs = [self.interpolate(x) for x in outputs]
+        if len(outputs) == 3:
+            tagmap = paddle.concat(
+                (outputs[1].unsqueeze(4), outputs[2].unsqueeze(4)), axis=4)
+        else:
+            tagmap = outputs[1].unsqueeze(4)
+
+        heatmap = outputs[0]
+        N, J = 1, self.hrhrnet_head.num_joints
+        heatmap_maxpool = self.pool(heatmap)
+        # topk
+        maxmap = heatmap * (heatmap == heatmap_maxpool)
+        maxmap = maxmap.reshape([N, J, -1])
+        heat_k, inds_k = maxmap.topk(self.max_num_people, axis=2)
+
+        outputs = [heatmap, tagmap, heat_k, inds_k]
+        return outputs
+
+
+@register
+@serializable
+class HrHRNetPostProcess(object):
+    '''
+    HrHRNet postprocess contain:
+        1) get topk keypoints in the output heatmap
+        2) sample the tagmap's value corresponding to each of the topk coordinate
+        3) match different joints to combine to some people with Hungary algorithm
+        4) adjust the coordinate by +-0.25 to decrease error std
+        5) salvage missing joints by check positivity of heatmap - tagdiff_norm
+    Args:
+        max_num_people (int): max number of people support in postprocess
+        heat_thresh (float): value of topk below this threshhold will be ignored
+        tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init
+
+        inputs(list[heatmap]): the output list of modle, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk
+        original_height, original_width (float): the original image size
+    '''
+
+    def __init__(self, max_num_people=30, heat_thresh=0.1, tag_thresh=1.):
+        self.max_num_people = max_num_people
+        self.heat_thresh = heat_thresh
+        self.tag_thresh = tag_thresh
+
+    def lerp(self, j, y, x, heatmap):
+        H, W = heatmap.shape[-2:]
+        left = np.clip(x - 1, 0, W - 1)
+        right = np.clip(x + 1, 0, W - 1)
+        up = np.clip(y - 1, 0, H - 1)
+        down = np.clip(y + 1, 0, H - 1)
+        offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25,
+                            -0.25)
+        offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25,
+                            -0.25)
+        return offset_y + 0.5, offset_x + 0.5
+
+    def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
+                 original_width):
+
+        N, J, H, W = heatmap.shape
+        assert N == 1, "only support batch size 1"
+        heatmap = heatmap[0].cpu().detach().numpy()
+        tagmap = tagmap[0].cpu().detach().numpy()
+        heats = heat_k[0].cpu().detach().numpy()
+        inds_np = inds_k[0].cpu().detach().numpy()
+        y = inds_np // W
+        x = inds_np % W
+        tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people), y.
+                      flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1])
+        coords = np.stack((y, x), axis=2)
+        # threshold
+        mask = heats > self.heat_thresh
+        # cluster
+        cluster = defaultdict(lambda: {
+            'coords': np.zeros((J, 2), dtype=np.float32),
+            'scores': np.zeros(J, dtype=np.float32),
+            'tags': []
+        })
+        for jid, m in enumerate(mask):
+            num_valid = m.sum()
+            if num_valid == 0:
+                continue
+            valid_inds = np.where(m)[0]
+            valid_tags = tags[jid, m, :]
+            if len(cluster) == 0:  # initialize
+                for i in valid_inds:
+                    tag = tags[jid, i]
+                    key = tag[0]
+                    cluster[key]['tags'].append(tag)
+                    cluster[key]['scores'][jid] = heats[jid, i]
+                    cluster[key]['coords'][jid] = coords[jid, i]
+                continue
+            candidates = list(cluster.keys())[:self.max_num_people]
+            centroids = [
+                np.mean(
+                    cluster[k]['tags'], axis=0) for k in candidates
+            ]
+            num_clusters = len(centroids)
+            # shape is (num_valid, num_clusters, tag_dim)
+            dist = valid_tags[:, None, :] - np.array(centroids)[None, ...]
+            l2_dist = np.linalg.norm(dist, ord=2, axis=2)
+            # modulate dist with heat value, see `use_detection_val`
+            cost = np.round(l2_dist) * 100 - heats[jid, m, None]
+            # pad the cost matrix, otherwise new pose are ignored
+            if num_valid > num_clusters:
+                cost = np.pad(cost, ((0, 0), (0, num_valid - num_clusters)),
+                              'constant',
+                              constant_values=((0, 0), (0, 1e-10)))
+            rows, cols = linear_sum_assignment(cost)
+            for y, x in zip(rows, cols):
+                tag = tags[jid, y]
+                if y < num_valid and x < num_clusters and \
+                   l2_dist[y, x] < self.tag_thresh:
+                    key = candidates[x]  # merge to cluster
+                else:
+                    key = tag[0]  # initialize new cluster
+                cluster[key]['tags'].append(tag)
+                cluster[key]['scores'][jid] = heats[jid, y]
+                cluster[key]['coords'][jid] = coords[jid, y]
+
+        # shape is [k, J, 2] and [k, J]
+        pose_tags = np.array([cluster[k]['tags'] for k in cluster])
+        pose_coords = np.array([cluster[k]['coords'] for k in cluster])
+        pose_scores = np.array([cluster[k]['scores'] for k in cluster])
+        valid = pose_scores > 0
+
+        pose_kpts = np.zeros((pose_scores.shape[0], J, 3), dtype=np.float32)
+        if valid.sum() == 0:
+            return pose_kpts, pose_kpts
+
+        # refine coords
+        valid_coords = pose_coords[valid].astype(np.int32)
+        y = valid_coords[..., 0].flatten()
+        x = valid_coords[..., 1].flatten()
+        _, j = np.nonzero(valid)
+        offsets = self.lerp(j, y, x, heatmap)
+        pose_coords[valid, 0] += offsets[0]
+        pose_coords[valid, 1] += offsets[1]
+
+        # mean score before salvage
+        mean_score = pose_scores.mean(axis=1)
+        pose_kpts[valid, 2] = pose_scores[valid]
+
+        # salvage missing joints
+        if True:
+            for pid, coords in enumerate(pose_coords):
+                tag_mean = np.array(pose_tags[pid]).mean(axis=0)
+                norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5
+                score = heatmap - np.round(norm)  # (J, H, W)
+                flat_score = score.reshape(J, -1)
+                max_inds = np.argmax(flat_score, axis=1)
+                max_scores = np.max(flat_score, axis=1)
+                salvage_joints = (pose_scores[pid] == 0) & (max_scores > 0)
+                if salvage_joints.sum() == 0:
+                    continue
+                y = max_inds[salvage_joints] // W
+                x = max_inds[salvage_joints] % W
+                offsets = self.lerp(salvage_joints.nonzero()[0], y, x, heatmap)
+                y = y.astype(np.float32) + offsets[0]
+                x = x.astype(np.float32) + offsets[1]
+                pose_coords[pid][salvage_joints, 0] = y
+                pose_coords[pid][salvage_joints, 1] = x
+                pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints]
+        pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1],
+                                       original_height, original_width,
+                                       min(H, W))
+        return pose_kpts, mean_score

+ 267 - 0
paddlers/models/ppdet/modeling/architectures/keypoint_hrnet.py

@@ -0,0 +1,267 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+import math
+import cv2
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+from ..keypoint_utils import transform_preds
+from .. import layers as L
+
+__all__ = ['TopDownHRNet']
+
+
+@register
+class TopDownHRNet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 width,
+                 num_joints,
+                 backbone='HRNet',
+                 loss='KeyPointMSELoss',
+                 post_process='HRNetPostProcess',
+                 flip_perm=None,
+                 flip=True,
+                 shift_heatmap=True,
+                 use_dark=True):
+        """
+        HRNet network, see https://arxiv.org/abs/1902.09212
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            post_process (object): `HRNetPostProcess` instance
+            flip_perm (list): The left-right joints exchange order list
+            use_dark(bool): Whether to use DARK in post processing
+        """
+        super(TopDownHRNet, self).__init__()
+        self.backbone = backbone
+        self.post_process = HRNetPostProcess(use_dark)
+        self.loss = loss
+        self.flip_perm = flip_perm
+        self.flip = flip
+        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)
+        self.shift_heatmap = shift_heatmap
+        self.deploy = False
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        return {'backbone': backbone, }
+
+    def _forward(self):
+        feats = self.backbone(self.inputs)
+        hrnet_outputs = self.final_conv(feats[0])
+
+        if self.training:
+            return self.loss(hrnet_outputs, self.inputs)
+        elif self.deploy:
+            outshape = hrnet_outputs.shape
+            max_idx = paddle.argmax(
+                hrnet_outputs.reshape(
+                    (outshape[0], outshape[1], outshape[2] * outshape[3])),
+                axis=-1)
+            return hrnet_outputs, max_idx
+        else:
+            if self.flip:
+                self.inputs['image'] = self.inputs['image'].flip([3])
+                feats = self.backbone(self.inputs)
+                output_flipped = self.final_conv(feats[0])
+                output_flipped = self.flip_back(output_flipped.numpy(),
+                                                self.flip_perm)
+                output_flipped = paddle.to_tensor(output_flipped.copy())
+                if self.shift_heatmap:
+                    output_flipped[:, :, :, 1:] = output_flipped.clone(
+                    )[:, :, :, 0:-1]
+                hrnet_outputs = (hrnet_outputs + output_flipped) * 0.5
+            imshape = (self.inputs['im_shape'].numpy()
+                       )[:, ::-1] if 'im_shape' in self.inputs else None
+            center = self.inputs['center'].numpy(
+            ) if 'center' in self.inputs else np.round(imshape / 2.)
+            scale = self.inputs['scale'].numpy(
+            ) if 'scale' in self.inputs else imshape / 200.
+            outputs = self.post_process(hrnet_outputs, center, scale)
+            return outputs
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        res_lst = self._forward()
+        outputs = {'keypoint': res_lst}
+        return outputs
+
+    def flip_back(self, output_flipped, matched_parts):
+        assert output_flipped.ndim == 4,\
+                'output_flipped should be [batch_size, num_joints, height, width]'
+
+        output_flipped = output_flipped[:, :, :, ::-1]
+
+        for pair in matched_parts:
+            tmp = output_flipped[:, pair[0], :, :].copy()
+            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+            output_flipped[:, pair[1], :, :] = tmp
+
+        return output_flipped
+
+
+class HRNetPostProcess(object):
+    def __init__(self, use_dark=True):
+        self.use_dark = use_dark
+
+    def get_max_preds(self, heatmaps):
+        '''get predictions from score maps
+
+        Args:
+            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
+            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
+        '''
+        assert isinstance(heatmaps,
+                          np.ndarray), 'heatmaps should be numpy.ndarray'
+        assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+        batch_size = heatmaps.shape[0]
+        num_joints = heatmaps.shape[1]
+        width = heatmaps.shape[3]
+        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
+        idx = np.argmax(heatmaps_reshaped, 2)
+        maxvals = np.amax(heatmaps_reshaped, 2)
+
+        maxvals = maxvals.reshape((batch_size, num_joints, 1))
+        idx = idx.reshape((batch_size, num_joints, 1))
+
+        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+
+        preds[:, :, 0] = (preds[:, :, 0]) % width
+        preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
+
+        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
+        pred_mask = pred_mask.astype(np.float32)
+
+        preds *= pred_mask
+
+        return preds, maxvals
+
+    def gaussian_blur(self, heatmap, kernel):
+        border = (kernel - 1) // 2
+        batch_size = heatmap.shape[0]
+        num_joints = heatmap.shape[1]
+        height = heatmap.shape[2]
+        width = heatmap.shape[3]
+        for i in range(batch_size):
+            for j in range(num_joints):
+                origin_max = np.max(heatmap[i, j])
+                dr = np.zeros((height + 2 * border, width + 2 * border))
+                dr[border:-border, border:-border] = heatmap[i, j].copy()
+                dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
+                heatmap[i, j] = dr[border:-border, border:-border].copy()
+                heatmap[i, j] *= origin_max / np.max(heatmap[i, j])
+        return heatmap
+
+    def dark_parse(self, hm, coord):
+        heatmap_height = hm.shape[0]
+        heatmap_width = hm.shape[1]
+        px = int(coord[0])
+        py = int(coord[1])
+        if 1 < px < heatmap_width - 2 and 1 < py < heatmap_height - 2:
+            dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1])
+            dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px])
+            dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2])
+            dxy = 0.25 * (hm[py+1][px+1] - hm[py-1][px+1] - hm[py+1][px-1] \
+                + hm[py-1][px-1])
+            dyy = 0.25 * (
+                hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
+            derivative = np.matrix([[dx], [dy]])
+            hessian = np.matrix([[dxx, dxy], [dxy, dyy]])
+            if dxx * dyy - dxy**2 != 0:
+                hessianinv = hessian.I
+                offset = -hessianinv * derivative
+                offset = np.squeeze(np.array(offset.T), axis=0)
+                coord += offset
+        return coord
+
+    def dark_postprocess(self, hm, coords, kernelsize):
+        '''DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+        Representation for Human Pose Estimation (CVPR 2020).
+        '''
+
+        hm = self.gaussian_blur(hm, kernelsize)
+        hm = np.maximum(hm, 1e-10)
+        hm = np.log(hm)
+        for n in range(coords.shape[0]):
+            for p in range(coords.shape[1]):
+                coords[n, p] = self.dark_parse(hm[n][p], coords[n][p])
+        return coords
+
+    def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
+        """the highest heatvalue location with a quarter offset in the
+        direction from the highest response to the second highest response.
+
+        Args:
+            heatmaps (numpy.ndarray): The predicted heatmaps
+            center (numpy.ndarray): The boxes center
+            scale (numpy.ndarray): The scale factor
+
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
+            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
+        """
+        coords, maxvals = self.get_max_preds(heatmaps)
+
+        heatmap_height = heatmaps.shape[2]
+        heatmap_width = heatmaps.shape[3]
+
+        if self.use_dark:
+            coords = self.dark_postprocess(heatmaps, coords, kernelsize)
+        else:
+            for n in range(coords.shape[0]):
+                for p in range(coords.shape[1]):
+                    hm = heatmaps[n][p]
+                    px = int(math.floor(coords[n][p][0] + 0.5))
+                    py = int(math.floor(coords[n][p][1] + 0.5))
+                    if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
+                        diff = np.array([
+                            hm[py][px + 1] - hm[py][px - 1],
+                            hm[py + 1][px] - hm[py - 1][px]
+                        ])
+                        coords[n][p] += np.sign(diff) * .25
+        preds = coords.copy()
+
+        # Transform back
+        for i in range(coords.shape[0]):
+            preds[i] = transform_preds(coords[i], center[i], scale[i],
+                                       [heatmap_width, heatmap_height])
+
+        return preds, maxvals
+
+    def __call__(self, output, center, scale):
+        preds, maxvals = self.get_final_preds(output.numpy(), center, scale)
+        outputs = [[
+            np.concatenate(
+                (preds, maxvals), axis=-1), np.mean(
+                    maxvals, axis=1)
+        ]]
+        return outputs

+ 135 - 0
paddlers/models/ppdet/modeling/architectures/mask_rcnn.py

@@ -0,0 +1,135 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['MaskRCNN']
+
+
+@register
+class MaskRCNN(BaseArch):
+    """
+    Mask R-CNN network, see https://arxiv.org/abs/1703.06870
+
+    Args:
+        backbone (object): backbone instance
+        rpn_head (object): `RPNHead` instance
+        bbox_head (object): `BBoxHead` instance
+        mask_head (object): `MaskHead` instance
+        bbox_post_process (object): `BBoxPostProcess` instance
+        mask_post_process (object): `MaskPostProcess` instance
+        neck (object): 'FPN' instance
+    """
+
+    __category__ = 'architecture'
+    __inject__ = [
+        'bbox_post_process',
+        'mask_post_process',
+    ]
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 bbox_head,
+                 mask_head,
+                 bbox_post_process,
+                 mask_post_process,
+                 neck=None):
+        super(MaskRCNN, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.rpn_head = rpn_head
+        self.bbox_head = bbox_head
+        self.mask_head = mask_head
+
+        self.bbox_post_process = bbox_post_process
+        self.mask_post_process = mask_post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        bbox_head = create(cfg['bbox_head'], **kwargs)
+
+        out_shape = neck and out_shape or bbox_head.get_head().out_shape
+        kwargs = {'input_shape': out_shape}
+        mask_head = create(cfg['mask_head'], **kwargs)
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "rpn_head": rpn_head,
+            "bbox_head": bbox_head,
+            "mask_head": mask_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+
+        if self.training:
+            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
+            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
+                                                  self.inputs)
+            rois, rois_num = self.bbox_head.get_assigned_rois()
+            bbox_targets = self.bbox_head.get_assigned_targets()
+            # Mask Head needs bbox_feat in Mask RCNN
+            mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs,
+                                       bbox_targets, bbox_feat)
+            return rpn_loss, bbox_loss, mask_loss
+        else:
+            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
+            preds, feat_func = self.bbox_head(body_feats, rois, rois_num, None)
+
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+
+            bbox, bbox_num = self.bbox_post_process(preds, (rois, rois_num),
+                                                    im_shape, scale_factor)
+            mask_out = self.mask_head(
+                body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)
+
+            # rescale the prediction back to origin image
+            bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num,
+                                                        im_shape, scale_factor)
+            origin_shape = self.bbox_post_process.get_origin_shape()
+            mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred,
+                                               bbox_num, origin_shape)
+            return bbox_pred, bbox_num, mask_pred
+
+    def get_loss(self, ):
+        bbox_loss, mask_loss, rpn_loss = self._forward()
+        loss = {}
+        loss.update(rpn_loss)
+        loss.update(bbox_loss)
+        loss.update(mask_loss)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        bbox_pred, bbox_num, mask_pred = self._forward()
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
+        return output

+ 141 - 0
paddlers/models/ppdet/modeling/architectures/meta_arch.py

@@ -0,0 +1,141 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import typing
+
+from paddlers.models.ppdet.core.workspace import register
+from paddlers.models.ppdet.modeling.post_process import nms
+
+__all__ = ['BaseArch']
+
+
+@register
+class BaseArch(nn.Layer):
+    def __init__(self, data_format='NCHW'):
+        super(BaseArch, self).__init__()
+        self.data_format = data_format
+        self.inputs = {}
+        self.fuse_norm = False
+
+    def load_meanstd(self, cfg_transform):
+        self.scale = 1.
+        self.mean = paddle.to_tensor([0.485, 0.456, 0.406]).reshape(
+            (1, 3, 1, 1))
+        self.std = paddle.to_tensor([0.229, 0.224, 0.225]).reshape(
+            (1, 3, 1, 1))
+        for item in cfg_transform:
+            if 'NormalizeImage' in item:
+                self.mean = paddle.to_tensor(item['NormalizeImage'][
+                    'mean']).reshape((1, 3, 1, 1))
+                self.std = paddle.to_tensor(item['NormalizeImage'][
+                    'std']).reshape((1, 3, 1, 1))
+                if item['NormalizeImage'].get('is_scale', True):
+                    self.scale = 1. / 255.
+                break
+        if self.data_format == 'NHWC':
+            self.mean = self.mean.reshape(1, 1, 1, 3)
+            self.std = self.std.reshape(1, 1, 1, 3)
+
+    def forward(self, inputs):
+        if self.data_format == 'NHWC':
+            image = inputs['image']
+            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
+
+        if self.fuse_norm:
+            image = inputs['image']
+            self.inputs['image'] = (image * self.scale - self.mean) / self.std
+            self.inputs['im_shape'] = inputs['im_shape']
+            self.inputs['scale_factor'] = inputs['scale_factor']
+        else:
+            self.inputs = inputs
+
+        self.model_arch()
+
+        if self.training:
+            out = self.get_loss()
+        else:
+            inputs_list = []
+            # multi-scale input
+            if not isinstance(inputs, typing.Sequence):
+                inputs_list.append(inputs)
+            else:
+                inputs_list.extend(inputs)
+
+            outs = []
+            for inp in inputs_list:
+                self.inputs = inp
+                outs.append(self.get_pred())
+
+            # multi-scale test
+            if len(outs) > 1:
+                out = self.merge_multi_scale_predictions(outs)
+            else:
+                out = outs[0]
+        return out
+
+    def merge_multi_scale_predictions(self, outs):
+        # default values for architectures not included in following list
+        num_classes = 80
+        nms_threshold = 0.5
+        keep_top_k = 100
+
+        if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'
+                                       ):
+            num_classes = self.bbox_head.num_classes
+            keep_top_k = self.bbox_post_process.nms.keep_top_k
+            nms_threshold = self.bbox_post_process.nms.nms_threshold
+        else:
+            raise Exception(
+                "Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now"
+            )
+
+        final_boxes = []
+        all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()
+        for c in range(num_classes):
+            idxs = all_scale_outs[:, 0] == c
+            if np.count_nonzero(idxs) == 0:
+                continue
+            r = nms(all_scale_outs[idxs, 1:], nms_threshold)
+            final_boxes.append(
+                np.concatenate([np.full((r.shape[0], 1), c), r], 1))
+        out = np.concatenate(final_boxes)
+        out = np.concatenate(sorted(
+            out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))
+        out = {
+            'bbox': paddle.to_tensor(out),
+            'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))
+        }
+
+        return out
+
+    def build_inputs(self, data, input_def):
+        inputs = {}
+        for i, k in enumerate(input_def):
+            inputs[k] = data[i]
+        return inputs
+
+    def model_arch(self, ):
+        pass
+
+    def get_loss(self, ):
+        raise NotImplementedError("Should implement get_loss method!")
+
+    def get_pred(self, ):
+        raise NotImplementedError("Should implement get_pred method!")
+
+    @classmethod
+    def convert_sync_batchnorm(cls, layer):
+        layer_output = layer
+        if getattr(layer, 'norm_type', None) == 'sync_bn':
+            layer_output = nn.SyncBatchNorm.convert_sync_batchnorm(layer)
+        else:
+            for name, sublayer in layer.named_children():
+                layer_output.add_sublayer(name,
+                                          cls.convert_sync_batchnorm(sublayer))
+
+        del layer
+        return layer_output

+ 91 - 0
paddlers/models/ppdet/modeling/architectures/picodet.py

@@ -0,0 +1,91 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['PicoDet']
+
+
+@register
+class PicoDet(BaseArch):
+    """
+    Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): 'FPN' instance
+        head (object): 'PicoHead' instance
+    """
+
+    __category__ = 'architecture'
+
+    def __init__(self, backbone, neck, head='PicoHead'):
+        super(PicoDet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.deploy = False
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        head_outs = self.head(fpn_feats, self.deploy)
+        if self.training or self.deploy:
+            return head_outs, None
+        else:
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+            bboxes, bbox_num = self.head.post_process(head_outs, im_shape,
+                                                      scale_factor)
+            return bboxes, bbox_num
+
+    def get_loss(self, ):
+        loss = {}
+
+        head_outs, _ = self._forward()
+        loss_gfl = self.head.get_loss(head_outs, self.inputs)
+        loss.update(loss_gfl)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        if self.deploy:
+            return {'picodet': self._forward()[0]}
+        else:
+            bbox_pred, bbox_num = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+            return output

+ 102 - 0
paddlers/models/ppdet/modeling/architectures/s2anet.py

@@ -0,0 +1,102 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['S2ANet']
+
+
+@register
+class S2ANet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = [
+        's2anet_head',
+        's2anet_bbox_post_process',
+    ]
+
+    def __init__(self, backbone, neck, s2anet_head, s2anet_bbox_post_process):
+        """
+        S2ANet, see https://arxiv.org/pdf/2008.09397.pdf
+
+        Args:
+            backbone (object): backbone instance
+            neck (object): `FPN` instance
+            s2anet_head (object): `S2ANetHead` instance
+            s2anet_bbox_post_process (object): `S2ANetBBoxPostProcess` instance
+        """
+        super(S2ANet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.s2anet_head = s2anet_head
+        self.s2anet_bbox_post_process = s2anet_bbox_post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        s2anet_head = create(cfg['s2anet_head'], **kwargs)
+        s2anet_bbox_post_process = create(cfg['s2anet_bbox_post_process'],
+                                          **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "s2anet_head": s2anet_head,
+            "s2anet_bbox_post_process": s2anet_bbox_post_process,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+        self.s2anet_head(body_feats)
+        if self.training:
+            loss = self.s2anet_head.get_loss(self.inputs)
+            total_loss = paddle.add_n(list(loss.values()))
+            loss.update({'loss': total_loss})
+            return loss
+        else:
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+            nms_pre = self.s2anet_bbox_post_process.nms_pre
+            pred_scores, pred_bboxes = self.s2anet_head.get_prediction(nms_pre)
+
+            # post_process
+            pred_bboxes, bbox_num = self.s2anet_bbox_post_process(pred_scores,
+                                                                  pred_bboxes)
+            # rescale the prediction back to origin image
+            pred_bboxes = self.s2anet_bbox_post_process.get_pred(
+                pred_bboxes, bbox_num, im_shape, scale_factor)
+
+            # output
+            output = {'bbox': pred_bboxes, 'bbox_num': bbox_num}
+            return output
+
+    def get_loss(self, ):
+        loss = self._forward()
+        return loss
+
+    def get_pred(self):
+        output = self._forward()
+        return output

+ 110 - 0
paddlers/models/ppdet/modeling/architectures/solov2.py

@@ -0,0 +1,110 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['SOLOv2']
+
+
+@register
+class SOLOv2(BaseArch):
+    """
+    SOLOv2 network, see https://arxiv.org/abs/2003.10152
+
+    Args:
+        backbone (object): an backbone instance
+        solov2_head (object): an `SOLOv2Head` instance
+        mask_head (object): an `SOLOv2MaskHead` instance
+        neck (object): neck of network, such as feature pyramid network instance
+    """
+
+    __category__ = 'architecture'
+
+    def __init__(self, backbone, solov2_head, mask_head, neck=None):
+        super(SOLOv2, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.solov2_head = solov2_head
+        self.mask_head = mask_head
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        solov2_head = create(cfg['solov2_head'], **kwargs)
+        mask_head = create(cfg['mask_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            'solov2_head': solov2_head,
+            'mask_head': mask_head,
+        }
+
+    def model_arch(self):
+        body_feats = self.backbone(self.inputs)
+
+        body_feats = self.neck(body_feats)
+
+        self.seg_pred = self.mask_head(body_feats)
+
+        self.cate_pred_list, self.kernel_pred_list = self.solov2_head(
+            body_feats)
+
+    def get_loss(self, ):
+        loss = {}
+        # get gt_ins_labels, gt_cate_labels, etc.
+        gt_ins_labels, gt_cate_labels, gt_grid_orders = [], [], []
+        fg_num = self.inputs['fg_num']
+        for i in range(len(self.solov2_head.seg_num_grids)):
+            ins_label = 'ins_label{}'.format(i)
+            if ins_label in self.inputs:
+                gt_ins_labels.append(self.inputs[ins_label])
+            cate_label = 'cate_label{}'.format(i)
+            if cate_label in self.inputs:
+                gt_cate_labels.append(self.inputs[cate_label])
+            grid_order = 'grid_order{}'.format(i)
+            if grid_order in self.inputs:
+                gt_grid_orders.append(self.inputs[grid_order])
+
+        loss_solov2 = self.solov2_head.get_loss(
+            self.cate_pred_list, self.kernel_pred_list, self.seg_pred,
+            gt_ins_labels, gt_cate_labels, gt_grid_orders, fg_num)
+        loss.update(loss_solov2)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        seg_masks, cate_labels, cate_scores, bbox_num = self.solov2_head.get_prediction(
+            self.cate_pred_list, self.kernel_pred_list, self.seg_pred,
+            self.inputs['im_shape'], self.inputs['scale_factor'])
+        outs = {
+            "segm": seg_masks,
+            "bbox_num": bbox_num,
+            'cate_label': cate_labels,
+            'cate_score': cate_scores
+        }
+        return outs

+ 99 - 0
paddlers/models/ppdet/modeling/architectures/sparse_rcnn.py

@@ -0,0 +1,99 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ["SparseRCNN"]
+
+
+@register
+class SparseRCNN(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ["postprocess"]
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 head="SparsercnnHead",
+                 postprocess="SparsePostProcess"):
+        super(SparseRCNN, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.postprocess = postprocess
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'roi_input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        head_outs = self.head(fpn_feats, self.inputs["img_whwh"])
+
+        if not self.training:
+            bboxes = self.postprocess(
+                head_outs["pred_logits"], head_outs["pred_boxes"],
+                self.inputs["scale_factor_wh"], self.inputs["img_whwh"])
+            return bboxes
+        else:
+            return head_outs
+
+    def get_loss(self):
+        batch_gt_class = self.inputs["gt_class"]
+        batch_gt_box = self.inputs["gt_bbox"]
+        batch_whwh = self.inputs["img_whwh"]
+        targets = []
+
+        for i in range(len(batch_gt_class)):
+            boxes = batch_gt_box[i]
+            labels = batch_gt_class[i].squeeze(-1)
+            img_whwh = batch_whwh[i]
+            img_whwh_tgt = img_whwh.unsqueeze(0).tile([int(boxes.shape[0]), 1])
+            targets.append({
+                "boxes": boxes,
+                "labels": labels,
+                "img_whwh": img_whwh,
+                "img_whwh_tgt": img_whwh_tgt
+            })
+
+        outputs = self._forward()
+        loss_dict = self.head.get_loss(outputs, targets)
+        acc = loss_dict["acc"]
+        loss_dict.pop("acc")
+        total_loss = sum(loss_dict.values())
+        loss_dict.update({"loss": total_loss, "acc": acc})
+        return loss_dict
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output

+ 93 - 0
paddlers/models/ppdet/modeling/architectures/ssd.py

@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['SSD']
+
+
+@register
+class SSD(BaseArch):
+    """
+    Single Shot MultiBox Detector, see https://arxiv.org/abs/1512.02325
+
+    Args:
+        backbone (nn.Layer): backbone instance
+        ssd_head (nn.Layer): `SSDHead` instance
+        post_process (object): `BBoxPostProcess` instance
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self, backbone, ssd_head, post_process, r34_backbone=False):
+        super(SSD, self).__init__()
+        self.backbone = backbone
+        self.ssd_head = ssd_head
+        self.post_process = post_process
+        self.r34_backbone = r34_backbone
+        if self.r34_backbone:
+            from paddlers.models.ppdet.modeling.backbones.resnet import ResNet
+            assert isinstance(self.backbone, ResNet) and \
+                   self.backbone.depth == 34, \
+                "If you set r34_backbone=True, please use ResNet-34 as backbone."
+            self.backbone.res_layers[2].blocks[
+                0].branch2a.conv._stride = [1, 1]
+            self.backbone.res_layers[2].blocks[0].short.conv._stride = [1, 1]
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # head
+        kwargs = {'input_shape': backbone.out_shape}
+        ssd_head = create(cfg['ssd_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            "ssd_head": ssd_head,
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs)
+
+        # SSD Head
+        if self.training:
+            return self.ssd_head(body_feats, self.inputs['image'],
+                                 self.inputs['gt_bbox'],
+                                 self.inputs['gt_class'])
+        else:
+            preds, anchors = self.ssd_head(body_feats, self.inputs['image'])
+            bbox, bbox_num = self.post_process(preds, anchors,
+                                               self.inputs['im_shape'],
+                                               self.inputs['scale_factor'])
+            return bbox, bbox_num
+
+    def get_loss(self, ):
+        return {"loss": self._forward()}
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {
+            "bbox": bbox_pred,
+            "bbox_num": bbox_num,
+        }
+        return output

+ 78 - 0
paddlers/models/ppdet/modeling/architectures/tood.py

@@ -0,0 +1,78 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['TOOD']
+
+
+@register
+class TOOD(BaseArch):
+    """
+    TOOD: Task-aligned One-stage Object Detection, see https://arxiv.org/abs/2108.07755
+    Args:
+        backbone (nn.Layer): backbone instance
+        neck (nn.Layer): 'FPN' instance
+        head (nn.Layer): 'TOODHead' instance
+    """
+
+    __category__ = 'architecture'
+
+    def __init__(self, backbone, neck, head):
+        super(TOOD, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        head_outs = self.head(fpn_feats)
+        if not self.training:
+            bboxes, bbox_num = self.head.post_process(
+                head_outs, self.inputs['im_shape'],
+                self.inputs['scale_factor'])
+            return bboxes, bbox_num
+        else:
+            loss = self.head.get_loss(head_outs, self.inputs)
+            return loss
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output

+ 98 - 0
paddlers/models/ppdet/modeling/architectures/ttfnet.py

@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['TTFNet']
+
+
+@register
+class TTFNet(BaseArch):
+    """
+    TTFNet network, see https://arxiv.org/abs/1909.00700
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): 'TTFFPN' instance
+        ttf_head (object): 'TTFHead' instance
+        post_process (object): 'BBoxPostProcess' instance
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='DarkNet',
+                 neck='TTFFPN',
+                 ttf_head='TTFHead',
+                 post_process='BBoxPostProcess'):
+        super(TTFNet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.ttf_head = ttf_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        ttf_head = create(cfg['ttf_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "ttf_head": ttf_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        body_feats = self.neck(body_feats)
+        hm, wh = self.ttf_head(body_feats)
+        if self.training:
+            return hm, wh
+        else:
+            bbox, bbox_num = self.post_process(hm, wh, self.inputs['im_shape'],
+                                               self.inputs['scale_factor'])
+            return bbox, bbox_num
+
+    def get_loss(self, ):
+        loss = {}
+        heatmap = self.inputs['ttf_heatmap']
+        box_target = self.inputs['ttf_box_target']
+        reg_weight = self.inputs['ttf_reg_weight']
+        hm, wh = self._forward()
+        head_loss = self.ttf_head.get_loss(hm, wh, heatmap, box_target,
+                                           reg_weight)
+        loss.update(head_loss)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {
+            "bbox": bbox_pred,
+            "bbox_num": bbox_num,
+        }
+        return output

+ 124 - 0
paddlers/models/ppdet/modeling/architectures/yolo.py

@@ -0,0 +1,124 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+from ..post_process import JDEBBoxPostProcess
+
+__all__ = ['YOLOv3']
+
+
+@register
+class YOLOv3(BaseArch):
+    __category__ = 'architecture'
+    __shared__ = ['data_format']
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='DarkNet',
+                 neck='YOLOv3FPN',
+                 yolo_head='YOLOv3Head',
+                 post_process='BBoxPostProcess',
+                 data_format='NCHW',
+                 for_mot=False):
+        """
+        YOLOv3 network, see https://arxiv.org/abs/1804.02767
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): anchor_head instance
+            bbox_post_process (object): `BBoxPostProcess` instance
+            data_format (str): data format, NCHW or NHWC
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOv3, self).__init__(data_format=data_format)
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+        self.return_idx = isinstance(post_process, JDEBBoxPostProcess)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if isinstance(neck_feats, dict):
+            assert self.for_mot == True
+            emb_feats = neck_feats['emb_feats']
+            neck_feats = neck_feats['yolo_feats']
+
+        if self.training:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+
+            if self.for_mot:
+                return {'det_losses': yolo_losses, 'emb_feats': emb_feats}
+            else:
+                return yolo_losses
+
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+
+            if self.for_mot:
+                boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors)
+                output = {
+                    'bbox': bbox,
+                    'bbox_num': bbox_num,
+                    'boxes_idx': boxes_idx,
+                    'nms_keep_idx': nms_keep_idx,
+                    'emb_feats': emb_feats,
+                }
+            else:
+                if self.return_idx:
+                    _, bbox, bbox_num, _ = self.post_process(
+                        yolo_head_outs, self.yolo_head.mask_anchors)
+                else:
+                    bbox, bbox_num = self.post_process(
+                        yolo_head_outs, self.yolo_head.mask_anchors,
+                        self.inputs['im_shape'], self.inputs['scale_factor'])
+                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()

+ 23 - 0
paddlers/models/ppdet/modeling/assigners/__init__.py

@@ -0,0 +1,23 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import utils
+from . import task_aligned_assigner
+from . import atss_assigner
+from . import simota_assigner
+
+from .utils import *
+from .task_aligned_assigner import *
+from .atss_assigner import *
+from .simota_assigner import *

+ 211 - 0
paddlers/models/ppdet/modeling/assigners/atss_assigner.py

@@ -0,0 +1,211 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register
+from ..ops import iou_similarity
+from ..bbox_utils import bbox_center
+from .utils import (pad_gt, check_points_inside_bboxes, compute_max_iou_anchor,
+                    compute_max_iou_gt)
+
+
+@register
+class ATSSAssigner(nn.Layer):
+    """Bridging the Gap Between Anchor-based and Anchor-free Detection
+     via Adaptive Training Sample Selection
+    """
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 topk=9,
+                 num_classes=80,
+                 force_gt_matching=False,
+                 eps=1e-9):
+        super(ATSSAssigner, self).__init__()
+        self.topk = topk
+        self.num_classes = num_classes
+        self.force_gt_matching = force_gt_matching
+        self.eps = eps
+
+    def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list,
+                             pad_gt_mask):
+        pad_gt_mask = pad_gt_mask.tile([1, 1, self.topk]).astype(paddle.bool)
+        gt2anchor_distances_list = paddle.split(
+            gt2anchor_distances, num_anchors_list, axis=-1)
+        num_anchors_index = np.cumsum(num_anchors_list).tolist()
+        num_anchors_index = [0, ] + num_anchors_index[:-1]
+        is_in_topk_list = []
+        topk_idxs_list = []
+        for distances, anchors_index in zip(gt2anchor_distances_list,
+                                            num_anchors_index):
+            num_anchors = distances.shape[-1]
+            topk_metrics, topk_idxs = paddle.topk(
+                distances, self.topk, axis=-1, largest=False)
+            topk_idxs_list.append(topk_idxs + anchors_index)
+            topk_idxs = paddle.where(pad_gt_mask, topk_idxs,
+                                     paddle.zeros_like(topk_idxs))
+            is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(axis=-2)
+            is_in_topk = paddle.where(is_in_topk > 1,
+                                      paddle.zeros_like(is_in_topk),
+                                      is_in_topk)
+            is_in_topk_list.append(
+                is_in_topk.astype(gt2anchor_distances.dtype))
+        is_in_topk_list = paddle.concat(is_in_topk_list, axis=-1)
+        topk_idxs_list = paddle.concat(topk_idxs_list, axis=-1)
+        return is_in_topk_list, topk_idxs_list
+
+    @paddle.no_grad()
+    def forward(self,
+                anchor_bboxes,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                bg_index,
+                gt_scores=None):
+        r"""This code is based on
+            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
+
+        The assignment is done in following steps
+        1. compute iou between all bbox (bbox of all pyramid levels) and gt
+        2. compute center distance between all bbox and gt
+        3. on each pyramid level, for each gt, select k bbox whose center
+           are closest to the gt center, so we total select k*l bbox as
+           candidates for each gt
+        4. get corresponding iou for the these candidates, and compute the
+           mean and std, set mean + std as the iou threshold
+        5. select these candidates whose iou are greater than or equal to
+           the threshold as positive
+        6. limit the positive sample's center in gt
+        7. if an anchor box is assigned to multiple gts, the one with the
+           highest iou will be selected.
+        Args:
+            anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4),
+                    "xmin, xmax, ymin, ymax" format
+            num_anchors_list (List): num of anchors in each level
+            gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes, shape(B, n, 4)
+            bg_index (int): background index
+            gt_scores (Tensor|List[Tensor]|None, float32) Score of gt_bboxes,
+                    shape(B, n, 1), if None, then it will initialize with one_hot label
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_bboxes (Tensor): (B, L, 4)
+            assigned_scores (Tensor): (B, L, C)
+        """
+        gt_labels, gt_bboxes, pad_gt_scores, pad_gt_mask = pad_gt(
+            gt_labels, gt_bboxes, gt_scores)
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+
+        num_anchors, _ = anchor_bboxes.shape
+        batch_size, num_max_boxes, _ = gt_bboxes.shape
+
+        # negative batch
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full([batch_size, num_anchors], bg_index)
+            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, self.num_classes])
+            return assigned_labels, assigned_bboxes, assigned_scores
+
+        # 1. compute iou between gt and anchor bbox, [B, n, L]
+        ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
+        ious = ious.reshape([batch_size, -1, num_anchors])
+
+        # 2. compute center distance between all anchors and gt, [B, n, L]
+        gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1)
+        anchor_centers = bbox_center(anchor_bboxes)
+        gt2anchor_distances = (gt_centers - anchor_centers.unsqueeze(0)) \
+            .norm(2, axis=-1).reshape([batch_size, -1, num_anchors])
+
+        # 3. on each pyramid level, selecting topk closest candidates
+        # based on the center distance, [B, n, L]
+        is_in_topk, topk_idxs = self._gather_topk_pyramid(
+            gt2anchor_distances, num_anchors_list, pad_gt_mask)
+
+        # 4. get corresponding iou for the these candidates, and compute the
+        # mean and std, 5. set mean + std as the iou threshold
+        iou_candidates = ious * is_in_topk
+        iou_threshold = paddle.index_sample(
+            iou_candidates.flatten(stop_axis=-2),
+            topk_idxs.flatten(stop_axis=-2))
+        iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])
+        iou_threshold = iou_threshold.mean(axis=-1, keepdim=True) + \
+                        iou_threshold.std(axis=-1, keepdim=True)
+        is_in_topk = paddle.where(
+            iou_candidates > iou_threshold.tile([1, 1, num_anchors]),
+            is_in_topk, paddle.zeros_like(is_in_topk))
+
+        # 6. check the positive sample's center in gt, [B, n, L]
+        is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)
+
+        # select positive sample, [B, n, L]
+        mask_positive = is_in_topk * is_in_gts * pad_gt_mask
+
+        # 7. if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected.
+        mask_positive_sum = mask_positive.sum(axis=-2)
+        if mask_positive_sum.max() > 1:
+            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
+                [1, num_max_boxes, 1])
+            is_max_iou = compute_max_iou_anchor(ious)
+            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        # 8. make sure every gt_bbox matches the anchor
+        if self.force_gt_matching:
+            is_max_iou = compute_max_iou_gt(ious) * pad_gt_mask
+            mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile(
+                [1, num_max_boxes, 1])
+            mask_positive = paddle.where(mask_max_iou, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        assigned_gt_index = mask_positive.argmax(axis=-2)
+        assert mask_positive_sum.max() == 1, \
+            ("one anchor just assign one gt, but received not equals 1. "
+             "Received: %f" % mask_positive_sum.max().item())
+
+        # assigned target
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            mask_positive_sum > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_bboxes = paddle.gather(
+            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
+        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
+
+        assigned_scores = F.one_hot(assigned_labels, self.num_classes)
+        if gt_scores is not None:
+            gather_scores = paddle.gather(
+                pad_gt_scores.flatten(), assigned_gt_index.flatten(), axis=0)
+            gather_scores = gather_scores.reshape([batch_size, num_anchors])
+            gather_scores = paddle.where(mask_positive_sum > 0, gather_scores,
+                                         paddle.zeros_like(gather_scores))
+            assigned_scores *= gather_scores.unsqueeze(-1)
+
+        return assigned_labels, assigned_bboxes, assigned_scores

+ 262 - 0
paddlers/models/ppdet/modeling/assigners/simota_assigner.py

@@ -0,0 +1,262 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/sim_ota_assigner.py
+
+import paddle
+import numpy as np
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.modeling.losses.varifocal_loss import varifocal_loss
+from paddlers.models.ppdet.modeling.bbox_utils import batch_bbox_overlaps
+from paddlers.models.ppdet.core.workspace import register
+
+
+@register
+class SimOTAAssigner(object):
+    """Computes matching between predictions and ground truth.
+    Args:
+        center_radius (int | float, optional): Ground truth center size
+            to judge whether a prior is in center. Default 2.5.
+        candidate_topk (int, optional): The candidate top-k which used to
+            get top-k ious to calculate dynamic-k. Default 10.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 3.0.
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        num_classes (int): The num_classes of dataset.
+        use_vfl (int): Whether to use varifocal_loss when calculating the cost matrix.
+    """
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 center_radius=2.5,
+                 candidate_topk=10,
+                 iou_weight=3.0,
+                 cls_weight=1.0,
+                 num_classes=80,
+                 use_vfl=True):
+        self.center_radius = center_radius
+        self.candidate_topk = candidate_topk
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+        self.num_classes = num_classes
+        self.use_vfl = use_vfl
+
+    def get_in_gt_and_in_center_info(self, flatten_center_and_stride,
+                                     gt_bboxes):
+        num_gt = gt_bboxes.shape[0]
+
+        flatten_x = flatten_center_and_stride[:, 0].unsqueeze(1).tile(
+            [1, num_gt])
+        flatten_y = flatten_center_and_stride[:, 1].unsqueeze(1).tile(
+            [1, num_gt])
+        flatten_stride_x = flatten_center_and_stride[:, 2].unsqueeze(1).tile(
+            [1, num_gt])
+        flatten_stride_y = flatten_center_and_stride[:, 3].unsqueeze(1).tile(
+            [1, num_gt])
+
+        # is prior centers in gt bboxes, shape: [n_center, n_gt]
+        l_ = flatten_x - gt_bboxes[:, 0]
+        t_ = flatten_y - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - flatten_x
+        b_ = gt_bboxes[:, 3] - flatten_y
+
+        deltas = paddle.stack([l_, t_, r_, b_], axis=1)
+        is_in_gts = deltas.min(axis=1) > 0
+        is_in_gts_all = is_in_gts.sum(axis=1) > 0
+
+        # is prior centers in gt centers
+        gt_center_xs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_center_ys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        ct_bound_l = gt_center_xs - self.center_radius * flatten_stride_x
+        ct_bound_t = gt_center_ys - self.center_radius * flatten_stride_y
+        ct_bound_r = gt_center_xs + self.center_radius * flatten_stride_x
+        ct_bound_b = gt_center_ys + self.center_radius * flatten_stride_y
+
+        cl_ = flatten_x - ct_bound_l
+        ct_ = flatten_y - ct_bound_t
+        cr_ = ct_bound_r - flatten_x
+        cb_ = ct_bound_b - flatten_y
+
+        ct_deltas = paddle.stack([cl_, ct_, cr_, cb_], axis=1)
+        is_in_cts = ct_deltas.min(axis=1) > 0
+        is_in_cts_all = is_in_cts.sum(axis=1) > 0
+
+        # in any of gts or gt centers, shape: [n_center]
+        is_in_gts_or_centers_all = paddle.logical_or(is_in_gts_all,
+                                                     is_in_cts_all)
+
+        is_in_gts_or_centers_all_inds = paddle.nonzero(
+            is_in_gts_or_centers_all).squeeze(1)
+
+        # both in gts and gt centers, shape: [num_fg, num_gt]
+        is_in_gts_and_centers = paddle.logical_and(
+            paddle.gather(
+                is_in_gts.cast('int'), is_in_gts_or_centers_all_inds,
+                axis=0).cast('bool'),
+            paddle.gather(
+                is_in_cts.cast('int'), is_in_gts_or_centers_all_inds,
+                axis=0).cast('bool'))
+        return is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_gts_and_centers
+
+    def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt):
+        match_matrix = np.zeros_like(cost_matrix.numpy())
+        # select candidate topk ious for dynamic-k calculation
+        topk_ious, _ = paddle.topk(pairwise_ious, self.candidate_topk, axis=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = paddle.topk(
+                cost_matrix[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            match_matrix[:, gt_idx][pos_idx.numpy()] = 1.0
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        # match points more than two gts
+        extra_match_gts_mask = match_matrix.sum(1) > 1
+        if extra_match_gts_mask.sum() > 0:
+            cost_matrix = cost_matrix.numpy()
+            cost_argmin = np.argmin(
+                cost_matrix[extra_match_gts_mask, :], axis=1)
+            match_matrix[extra_match_gts_mask, :] *= 0.0
+            match_matrix[extra_match_gts_mask, cost_argmin] = 1.0
+        # get foreground mask
+        match_fg_mask_inmatrix = match_matrix.sum(1) > 0
+        match_gt_inds_to_fg = match_matrix[match_fg_mask_inmatrix, :].argmax(1)
+
+        return match_gt_inds_to_fg, match_fg_mask_inmatrix
+
+    def get_sample(self, assign_gt_inds, gt_bboxes):
+        pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])
+        neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])
+        pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1
+
+        if gt_bboxes.size == 0:
+            # hack for index error case
+            assert pos_assigned_gt_inds.size == 0
+            pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.resize(-1, 4)
+            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
+        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
+
+    def __call__(self,
+                 flatten_cls_pred_scores,
+                 flatten_center_and_stride,
+                 flatten_bboxes,
+                 gt_bboxes,
+                 gt_labels,
+                 eps=1e-7):
+        """Assign gt to priors using SimOTA.
+        TODO: add comment.
+        Returns:
+            assign_result: The assigned result.
+        """
+        num_gt = gt_bboxes.shape[0]
+        num_bboxes = flatten_bboxes.shape[0]
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes
+            label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
+            label_weight = np.ones([num_bboxes], dtype=np.float32)
+            bbox_target = np.zeros_like(flatten_center_and_stride)
+            return 0, label, label_weight, bbox_target
+
+        is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
+            flatten_center_and_stride, gt_bboxes)
+
+        # bboxes and scores to calculate matrix
+        valid_flatten_bboxes = flatten_bboxes[is_in_gts_or_centers_all_inds]
+        valid_cls_pred_scores = flatten_cls_pred_scores[
+            is_in_gts_or_centers_all_inds]
+        num_valid_bboxes = valid_flatten_bboxes.shape[0]
+
+        pairwise_ious = batch_bbox_overlaps(valid_flatten_bboxes,
+                                            gt_bboxes)  # [num_points,num_gts]
+        if self.use_vfl:
+            gt_vfl_labels = gt_labels.squeeze(-1).unsqueeze(0).tile(
+                [num_valid_bboxes, 1]).reshape([-1])
+            valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(
+                [1, num_gt, 1]).reshape([-1, self.num_classes])
+            vfl_score = np.zeros(valid_pred_scores.shape)
+            vfl_score[np.arange(0, vfl_score.shape[0]), gt_vfl_labels.numpy(
+            )] = pairwise_ious.reshape([-1])
+            vfl_score = paddle.to_tensor(vfl_score)
+            losses_vfl = varifocal_loss(
+                valid_pred_scores, vfl_score,
+                use_sigmoid=False).reshape([num_valid_bboxes, num_gt])
+            losses_giou = batch_bbox_overlaps(
+                valid_flatten_bboxes, gt_bboxes, mode='giou')
+            cost_matrix = (
+                losses_vfl * self.cls_weight + losses_giou * self.iou_weight +
+                paddle.logical_not(is_in_boxes_and_center).cast('float32') *
+                100000000)
+        else:
+            iou_cost = -paddle.log(pairwise_ious + eps)
+            gt_onehot_label = (F.one_hot(
+                gt_labels.squeeze(-1).cast(paddle.int64),
+                flatten_cls_pred_scores.shape[-1]).cast('float32').unsqueeze(0)
+                               .tile([num_valid_bboxes, 1, 1]))
+
+            valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(
+                [1, num_gt, 1])
+            cls_cost = F.binary_cross_entropy(
+                valid_pred_scores, gt_onehot_label, reduction='none').sum(-1)
+
+            cost_matrix = (
+                cls_cost * self.cls_weight + iou_cost * self.iou_weight +
+                paddle.logical_not(is_in_boxes_and_center).cast('float32') *
+                100000000)
+
+        match_gt_inds_to_fg, match_fg_mask_inmatrix = \
+            self.dynamic_k_matching(
+                cost_matrix, pairwise_ious, num_gt)
+
+        # sample and assign results
+        assigned_gt_inds = np.zeros([num_bboxes], dtype=np.int64)
+        match_fg_mask_inall = np.zeros_like(assigned_gt_inds)
+        match_fg_mask_inall[is_in_gts_or_centers_all.numpy(
+        )] = match_fg_mask_inmatrix
+
+        assigned_gt_inds[match_fg_mask_inall.astype(
+            np.bool)] = match_gt_inds_to_fg + 1
+
+        pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \
+            = self.get_sample(assigned_gt_inds, gt_bboxes.numpy())
+
+        bbox_target = np.zeros_like(flatten_bboxes)
+        bbox_weight = np.zeros_like(flatten_bboxes)
+        label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
+        label_weight = np.zeros([num_bboxes], dtype=np.float32)
+
+        if len(pos_inds) > 0:
+            gt_labels = gt_labels.numpy()
+            pos_bbox_targets = pos_gt_bboxes
+            bbox_target[pos_inds, :] = pos_bbox_targets
+            bbox_weight[pos_inds, :] = 1.0
+            if not np.any(gt_labels):
+                label[pos_inds] = 0
+            else:
+                label[pos_inds] = gt_labels.squeeze(-1)[pos_assigned_gt_inds]
+
+            label_weight[pos_inds] = 1.0
+        if len(neg_inds) > 0:
+            label_weight[neg_inds] = 1.0
+
+        pos_num = max(pos_inds.size, 1)
+
+        return pos_num, label, label_weight, bbox_target

+ 158 - 0
paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py

@@ -0,0 +1,158 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register
+from ..bbox_utils import iou_similarity
+from .utils import (pad_gt, gather_topk_anchors, check_points_inside_bboxes,
+                    compute_max_iou_anchor)
+
+
+@register
+class TaskAlignedAssigner(nn.Layer):
+    """TOOD: Task-aligned One-stage Object Detection
+    """
+
+    def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
+        super(TaskAlignedAssigner, self).__init__()
+        self.topk = topk
+        self.alpha = alpha
+        self.beta = beta
+        self.eps = eps
+
+    @paddle.no_grad()
+    def forward(self,
+                pred_scores,
+                pred_bboxes,
+                anchor_points,
+                gt_labels,
+                gt_bboxes,
+                bg_index,
+                gt_scores=None):
+        r"""This code is based on
+            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
+
+        The assignment is done in following steps
+        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free detector
+           only can predict positive distance)
+        4. if an anchor box is assigned to multiple gts, the one with the
+           highest iou will be selected.
+        Args:
+            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
+            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
+            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
+            gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes, shape(B, n, 4)
+            bg_index (int): background index
+            gt_scores (Tensor|List[Tensor]|None, float32) Score of gt_bboxes,
+                    shape(B, n, 1), if None, then it will initialize with one_hot label
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_bboxes (Tensor): (B, L, 4)
+            assigned_scores (Tensor): (B, L, C)
+        """
+        assert pred_scores.ndim == pred_bboxes.ndim
+
+        gt_labels, gt_bboxes, pad_gt_scores, pad_gt_mask = pad_gt(
+            gt_labels, gt_bboxes, gt_scores)
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+
+        batch_size, num_anchors, num_classes = pred_scores.shape
+        _, num_max_boxes, _ = gt_bboxes.shape
+
+        # negative batch
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full([batch_size, num_anchors], bg_index)
+            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, num_classes])
+            return assigned_labels, assigned_bboxes, assigned_scores
+
+        # compute iou between gt and pred bbox, [B, n, L]
+        ious = iou_similarity(gt_bboxes, pred_bboxes)
+        # gather pred bboxes class score
+        pred_scores = pred_scores.transpose([0, 2, 1])
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        gt_labels_ind = paddle.stack(
+            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
+            axis=-1)
+        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
+        # compute alignment metrics, [B, n, L]
+        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
+            self.beta)
+
+        # check the positive sample's center in gt, [B, n, L]
+        is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)
+
+        # select topk largest alignment metrics pred bbox as candidates
+        # for each gt, [B, n, L]
+        is_in_topk = gather_topk_anchors(
+            alignment_metrics * is_in_gts,
+            self.topk,
+            topk_mask=pad_gt_mask.tile([1, 1, self.topk]).astype(paddle.bool))
+
+        # select positive sample, [B, n, L]
+        mask_positive = is_in_topk * is_in_gts * pad_gt_mask
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected, [B, n, L]
+        mask_positive_sum = mask_positive.sum(axis=-2)
+        if mask_positive_sum.max() > 1:
+            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
+                [1, num_max_boxes, 1])
+            is_max_iou = compute_max_iou_anchor(ious)
+            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        assigned_gt_index = mask_positive.argmax(axis=-2)
+        assert mask_positive_sum.max() == 1, \
+            ("one anchor just assign one gt, but received not equals 1. "
+             "Received: %f" % mask_positive_sum.max().item())
+
+        # assigned target
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            mask_positive_sum > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_bboxes = paddle.gather(
+            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
+        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
+
+        assigned_scores = F.one_hot(assigned_labels, num_classes)
+        # rescale alignment metrics
+        alignment_metrics *= mask_positive
+        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
+        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
+                                                           keepdim=True)
+        alignment_metrics = alignment_metrics / (
+            max_metrics_per_instance + self.eps) * max_ious_per_instance
+        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
+        assigned_scores = assigned_scores * alignment_metrics
+
+        return assigned_labels, assigned_bboxes, assigned_scores

+ 195 - 0
paddlers/models/ppdet/modeling/assigners/utils.py

@@ -0,0 +1,195 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn.functional as F
+
+__all__ = [
+    'pad_gt', 'gather_topk_anchors', 'check_points_inside_bboxes',
+    'compute_max_iou_anchor', 'compute_max_iou_gt',
+    'generate_anchors_for_grid_cell'
+]
+
+
+def pad_gt(gt_labels, gt_bboxes, gt_scores=None):
+    r""" Pad 0 in gt_labels and gt_bboxes.
+    Args:
+        gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes,
+            shape is [B, n, 1] or [[n_1, 1], [n_2, 1], ...], here n = sum(n_i)
+        gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes,
+            shape is [B, n, 4] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)
+        gt_scores (Tensor|List[Tensor]|None, float32): Score of gt_bboxes,
+            shape is [B, n, 1] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)
+    Returns:
+        pad_gt_labels (Tensor, int64): shape[B, n, 1]
+        pad_gt_bboxes (Tensor, float32): shape[B, n, 4]
+        pad_gt_scores (Tensor, float32): shape[B, n, 1]
+        pad_gt_mask (Tensor, float32): shape[B, n, 1], 1 means bbox, 0 means no bbox
+    """
+    if isinstance(gt_labels, paddle.Tensor) and isinstance(gt_bboxes,
+                                                           paddle.Tensor):
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+        pad_gt_mask = (
+            gt_bboxes.sum(axis=-1, keepdim=True) > 0).astype(gt_bboxes.dtype)
+        if gt_scores is None:
+            gt_scores = pad_gt_mask.clone()
+        assert gt_labels.ndim == gt_scores.ndim
+
+        return gt_labels, gt_bboxes, gt_scores, pad_gt_mask
+    elif isinstance(gt_labels, list) and isinstance(gt_bboxes, list):
+        assert len(gt_labels) == len(gt_bboxes), \
+            'The number of `gt_labels` and `gt_bboxes` is not equal. '
+        num_max_boxes = max([len(a) for a in gt_bboxes])
+        batch_size = len(gt_bboxes)
+        # pad label and bbox
+        pad_gt_labels = paddle.zeros(
+            [batch_size, num_max_boxes, 1], dtype=gt_labels[0].dtype)
+        pad_gt_bboxes = paddle.zeros(
+            [batch_size, num_max_boxes, 4], dtype=gt_bboxes[0].dtype)
+        pad_gt_scores = paddle.zeros(
+            [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)
+        pad_gt_mask = paddle.zeros(
+            [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)
+        for i, (label, bbox) in enumerate(zip(gt_labels, gt_bboxes)):
+            if len(label) > 0 and len(bbox) > 0:
+                pad_gt_labels[i, :len(label)] = label
+                pad_gt_bboxes[i, :len(bbox)] = bbox
+                pad_gt_mask[i, :len(bbox)] = 1.
+                if gt_scores is not None:
+                    pad_gt_scores[i, :len(gt_scores[i])] = gt_scores[i]
+        if gt_scores is None:
+            pad_gt_scores = pad_gt_mask.clone()
+        return pad_gt_labels, pad_gt_bboxes, pad_gt_scores, pad_gt_mask
+    else:
+        raise ValueError('The input `gt_labels` or `gt_bboxes` is invalid! ')
+
+
+def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
+    r"""
+    Args:
+        metrics (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
+        topk (int): The number of top elements to look for along the axis.
+        largest (bool) : largest is a flag, if set to true,
+            algorithm will sort by descending order, otherwise sort by
+            ascending order. Default: True
+        topk_mask (Tensor, bool|None): shape[B, n, topk], ignore bbox mask,
+            Default: None
+        eps (float): Default: 1e-9
+    Returns:
+        is_in_topk (Tensor, float32): shape[B, n, L], value=1. means selected
+    """
+    num_anchors = metrics.shape[-1]
+    topk_metrics, topk_idxs = paddle.topk(
+        metrics, topk, axis=-1, largest=largest)
+    if topk_mask is None:
+        topk_mask = (topk_metrics.max(axis=-1, keepdim=True) > eps).tile(
+            [1, 1, topk])
+    topk_idxs = paddle.where(topk_mask, topk_idxs,
+                             paddle.zeros_like(topk_idxs))
+    is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(axis=-2)
+    is_in_topk = paddle.where(is_in_topk > 1,
+                              paddle.zeros_like(is_in_topk), is_in_topk)
+    return is_in_topk.astype(metrics.dtype)
+
+
+def check_points_inside_bboxes(points, bboxes, eps=1e-9):
+    r"""
+    Args:
+        points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
+        bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format
+        eps (float): Default: 1e-9
+    Returns:
+        is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected
+    """
+    points = points.unsqueeze([0, 1])
+    x, y = points.chunk(2, axis=-1)
+    xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, axis=-1)
+    l = x - xmin
+    t = y - ymin
+    r = xmax - x
+    b = ymax - y
+    bbox_ltrb = paddle.concat([l, t, r, b], axis=-1)
+    return (bbox_ltrb.min(axis=-1) > eps).astype(bboxes.dtype)
+
+
+def compute_max_iou_anchor(ious):
+    r"""
+    For each anchor, find the GT with the largest IOU.
+    Args:
+        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
+    Returns:
+        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
+    """
+    num_max_boxes = ious.shape[-2]
+    max_iou_index = ious.argmax(axis=-2)
+    is_max_iou = F.one_hot(max_iou_index, num_max_boxes).transpose([0, 2, 1])
+    return is_max_iou.astype(ious.dtype)
+
+
+def compute_max_iou_gt(ious):
+    r"""
+    For each GT, find the anchor with the largest IOU.
+    Args:
+        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
+    Returns:
+        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
+    """
+    num_anchors = ious.shape[-1]
+    max_iou_index = ious.argmax(axis=-1)
+    is_max_iou = F.one_hot(max_iou_index, num_anchors)
+    return is_max_iou.astype(ious.dtype)
+
+
+def generate_anchors_for_grid_cell(feats,
+                                   fpn_strides,
+                                   grid_cell_size=5.0,
+                                   grid_cell_offset=0.5):
+    r"""
+    Like ATSS, generate anchors based on grid size.
+    Args:
+        feats (List[Tensor]): shape[s, (b, c, h, w)]
+        fpn_strides (tuple|list): shape[s], stride for each scale feature
+        grid_cell_size (float): anchor size
+        grid_cell_offset (float): The range is between 0 and 1.
+    Returns:
+        anchors (List[Tensor]): shape[s, (l, 4)]
+        num_anchors_list (List[int]): shape[s]
+        stride_tensor_list (List[Tensor]): shape[s, (l, 1)]
+    """
+    assert len(feats) == len(fpn_strides)
+    anchors = []
+    num_anchors_list = []
+    stride_tensor_list = []
+    for feat, stride in zip(feats, fpn_strides):
+        _, _, h, w = feat.shape
+        cell_half_size = grid_cell_size * stride * 0.5
+        shift_x = (paddle.arange(end=w) + grid_cell_offset) * stride
+        shift_y = (paddle.arange(end=h) + grid_cell_offset) * stride
+        shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
+        anchor = paddle.stack(
+            [
+                shift_x - cell_half_size, shift_y - cell_half_size,
+                shift_x + cell_half_size, shift_y + cell_half_size
+            ],
+            axis=-1).astype(feat.dtype)
+        anchors.append(anchor.reshape([-1, 4]))
+        num_anchors_list.append(len(anchors[-1]))
+        stride_tensor_list.append(
+            paddle.full([num_anchors_list[-1], 1], stride))
+    return anchors, num_anchors_list, stride_tensor_list

+ 49 - 0
paddlers/models/ppdet/modeling/backbones/__init__.py

@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import vgg
+from . import resnet
+from . import darknet
+from . import mobilenet_v1
+from . import mobilenet_v3
+from . import hrnet
+from . import lite_hrnet
+from . import blazenet
+from . import ghostnet
+from . import senet
+from . import res2net
+from . import dla
+from . import shufflenet_v2
+from . import swin_transformer
+from . import lcnet
+from . import hardnet
+from . import esnet
+
+from .vgg import *
+from .resnet import *
+from .darknet import *
+from .mobilenet_v1 import *
+from .mobilenet_v3 import *
+from .hrnet import *
+from .lite_hrnet import *
+from .blazenet import *
+from .ghostnet import *
+from .senet import *
+from .res2net import *
+from .dla import *
+from .shufflenet_v2 import *
+from .swin_transformer import *
+from .lcnet import *
+from .hardnet import *
+from .esnet import *

+ 320 - 0
paddlers/models/ppdet/modeling/backbones/blazenet.py

@@ -0,0 +1,320 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import KaimingNormal
+from paddlers.models.ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['BlazeNet']
+
+
+def hard_swish(x):
+    return x * F.relu6(x + 3) / 6.
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act='relu',
+                 conv_lr=0.1,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=conv_lr, initializer=KaimingNormal()),
+            bias_attr=False)
+
+        if norm_type in ['bn', 'sync_bn']:
+            self._batch_norm = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        if self.act == "relu":
+            x = F.relu(x)
+        elif self.act == "relu6":
+            x = F.relu6(x)
+        elif self.act == 'leaky':
+            x = F.leaky_relu(x)
+        elif self.act == 'hard_swish':
+            x = hard_swish(x)
+        return x
+
+
+class BlazeBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 double_channels=None,
+                 stride=1,
+                 use_5x5kernel=True,
+                 act='relu',
+                 name=None):
+        super(BlazeBlock, self).__init__()
+        assert stride in [1, 2]
+        self.use_pool = not stride == 1
+        self.use_double_block = double_channels is not None
+        self.conv_dw = []
+        if use_5x5kernel:
+            self.conv_dw.append(
+                self.add_sublayer(
+                    name + "1_dw",
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=out_channels1,
+                        kernel_size=5,
+                        stride=stride,
+                        padding=2,
+                        num_groups=out_channels1,
+                        name=name + "1_dw")))
+        else:
+            self.conv_dw.append(
+                self.add_sublayer(
+                    name + "1_dw_1",
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=out_channels1,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        num_groups=out_channels1,
+                        name=name + "1_dw_1")))
+            self.conv_dw.append(
+                self.add_sublayer(
+                    name + "1_dw_2",
+                    ConvBNLayer(
+                        in_channels=out_channels1,
+                        out_channels=out_channels1,
+                        kernel_size=3,
+                        stride=stride,
+                        padding=1,
+                        num_groups=out_channels1,
+                        name=name + "1_dw_2")))
+        self.act = act if self.use_double_block else None
+        self.conv_pw = ConvBNLayer(
+            in_channels=out_channels1,
+            out_channels=out_channels2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act=self.act,
+            name=name + "1_sep")
+        if self.use_double_block:
+            self.conv_dw2 = []
+            if use_5x5kernel:
+                self.conv_dw2.append(
+                    self.add_sublayer(
+                        name + "2_dw",
+                        ConvBNLayer(
+                            in_channels=out_channels2,
+                            out_channels=out_channels2,
+                            kernel_size=5,
+                            stride=1,
+                            padding=2,
+                            num_groups=out_channels2,
+                            name=name + "2_dw")))
+            else:
+                self.conv_dw2.append(
+                    self.add_sublayer(
+                        name + "2_dw_1",
+                        ConvBNLayer(
+                            in_channels=out_channels2,
+                            out_channels=out_channels2,
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            num_groups=out_channels2,
+                            name=name + "1_dw_1")))
+                self.conv_dw2.append(
+                    self.add_sublayer(
+                        name + "2_dw_2",
+                        ConvBNLayer(
+                            in_channels=out_channels2,
+                            out_channels=out_channels2,
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            num_groups=out_channels2,
+                            name=name + "2_dw_2")))
+            self.conv_pw2 = ConvBNLayer(
+                in_channels=out_channels2,
+                out_channels=double_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                name=name + "2_sep")
+        # shortcut
+        if self.use_pool:
+            shortcut_channel = double_channels or out_channels2
+            self._shortcut = []
+            self._shortcut.append(
+                self.add_sublayer(
+                    name + '_shortcut_pool',
+                    nn.MaxPool2D(
+                        kernel_size=stride, stride=stride, ceil_mode=True)))
+            self._shortcut.append(
+                self.add_sublayer(
+                    name + '_shortcut_conv',
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=shortcut_channel,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        name="shortcut" + name)))
+
+    def forward(self, x):
+        y = x
+        for conv_dw_block in self.conv_dw:
+            y = conv_dw_block(y)
+        y = self.conv_pw(y)
+        if self.use_double_block:
+            for conv_dw2_block in self.conv_dw2:
+                y = conv_dw2_block(y)
+            y = self.conv_pw2(y)
+        if self.use_pool:
+            for shortcut in self._shortcut:
+                x = shortcut(x)
+        return F.relu(paddle.add(x, y))
+
+
+@register
+@serializable
+class BlazeNet(nn.Layer):
+    """
+    BlazeFace, see https://arxiv.org/abs/1907.05047
+
+    Args:
+        blaze_filters (list): number of filter for each blaze block.
+        double_blaze_filters (list): number of filter for each double_blaze block.
+        use_5x5kernel (bool): whether or not filter size is 5x5 in depth-wise conv.
+    """
+
+    def __init__(self,
+                 blaze_filters=[[24, 24], [24, 24], [24, 48, 2], [48, 48],
+                                [48, 48]],
+                 double_blaze_filters=[[48, 24, 96, 2], [96, 24, 96],
+                                       [96, 24, 96], [96, 24, 96, 2],
+                                       [96, 24, 96], [96, 24, 96]],
+                 use_5x5kernel=True,
+                 act=None):
+        super(BlazeNet, self).__init__()
+        conv1_num_filters = blaze_filters[0][0]
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=conv1_num_filters,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            name="conv1")
+        in_channels = conv1_num_filters
+        self.blaze_block = []
+        self._out_channels = []
+        for k, v in enumerate(blaze_filters):
+            assert len(v) in [2, 3], \
+                "blaze_filters {} not in [2, 3]"
+            if len(v) == 2:
+                self.blaze_block.append(
+                    self.add_sublayer(
+                        'blaze_{}'.format(k),
+                        BlazeBlock(
+                            in_channels,
+                            v[0],
+                            v[1],
+                            use_5x5kernel=use_5x5kernel,
+                            act=act,
+                            name='blaze_{}'.format(k))))
+            elif len(v) == 3:
+                self.blaze_block.append(
+                    self.add_sublayer(
+                        'blaze_{}'.format(k),
+                        BlazeBlock(
+                            in_channels,
+                            v[0],
+                            v[1],
+                            stride=v[2],
+                            use_5x5kernel=use_5x5kernel,
+                            act=act,
+                            name='blaze_{}'.format(k))))
+            in_channels = v[1]
+
+        for k, v in enumerate(double_blaze_filters):
+            assert len(v) in [3, 4], \
+                "blaze_filters {} not in [3, 4]"
+            if len(v) == 3:
+                self.blaze_block.append(
+                    self.add_sublayer(
+                        'double_blaze_{}'.format(k),
+                        BlazeBlock(
+                            in_channels,
+                            v[0],
+                            v[1],
+                            double_channels=v[2],
+                            use_5x5kernel=use_5x5kernel,
+                            act=act,
+                            name='double_blaze_{}'.format(k))))
+            elif len(v) == 4:
+                self.blaze_block.append(
+                    self.add_sublayer(
+                        'double_blaze_{}'.format(k),
+                        BlazeBlock(
+                            in_channels,
+                            v[0],
+                            v[1],
+                            double_channels=v[2],
+                            stride=v[3],
+                            use_5x5kernel=use_5x5kernel,
+                            act=act,
+                            name='double_blaze_{}'.format(k))))
+            in_channels = v[2]
+            self._out_channels.append(in_channels)
+
+    def forward(self, inputs):
+        outs = []
+        y = self.conv1(inputs['image'])
+        for block in self.blaze_block:
+            y = block(y)
+            outs.append(y)
+        return [outs[-4], outs[-1]]
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(channels=c)
+            for c in [self._out_channels[-4], self._out_channels[-1]]
+        ]

+ 340 - 0
paddlers/models/ppdet/modeling/backbones/darknet.py

@@ -0,0 +1,340 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register, serializable
+from paddlers.models.ppdet.modeling.ops import batch_norm, mish
+from ..shape_spec import ShapeSpec
+
+__all__ = ['DarkNet', 'ConvBNLayer']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 act="leaky",
+                 freeze_norm=False,
+                 data_format='NCHW',
+                 name=''):
+        """
+        conv + bn + activation layer
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            stride (int): stride, default 1
+            groups (int): number of groups of conv layer, default 1
+            padding (int): padding size, default 0
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            act (str): activation function type, default 'leaky', which means leaky_relu
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(ConvBNLayer, self).__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            data_format=data_format,
+            bias_attr=False)
+        self.batch_norm = batch_norm(
+            ch_out,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.act = act
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.batch_norm(out)
+        if self.act == 'leaky':
+            out = F.leaky_relu(out, 0.1)
+        elif self.act == 'mish':
+            out = mish(out)
+        return out
+
+
+class DownSample(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=2,
+                 padding=1,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        downsample layer
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            stride (int): stride, default 2
+            padding (int): padding size, default 1
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+
+        super(DownSample, self).__init__()
+
+        self.conv_bn_layer = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.ch_out = ch_out
+
+    def forward(self, inputs):
+        out = self.conv_bn_layer(inputs)
+        return out
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        BasicBlock layer of DarkNet
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+
+        super(BasicBlock, self).__init__()
+
+        self.conv1 = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            ch_in=ch_out,
+            ch_out=ch_out * 2,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+    def forward(self, inputs):
+        conv1 = self.conv1(inputs)
+        conv2 = self.conv2(conv1)
+        out = paddle.add(x=inputs, y=conv2)
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 count,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None,
+                 data_format='NCHW'):
+        """
+        Blocks layer, which consist of some BaickBlock layers
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            count (int): number of BasicBlock layer
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            name (str): layer name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(Blocks, self).__init__()
+
+        self.basicblock0 = BasicBlock(
+            ch_in,
+            ch_out,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.res_out_list = []
+        for i in range(1, count):
+            block_name = '{}.{}'.format(name, i)
+            res_out = self.add_sublayer(
+                block_name,
+                BasicBlock(
+                    ch_out * 2,
+                    ch_out,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format))
+            self.res_out_list.append(res_out)
+        self.ch_out = ch_out
+
+    def forward(self, inputs):
+        y = self.basicblock0(inputs)
+        for basic_block_i in self.res_out_list:
+            y = basic_block_i(y)
+        return y
+
+
+DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
+
+
+@register
+@serializable
+class DarkNet(nn.Layer):
+    __shared__ = ['norm_type', 'data_format']
+
+    def __init__(self,
+                 depth=53,
+                 freeze_at=-1,
+                 return_idx=[2, 3, 4],
+                 num_stages=5,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        Darknet, see https://pjreddie.com/darknet/yolo/
+
+        Args:
+            depth (int): depth of network
+            freeze_at (int): freeze the backbone at which stage
+            filter_size (int): filter size, default 3
+            return_idx (list): index of stages whose feature maps are returned
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(DarkNet, self).__init__()
+        self.depth = depth
+        self.freeze_at = freeze_at
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        self.stages = DarkNet_cfg[self.depth][0:num_stages]
+
+        self.conv0 = ConvBNLayer(
+            ch_in=3,
+            ch_out=32,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+        self.downsample0 = DownSample(
+            ch_in=32,
+            ch_out=32 * 2,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+        self._out_channels = []
+        self.darknet_conv_block_list = []
+        self.downsample_list = []
+        ch_in = [64, 128, 256, 512, 1024]
+        for i, stage in enumerate(self.stages):
+            name = 'stage.{}'.format(i)
+            conv_block = self.add_sublayer(
+                name,
+                Blocks(
+                    int(ch_in[i]),
+                    32 * (2**i),
+                    stage,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format,
+                    name=name))
+            self.darknet_conv_block_list.append(conv_block)
+            if i in return_idx:
+                self._out_channels.append(64 * (2**i))
+        for i in range(num_stages - 1):
+            down_name = 'stage.{}.downsample'.format(i)
+            downsample = self.add_sublayer(
+                down_name,
+                DownSample(
+                    ch_in=32 * (2**(i + 1)),
+                    ch_out=32 * (2**(i + 2)),
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format))
+            self.downsample_list.append(downsample)
+
+    def forward(self, inputs):
+        x = inputs['image']
+
+        out = self.conv0(x)
+        out = self.downsample0(out)
+        blocks = []
+        for i, conv_block_i in enumerate(self.darknet_conv_block_list):
+            out = conv_block_i(out)
+            if i == self.freeze_at:
+                out.stop_gradient = True
+            if i in self.return_idx:
+                blocks.append(out)
+            if i < self.num_stages - 1:
+                out = self.downsample_list[i](out)
+        return blocks
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]

+ 244 - 0
paddlers/models/ppdet/modeling/backbones/dla.py

@@ -0,0 +1,244 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlers.models.ppdet.core.workspace import register, serializable
+from paddlers.models.ppdet.modeling.layers import ConvNormLayer
+from ..shape_spec import ShapeSpec
+
+DLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512])}
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, ch_in, ch_out, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = ConvNormLayer(
+            ch_in,
+            ch_out,
+            filter_size=3,
+            stride=stride,
+            bias_on=False,
+            norm_decay=None)
+        self.conv2 = ConvNormLayer(
+            ch_out,
+            ch_out,
+            filter_size=3,
+            stride=1,
+            bias_on=False,
+            norm_decay=None)
+
+    def forward(self, inputs, residual=None):
+        if residual is None:
+            residual = inputs
+
+        out = self.conv1(inputs)
+        out = F.relu(out)
+
+        out = self.conv2(out)
+
+        out = paddle.add(x=out, y=residual)
+        out = F.relu(out)
+
+        return out
+
+
+class Root(nn.Layer):
+    def __init__(self, ch_in, ch_out, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = ConvNormLayer(
+            ch_in,
+            ch_out,
+            filter_size=1,
+            stride=1,
+            bias_on=False,
+            norm_decay=None)
+        self.residual = residual
+
+    def forward(self, inputs):
+        children = inputs
+        out = self.conv(paddle.concat(inputs, axis=1))
+        if self.residual:
+            out = paddle.add(x=out, y=children[0])
+        out = F.relu(out)
+
+        return out
+
+
+class Tree(nn.Layer):
+    def __init__(self,
+                 level,
+                 block,
+                 ch_in,
+                 ch_out,
+                 stride=1,
+                 level_root=False,
+                 root_dim=0,
+                 root_kernel_size=1,
+                 root_residual=False):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * ch_out
+        if level_root:
+            root_dim += ch_in
+        if level == 1:
+            self.tree1 = block(ch_in, ch_out, stride)
+            self.tree2 = block(ch_out, ch_out, 1)
+        else:
+            self.tree1 = Tree(
+                level - 1,
+                block,
+                ch_in,
+                ch_out,
+                stride,
+                root_dim=0,
+                root_kernel_size=root_kernel_size,
+                root_residual=root_residual)
+            self.tree2 = Tree(
+                level - 1,
+                block,
+                ch_out,
+                ch_out,
+                1,
+                root_dim=root_dim + ch_out,
+                root_kernel_size=root_kernel_size,
+                root_residual=root_residual)
+
+        if level == 1:
+            self.root = Root(root_dim, ch_out, root_kernel_size, root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.level = level
+        if stride > 1:
+            self.downsample = nn.MaxPool2D(stride, stride=stride)
+        if ch_in != ch_out:
+            self.project = ConvNormLayer(
+                ch_in,
+                ch_out,
+                filter_size=1,
+                stride=1,
+                bias_on=False,
+                norm_decay=None)
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.level == 1:
+            x2 = self.tree2(x1)
+            x = self.root([x2, x1] + children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+@register
+@serializable
+class DLA(nn.Layer):
+    """
+    DLA, see https://arxiv.org/pdf/1707.06484.pdf
+
+    Args:
+        depth (int): DLA depth, should be 34.
+        residual_root (bool): whether use a reidual layer in the root block
+
+    """
+
+    def __init__(self, depth=34, residual_root=False):
+        super(DLA, self).__init__()
+        levels, channels = DLA_cfg[depth]
+        if depth == 34:
+            block = BasicBlock
+        self.channels = channels
+        self.base_layer = nn.Sequential(
+            ConvNormLayer(
+                3,
+                channels[0],
+                filter_size=7,
+                stride=1,
+                bias_on=False,
+                norm_decay=None),
+            nn.ReLU())
+        self.level0 = self._make_conv_level(channels[0], channels[0],
+                                            levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+        self.level2 = Tree(
+            levels[2],
+            block,
+            channels[1],
+            channels[2],
+            2,
+            level_root=False,
+            root_residual=residual_root)
+        self.level3 = Tree(
+            levels[3],
+            block,
+            channels[2],
+            channels[3],
+            2,
+            level_root=True,
+            root_residual=residual_root)
+        self.level4 = Tree(
+            levels[4],
+            block,
+            channels[3],
+            channels[4],
+            2,
+            level_root=True,
+            root_residual=residual_root)
+        self.level5 = Tree(
+            levels[5],
+            block,
+            channels[4],
+            channels[5],
+            2,
+            level_root=True,
+            root_residual=residual_root)
+
+    def _make_conv_level(self, ch_in, ch_out, conv_num, stride=1):
+        modules = []
+        for i in range(conv_num):
+            modules.extend([
+                ConvNormLayer(
+                    ch_in,
+                    ch_out,
+                    filter_size=3,
+                    stride=stride if i == 0 else 1,
+                    bias_on=False,
+                    norm_decay=None), nn.ReLU()
+            ])
+            ch_in = ch_out
+        return nn.Sequential(*modules)
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=self.channels[i]) for i in range(6)]
+
+    def forward(self, inputs):
+        outs = []
+        im = inputs['image']
+        feats = self.base_layer(im)
+        for i in range(6):
+            feats = getattr(self, 'level{}'.format(i))(feats)
+            outs.append(feats)
+
+        return outs

+ 290 - 0
paddlers/models/ppdet/modeling/backbones/esnet.py

@@ -0,0 +1,290 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+
+from paddlers.models.ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+from paddlers.models.ppdet.modeling.ops import channel_shuffle
+from paddlers.models.ppdet.modeling.backbones.shufflenet_v2 import ConvBNLayer
+
+__all__ = ['ESNet']
+
+
+def make_divisible(v, divisor=16, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, reduction=4):
+        super(SEModule, self).__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(),
+            bias_attr=ParamAttr())
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(),
+            bias_attr=ParamAttr())
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = F.hardsigmoid(outputs)
+        return paddle.multiply(x=inputs, y=outputs)
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 stride,
+                 act="relu"):
+        super(InvertedResidual, self).__init__()
+        self._conv_pw = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=mid_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw = ConvBNLayer(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=mid_channels // 2,
+            act=None)
+        self._se = SEModule(mid_channels)
+
+        self._conv_linear = ConvBNLayer(
+            in_channels=mid_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1, x2 = paddle.split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self._conv_pw(x2)
+        x3 = self._conv_dw(x2)
+        x3 = paddle.concat([x2, x3], axis=1)
+        x3 = self._se(x3)
+        x3 = self._conv_linear(x3)
+        out = paddle.concat([x1, x3], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class InvertedResidualDS(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 stride,
+                 act="relu"):
+        super(InvertedResidualDS, self).__init__()
+
+        # branch1
+        self._conv_dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=in_channels,
+            act=None)
+        self._conv_linear_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        # branch2
+        self._conv_pw_2 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw_2 = ConvBNLayer(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=mid_channels // 2,
+            act=None)
+        self._se = SEModule(mid_channels // 2)
+        self._conv_linear_2 = ConvBNLayer(
+            in_channels=mid_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw_mv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=out_channels,
+            act="hard_swish")
+        self._conv_pw_mv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act="hard_swish")
+
+    def forward(self, inputs):
+        x1 = self._conv_dw_1(inputs)
+        x1 = self._conv_linear_1(x1)
+        x2 = self._conv_pw_2(inputs)
+        x2 = self._conv_dw_2(x2)
+        x2 = self._se(x2)
+        x2 = self._conv_linear_2(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        out = self._conv_dw_mv1(out)
+        out = self._conv_pw_mv1(out)
+
+        return out
+
+
+@register
+@serializable
+class ESNet(nn.Layer):
+    def __init__(self,
+                 scale=1.0,
+                 act="hard_swish",
+                 feature_maps=[4, 11, 14],
+                 channel_ratio=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]):
+        super(ESNet, self).__init__()
+        self.scale = scale
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        self.feature_maps = feature_maps
+        stage_repeats = [3, 7, 3]
+
+        stage_out_channels = [
+            -1, 24, make_divisible(128 * scale), make_divisible(256 * scale),
+            make_divisible(512 * scale), 1024
+        ]
+
+        self._out_channels = []
+        self._feature_idx = 0
+        # 1. conv1
+        self._conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            act=act)
+        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self._feature_idx += 1
+
+        # 2. bottleneck sequences
+        self._block_list = []
+        arch_idx = 0
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                channels_scales = channel_ratio[arch_idx]
+                mid_c = make_divisible(
+                    int(stage_out_channels[stage_id + 2] * channels_scales),
+                    divisor=8)
+                if i == 0:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidualDS(
+                            in_channels=stage_out_channels[stage_id + 1],
+                            mid_channels=mid_c,
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=2,
+                            act=act))
+                else:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidual(
+                            in_channels=stage_out_channels[stage_id + 2],
+                            mid_channels=mid_c,
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=1,
+                            act=act))
+                self._block_list.append(block)
+                arch_idx += 1
+                self._feature_idx += 1
+                self._update_out_channels(stage_out_channels[stage_id + 2],
+                                          self._feature_idx, self.feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        y = self._conv1(inputs['image'])
+        y = self._max_pool(y)
+        outs = []
+        for i, inv in enumerate(self._block_list):
+            y = inv(y)
+            if i + 2 in self.feature_maps:
+                outs.append(y)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]

+ 470 - 0
paddlers/models/ppdet/modeling/backbones/ghostnet.py

@@ -0,0 +1,470 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import AdaptiveAvgPool2D, Linear
+from paddle.nn.initializer import Uniform
+
+from paddlers.models.ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+from .mobilenet_v3 import make_divisible, ConvBNLayer
+
+__all__ = ['GhostNet']
+
+
+class ExtraBlockDW(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 ch_1,
+                 ch_2,
+                 stride,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None):
+        super(ExtraBlockDW, self).__init__()
+        self.pointwise_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=ch_1,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra1")
+        self.depthwise_conv = ConvBNLayer(
+            in_c=ch_1,
+            out_c=ch_2,
+            filter_size=3,
+            stride=stride,
+            padding=1,  #
+            num_groups=int(ch_1),
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_dw")
+        self.normal_conv = ConvBNLayer(
+            in_c=ch_2,
+            out_c=ch_2,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_sep")
+
+    def forward(self, inputs):
+        x = self.pointwise_conv(inputs)
+        x = self.depthwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+class SEBlock(nn.Layer):
+    def __init__(self, num_channels, lr_mult, reduction_ratio=4, name=None):
+        super(SEBlock, self).__init__()
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+        self._num_channels = num_channels
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        med_ch = num_channels // reduction_ratio
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_channels,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+
+    def forward(self, inputs):
+        pool = self.pool2d_gap(inputs)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = paddle.clip(x=excitation, min=0, max=1)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = paddle.multiply(inputs, excitation)
+        return out
+
+
+class GhostModule(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 output_channels,
+                 kernel_size=1,
+                 ratio=2,
+                 dw_size=3,
+                 stride=1,
+                 relu=True,
+                 lr_mult=1.,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None):
+        super(GhostModule, self).__init__()
+        init_channels = int(math.ceil(output_channels / ratio))
+        new_channels = int(init_channels * (ratio - 1))
+        self.primary_conv = ConvBNLayer(
+            in_c=in_channels,
+            out_c=init_channels,
+            filter_size=kernel_size,
+            stride=stride,
+            padding=int((kernel_size - 1) // 2),
+            num_groups=1,
+            act="relu" if relu else None,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_primary_conv")
+        self.cheap_operation = ConvBNLayer(
+            in_c=init_channels,
+            out_c=new_channels,
+            filter_size=dw_size,
+            stride=1,
+            padding=int((dw_size - 1) // 2),
+            num_groups=init_channels,
+            act="relu" if relu else None,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_cheap_operation")
+
+    def forward(self, inputs):
+        x = self.primary_conv(inputs)
+        y = self.cheap_operation(x)
+        out = paddle.concat([x, y], axis=1)
+        return out
+
+
+class GhostBottleneck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 hidden_dim,
+                 output_channels,
+                 kernel_size,
+                 stride,
+                 use_se,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 return_list=False,
+                 name=None):
+        super(GhostBottleneck, self).__init__()
+        self._stride = stride
+        self._use_se = use_se
+        self._num_channels = in_channels
+        self._output_channels = output_channels
+        self.return_list = return_list
+
+        self.ghost_module_1 = GhostModule(
+            in_channels=in_channels,
+            output_channels=hidden_dim,
+            kernel_size=1,
+            stride=1,
+            relu=True,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_ghost_module_1")
+        if stride == 2:
+            self.depthwise_conv = ConvBNLayer(
+                in_c=hidden_dim,
+                out_c=hidden_dim,
+                filter_size=kernel_size,
+                stride=stride,
+                padding=int((kernel_size - 1) // 2),
+                num_groups=hidden_dim,
+                act=None,
+                lr_mult=lr_mult,
+                conv_decay=conv_decay,
+                norm_type=norm_type,
+                norm_decay=norm_decay,
+                freeze_norm=freeze_norm,
+                name=name +
+                "_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
+            )
+        if use_se:
+            self.se_block = SEBlock(hidden_dim, lr_mult, name=name + "_se")
+        self.ghost_module_2 = GhostModule(
+            in_channels=hidden_dim,
+            output_channels=output_channels,
+            kernel_size=1,
+            relu=False,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_ghost_module_2")
+        if stride != 1 or in_channels != output_channels:
+            self.shortcut_depthwise = ConvBNLayer(
+                in_c=in_channels,
+                out_c=in_channels,
+                filter_size=kernel_size,
+                stride=stride,
+                padding=int((kernel_size - 1) // 2),
+                num_groups=in_channels,
+                act=None,
+                lr_mult=lr_mult,
+                conv_decay=conv_decay,
+                norm_type=norm_type,
+                norm_decay=norm_decay,
+                freeze_norm=freeze_norm,
+                name=name +
+                "_shortcut_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
+            )
+            self.shortcut_conv = ConvBNLayer(
+                in_c=in_channels,
+                out_c=output_channels,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                act=None,
+                lr_mult=lr_mult,
+                conv_decay=conv_decay,
+                norm_type=norm_type,
+                norm_decay=norm_decay,
+                freeze_norm=freeze_norm,
+                name=name + "_shortcut_conv")
+
+    def forward(self, inputs):
+        y = self.ghost_module_1(inputs)
+        x = y
+        if self._stride == 2:
+            x = self.depthwise_conv(x)
+        if self._use_se:
+            x = self.se_block(x)
+        x = self.ghost_module_2(x)
+
+        if self._stride == 1 and self._num_channels == self._output_channels:
+            shortcut = inputs
+        else:
+            shortcut = self.shortcut_depthwise(inputs)
+            shortcut = self.shortcut_conv(shortcut)
+        x = paddle.add(x=x, y=shortcut)
+
+        if self.return_list:
+            return [y, x]
+        else:
+            return x
+
+
+@register
+@serializable
+class GhostNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 scale=1.3,
+                 feature_maps=[6, 12, 15],
+                 with_extra_blocks=False,
+                 extra_block_filters=[[256, 512], [128, 256], [128, 256],
+                                      [64, 128]],
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.0,
+                 freeze_norm=False):
+        super(GhostNet, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        if norm_type == 'sync_bn' and freeze_norm:
+            raise ValueError(
+                "The norm_type should not be sync_bn when freeze_norm is True")
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        inplanes = 16
+        self.cfgs = [
+            # k, t, c, SE, s
+            [3, 16, 16, 0, 1],
+            [3, 48, 24, 0, 2],
+            [3, 72, 24, 0, 1],
+            [5, 72, 40, 1, 2],
+            [5, 120, 40, 1, 1],
+            [3, 240, 80, 0, 2],
+            [3, 200, 80, 0, 1],
+            [3, 184, 80, 0, 1],
+            [3, 184, 80, 0, 1],
+            [3, 480, 112, 1, 1],
+            [3, 672, 112, 1, 1],
+            [5, 672, 160, 1, 2],  # SSDLite output
+            [5, 960, 160, 0, 1],
+            [5, 960, 160, 1, 1],
+            [5, 960, 160, 0, 1],
+            [5, 960, 160, 1, 1]
+        ]
+        self.scale = scale
+        conv1_out_ch = int(make_divisible(inplanes * self.scale, 4))
+        self.conv1 = ConvBNLayer(
+            in_c=3,
+            out_c=conv1_out_ch,
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            act="relu",
+            lr_mult=1.,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="conv1")
+
+        # build inverted residual blocks
+        self._out_channels = []
+        self.ghost_bottleneck_list = []
+        idx = 0
+        inplanes = conv1_out_ch
+        for k, exp_size, c, use_se, s in self.cfgs:
+            lr_idx = min(idx // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv
+            return_list = self.with_extra_blocks and idx + 2 in self.feature_maps
+
+            ghost_bottleneck = self.add_sublayer(
+                "_ghostbottleneck_" + str(idx),
+                sublayer=GhostBottleneck(
+                    in_channels=inplanes,
+                    hidden_dim=int(make_divisible(exp_size * self.scale, 4)),
+                    output_channels=int(make_divisible(c * self.scale, 4)),
+                    kernel_size=k,
+                    stride=s,
+                    use_se=use_se,
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    return_list=return_list,
+                    name="_ghostbottleneck_" + str(idx)))
+            self.ghost_bottleneck_list.append(ghost_bottleneck)
+            inplanes = int(make_divisible(c * self.scale, 4))
+            idx += 1
+            self._update_out_channels(
+                int(make_divisible(exp_size * self.scale, 4))
+                if return_list else inplanes, idx + 1, feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_block_list = []
+            extra_out_c = int(make_divisible(self.scale * self.cfgs[-1][1], 4))
+            lr_idx = min(idx // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            conv_extra = self.add_sublayer(
+                "conv" + str(idx + 2),
+                sublayer=ConvBNLayer(
+                    in_c=inplanes,
+                    out_c=extra_out_c,
+                    filter_size=1,
+                    stride=1,
+                    padding=0,
+                    num_groups=1,
+                    act="relu6",
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    name="conv" + str(idx + 2)))
+            self.extra_block_list.append(conv_extra)
+            idx += 1
+            self._update_out_channels(extra_out_c, idx + 1, feature_maps)
+
+            for j, block_filter in enumerate(self.extra_block_filters):
+                in_c = extra_out_c if j == 0 else self.extra_block_filters[
+                    j - 1][1]
+                conv_extra = self.add_sublayer(
+                    "conv" + str(idx + 2),
+                    sublayer=ExtraBlockDW(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        stride=2,
+                        lr_mult=lr_mult,
+                        conv_decay=conv_decay,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        name='conv' + str(idx + 2)))
+                self.extra_block_list.append(conv_extra)
+                idx += 1
+                self._update_out_channels(block_filter[1], idx + 1,
+                                          feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        x = self.conv1(inputs['image'])
+        outs = []
+        for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list):
+            x = ghost_bottleneck(x)
+            if idx + 2 in self.feature_maps:
+                if isinstance(x, list):
+                    outs.append(x[0])
+                    x = x[1]
+                else:
+                    outs.append(x)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        for i, block in enumerate(self.extra_block_list):
+            idx = i + len(self.ghost_bottleneck_list)
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                outs.append(x)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]

+ 224 - 0
paddlers/models/ppdet/modeling/backbones/hardnet.py

@@ -0,0 +1,224 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddlers.models.ppdet.core.workspace import register
+from ..shape_spec import ShapeSpec
+
+__all__ = ['HarDNet']
+
+
+def ConvLayer(in_channels,
+              out_channels,
+              kernel_size=3,
+              stride=1,
+              bias_attr=False):
+    layer = nn.Sequential(
+        ('conv', nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=1,
+            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)),
+        ('relu', nn.ReLU6()))
+    return layer
+
+
+def DWConvLayer(in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                bias_attr=False):
+    layer = nn.Sequential(
+        ('dwconv', nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=1,
+            groups=out_channels,
+            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)))
+    return layer
+
+
+def CombConvLayer(in_channels, out_channels, kernel_size=1, stride=1):
+    layer = nn.Sequential(
+        ('layer1', ConvLayer(
+            in_channels, out_channels, kernel_size=kernel_size)),
+        ('layer2', DWConvLayer(
+            out_channels, out_channels, stride=stride)))
+    return layer
+
+
+class HarDBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 growth_rate,
+                 grmul,
+                 n_layers,
+                 keepBase=False,
+                 residual_out=False,
+                 dwconv=False):
+        super().__init__()
+        self.keepBase = keepBase
+        self.links = []
+        layers_ = []
+        self.out_channels = 0
+        for i in range(n_layers):
+            outch, inch, link = self.get_link(i + 1, in_channels, growth_rate,
+                                              grmul)
+            self.links.append(link)
+            if dwconv:
+                layers_.append(CombConvLayer(inch, outch))
+            else:
+                layers_.append(ConvLayer(inch, outch))
+
+            if (i % 2 == 0) or (i == n_layers - 1):
+                self.out_channels += outch
+        self.layers = nn.LayerList(layers_)
+
+    def get_out_ch(self):
+        return self.out_channels
+
+    def get_link(self, layer, base_ch, growth_rate, grmul):
+        if layer == 0:
+            return base_ch, 0, []
+        out_channels = growth_rate
+
+        link = []
+        for i in range(10):
+            dv = 2**i
+            if layer % dv == 0:
+                k = layer - dv
+                link.append(k)
+                if i > 0:
+                    out_channels *= grmul
+
+        out_channels = int(int(out_channels + 1) / 2) * 2
+        in_channels = 0
+
+        for i in link:
+            ch, _, _ = self.get_link(i, base_ch, growth_rate, grmul)
+            in_channels += ch
+
+        return out_channels, in_channels, link
+
+    def forward(self, x):
+        layers_ = [x]
+
+        for layer in range(len(self.layers)):
+            link = self.links[layer]
+            tin = []
+            for i in link:
+                tin.append(layers_[i])
+            if len(tin) > 1:
+                x = paddle.concat(tin, 1)
+            else:
+                x = tin[0]
+            out = self.layers[layer](x)
+            layers_.append(out)
+
+        t = len(layers_)
+        out_ = []
+        for i in range(t):
+            if (i == 0 and self.keepBase) or (i == t - 1) or (i % 2 == 1):
+                out_.append(layers_[i])
+        out = paddle.concat(out_, 1)
+
+        return out
+
+
+@register
+class HarDNet(nn.Layer):
+    def __init__(self, depth_wise=False, return_idx=[1, 3, 8, 13], arch=85):
+        super(HarDNet, self).__init__()
+        assert arch in [39, 68, 85], "HarDNet-{} not support.".format(arch)
+        if arch == 85:
+            first_ch = [48, 96]
+            second_kernel = 3
+            ch_list = [192, 256, 320, 480, 720]
+            grmul = 1.7
+            gr = [24, 24, 28, 36, 48]
+            n_layers = [8, 16, 16, 16, 16]
+        elif arch == 68:
+            first_ch = [32, 64]
+            second_kernel = 3
+            ch_list = [128, 256, 320, 640]
+            grmul = 1.7
+            gr = [14, 16, 20, 40]
+            n_layers = [8, 16, 16, 16]
+
+        self.return_idx = return_idx
+        self._out_channels = [96, 214, 458, 784]
+
+        avg_pool = True
+        if depth_wise:
+            second_kernel = 1
+            avg_pool = False
+
+        blks = len(n_layers)
+        self.base = nn.LayerList([])
+
+        # First Layer: Standard Conv3x3, Stride=2
+        self.base.append(
+            ConvLayer(
+                in_channels=3,
+                out_channels=first_ch[0],
+                kernel_size=3,
+                stride=2,
+                bias_attr=False))
+
+        # Second Layer
+        self.base.append(
+            ConvLayer(
+                first_ch[0], first_ch[1], kernel_size=second_kernel))
+
+        # Avgpooling or DWConv3x3 downsampling
+        if avg_pool:
+            self.base.append(nn.AvgPool2D(kernel_size=3, stride=2, padding=1))
+        else:
+            self.base.append(DWConvLayer(first_ch[1], first_ch[1], stride=2))
+
+        # Build all HarDNet blocks
+        ch = first_ch[1]
+        for i in range(blks):
+            blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise)
+            ch = blk.out_channels
+            self.base.append(blk)
+
+            if i != blks - 1:
+                self.base.append(ConvLayer(ch, ch_list[i], kernel_size=1))
+            ch = ch_list[i]
+            if i == 0:
+                self.base.append(
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, ceil_mode=True))
+            elif i != blks - 1 and i != 1 and i != 3:
+                self.base.append(nn.AvgPool2D(kernel_size=2, stride=2))
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outs = []
+        for i, layer in enumerate(self.base):
+            x = layer(x)
+            if i in self.return_idx:
+                outs.append(x)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=self._out_channels[i]) for i in range(4)]

+ 727 - 0
paddlers/models/ppdet/modeling/backbones/hrnet.py

@@ -0,0 +1,727 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import AdaptiveAvgPool2D, Linear
+from paddle.regularizer import L2Decay
+from paddle import ParamAttr
+from paddle.nn.initializer import Normal, Uniform
+from numbers import Integral
+import math
+
+from paddlers.models.ppdet.core.workspace import register
+from ..shape_spec import ShapeSpec
+
+__all__ = ['HRNet']
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride=1,
+                 norm_type='bn',
+                 norm_groups=32,
+                 use_dcn=False,
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 act=None,
+                 name=None):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn', 'gn']
+
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=1,
+            weight_attr=ParamAttr(initializer=Normal(
+                mean=0., std=0.01)),
+            bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else 1.
+
+        param_attr = ParamAttr(
+            learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
+        global_stats = True if freeze_norm else None
+        if norm_type in ['bn', 'sync_bn']:
+            self.norm = nn.BatchNorm2D(
+                ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        elif norm_type == 'gn':
+            self.norm = nn.GroupNorm(
+                num_groups=norm_groups,
+                num_channels=ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr)
+        norm_params = self.norm.parameters()
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.norm(out)
+
+        if self.act == 'relu':
+            out = F.relu(out)
+        return out
+
+
+class Layer1(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 has_se=False,
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 name=None):
+        super(Layer1, self).__init__()
+
+        self.bottleneck_block_list = []
+
+        for i in range(4):
+            bottleneck_block = self.add_sublayer(
+                "block_{}_{}".format(name, i + 1),
+                BottleneckBlock(
+                    num_channels=num_channels if i == 0 else 256,
+                    num_filters=64,
+                    has_se=has_se,
+                    stride=1,
+                    downsample=True if i == 0 else False,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    name=name + '_' + str(i + 1)))
+            self.bottleneck_block_list.append(bottleneck_block)
+
+    def forward(self, input):
+        conv = input
+        for block_func in self.bottleneck_block_list:
+            conv = block_func(conv)
+        return conv
+
+
+class TransitionLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 name=None):
+        super(TransitionLayer, self).__init__()
+
+        num_in = len(in_channels)
+        num_out = len(out_channels)
+        out = []
+        self.conv_bn_func_list = []
+        for i in range(num_out):
+            residual = None
+            if i < num_in:
+                if in_channels[i] != out_channels[i]:
+                    residual = self.add_sublayer(
+                        "transition_{}_layer_{}".format(name, i + 1),
+                        ConvNormLayer(
+                            ch_in=in_channels[i],
+                            ch_out=out_channels[i],
+                            filter_size=3,
+                            norm_decay=norm_decay,
+                            freeze_norm=freeze_norm,
+                            act='relu',
+                            name=name + '_layer_' + str(i + 1)))
+            else:
+                residual = self.add_sublayer(
+                    "transition_{}_layer_{}".format(name, i + 1),
+                    ConvNormLayer(
+                        ch_in=in_channels[-1],
+                        ch_out=out_channels[i],
+                        filter_size=3,
+                        stride=2,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        act='relu',
+                        name=name + '_layer_' + str(i + 1)))
+            self.conv_bn_func_list.append(residual)
+
+    def forward(self, input):
+        outs = []
+        for idx, conv_bn_func in enumerate(self.conv_bn_func_list):
+            if conv_bn_func is None:
+                outs.append(input[idx])
+            else:
+                if idx < len(input):
+                    outs.append(conv_bn_func(input[idx]))
+                else:
+                    outs.append(conv_bn_func(input[-1]))
+        return outs
+
+
+class Branches(nn.Layer):
+    def __init__(self,
+                 block_num,
+                 in_channels,
+                 out_channels,
+                 has_se=False,
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 name=None):
+        super(Branches, self).__init__()
+
+        self.basic_block_list = []
+        for i in range(len(out_channels)):
+            self.basic_block_list.append([])
+            for j in range(block_num):
+                in_ch = in_channels[i] if j == 0 else out_channels[i]
+                basic_block_func = self.add_sublayer(
+                    "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1),
+                    BasicBlock(
+                        num_channels=in_ch,
+                        num_filters=out_channels[i],
+                        has_se=has_se,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        name=name + '_branch_layer_' + str(i + 1) + '_' +
+                        str(j + 1)))
+                self.basic_block_list[i].append(basic_block_func)
+
+    def forward(self, inputs):
+        outs = []
+        for idx, input in enumerate(inputs):
+            conv = input
+            basic_block_list = self.basic_block_list[idx]
+            for basic_block_func in basic_block_list:
+                conv = basic_block_func(conv)
+            outs.append(conv)
+        return outs
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 has_se,
+                 stride=1,
+                 downsample=False,
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.has_se = has_se
+        self.downsample = downsample
+
+        self.conv1 = ConvNormLayer(
+            ch_in=num_channels,
+            ch_out=num_filters,
+            filter_size=1,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            act="relu",
+            name=name + "_conv1")
+        self.conv2 = ConvNormLayer(
+            ch_in=num_filters,
+            ch_out=num_filters,
+            filter_size=3,
+            stride=stride,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            act="relu",
+            name=name + "_conv2")
+        self.conv3 = ConvNormLayer(
+            ch_in=num_filters,
+            ch_out=num_filters * 4,
+            filter_size=1,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            act=None,
+            name=name + "_conv3")
+
+        if self.downsample:
+            self.conv_down = ConvNormLayer(
+                ch_in=num_channels,
+                ch_out=num_filters * 4,
+                filter_size=1,
+                norm_decay=norm_decay,
+                freeze_norm=freeze_norm,
+                act=None,
+                name=name + "_downsample")
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters * 4,
+                num_filters=num_filters * 4,
+                reduction_ratio=16,
+                name='fc' + name)
+
+    def forward(self, input):
+        residual = input
+        conv1 = self.conv1(input)
+        conv2 = self.conv2(conv1)
+        conv3 = self.conv3(conv2)
+
+        if self.downsample:
+            residual = self.conv_down(input)
+
+        if self.has_se:
+            conv3 = self.se(conv3)
+
+        y = paddle.add(x=residual, y=conv3)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride=1,
+                 has_se=False,
+                 downsample=False,
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+
+        self.has_se = has_se
+        self.downsample = downsample
+        self.conv1 = ConvNormLayer(
+            ch_in=num_channels,
+            ch_out=num_filters,
+            filter_size=3,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            stride=stride,
+            act="relu",
+            name=name + "_conv1")
+        self.conv2 = ConvNormLayer(
+            ch_in=num_filters,
+            ch_out=num_filters,
+            filter_size=3,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            stride=1,
+            act=None,
+            name=name + "_conv2")
+
+        if self.downsample:
+            self.conv_down = ConvNormLayer(
+                ch_in=num_channels,
+                ch_out=num_filters * 4,
+                filter_size=1,
+                norm_decay=norm_decay,
+                freeze_norm=freeze_norm,
+                act=None,
+                name=name + "_downsample")
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters,
+                num_filters=num_filters,
+                reduction_ratio=16,
+                name='fc' + name)
+
+    def forward(self, input):
+        residual = input
+        conv1 = self.conv1(input)
+        conv2 = self.conv2(conv1)
+
+        if self.downsample:
+            residual = self.conv_down(input)
+
+        if self.has_se:
+            conv2 = self.se(conv2)
+
+        y = paddle.add(x=residual, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = F.sigmoid(excitation)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = input * excitation
+        return out
+
+
+class Stage(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_modules,
+                 num_filters,
+                 has_se=False,
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 multi_scale_output=True,
+                 name=None):
+        super(Stage, self).__init__()
+
+        self._num_modules = num_modules
+        self.stage_func_list = []
+        for i in range(num_modules):
+            if i == num_modules - 1 and not multi_scale_output:
+                stage_func = self.add_sublayer(
+                    "stage_{}_{}".format(name, i + 1),
+                    HighResolutionModule(
+                        num_channels=num_channels,
+                        num_filters=num_filters,
+                        has_se=has_se,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        multi_scale_output=False,
+                        name=name + '_' + str(i + 1)))
+            else:
+                stage_func = self.add_sublayer(
+                    "stage_{}_{}".format(name, i + 1),
+                    HighResolutionModule(
+                        num_channels=num_channels,
+                        num_filters=num_filters,
+                        has_se=has_se,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        name=name + '_' + str(i + 1)))
+
+            self.stage_func_list.append(stage_func)
+
+    def forward(self, input):
+        out = input
+        for idx in range(self._num_modules):
+            out = self.stage_func_list[idx](out)
+        return out
+
+
+class HighResolutionModule(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 has_se=False,
+                 multi_scale_output=True,
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 name=None):
+        super(HighResolutionModule, self).__init__()
+        self.branches_func = Branches(
+            block_num=4,
+            in_channels=num_channels,
+            out_channels=num_filters,
+            has_se=has_se,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name)
+
+        self.fuse_func = FuseLayers(
+            in_channels=num_filters,
+            out_channels=num_filters,
+            multi_scale_output=multi_scale_output,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name)
+
+    def forward(self, input):
+        out = self.branches_func(input)
+        out = self.fuse_func(out)
+        return out
+
+
+class FuseLayers(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 multi_scale_output=True,
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 name=None):
+        super(FuseLayers, self).__init__()
+
+        self._actual_ch = len(in_channels) if multi_scale_output else 1
+        self._in_channels = in_channels
+
+        self.residual_func_list = []
+        for i in range(self._actual_ch):
+            for j in range(len(in_channels)):
+                residual_func = None
+                if j > i:
+                    residual_func = self.add_sublayer(
+                        "residual_{}_layer_{}_{}".format(name, i + 1, j + 1),
+                        ConvNormLayer(
+                            ch_in=in_channels[j],
+                            ch_out=out_channels[i],
+                            filter_size=1,
+                            stride=1,
+                            act=None,
+                            norm_decay=norm_decay,
+                            freeze_norm=freeze_norm,
+                            name=name + '_layer_' + str(i + 1) + '_' +
+                            str(j + 1)))
+                    self.residual_func_list.append(residual_func)
+                elif j < i:
+                    pre_num_filters = in_channels[j]
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            residual_func = self.add_sublayer(
+                                "residual_{}_layer_{}_{}_{}".format(
+                                    name, i + 1, j + 1, k + 1),
+                                ConvNormLayer(
+                                    ch_in=pre_num_filters,
+                                    ch_out=out_channels[i],
+                                    filter_size=3,
+                                    stride=2,
+                                    norm_decay=norm_decay,
+                                    freeze_norm=freeze_norm,
+                                    act=None,
+                                    name=name + '_layer_' + str(i + 1) + '_' +
+                                    str(j + 1) + '_' + str(k + 1)))
+                            pre_num_filters = out_channels[i]
+                        else:
+                            residual_func = self.add_sublayer(
+                                "residual_{}_layer_{}_{}_{}".format(
+                                    name, i + 1, j + 1, k + 1),
+                                ConvNormLayer(
+                                    ch_in=pre_num_filters,
+                                    ch_out=out_channels[j],
+                                    filter_size=3,
+                                    stride=2,
+                                    norm_decay=norm_decay,
+                                    freeze_norm=freeze_norm,
+                                    act="relu",
+                                    name=name + '_layer_' + str(i + 1) + '_' +
+                                    str(j + 1) + '_' + str(k + 1)))
+                            pre_num_filters = out_channels[j]
+                        self.residual_func_list.append(residual_func)
+
+    def forward(self, input):
+        outs = []
+        residual_func_idx = 0
+        for i in range(self._actual_ch):
+            residual = input[i]
+            for j in range(len(self._in_channels)):
+                if j > i:
+                    y = self.residual_func_list[residual_func_idx](input[j])
+                    residual_func_idx += 1
+                    y = F.interpolate(y, scale_factor=2**(j - i))
+                    residual = paddle.add(x=residual, y=y)
+                elif j < i:
+                    y = input[j]
+                    for k in range(i - j):
+                        y = self.residual_func_list[residual_func_idx](y)
+                        residual_func_idx += 1
+
+                    residual = paddle.add(x=residual, y=y)
+            residual = F.relu(residual)
+            outs.append(residual)
+
+        return outs
+
+
+@register
+class HRNet(nn.Layer):
+    """
+    HRNet, see https://arxiv.org/abs/1908.07919
+
+    Args:
+        width (int): the width of HRNet
+        has_se (bool): whether to add SE block for each stage
+        freeze_at (int): the stage to freeze
+        freeze_norm (bool): whether to freeze norm in HRNet
+        norm_decay (float): weight decay for normalization layer weights
+        return_idx (List): the stage to return
+        upsample (bool): whether to upsample and concat the backbone feats
+    """
+
+    def __init__(self,
+                 width=18,
+                 has_se=False,
+                 freeze_at=0,
+                 freeze_norm=True,
+                 norm_decay=0.,
+                 return_idx=[0, 1, 2, 3],
+                 upsample=False):
+        super(HRNet, self).__init__()
+
+        self.width = width
+        self.has_se = has_se
+        if isinstance(return_idx, Integral):
+            return_idx = [return_idx]
+
+        assert len(return_idx) > 0, "need one or more return index"
+        self.freeze_at = freeze_at
+        self.return_idx = return_idx
+        self.upsample = upsample
+
+        self.channels = {
+            18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]],
+            30: [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
+            32: [[32, 64], [32, 64, 128], [32, 64, 128, 256]],
+            40: [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
+            44: [[44, 88], [44, 88, 176], [44, 88, 176, 352]],
+            48: [[48, 96], [48, 96, 192], [48, 96, 192, 384]],
+            60: [[60, 120], [60, 120, 240], [60, 120, 240, 480]],
+            64: [[64, 128], [64, 128, 256], [64, 128, 256, 512]]
+        }
+
+        channels_2, channels_3, channels_4 = self.channels[width]
+        num_modules_2, num_modules_3, num_modules_4 = 1, 4, 3
+        self._out_channels = [sum(channels_4)] if self.upsample else channels_4
+        self._out_strides = [4] if self.upsample else [4, 8, 16, 32]
+
+        self.conv_layer1_1 = ConvNormLayer(
+            ch_in=3,
+            ch_out=64,
+            filter_size=3,
+            stride=2,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            act='relu',
+            name="layer1_1")
+
+        self.conv_layer1_2 = ConvNormLayer(
+            ch_in=64,
+            ch_out=64,
+            filter_size=3,
+            stride=2,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            act='relu',
+            name="layer1_2")
+
+        self.la1 = Layer1(
+            num_channels=64,
+            has_se=has_se,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="layer2")
+
+        self.tr1 = TransitionLayer(
+            in_channels=[256],
+            out_channels=channels_2,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="tr1")
+
+        self.st2 = Stage(
+            num_channels=channels_2,
+            num_modules=num_modules_2,
+            num_filters=channels_2,
+            has_se=self.has_se,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="st2")
+
+        self.tr2 = TransitionLayer(
+            in_channels=channels_2,
+            out_channels=channels_3,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="tr2")
+
+        self.st3 = Stage(
+            num_channels=channels_3,
+            num_modules=num_modules_3,
+            num_filters=channels_3,
+            has_se=self.has_se,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="st3")
+
+        self.tr3 = TransitionLayer(
+            in_channels=channels_3,
+            out_channels=channels_4,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="tr3")
+        self.st4 = Stage(
+            num_channels=channels_4,
+            num_modules=num_modules_4,
+            num_filters=channels_4,
+            has_se=self.has_se,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            multi_scale_output=len(return_idx) > 1,
+            name="st4")
+
+    def forward(self, inputs):
+        x = inputs['image']
+        conv1 = self.conv_layer1_1(x)
+        conv2 = self.conv_layer1_2(conv1)
+
+        la1 = self.la1(conv2)
+        tr1 = self.tr1([la1])
+        st2 = self.st2(tr1)
+        tr2 = self.tr2(st2)
+
+        st3 = self.st3(tr2)
+        tr3 = self.tr3(st3)
+
+        st4 = self.st4(tr3)
+
+        if self.upsample:
+            # Upsampling
+            x0_h, x0_w = st4[0].shape[2:4]
+            x1 = F.upsample(st4[1], size=(x0_h, x0_w), mode='bilinear')
+            x2 = F.upsample(st4[2], size=(x0_h, x0_w), mode='bilinear')
+            x3 = F.upsample(st4[3], size=(x0_h, x0_w), mode='bilinear')
+            x = paddle.concat([st4[0], x1, x2, x3], 1)
+            return x
+
+        res = []
+        for i, layer in enumerate(st4):
+            if i == self.freeze_at:
+                layer.stop_gradient = True
+            if i in self.return_idx:
+                res.append(layer)
+
+        return res
+
+    @property
+    def out_shape(self):
+        if self.upsample:
+            self.return_idx = [0]
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]

+ 259 - 0
paddlers/models/ppdet/modeling/backbones/lcnet.py

@@ -0,0 +1,259 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, Conv2D
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+
+from paddlers.models.ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['LCNet']
+
+NET_CONFIG = {
+    "blocks2":
+    #k, in_c, out_c, s, use_se
+    [[3, 16, 32, 1, False], ],
+    "blocks3": [
+        [3, 32, 64, 2, False],
+        [3, 64, 64, 1, False],
+    ],
+    "blocks4": [
+        [3, 64, 128, 2, False],
+        [3, 128, 128, 1, False],
+    ],
+    "blocks5": [
+        [3, 128, 256, 2, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+    ],
+    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 num_groups=1):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm2D(
+            num_filters,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.hardswish = nn.Hardswish()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.hardswish(x)
+        return x
+
+
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 dw_size=3,
+                 use_se=False):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_channels,
+            filter_size=dw_size,
+            stride=stride,
+            num_groups=num_channels)
+        if use_se:
+            self.se = SEModule(num_channels)
+        self.pw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            filter_size=1,
+            num_filters=num_filters,
+            stride=1)
+
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+@register
+@serializable
+class LCNet(nn.Layer):
+    def __init__(self, scale=1.0, feature_maps=[3, 4, 5]):
+        super().__init__()
+        self.scale = scale
+        self.feature_maps = feature_maps
+
+        out_channels = []
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=make_divisible(16 * scale),
+            stride=2)
+
+        self.blocks2 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
+        ])
+
+        self.blocks3 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))
+
+        self.blocks4 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))
+
+        self.blocks5 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))
+
+        self.blocks6 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
+        self._out_channels = [
+            ch for idx, ch in enumerate(out_channels)
+            if idx + 2 in feature_maps
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outs = []
+
+        x = self.conv1(x)
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        outs.append(x)
+        x = self.blocks4(x)
+        outs.append(x)
+        x = self.blocks5(x)
+        outs.append(x)
+        x = self.blocks6(x)
+        outs.append(x)
+        outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]

+ 886 - 0
paddlers/models/ppdet/modeling/backbones/lite_hrnet.py

@@ -0,0 +1,886 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on
+https://github.com/HRNet/Lite-HRNet/blob/hrnet/models/backbones/litehrnet.py
+"""
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from numbers import Integral
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Normal, Constant
+from paddlers.models.ppdet.core.workspace import register
+from paddlers.models.ppdet.modeling.shape_spec import ShapeSpec
+from paddlers.models.ppdet.modeling.ops import channel_shuffle
+from .. import layers as L
+
+__all__ = ['LiteHRNet']
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 norm_type=None,
+                 norm_groups=32,
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 act=None):
+        super(ConvNormLayer, self).__init__()
+        self.act = act
+        norm_lr = 0. if freeze_norm else 1.
+        if norm_type is not None:
+            assert norm_type in ['bn', 'sync_bn', 'gn'], \
+                "norm_type should be one of ['bn', 'sync_bn', 'gn'], but got {}".format(norm_type)
+            param_attr = ParamAttr(
+                initializer=Constant(1.0),
+                learning_rate=norm_lr,
+                regularizer=L2Decay(norm_decay), )
+            bias_attr = ParamAttr(
+                learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
+            global_stats = True if freeze_norm else None
+            if norm_type in ['bn', 'sync_bn']:
+                self.norm = nn.BatchNorm2D(
+                    ch_out,
+                    weight_attr=param_attr,
+                    bias_attr=bias_attr,
+                    use_global_stats=global_stats, )
+            elif norm_type == 'gn':
+                self.norm = nn.GroupNorm(
+                    num_groups=norm_groups,
+                    num_channels=ch_out,
+                    weight_attr=param_attr,
+                    bias_attr=bias_attr)
+            norm_params = self.norm.parameters()
+            if freeze_norm:
+                for param in norm_params:
+                    param.stop_gradient = True
+            conv_bias_attr = False
+        else:
+            conv_bias_attr = True
+            self.norm = None
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=Normal(
+                mean=0., std=0.001)),
+            bias_attr=conv_bias_attr)
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        if self.norm is not None:
+            out = self.norm(out)
+
+        if self.act == 'relu':
+            out = F.relu(out)
+        elif self.act == 'sigmoid':
+            out = F.sigmoid(out)
+        return out
+
+
+class DepthWiseSeparableConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride=1,
+                 dw_norm_type=None,
+                 pw_norm_type=None,
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 dw_act=None,
+                 pw_act=None):
+        super(DepthWiseSeparableConvNormLayer, self).__init__()
+        self.depthwise_conv = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=ch_in,
+            filter_size=filter_size,
+            stride=stride,
+            groups=ch_in,
+            norm_type=dw_norm_type,
+            act=dw_act,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm, )
+        self.pointwise_conv = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=1,
+            stride=1,
+            norm_type=pw_norm_type,
+            act=pw_act,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm, )
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
+
+
+class CrossResolutionWeightingModule(nn.Layer):
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(CrossResolutionWeightingModule, self).__init__()
+        self.channels = channels
+        total_channel = sum(channels)
+        self.conv1 = ConvNormLayer(
+            ch_in=total_channel,
+            ch_out=total_channel // ratio,
+            filter_size=1,
+            stride=1,
+            norm_type=norm_type,
+            act='relu',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        self.conv2 = ConvNormLayer(
+            ch_in=total_channel // ratio,
+            ch_out=total_channel,
+            filter_size=1,
+            stride=1,
+            norm_type=norm_type,
+            act='sigmoid',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+
+    def forward(self, x):
+        mini_size = x[-1].shape[-2:]
+        out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]]
+        out = paddle.concat(out, 1)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        out = paddle.split(out, self.channels, 1)
+        out = [
+            s * F.interpolate(
+                a, s.shape[-2:], mode='nearest') for s, a in zip(x, out)
+        ]
+        return out
+
+
+class SpatialWeightingModule(nn.Layer):
+    def __init__(self, in_channel, ratio=16, freeze_norm=False, norm_decay=0.):
+        super(SpatialWeightingModule, self).__init__()
+        self.global_avgpooling = nn.AdaptiveAvgPool2D(1)
+        self.conv1 = ConvNormLayer(
+            ch_in=in_channel,
+            ch_out=in_channel // ratio,
+            filter_size=1,
+            stride=1,
+            act='relu',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        self.conv2 = ConvNormLayer(
+            ch_in=in_channel // ratio,
+            ch_out=in_channel,
+            filter_size=1,
+            stride=1,
+            act='sigmoid',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+
+    def forward(self, x):
+        out = self.global_avgpooling(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
+
+
+class ConditionalChannelWeightingBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 stride,
+                 reduce_ratio,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(ConditionalChannelWeightingBlock, self).__init__()
+        assert stride in [1, 2]
+        branch_channels = [channel // 2 for channel in in_channels]
+
+        self.cross_resolution_weighting = CrossResolutionWeightingModule(
+            branch_channels,
+            ratio=reduce_ratio,
+            norm_type=norm_type,
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        self.depthwise_convs = nn.LayerList([
+            ConvNormLayer(
+                channel,
+                channel,
+                filter_size=3,
+                stride=stride,
+                groups=channel,
+                norm_type=norm_type,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay) for channel in branch_channels
+        ])
+
+        self.spatial_weighting = nn.LayerList([
+            SpatialWeightingModule(
+                channel,
+                ratio=4,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay) for channel in branch_channels
+        ])
+
+    def forward(self, x):
+        x = [s.chunk(2, axis=1) for s in x]
+        x1 = [s[0] for s in x]
+        x2 = [s[1] for s in x]
+
+        x2 = self.cross_resolution_weighting(x2)
+        x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)]
+        x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)]
+
+        out = [paddle.concat([s1, s2], axis=1) for s1, s2 in zip(x1, x2)]
+        out = [channel_shuffle(s, groups=2) for s in out]
+        return out
+
+
+class ShuffleUnit(nn.Layer):
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 stride,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(ShuffleUnit, self).__init__()
+        branch_channel = out_channel // 2
+        self.stride = stride
+        if self.stride == 1:
+            assert in_channel == branch_channel * 2, \
+                "when stride=1, in_channel {} should equal to branch_channel*2 {}".format(in_channel, branch_channel * 2)
+        if stride > 1:
+            self.branch1 = nn.Sequential(
+                ConvNormLayer(
+                    ch_in=in_channel,
+                    ch_out=in_channel,
+                    filter_size=3,
+                    stride=self.stride,
+                    groups=in_channel,
+                    norm_type=norm_type,
+                    freeze_norm=freeze_norm,
+                    norm_decay=norm_decay),
+                ConvNormLayer(
+                    ch_in=in_channel,
+                    ch_out=branch_channel,
+                    filter_size=1,
+                    stride=1,
+                    norm_type=norm_type,
+                    act='relu',
+                    freeze_norm=freeze_norm,
+                    norm_decay=norm_decay), )
+        self.branch2 = nn.Sequential(
+            ConvNormLayer(
+                ch_in=branch_channel if stride == 1 else in_channel,
+                ch_out=branch_channel,
+                filter_size=1,
+                stride=1,
+                norm_type=norm_type,
+                act='relu',
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay),
+            ConvNormLayer(
+                ch_in=branch_channel,
+                ch_out=branch_channel,
+                filter_size=3,
+                stride=self.stride,
+                groups=branch_channel,
+                norm_type=norm_type,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay),
+            ConvNormLayer(
+                ch_in=branch_channel,
+                ch_out=branch_channel,
+                filter_size=1,
+                stride=1,
+                norm_type=norm_type,
+                act='relu',
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay), )
+
+    def forward(self, x):
+        if self.stride > 1:
+            x1 = self.branch1(x)
+            x2 = self.branch2(x)
+        else:
+            x1, x2 = x.chunk(2, axis=1)
+            x2 = self.branch2(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        out = channel_shuffle(out, groups=2)
+        return out
+
+
+class IterativeHead(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(IterativeHead, self).__init__()
+        num_branches = len(in_channels)
+        self.in_channels = in_channels[::-1]
+
+        projects = []
+        for i in range(num_branches):
+            if i != num_branches - 1:
+                projects.append(
+                    DepthWiseSeparableConvNormLayer(
+                        ch_in=self.in_channels[i],
+                        ch_out=self.in_channels[i + 1],
+                        filter_size=3,
+                        stride=1,
+                        dw_act=None,
+                        pw_act='relu',
+                        dw_norm_type=norm_type,
+                        pw_norm_type=norm_type,
+                        freeze_norm=freeze_norm,
+                        norm_decay=norm_decay))
+            else:
+                projects.append(
+                    DepthWiseSeparableConvNormLayer(
+                        ch_in=self.in_channels[i],
+                        ch_out=self.in_channels[i],
+                        filter_size=3,
+                        stride=1,
+                        dw_act=None,
+                        pw_act='relu',
+                        dw_norm_type=norm_type,
+                        pw_norm_type=norm_type,
+                        freeze_norm=freeze_norm,
+                        norm_decay=norm_decay))
+        self.projects = nn.LayerList(projects)
+
+    def forward(self, x):
+        x = x[::-1]
+        y = []
+        last_x = None
+        for i, s in enumerate(x):
+            if last_x is not None:
+                last_x = F.interpolate(
+                    last_x,
+                    size=s.shape[-2:],
+                    mode='bilinear',
+                    align_corners=True)
+                s = s + last_x
+            s = self.projects[i](s)
+            y.append(s)
+            last_x = s
+
+        return y[::-1]
+
+
+class Stem(nn.Layer):
+    def __init__(self,
+                 in_channel,
+                 stem_channel,
+                 out_channel,
+                 expand_ratio,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(Stem, self).__init__()
+        self.conv1 = ConvNormLayer(
+            in_channel,
+            stem_channel,
+            filter_size=3,
+            stride=2,
+            norm_type=norm_type,
+            act='relu',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        mid_channel = int(round(stem_channel * expand_ratio))
+        branch_channel = stem_channel // 2
+        if stem_channel == out_channel:
+            inc_channel = out_channel - branch_channel
+        else:
+            inc_channel = out_channel - stem_channel
+        self.branch1 = nn.Sequential(
+            ConvNormLayer(
+                ch_in=branch_channel,
+                ch_out=branch_channel,
+                filter_size=3,
+                stride=2,
+                groups=branch_channel,
+                norm_type=norm_type,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay),
+            ConvNormLayer(
+                ch_in=branch_channel,
+                ch_out=inc_channel,
+                filter_size=1,
+                stride=1,
+                norm_type=norm_type,
+                act='relu',
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay), )
+        self.expand_conv = ConvNormLayer(
+            ch_in=branch_channel,
+            ch_out=mid_channel,
+            filter_size=1,
+            stride=1,
+            norm_type=norm_type,
+            act='relu',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        self.depthwise_conv = ConvNormLayer(
+            ch_in=mid_channel,
+            ch_out=mid_channel,
+            filter_size=3,
+            stride=2,
+            groups=mid_channel,
+            norm_type=norm_type,
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        self.linear_conv = ConvNormLayer(
+            ch_in=mid_channel,
+            ch_out=branch_channel
+            if stem_channel == out_channel else stem_channel,
+            filter_size=1,
+            stride=1,
+            norm_type=norm_type,
+            act='relu',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x1, x2 = x.chunk(2, axis=1)
+        x1 = self.branch1(x1)
+        x2 = self.expand_conv(x2)
+        x2 = self.depthwise_conv(x2)
+        x2 = self.linear_conv(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        out = channel_shuffle(out, groups=2)
+
+        return out
+
+
+class LiteHRNetModule(nn.Layer):
+    def __init__(self,
+                 num_branches,
+                 num_blocks,
+                 in_channels,
+                 reduce_ratio,
+                 module_type,
+                 multiscale_output=False,
+                 with_fuse=True,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(LiteHRNetModule, self).__init__()
+        assert num_branches == len(in_channels),\
+            "num_branches {} should equal to num_in_channels {}".format(num_branches, len(in_channels))
+        assert module_type in [
+            'LITE', 'NAIVE'
+        ], "module_type should be one of ['LITE', 'NAIVE']"
+        self.num_branches = num_branches
+        self.in_channels = in_channels
+        self.multiscale_output = multiscale_output
+        self.with_fuse = with_fuse
+        self.norm_type = 'bn'
+        self.module_type = module_type
+
+        if self.module_type == 'LITE':
+            self.layers = self._make_weighting_blocks(
+                num_blocks,
+                reduce_ratio,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay)
+        elif self.module_type == 'NAIVE':
+            self.layers = self._make_naive_branches(
+                num_branches,
+                num_blocks,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay)
+
+        if self.with_fuse:
+            self.fuse_layers = self._make_fuse_layers(
+                freeze_norm=freeze_norm, norm_decay=norm_decay)
+            self.relu = nn.ReLU()
+
+    def _make_weighting_blocks(self,
+                               num_blocks,
+                               reduce_ratio,
+                               stride=1,
+                               freeze_norm=False,
+                               norm_decay=0.):
+        layers = []
+        for i in range(num_blocks):
+            layers.append(
+                ConditionalChannelWeightingBlock(
+                    self.in_channels,
+                    stride=stride,
+                    reduce_ratio=reduce_ratio,
+                    norm_type=self.norm_type,
+                    freeze_norm=freeze_norm,
+                    norm_decay=norm_decay))
+        return nn.Sequential(*layers)
+
+    def _make_naive_branches(self,
+                             num_branches,
+                             num_blocks,
+                             freeze_norm=False,
+                             norm_decay=0.):
+        branches = []
+        for branch_idx in range(num_branches):
+            layers = []
+            for i in range(num_blocks):
+                layers.append(
+                    ShuffleUnit(
+                        self.in_channels[branch_idx],
+                        self.in_channels[branch_idx],
+                        stride=1,
+                        norm_type=self.norm_type,
+                        freeze_norm=freeze_norm,
+                        norm_decay=norm_decay))
+            branches.append(nn.Sequential(*layers))
+        return nn.LayerList(branches)
+
+    def _make_fuse_layers(self, freeze_norm=False, norm_decay=0.):
+        if self.num_branches == 1:
+            return None
+        fuse_layers = []
+        num_out_branches = self.num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(self.num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            L.Conv2d(
+                                self.in_channels[j],
+                                self.in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False, ),
+                            nn.BatchNorm2D(self.in_channels[i]),
+                            nn.Upsample(
+                                scale_factor=2**(j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    L.Conv2d(
+                                        self.in_channels[j],
+                                        self.in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=self.in_channels[j],
+                                        bias=False, ),
+                                    nn.BatchNorm2D(self.in_channels[j]),
+                                    L.Conv2d(
+                                        self.in_channels[j],
+                                        self.in_channels[i],
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        bias=False, ),
+                                    nn.BatchNorm2D(self.in_channels[i])))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    L.Conv2d(
+                                        self.in_channels[j],
+                                        self.in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=self.in_channels[j],
+                                        bias=False, ),
+                                    nn.BatchNorm2D(self.in_channels[j]),
+                                    L.Conv2d(
+                                        self.in_channels[j],
+                                        self.in_channels[j],
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        bias=False, ),
+                                    nn.BatchNorm2D(self.in_channels[j]),
+                                    nn.ReLU()))
+
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.LayerList(fuse_layer))
+
+        return nn.LayerList(fuse_layers)
+
+    def forward(self, x):
+        if self.num_branches == 1:
+            return [self.layers[0](x[0])]
+        if self.module_type == 'LITE':
+            out = self.layers(x)
+        elif self.module_type == 'NAIVE':
+            for i in range(self.num_branches):
+                x[i] = self.layers[i](x[i])
+            out = x
+        if self.with_fuse:
+            out_fuse = []
+            for i in range(len(self.fuse_layers)):
+                y = out[0] if i == 0 else self.fuse_layers[i][0](out[0])
+                for j in range(self.num_branches):
+                    if j == 0:
+                        y += y
+                    elif i == j:
+                        y += out[j]
+                    else:
+                        y += self.fuse_layers[i][j](out[j])
+                    if i == 0:
+                        out[i] = y
+                out_fuse.append(self.relu(y))
+            out = out_fuse
+        elif not self.multiscale_output:
+            out = [out[0]]
+        return out
+
+
+@register
+class LiteHRNet(nn.Layer):
+    """
+    @inproceedings{Yulitehrnet21,
+    title={Lite-HRNet: A Lightweight High-Resolution Network},
+        author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
+        booktitle={CVPR},year={2021}
+    }
+    Args:
+        network_type (str): the network_type should be one of ["lite_18", "lite_30", "naive", "wider_naive"],
+            "naive": Simply combining the shuffle block in ShuffleNet and the highresolution design pattern in HRNet.
+            "wider_naive": Naive network with wider channels in each block.
+            "lite_18": Lite-HRNet-18, which replaces the pointwise convolution in a shuffle block by conditional channel weighting.
+            "lite_30": Lite-HRNet-30, with more blocks compared with Lite-HRNet-18.
+        freeze_at (int): the stage to freeze
+        freeze_norm (bool): whether to freeze norm in HRNet
+        norm_decay (float): weight decay for normalization layer weights
+        return_idx (List): the stage to return
+    """
+
+    def __init__(self,
+                 network_type,
+                 freeze_at=0,
+                 freeze_norm=True,
+                 norm_decay=0.,
+                 return_idx=[0, 1, 2, 3]):
+        super(LiteHRNet, self).__init__()
+        if isinstance(return_idx, Integral):
+            return_idx = [return_idx]
+        assert network_type in ["lite_18", "lite_30", "naive", "wider_naive"], \
+            "the network_type should be one of [lite_18, lite_30, naive, wider_naive]"
+        assert len(return_idx) > 0, "need one or more return index"
+        self.freeze_at = freeze_at
+        self.freeze_norm = freeze_norm
+        self.norm_decay = norm_decay
+        self.return_idx = return_idx
+        self.norm_type = 'bn'
+
+        self.module_configs = {
+            "lite_18": {
+                "num_modules": [2, 4, 2],
+                "num_branches": [2, 3, 4],
+                "num_blocks": [2, 2, 2],
+                "module_type": ["LITE", "LITE", "LITE"],
+                "reduce_ratios": [8, 8, 8],
+                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
+            },
+            "lite_30": {
+                "num_modules": [3, 8, 3],
+                "num_branches": [2, 3, 4],
+                "num_blocks": [2, 2, 2],
+                "module_type": ["LITE", "LITE", "LITE"],
+                "reduce_ratios": [8, 8, 8],
+                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
+            },
+            "naive": {
+                "num_modules": [2, 4, 2],
+                "num_branches": [2, 3, 4],
+                "num_blocks": [2, 2, 2],
+                "module_type": ["NAIVE", "NAIVE", "NAIVE"],
+                "reduce_ratios": [1, 1, 1],
+                "num_channels": [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
+            },
+            "wider_naive": {
+                "num_modules": [2, 4, 2],
+                "num_branches": [2, 3, 4],
+                "num_blocks": [2, 2, 2],
+                "module_type": ["NAIVE", "NAIVE", "NAIVE"],
+                "reduce_ratios": [1, 1, 1],
+                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
+            },
+        }
+
+        self.stages_config = self.module_configs[network_type]
+
+        self.stem = Stem(3, 32, 32, 1)
+        num_channels_pre_layer = [32]
+        for stage_idx in range(3):
+            num_channels = self.stages_config["num_channels"][stage_idx]
+            setattr(self, 'transition{}'.format(stage_idx),
+                    self._make_transition_layer(num_channels_pre_layer,
+                                                num_channels, self.freeze_norm,
+                                                self.norm_decay))
+            stage, num_channels_pre_layer = self._make_stage(
+                self.stages_config, stage_idx, num_channels, True,
+                self.freeze_norm, self.norm_decay)
+            setattr(self, 'stage{}'.format(stage_idx), stage)
+        self.head_layer = IterativeHead(num_channels_pre_layer, 'bn',
+                                        self.freeze_norm, self.norm_decay)
+
+    def _make_transition_layer(self,
+                               num_channels_pre_layer,
+                               num_channels_cur_layer,
+                               freeze_norm=False,
+                               norm_decay=0.):
+        num_branches_pre = len(num_channels_pre_layer)
+        num_branches_cur = len(num_channels_cur_layer)
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            L.Conv2d(
+                                num_channels_pre_layer[i],
+                                num_channels_pre_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                groups=num_channels_pre_layer[i],
+                                bias=False),
+                            nn.BatchNorm2D(num_channels_pre_layer[i]),
+                            L.Conv2d(
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False, ),
+                            nn.BatchNorm2D(num_channels_cur_layer[i]),
+                            nn.ReLU()))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            L.Conv2d(
+                                num_channels_pre_layer[-1],
+                                num_channels_pre_layer[-1],
+                                groups=num_channels_pre_layer[-1],
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False, ),
+                            nn.BatchNorm2D(num_channels_pre_layer[-1]),
+                            L.Conv2d(
+                                num_channels_pre_layer[-1],
+                                num_channels_cur_layer[i]
+                                if j == i - num_branches_pre else
+                                num_channels_pre_layer[-1],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False, ),
+                            nn.BatchNorm2D(num_channels_cur_layer[i]
+                                           if j == i - num_branches_pre else
+                                           num_channels_pre_layer[-1]),
+                            nn.ReLU()))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+        return nn.LayerList(transition_layers)
+
+    def _make_stage(self,
+                    stages_config,
+                    stage_idx,
+                    in_channels,
+                    multiscale_output,
+                    freeze_norm=False,
+                    norm_decay=0.):
+        num_modules = stages_config["num_modules"][stage_idx]
+        num_branches = stages_config["num_branches"][stage_idx]
+        num_blocks = stages_config["num_blocks"][stage_idx]
+        reduce_ratio = stages_config['reduce_ratios'][stage_idx]
+        module_type = stages_config['module_type'][stage_idx]
+
+        modules = []
+        for i in range(num_modules):
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+            modules.append(
+                LiteHRNetModule(
+                    num_branches,
+                    num_blocks,
+                    in_channels,
+                    reduce_ratio,
+                    module_type,
+                    multiscale_output=reset_multiscale_output,
+                    with_fuse=True,
+                    freeze_norm=freeze_norm,
+                    norm_decay=norm_decay))
+            in_channels = modules[-1].in_channels
+        return nn.Sequential(*modules), in_channels
+
+    def forward(self, inputs):
+        x = inputs['image']
+        x = self.stem(x)
+        y_list = [x]
+        for stage_idx in range(3):
+            x_list = []
+            transition = getattr(self, 'transition{}'.format(stage_idx))
+            for j in range(self.stages_config["num_branches"][stage_idx]):
+                if transition[j] is not None:
+                    if j >= len(y_list):
+                        x_list.append(transition[j](y_list[-1]))
+                    else:
+                        x_list.append(transition[j](y_list[j]))
+                else:
+                    x_list.append(y_list[j])
+            y_list = getattr(self, 'stage{}'.format(stage_idx))(x_list)
+        x = self.head_layer(y_list)
+        res = []
+        for i, layer in enumerate(x):
+            if i == self.freeze_at:
+                layer.stop_gradient = True
+            if i in self.return_idx:
+                res.append(layer)
+        return res
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]

+ 411 - 0
paddlers/models/ppdet/modeling/backbones/mobilenet_v1.py

@@ -0,0 +1,411 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+from paddlers.models.ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['MobileNet']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act='relu',
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=conv_lr,
+                initializer=KaimingNormal(),
+                regularizer=L2Decay(conv_decay)),
+            bias_attr=False)
+
+        param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
+        bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
+        if norm_type in ['sync_bn', 'bn']:
+            self._batch_norm = nn.BatchNorm2D(
+                out_channels, weight_attr=param_attr, bias_attr=bias_attr)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        if self.act == "relu":
+            x = F.relu(x)
+        elif self.act == "relu6":
+            x = F.relu6(x)
+        return x
+
+
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 num_groups,
+                 stride,
+                 scale,
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(DepthwiseSeparable, self).__init__()
+
+        self._depthwise_conv = ConvBNLayer(
+            in_channels,
+            int(out_channels1 * scale),
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_dw")
+
+        self._pointwise_conv = ConvBNLayer(
+            int(out_channels1 * scale),
+            int(out_channels2 * scale),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_sep")
+
+    def forward(self, x):
+        x = self._depthwise_conv(x)
+        x = self._pointwise_conv(x)
+        return x
+
+
+class ExtraBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 num_groups=1,
+                 stride=2,
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ExtraBlock, self).__init__()
+
+        self.pointwise_conv = ConvBNLayer(
+            in_channels,
+            int(out_channels1),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            num_groups=int(num_groups),
+            act='relu6',
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_extra1")
+
+        self.normal_conv = ConvBNLayer(
+            int(out_channels1),
+            int(out_channels2),
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups),
+            act='relu6',
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_extra2")
+
+    def forward(self, x):
+        x = self.pointwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+@register
+@serializable
+class MobileNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 conv_decay=0.,
+                 scale=1,
+                 conv_learning_rate=1.0,
+                 feature_maps=[4, 6, 13],
+                 with_extra_blocks=False,
+                 extra_block_filters=[[256, 512], [128, 256], [128, 256],
+                                      [64, 128]]):
+        super(MobileNet, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        self._out_channels = []
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=int(32 * scale),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_lr=conv_learning_rate,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name="conv1")
+
+        self.dwsl = []
+        dws21 = self.add_sublayer(
+            "conv2_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(32 * scale),
+                out_channels1=32,
+                out_channels2=64,
+                num_groups=32,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv2_1"))
+        self.dwsl.append(dws21)
+        self._update_out_channels(
+            int(64 * scale), len(self.dwsl), feature_maps)
+        dws22 = self.add_sublayer(
+            "conv2_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(64 * scale),
+                out_channels1=64,
+                out_channels2=128,
+                num_groups=64,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv2_2"))
+        self.dwsl.append(dws22)
+        self._update_out_channels(
+            int(128 * scale), len(self.dwsl), feature_maps)
+        # 1/4
+        dws31 = self.add_sublayer(
+            "conv3_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=128,
+                num_groups=128,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv3_1"))
+        self.dwsl.append(dws31)
+        self._update_out_channels(
+            int(128 * scale), len(self.dwsl), feature_maps)
+        dws32 = self.add_sublayer(
+            "conv3_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=256,
+                num_groups=128,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv3_2"))
+        self.dwsl.append(dws32)
+        self._update_out_channels(
+            int(256 * scale), len(self.dwsl), feature_maps)
+        # 1/8
+        dws41 = self.add_sublayer(
+            "conv4_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=256,
+                num_groups=256,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv4_1"))
+        self.dwsl.append(dws41)
+        self._update_out_channels(
+            int(256 * scale), len(self.dwsl), feature_maps)
+        dws42 = self.add_sublayer(
+            "conv4_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=512,
+                num_groups=256,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv4_2"))
+        self.dwsl.append(dws42)
+        self._update_out_channels(
+            int(512 * scale), len(self.dwsl), feature_maps)
+        # 1/16
+        for i in range(5):
+            tmp = self.add_sublayer(
+                "conv5_" + str(i + 1),
+                sublayer=DepthwiseSeparable(
+                    in_channels=int(512 * scale),
+                    out_channels1=512,
+                    out_channels2=512,
+                    num_groups=512,
+                    stride=1,
+                    scale=scale,
+                    conv_lr=conv_learning_rate,
+                    conv_decay=conv_decay,
+                    norm_decay=norm_decay,
+                    norm_type=norm_type,
+                    name="conv5_" + str(i + 1)))
+            self.dwsl.append(tmp)
+            self._update_out_channels(
+                int(512 * scale), len(self.dwsl), feature_maps)
+        dws56 = self.add_sublayer(
+            "conv5_6",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(512 * scale),
+                out_channels1=512,
+                out_channels2=1024,
+                num_groups=512,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv5_6"))
+        self.dwsl.append(dws56)
+        self._update_out_channels(
+            int(1024 * scale), len(self.dwsl), feature_maps)
+        # 1/32
+        dws6 = self.add_sublayer(
+            "conv6",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(1024 * scale),
+                out_channels1=1024,
+                out_channels2=1024,
+                num_groups=1024,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv6"))
+        self.dwsl.append(dws6)
+        self._update_out_channels(
+            int(1024 * scale), len(self.dwsl), feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_blocks = []
+            for i, block_filter in enumerate(self.extra_block_filters):
+                in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
+                conv_extra = self.add_sublayer(
+                    "conv7_" + str(i + 1),
+                    sublayer=ExtraBlock(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        conv_lr=conv_learning_rate,
+                        conv_decay=conv_decay,
+                        norm_decay=norm_decay,
+                        norm_type=norm_type,
+                        name="conv7_" + str(i + 1)))
+                self.extra_blocks.append(conv_extra)
+                self._update_out_channels(
+                    block_filter[1],
+                    len(self.dwsl) + len(self.extra_blocks), feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        outs = []
+        y = self.conv1(inputs['image'])
+        for i, block in enumerate(self.dwsl):
+            y = block(y)
+            if i + 1 in self.feature_maps:
+                outs.append(y)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        y = outs[-1]
+        for i, block in enumerate(self.extra_blocks):
+            idx = i + len(self.dwsl)
+            y = block(y)
+            if idx + 1 in self.feature_maps:
+                outs.append(y)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]

+ 479 - 0
paddlers/models/ppdet/modeling/backbones/mobilenet_v3.py

@@ -0,0 +1,479 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddlers.models.ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['MobileNetV3']
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act=None,
+                 lr_mult=1.,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=""):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else lr_mult
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        global_stats = True if freeze_norm else None
+        if norm_type in ['sync_bn', 'bn']:
+            self.bn = nn.BatchNorm2D(
+                out_c,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        norm_params = self.bn.parameters()
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            if self.act == "relu":
+                x = F.relu(x)
+            elif self.act == "relu6":
+                x = F.relu6(x)
+            elif self.act == "hard_swish":
+                x = F.hardswish(x)
+            else:
+                raise NotImplementedError(
+                    "The activation function is selected incorrectly.")
+        return x
+
+
+class ResidualUnit(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 act=None,
+                 return_list=False,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.use_se = use_se
+        self.return_list = return_list
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=act,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_expand")
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            act=act,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_depthwise")
+        if self.use_se:
+            self.mid_se = SEModule(
+                mid_c, lr_mult, conv_decay, name=name + "_se")
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_linear")
+
+    def forward(self, inputs):
+        y = self.expand_conv(inputs)
+        x = self.bottleneck_conv(y)
+        if self.use_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(inputs, x)
+        if self.return_list:
+            return [y, x]
+        else:
+            return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        mid_channels = int(channel // reduction)
+        self.conv1 = nn.Conv2D(
+            in_channels=channel,
+            out_channels=mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
+        self.conv2 = nn.Conv2D(
+            in_channels=mid_channels,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
+        return paddle.multiply(x=inputs, y=outputs)
+
+
+class ExtraBlockDW(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 ch_1,
+                 ch_2,
+                 stride,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None):
+        super(ExtraBlockDW, self).__init__()
+        self.pointwise_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=ch_1,
+            filter_size=1,
+            stride=1,
+            padding='SAME',
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra1")
+        self.depthwise_conv = ConvBNLayer(
+            in_c=ch_1,
+            out_c=ch_2,
+            filter_size=3,
+            stride=stride,
+            padding='SAME',
+            num_groups=int(ch_1),
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_dw")
+        self.normal_conv = ConvBNLayer(
+            in_c=ch_2,
+            out_c=ch_2,
+            filter_size=1,
+            stride=1,
+            padding='SAME',
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_sep")
+
+    def forward(self, inputs):
+        x = self.pointwise_conv(inputs)
+        x = self.depthwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+@register
+@serializable
+class MobileNetV3(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 scale=1.0,
+                 model_name="large",
+                 feature_maps=[6, 12, 15],
+                 with_extra_blocks=False,
+                 extra_block_filters=[[256, 512], [128, 256], [128, 256],
+                                      [64, 128]],
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 conv_decay=0.0,
+                 multiplier=1.0,
+                 norm_type='bn',
+                 norm_decay=0.0,
+                 freeze_norm=False):
+        super(MobileNetV3, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        if norm_type == 'sync_bn' and freeze_norm:
+            raise ValueError(
+                "The norm_type should not be sync_bn when freeze_norm is True")
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, "relu", 1],
+                [3, 64, 24, False, "relu", 2],
+                [3, 72, 24, False, "relu", 1],
+                [5, 72, 40, True, "relu", 2],  # RCNN output
+                [5, 120, 40, True, "relu", 1],
+                [5, 120, 40, True, "relu", 1],  # YOLOv3 output
+                [3, 240, 80, False, "hard_swish", 2],  # RCNN output
+                [3, 200, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 480, 112, True, "hard_swish", 1],
+                [3, 672, 112, True, "hard_swish", 1],  # YOLOv3 output
+                [5, 672, 160, True, "hard_swish",
+                 2],  # SSD/SSDLite/RCNN output
+                [5, 960, 160, True, "hard_swish", 1],
+                [5, 960, 160, True, "hard_swish", 1],  # YOLOv3 output
+            ]
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, "relu", 2],
+                [3, 72, 24, False, "relu", 2],  # RCNN output
+                [3, 88, 24, False, "relu", 1],  # YOLOv3 output
+                [5, 96, 40, True, "hard_swish", 2],  # RCNN output
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 120, 48, True, "hard_swish", 1],
+                [5, 144, 48, True, "hard_swish", 1],  # YOLOv3 output
+                [5, 288, 96, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
+                [5, 576, 96, True, "hard_swish", 1],
+                [5, 576, 96, True, "hard_swish", 1],  # YOLOv3 output
+            ]
+        else:
+            raise NotImplementedError(
+                "mode[{}_model] is not implemented!".format(model_name))
+
+        if multiplier != 1.0:
+            self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
+            self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
+            self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
+            self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
+            self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
+
+        self.conv1 = ConvBNLayer(
+            in_c=3,
+            out_c=make_divisible(inplanes * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            act="hard_swish",
+            lr_mult=lr_mult_list[0],
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="conv1")
+
+        self._out_channels = []
+        self.block_list = []
+        i = 0
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in self.cfg:
+            lr_idx = min(i // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv
+            return_list = self.with_extra_blocks and i + 2 in self.feature_maps
+
+            block = self.add_sublayer(
+                "conv" + str(i + 2),
+                sublayer=ResidualUnit(
+                    in_c=inplanes,
+                    mid_c=make_divisible(scale * exp),
+                    out_c=make_divisible(scale * c),
+                    filter_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl,
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    return_list=return_list,
+                    name="conv" + str(i + 2)))
+            self.block_list.append(block)
+            inplanes = make_divisible(scale * c)
+            i += 1
+            self._update_out_channels(
+                make_divisible(scale * exp)
+                if return_list else inplanes, i + 1, feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_block_list = []
+            extra_out_c = make_divisible(scale * self.cfg[-1][1])
+            lr_idx = min(i // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            conv_extra = self.add_sublayer(
+                "conv" + str(i + 2),
+                sublayer=ConvBNLayer(
+                    in_c=inplanes,
+                    out_c=extra_out_c,
+                    filter_size=1,
+                    stride=1,
+                    padding=0,
+                    num_groups=1,
+                    act="hard_swish",
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    name="conv" + str(i + 2)))
+            self.extra_block_list.append(conv_extra)
+            i += 1
+            self._update_out_channels(extra_out_c, i + 1, feature_maps)
+
+            for j, block_filter in enumerate(self.extra_block_filters):
+                in_c = extra_out_c if j == 0 else self.extra_block_filters[
+                    j - 1][1]
+                conv_extra = self.add_sublayer(
+                    "conv" + str(i + 2),
+                    sublayer=ExtraBlockDW(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        stride=2,
+                        lr_mult=lr_mult,
+                        conv_decay=conv_decay,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        name='conv' + str(i + 2)))
+                self.extra_block_list.append(conv_extra)
+                i += 1
+                self._update_out_channels(block_filter[1], i + 1, feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        x = self.conv1(inputs['image'])
+        outs = []
+        for idx, block in enumerate(self.block_list):
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                if isinstance(x, list):
+                    outs.append(x[0])
+                    x = x[1]
+                else:
+                    outs.append(x)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        for i, block in enumerate(self.extra_block_list):
+            idx = i + len(self.block_list)
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                outs.append(x)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]

+ 69 - 0
paddlers/models/ppdet/modeling/backbones/name_adapter.py

@@ -0,0 +1,69 @@
+class NameAdapter(object):
+    """Fix the backbones variable names for pretrained weight"""
+
+    def __init__(self, model):
+        super(NameAdapter, self).__init__()
+        self.model = model
+
+    @property
+    def model_type(self):
+        return getattr(self.model, '_model_type', '')
+
+    @property
+    def variant(self):
+        return getattr(self.model, 'variant', '')
+
+    def fix_conv_norm_name(self, name):
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        # the naming rule is same as pretrained weight
+        if self.model_type == 'SEResNeXt':
+            bn_name = name + "_bn"
+        return bn_name
+
+    def fix_shortcut_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            name = 'conv' + name + '_prj'
+        return name
+
+    def fix_bottleneck_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            conv_name1 = 'conv' + name + '_x1'
+            conv_name2 = 'conv' + name + '_x2'
+            conv_name3 = 'conv' + name + '_x3'
+            shortcut_name = name
+        else:
+            conv_name1 = name + "_branch2a"
+            conv_name2 = name + "_branch2b"
+            conv_name3 = name + "_branch2c"
+            shortcut_name = name + "_branch1"
+        return conv_name1, conv_name2, conv_name3, shortcut_name
+
+    def fix_basicblock_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            conv_name1 = 'conv' + name + '_x1'
+            conv_name2 = 'conv' + name + '_x2'
+            shortcut_name = name
+        else:
+            conv_name1 = name + "_branch2a"
+            conv_name2 = name + "_branch2b"
+            shortcut_name = name + "_branch1"
+        return conv_name1, conv_name2, shortcut_name
+
+    def fix_layer_warp_name(self, stage_num, count, i):
+        name = 'res' + str(stage_num)
+        if count > 10 and stage_num == 4:
+            if i == 0:
+                conv_name = name + "a"
+            else:
+                conv_name = name + "b" + str(i)
+        else:
+            conv_name = name + chr(ord("a") + i)
+        if self.model_type == 'SEResNeXt':
+            conv_name = str(stage_num + 2) + '_' + str(i + 1)
+        return conv_name
+
+    def fix_c1_stage_name(self):
+        return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"

+ 358 - 0
paddlers/models/ppdet/modeling/backbones/res2net.py

@@ -0,0 +1,358 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Integral
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlers.models.ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+from .resnet import ConvNormLayer
+
+__all__ = ['Res2Net', 'Res2NetC5']
+
+Res2Net_cfg = {
+    50: [3, 4, 6, 3],
+    101: [3, 4, 23, 3],
+    152: [3, 8, 36, 3],
+    200: [3, 12, 48, 3]
+}
+
+
+class BottleNeck(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 width,
+                 scales=4,
+                 variant='b',
+                 groups=1,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False):
+        super(BottleNeck, self).__init__()
+
+        self.shortcut = shortcut
+        self.scales = scales
+        self.stride = stride
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.branch1 = nn.Sequential()
+                self.branch1.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.branch1.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.branch1 = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=width * scales,
+            filter_size=1,
+            stride=stride if variant == 'a' else 1,
+            groups=1,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = nn.LayerList([
+            ConvNormLayer(
+                ch_in=width,
+                ch_out=width,
+                filter_size=3,
+                stride=1 if variant == 'a' else stride,
+                groups=groups,
+                act='relu',
+                norm_type=norm_type,
+                norm_decay=norm_decay,
+                freeze_norm=freeze_norm,
+                lr=lr,
+                dcn_v2=dcn_v2) for _ in range(self.scales - 1)
+        ])
+
+        self.branch2c = ConvNormLayer(
+            ch_in=width * scales,
+            ch_out=ch_out,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+    def forward(self, inputs):
+
+        out = self.branch2a(inputs)
+        feature_split = paddle.split(out, self.scales, 1)
+        out_split = []
+        for i in range(self.scales - 1):
+            if i == 0 or self.stride == 2:
+                out_split.append(self.branch2b[i](feature_split[i]))
+            else:
+                out_split.append(self.branch2b[i](paddle.add(feature_split[i],
+                                                             out_split[-1])))
+        if self.stride == 1:
+            out_split.append(feature_split[-1])
+        else:
+            out_split.append(
+                F.avg_pool2d(feature_split[-1], 3, self.stride, 1))
+        out = self.branch2c(paddle.concat(out_split, 1))
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.branch1(inputs)
+
+        out = paddle.add(out, short)
+        out = F.relu(out)
+
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 count,
+                 stage_num,
+                 width,
+                 scales=4,
+                 variant='b',
+                 groups=1,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False):
+        super(Blocks, self).__init__()
+
+        self.blocks = nn.Sequential()
+        for i in range(count):
+            self.blocks.add_sublayer(
+                str(i),
+                BottleNeck(
+                    ch_in=ch_in if i == 0 else ch_out,
+                    ch_out=ch_out,
+                    stride=2 if i == 0 and stage_num != 2 else 1,
+                    shortcut=False if i == 0 else True,
+                    width=width * (2**(stage_num - 2)),
+                    scales=scales,
+                    variant=variant,
+                    groups=groups,
+                    lr=lr,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=dcn_v2))
+
+    def forward(self, inputs):
+        return self.blocks(inputs)
+
+
+@register
+@serializable
+class Res2Net(nn.Layer):
+    """
+    Res2Net, see https://arxiv.org/abs/1904.01169
+    Args:
+        depth (int): Res2Net depth, should be 50, 101, 152, 200.
+        width (int): Res2Net width
+        scales (int): Res2Net scale
+        variant (str): Res2Net variant, supports 'a', 'b', 'c', 'd' currently
+        lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
+                             lower learning rate ratio is need for pretrained model
+                             got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
+        groups (int): The groups number of the Conv Layer.
+        norm_type (str): normalization type, 'bn' or 'sync_bn'
+        norm_decay (float): weight decay for normalization layer weights
+        freeze_norm (bool): freeze normalization layers
+        freeze_at (int): freeze the backbone at which stage
+        return_idx (list): index of stages whose feature maps are returned,
+                           index 0 stands for res2
+        dcn_v2_stages (list): index of stages who select deformable conv v2
+        num_stages (int): number of stages created
+
+    """
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 depth=50,
+                 width=26,
+                 scales=4,
+                 variant='b',
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
+                 groups=1,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 freeze_at=0,
+                 return_idx=[0, 1, 2, 3],
+                 dcn_v2_stages=[-1],
+                 num_stages=4):
+        super(Res2Net, self).__init__()
+
+        self._model_type = 'Res2Net' if groups == 1 else 'Res2NeXt'
+
+        assert depth in [50, 101, 152, 200], \
+            "depth {} not in [50, 101, 152, 200]"
+        assert variant in ['a', 'b', 'c', 'd'], "invalid Res2Net variant"
+        assert num_stages >= 1 and num_stages <= 4
+
+        self.depth = depth
+        self.variant = variant
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.freeze_norm = freeze_norm
+        self.freeze_at = freeze_at
+        if isinstance(return_idx, Integral):
+            return_idx = [return_idx]
+        assert max(return_idx) < num_stages, \
+            'the maximum return index must smaller than num_stages, ' \
+            'but received maximum return index is {} and num_stages ' \
+            'is {}'.format(max(return_idx), num_stages)
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        assert len(lr_mult_list) == 4, \
+            "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+        self.dcn_v2_stages = dcn_v2_stages
+
+        block_nums = Res2Net_cfg[depth]
+
+        # C1 stage
+        if self.variant in ['c', 'd']:
+            conv_def = [
+                [3, 32, 3, 2, "conv1_1"],
+                [32, 32, 3, 1, "conv1_2"],
+                [32, 64, 3, 1, "conv1_3"],
+            ]
+        else:
+            conv_def = [[3, 64, 7, 2, "conv1"]]
+        self.res1 = nn.Sequential()
+        for (c_in, c_out, k, s, _name) in conv_def:
+            self.res1.add_sublayer(
+                _name,
+                ConvNormLayer(
+                    ch_in=c_in,
+                    ch_out=c_out,
+                    filter_size=k,
+                    stride=s,
+                    groups=1,
+                    act='relu',
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=1.0))
+
+        self._in_channels = [64, 256, 512, 1024]
+        self._out_channels = [256, 512, 1024, 2048]
+        self._out_strides = [4, 8, 16, 32]
+
+        # C2-C5 stages
+        self.res_layers = []
+        for i in range(num_stages):
+            lr_mult = lr_mult_list[i]
+            stage_num = i + 2
+            self.res_layers.append(
+                self.add_sublayer(
+                    "res{}".format(stage_num),
+                    Blocks(
+                        self._in_channels[i],
+                        self._out_channels[i],
+                        count=block_nums[i],
+                        stage_num=stage_num,
+                        width=width,
+                        scales=scales,
+                        groups=groups,
+                        lr=lr_mult,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        dcn_v2=(i in self.dcn_v2_stages))))
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        res1 = self.res1(x)
+        x = F.max_pool2d(res1, kernel_size=3, stride=2, padding=1)
+        outs = []
+        for idx, stage in enumerate(self.res_layers):
+            x = stage(x)
+            if idx == self.freeze_at:
+                x.stop_gradient = True
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+
+
+@register
+class Res2NetC5(nn.Layer):
+    def __init__(self, depth=50, width=26, scales=4, variant='b'):
+        super(Res2NetC5, self).__init__()
+        feat_in, feat_out = [1024, 2048]
+        self.res5 = Blocks(
+            feat_in,
+            feat_out,
+            count=3,
+            stage_num=5,
+            width=width,
+            scales=scales,
+            variant=variant)
+        self.feat_out = feat_out
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(
+            channels=self.feat_out,
+            stride=32, )]
+
+    def forward(self, roi_feat, stage=0):
+        y = self.res5(roi_feat)
+        return y

+ 609 - 0
paddlers/models/ppdet/modeling/backbones/resnet.py

@@ -0,0 +1,609 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from numbers import Integral
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlers.models.ppdet.core.workspace import register, serializable
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Uniform
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+from paddle.vision.ops import DeformConv2D
+from .name_adapter import NameAdapter
+from ..shape_spec import ShapeSpec
+
+__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
+
+ResNet_cfg = {
+    18: [2, 2, 2, 2],
+    34: [3, 4, 6, 3],
+    50: [3, 4, 6, 3],
+    101: [3, 4, 23, 3],
+    152: [3, 8, 36, 3],
+}
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride,
+                 groups=1,
+                 act=None,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 lr=1.0,
+                 dcn_v2=False):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn']
+        self.norm_type = norm_type
+        self.act = act
+        self.dcn_v2 = dcn_v2
+
+        if not self.dcn_v2:
+            self.conv = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(learning_rate=lr),
+                bias_attr=False)
+        else:
+            self.offset_channel = 2 * filter_size**2
+            self.mask_channel = filter_size**2
+
+            self.conv_offset = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=3 * filter_size**2,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                weight_attr=ParamAttr(initializer=Constant(0.)),
+                bias_attr=ParamAttr(initializer=Constant(0.)))
+            self.conv = DeformConv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                dilation=1,
+                groups=groups,
+                weight_attr=ParamAttr(learning_rate=lr),
+                bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else lr
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+
+        global_stats = True if freeze_norm else None
+        if norm_type in ['sync_bn', 'bn']:
+            self.norm = nn.BatchNorm2D(
+                ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        norm_params = self.norm.parameters()
+
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, inputs):
+        if not self.dcn_v2:
+            out = self.conv(inputs)
+        else:
+            offset_mask = self.conv_offset(inputs)
+            offset, mask = paddle.split(
+                offset_mask,
+                num_or_sections=[self.offset_channel, self.mask_channel],
+                axis=1)
+            mask = F.sigmoid(mask)
+            out = self.conv(inputs, offset, mask=mask)
+
+        if self.norm_type in ['bn', 'sync_bn']:
+            out = self.norm(out)
+        if self.act:
+            out = getattr(F, self.act)(out)
+        return out
+
+
+class SELayer(nn.Layer):
+    def __init__(self, ch, reduction_ratio=16):
+        super(SELayer, self).__init__()
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        stdv = 1.0 / math.sqrt(ch)
+        c_ = ch // reduction_ratio
+        self.squeeze = nn.Linear(
+            ch,
+            c_,
+            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=True)
+
+        stdv = 1.0 / math.sqrt(c_)
+        self.extract = nn.Linear(
+            c_,
+            ch,
+            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=True)
+
+    def forward(self, inputs):
+        out = self.pool(inputs)
+        out = paddle.squeeze(out, axis=[2, 3])
+        out = self.squeeze(out)
+        out = F.relu(out)
+        out = self.extract(out)
+        out = F.sigmoid(out)
+        out = paddle.unsqueeze(out, axis=[2, 3])
+        scale = out * inputs
+        return scale
+
+
+class BasicBlock(nn.Layer):
+
+    expansion = 1
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 variant='b',
+                 groups=1,
+                 base_width=64,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(BasicBlock, self).__init__()
+        assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = ConvNormLayer(
+            ch_in=ch_out,
+            ch_out=ch_out,
+            filter_size=3,
+            stride=1,
+            act=None,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr,
+            dcn_v2=dcn_v2)
+
+        self.std_senet = std_senet
+        if self.std_senet:
+            self.se = SELayer(ch_out)
+
+    def forward(self, inputs):
+        out = self.branch2a(inputs)
+        out = self.branch2b(out)
+        if self.std_senet:
+            out = self.se(out)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        out = paddle.add(x=out, y=short)
+        out = F.relu(out)
+
+        return out
+
+
+class BottleNeck(nn.Layer):
+
+    expansion = 4
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 variant='b',
+                 groups=1,
+                 base_width=4,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(BottleNeck, self).__init__()
+        if variant == 'a':
+            stride1, stride2 = stride, 1
+        else:
+            stride1, stride2 = 1, stride
+
+        # ResNeXt
+        width = int(ch_out * (base_width / 64.)) * groups
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out * self.expansion,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out * self.expansion,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=width,
+            filter_size=1,
+            stride=stride1,
+            groups=1,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = ConvNormLayer(
+            ch_in=width,
+            ch_out=width,
+            filter_size=3,
+            stride=stride2,
+            groups=groups,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr,
+            dcn_v2=dcn_v2)
+
+        self.branch2c = ConvNormLayer(
+            ch_in=width,
+            ch_out=ch_out * self.expansion,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.std_senet = std_senet
+        if self.std_senet:
+            self.se = SELayer(ch_out * self.expansion)
+
+    def forward(self, inputs):
+
+        out = self.branch2a(inputs)
+        out = self.branch2b(out)
+        out = self.branch2c(out)
+
+        if self.std_senet:
+            out = self.se(out)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        out = paddle.add(x=out, y=short)
+        out = F.relu(out)
+
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 block,
+                 ch_in,
+                 ch_out,
+                 count,
+                 name_adapter,
+                 stage_num,
+                 variant='b',
+                 groups=1,
+                 base_width=64,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(Blocks, self).__init__()
+
+        self.blocks = []
+        for i in range(count):
+            conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
+            layer = self.add_sublayer(
+                conv_name,
+                block(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    stride=2 if i == 0 and stage_num != 2 else 1,
+                    shortcut=False if i == 0 else True,
+                    variant=variant,
+                    groups=groups,
+                    base_width=base_width,
+                    lr=lr,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=dcn_v2,
+                    std_senet=std_senet))
+            self.blocks.append(layer)
+            if i == 0:
+                ch_in = ch_out * block.expansion
+
+    def forward(self, inputs):
+        block_out = inputs
+        for block in self.blocks:
+            block_out = block(block_out)
+        return block_out
+
+
+@register
+@serializable
+class ResNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 depth=50,
+                 ch_in=64,
+                 variant='b',
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
+                 groups=1,
+                 base_width=64,
+                 norm_type='bn',
+                 norm_decay=0,
+                 freeze_norm=True,
+                 freeze_at=0,
+                 return_idx=[0, 1, 2, 3],
+                 dcn_v2_stages=[-1],
+                 num_stages=4,
+                 std_senet=False):
+        """
+        Residual Network, see https://arxiv.org/abs/1512.03385
+
+        Args:
+            depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
+            ch_in (int): output channel of first stage, default 64
+            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
+            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
+                                 lower learning rate ratio is need for pretrained model
+                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
+            groups (int): group convolution cardinality
+            base_width (int): base width of each group convolution
+            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
+            norm_decay (float): weight decay for normalization layer weights
+            freeze_norm (bool): freeze normalization layers
+            freeze_at (int): freeze the backbone at which stage
+            return_idx (list): index of the stages whose feature maps are returned
+            dcn_v2_stages (list): index of stages who select deformable conv v2
+            num_stages (int): total num of stages
+            std_senet (bool): whether use senet, default True
+        """
+        super(ResNet, self).__init__()
+        self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
+        assert num_stages >= 1 and num_stages <= 4
+        self.depth = depth
+        self.variant = variant
+        self.groups = groups
+        self.base_width = base_width
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.freeze_norm = freeze_norm
+        self.freeze_at = freeze_at
+        if isinstance(return_idx, Integral):
+            return_idx = [return_idx]
+        assert max(return_idx) < num_stages, \
+            'the maximum return index must smaller than num_stages, ' \
+            'but received maximum return index is {} and num_stages ' \
+            'is {}'.format(max(return_idx), num_stages)
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        assert len(lr_mult_list) == 4, \
+            "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+        self.dcn_v2_stages = dcn_v2_stages
+
+        block_nums = ResNet_cfg[depth]
+        na = NameAdapter(self)
+
+        conv1_name = na.fix_c1_stage_name()
+        if variant in ['c', 'd']:
+            conv_def = [
+                [3, ch_in // 2, 3, 2, "conv1_1"],
+                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
+                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
+            ]
+        else:
+            conv_def = [[3, ch_in, 7, 2, conv1_name]]
+        self.conv1 = nn.Sequential()
+        for (c_in, c_out, k, s, _name) in conv_def:
+            self.conv1.add_sublayer(
+                _name,
+                ConvNormLayer(
+                    ch_in=c_in,
+                    ch_out=c_out,
+                    filter_size=k,
+                    stride=s,
+                    groups=1,
+                    act='relu',
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=1.0))
+
+        self.ch_in = ch_in
+        ch_out_list = [64, 128, 256, 512]
+        block = BottleNeck if depth >= 50 else BasicBlock
+
+        self._out_channels = [block.expansion * v for v in ch_out_list]
+        self._out_strides = [4, 8, 16, 32]
+
+        self.res_layers = []
+        for i in range(num_stages):
+            lr_mult = lr_mult_list[i]
+            stage_num = i + 2
+            res_name = "res{}".format(stage_num)
+            res_layer = self.add_sublayer(
+                res_name,
+                Blocks(
+                    block,
+                    self.ch_in,
+                    ch_out_list[i],
+                    count=block_nums[i],
+                    name_adapter=na,
+                    stage_num=stage_num,
+                    variant=variant,
+                    groups=groups,
+                    base_width=base_width,
+                    lr=lr_mult,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=(i in self.dcn_v2_stages),
+                    std_senet=std_senet))
+            self.res_layers.append(res_layer)
+            self.ch_in = self._out_channels[i]
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.conv1)
+            for i in range(min(freeze_at + 1, num_stages)):
+                self._freeze_parameters(self.res_layers[i])
+
+    def _freeze_parameters(self, m):
+        for p in m.parameters():
+            p.stop_gradient = True
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        conv1 = self.conv1(x)
+        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
+        outs = []
+        for idx, stage in enumerate(self.res_layers):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+
+
+@register
+class Res5Head(nn.Layer):
+    def __init__(self, depth=50):
+        super(Res5Head, self).__init__()
+        feat_in, feat_out = [1024, 512]
+        if depth < 50:
+            feat_in = 256
+        na = NameAdapter(self)
+        block = BottleNeck if depth >= 50 else BasicBlock
+        self.res5 = Blocks(
+            block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
+        self.feat_out = feat_out if depth < 50 else feat_out * 4
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(
+            channels=self.feat_out,
+            stride=16, )]
+
+    def forward(self, roi_feat, stage=0):
+        y = self.res5(roi_feat)
+        return y

+ 139 - 0
paddlers/models/ppdet/modeling/backbones/senet.py

@@ -0,0 +1,139 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+
+from paddlers.models.ppdet.core.workspace import register, serializable
+from .resnet import ResNet, Blocks, BasicBlock, BottleNeck
+
+__all__ = ['SENet', 'SERes5Head']
+
+
+@register
+@serializable
+class SENet(ResNet):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 depth=50,
+                 variant='b',
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
+                 groups=1,
+                 base_width=64,
+                 norm_type='bn',
+                 norm_decay=0,
+                 freeze_norm=True,
+                 freeze_at=0,
+                 return_idx=[0, 1, 2, 3],
+                 dcn_v2_stages=[-1],
+                 std_senet=True,
+                 num_stages=4):
+        """
+        Squeeze-and-Excitation Networks, see https://arxiv.org/abs/1709.01507
+
+        Args:
+            depth (int): SENet depth, should be 50, 101, 152
+            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
+            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
+                                 lower learning rate ratio is need for pretrained model
+                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
+            groups (int): group convolution cardinality
+            base_width (int): base width of each group convolution
+            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
+            norm_decay (float): weight decay for normalization layer weights
+            freeze_norm (bool): freeze normalization layers
+            freeze_at (int): freeze the backbone at which stage
+            return_idx (list): index of the stages whose feature maps are returned
+            dcn_v2_stages (list): index of stages who select deformable conv v2
+            std_senet (bool): whether use senet, default True
+            num_stages (int): total num of stages
+        """
+
+        super(SENet, self).__init__(
+            depth=depth,
+            variant=variant,
+            lr_mult_list=lr_mult_list,
+            ch_in=128,
+            groups=groups,
+            base_width=base_width,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            freeze_at=freeze_at,
+            return_idx=return_idx,
+            dcn_v2_stages=dcn_v2_stages,
+            std_senet=std_senet,
+            num_stages=num_stages)
+
+
+@register
+class SERes5Head(nn.Layer):
+    def __init__(self,
+                 depth=50,
+                 variant='b',
+                 lr_mult=1.0,
+                 groups=1,
+                 base_width=64,
+                 norm_type='bn',
+                 norm_decay=0,
+                 dcn_v2=False,
+                 freeze_norm=False,
+                 std_senet=True):
+        """
+        SERes5Head layer
+
+        Args:
+            depth (int): SENet depth, should be 50, 101, 152
+            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
+            lr_mult (list): learning rate ratio of SERes5Head, default as 1.0.
+            groups (int): group convolution cardinality
+            base_width (int): base width of each group convolution
+            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
+            norm_decay (float): weight decay for normalization layer weights
+            dcn_v2_stages (list): index of stages who select deformable conv v2
+            std_senet (bool): whether use senet, default True
+
+        """
+        super(SERes5Head, self).__init__()
+        ch_out = 512
+        ch_in = 256 if depth < 50 else 1024
+        na = NameAdapter(self)
+        block = BottleNeck if depth >= 50 else BasicBlock
+        self.res5 = Blocks(
+            block,
+            ch_in,
+            ch_out,
+            count=3,
+            name_adapter=na,
+            stage_num=5,
+            variant=variant,
+            groups=groups,
+            base_width=base_width,
+            lr=lr_mult,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            dcn_v2=dcn_v2,
+            std_senet=std_senet)
+        self.ch_out = ch_out * block.expansion
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(
+            channels=self.ch_out,
+            stride=16, )]
+
+    def forward(self, roi_feat):
+        y = self.res5(roi_feat)
+        return y

+ 251 - 0
paddlers/models/ppdet/modeling/backbones/shufflenet_v2.py

@@ -0,0 +1,251 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+
+from paddlers.models.ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+from paddlers.models.ppdet.modeling.ops import channel_shuffle
+
+__all__ = ['ShuffleNetV2']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if act == "hard_swish":
+            act = 'hardswish'
+        self.act = act
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self.act:
+            y = getattr(F, self.act)(y)
+        return y
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu"):
+        super(InvertedResidual, self).__init__()
+        self._conv_pw = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None)
+        self._conv_linear = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1, x2 = paddle.split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self._conv_pw(x2)
+        x2 = self._conv_dw(x2)
+        x2 = self._conv_linear(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class InvertedResidualDS(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu"):
+        super(InvertedResidualDS, self).__init__()
+
+        # branch1
+        self._conv_dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=in_channels,
+            act=None)
+        self._conv_linear_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        # branch2
+        self._conv_pw_2 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None)
+        self._conv_linear_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1 = self._conv_dw_1(inputs)
+        x1 = self._conv_linear_1(x1)
+        x2 = self._conv_pw_2(inputs)
+        x2 = self._conv_dw_2(x2)
+        x2 = self._conv_linear_2(x2)
+        out = paddle.concat([x1, x2], axis=1)
+
+        return channel_shuffle(out, 2)
+
+
+@register
+@serializable
+class ShuffleNetV2(nn.Layer):
+    def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
+        super(ShuffleNetV2, self).__init__()
+        self.scale = scale
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        self.feature_maps = feature_maps
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.25:
+            stage_out_channels = [-1, 24, 24, 48, 96, 512]
+        elif scale == 0.33:
+            stage_out_channels = [-1, 24, 32, 64, 128, 512]
+        elif scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 224, 488, 976, 2048]
+        else:
+            raise NotImplementedError("This scale size:[" + str(scale) +
+                                      "] is not implemented!")
+
+        self._out_channels = []
+        self._feature_idx = 0
+        # 1. conv1
+        self._conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            act=act)
+        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self._feature_idx += 1
+
+        # 2. bottleneck sequences
+        self._block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidualDS(
+                            in_channels=stage_out_channels[stage_id + 1],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=2,
+                            act=act))
+                else:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidual(
+                            in_channels=stage_out_channels[stage_id + 2],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=1,
+                            act=act))
+                self._block_list.append(block)
+                self._feature_idx += 1
+                self._update_out_channels(stage_out_channels[stage_id + 2],
+                                          self._feature_idx, self.feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        y = self._conv1(inputs['image'])
+        y = self._max_pool(y)
+        outs = []
+        for i, inv in enumerate(self._block_list):
+            y = inv(y)
+            if i + 2 in self.feature_maps:
+                outs.append(y)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]

Неке датотеке нису приказане због велике количине промена