Răsfoiți Sursa

【Hackathon + No.150】Add Rotated Box Detection Functionality (#100)

Asthestarsfalll 2 ani în urmă
părinte
comite
c6f15f726f
100 a modificat fișierele cu 8559 adăugiri și 909 ștergeri
  1. 3 1
      docs/apis/data_cn.md
  2. 3 1
      docs/apis/data_en.md
  3. 3 1
      docs/apis/train_cn.md
  4. 3 1
      docs/apis/train_en.md
  5. 3 2
      docs/intro/data_prep_cn.md
  6. 3 2
      docs/intro/data_prep_en.md
  7. 1 0
      docs/intro/model_cons_params_cn.md
  8. 1 0
      docs/intro/model_cons_params_en.md
  9. 1 0
      docs/intro/model_zoo_cn.md
  10. 1 0
      docs/intro/model_zoo_en.md
  11. 9 0
      docs/quick_start_cn.md
  12. 9 0
      docs/quick_start_en.md
  13. 31 5
      paddlers/datasets/base.py
  14. 3 2
      paddlers/datasets/cd_dataset.py
  15. 4 2
      paddlers/datasets/clas_dataset.py
  16. 49 29
      paddlers/datasets/coco.py
  17. 3 2
      paddlers/datasets/res_dataset.py
  18. 3 2
      paddlers/datasets/seg_dataset.py
  19. 5 2
      paddlers/datasets/voc.py
  20. 9 0
      paddlers/models/ppdet/core/workspace.py
  21. 1 1
      paddlers/models/ppdet/data/crop_utils/__init__.py
  22. 91 53
      paddlers/models/ppdet/data/crop_utils/annotation_cropper.py
  23. 10 6
      paddlers/models/ppdet/data/crop_utils/chip_box_utils.py
  24. 309 0
      paddlers/models/ppdet/data/reader.py
  25. 1 0
      paddlers/models/ppdet/data/source/__init__.py
  26. 3 0
      paddlers/models/ppdet/data/source/category.py
  27. 237 3
      paddlers/models/ppdet/data/source/coco.py
  28. 9 1
      paddlers/models/ppdet/data/source/dataset.py
  29. 84 29
      paddlers/models/ppdet/data/source/keypoint_coco.py
  30. 380 0
      paddlers/models/ppdet/data/source/pose3d_cmb.py
  31. 2 0
      paddlers/models/ppdet/data/transform/__init__.py
  32. 159 7
      paddlers/models/ppdet/data/transform/atss_assigner.py
  33. 359 42
      paddlers/models/ppdet/data/transform/batch_operators.py
  34. 832 85
      paddlers/models/ppdet/data/transform/keypoint_operators.py
  35. 296 0
      paddlers/models/ppdet/data/transform/keypoints_3d_operators.py
  36. 500 71
      paddlers/models/ppdet/data/transform/operators.py
  37. 7 0
      paddlers/models/ppdet/engine/__init__.py
  38. 111 47
      paddlers/models/ppdet/engine/callbacks.py
  39. 54 6
      paddlers/models/ppdet/engine/export_utils.py
  40. 107 10
      paddlers/models/ppdet/engine/tracker.py
  41. 147 30
      paddlers/models/ppdet/engine/trainer.py
  42. 42 0
      paddlers/models/ppdet/engine/trainer_cot.py
  43. 475 0
      paddlers/models/ppdet/engine/trainer_ssod.py
  44. 18 17
      paddlers/models/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc
  45. 9 14
      paddlers/models/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu
  46. 121 0
      paddlers/models/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc
  47. 96 0
      paddlers/models/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu
  48. 95 0
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc
  49. 6 11
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu
  50. 0 97
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cc
  51. 12 4
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h
  52. 1 1
      paddlers/models/ppdet/hash.txt
  53. 2 1
      paddlers/models/ppdet/metrics/__init__.py
  54. 6 2
      paddlers/models/ppdet/metrics/coco_utils.py
  55. 16 0
      paddlers/models/ppdet/metrics/json_results.py
  56. 1 1
      paddlers/models/ppdet/metrics/metrics.py
  57. 200 0
      paddlers/models/ppdet/metrics/pose3d_metrics.py
  58. 2 0
      paddlers/models/ppdet/modeling/__init__.py
  59. 11 0
      paddlers/models/ppdet/modeling/architectures/__init__.py
  60. 35 9
      paddlers/models/ppdet/modeling/architectures/blazeface.py
  61. 1 1
      paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py
  62. 11 16
      paddlers/models/ppdet/modeling/architectures/centernet.py
  63. 176 0
      paddlers/models/ppdet/modeling/architectures/centertrack.py
  64. 13 5
      paddlers/models/ppdet/modeling/architectures/detr.py
  65. 61 5
      paddlers/models/ppdet/modeling/architectures/faster_rcnn.py
  66. 30 39
      paddlers/models/ppdet/modeling/architectures/fcos.py
  67. 207 6
      paddlers/models/ppdet/modeling/architectures/keypoint_hrnet.py
  68. 217 0
      paddlers/models/ppdet/modeling/architectures/keypoint_petr.py
  69. 22 5
      paddlers/models/ppdet/modeling/architectures/mask_rcnn.py
  70. 2 1
      paddlers/models/ppdet/modeling/architectures/meta_arch.py
  71. 114 0
      paddlers/models/ppdet/modeling/architectures/pose3d_metro.py
  72. 260 0
      paddlers/models/ppdet/modeling/architectures/ppyoloe.py
  73. 104 0
      paddlers/models/ppdet/modeling/architectures/queryinst.py
  74. 18 2
      paddlers/models/ppdet/modeling/architectures/retinanet.py
  75. 3 3
      paddlers/models/ppdet/modeling/architectures/sparse_rcnn.py
  76. 35 9
      paddlers/models/ppdet/modeling/architectures/ssd.py
  77. 28 5
      paddlers/models/ppdet/modeling/architectures/yolo.py
  78. 88 0
      paddlers/models/ppdet/modeling/architectures/yolof.py
  79. 10 0
      paddlers/models/ppdet/modeling/assigners/__init__.py
  80. 16 6
      paddlers/models/ppdet/modeling/assigners/atss_assigner.py
  81. 227 0
      paddlers/models/ppdet/modeling/assigners/fcosr_assigner.py
  82. 316 0
      paddlers/models/ppdet/modeling/assigners/hungarian_assigner.py
  83. 275 0
      paddlers/models/ppdet/modeling/assigners/pose_utils.py
  84. 164 0
      paddlers/models/ppdet/modeling/assigners/rotated_task_aligned_assigner.py
  85. 1 1
      paddlers/models/ppdet/modeling/assigners/simota_assigner.py
  86. 38 4
      paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py
  87. 182 0
      paddlers/models/ppdet/modeling/assigners/task_aligned_assigner_cr.py
  88. 93 0
      paddlers/models/ppdet/modeling/assigners/uniform_assigner.py
  89. 8 3
      paddlers/models/ppdet/modeling/assigners/utils.py
  90. 2 0
      paddlers/models/ppdet/modeling/backbones/__init__.py
  91. 49 9
      paddlers/models/ppdet/modeling/backbones/dla.py
  92. 144 2
      paddlers/models/ppdet/modeling/backbones/hrnet.py
  93. 5 0
      paddlers/models/ppdet/modeling/backbones/lite_hrnet.py
  94. 30 30
      paddlers/models/ppdet/modeling/backbones/resnet.py
  95. 381 0
      paddlers/models/ppdet/modeling/backbones/trans_encoder.py
  96. 29 11
      paddlers/models/ppdet/modeling/backbones/vision_transformer.py
  97. 99 94
      paddlers/models/ppdet/modeling/bbox_utils.py
  98. 13 0
      paddlers/models/ppdet/modeling/heads/__init__.py
  99. 51 12
      paddlers/models/ppdet/modeling/heads/bbox_head.py
  100. 42 40
      paddlers/models/ppdet/modeling/heads/centernet_head.py

+ 3 - 1
docs/apis/data_cn.md

@@ -57,13 +57,14 @@
 |-------|----|--------|-----|
 |`data_dir`|`str`|数据集存放目录。||
 |`image_dir`|`str`|输入图像存放目录。||
-|`ann_path`|`str`|[COCO格式](https://cocodataset.org/#home)标注文件路径。||
+|`anno_path`|`str`|[COCO格式](https://cocodataset.org/#home)标注文件路径。||
 |`transforms`|`paddlers.transforms.Compose`|对输入数据应用的数据变换算子。||
 |`label_list`|`str` \| `None`|label list文件。label list是一个文本文件,其中每一行包含一个类别的名称。|`None`|
 |`num_workers`|`int` \| `str`|加载数据时使用的辅助进程数。若设置为`'auto'`,则按照如下规则确定使用进程数:当CPU核心数大于16时,使用8个数据读取辅助进程;否则,使用CPU核心数一半数量的辅助进程。|`'auto'`|
 |`shuffle`|`bool`|是否随机打乱数据集中的样本。|`False`|
 |`allow_empty`|`bool`|是否向数据集中添加负样本。|`False`|
 |`empty_ratio`|`float`|负样本占比,仅当`allow_empty`为`True`时生效。若`empty_ratio`为负值或大于等于1,则保留所有生成的负样本。|`1.0`|
+|`batch_transforms`|`paddlers.transforms.BatchCompose`|对输入数据应用的批数据变换算子。||
 
 ### VOC格式目标检测数据集`VOCDetDataset`
 
@@ -81,6 +82,7 @@
 |`shuffle`|`bool`|是否随机打乱数据集中的样本。|`False`|
 |`allow_empty`|`bool`|是否向数据集中添加负样本。|`False`|
 |`empty_ratio`|`float`|负样本占比,仅当`allow_empty`为`True`时生效。若`empty_ratio`为负值或大于等于1,则保留所有生成的负样本。|`1.0`|
+|`batch_transforms`|`paddlers.transforms.BatchCompose`|对输入数据应用的批数据变换算子。||
 
 `VOCDetDataset`对file list的要求如下:
 

+ 3 - 1
docs/apis/data_en.md

@@ -57,13 +57,14 @@ The initialization parameter list is as follows:
 |-------|----|--------|-----|
 |`data_dir`|`str`|Directory that stores the dataset.||
 |`image_dir`|`str`|Directory of input images.||
-|`ann_path`|`str`|[COCO Format](https://cocodataset.org/#home)label file path.||
+|`anno_path`|`str`|[COCO Format](https://cocodataset.org/#home)label file path.||
 |`transforms`|`paddlers.transforms.Compose`|Data transformation operators applied to input data.||
 |`label_list`|`str` \| `None`|Label list path. Label list is a text file, in which each line contains the name of class.|`None`|
 |`num_workers`|`int` \| `str`|Number of auxiliary processes used when loading data. If it is set to `'auto'`, use the following rules to determine the number of processes to use: When the number of CPU cores is greater than 16, 8 data read auxiliary processes are used; otherwise, the number of auxiliary processes is set to half the counts of CPU cores.|`'auto'`|
 |`shuffle`|`bool`|Whether to randomly shuffle the samples in the dataset.|`False`|
 |`allow_empty`|`bool`|Whether to add negative samples to the dataset.|`False`|
 |`empty_ratio`|`float`|Negative sample ratio. Take effect only if `allow_empty` is `True`. If `empty_ratio` is negative or greater than or equal to 1, all negative samples generated are retained.|`1.0`|
+|`batch_transforms`|`paddlers.transforms.BatchCompose`|Data batch transformation operators applied to input data.||
 
 ### VOC Format Object Detection Dataset `VOCDetDataset`
 
@@ -81,6 +82,7 @@ The initialization parameter list is as follows:
 |`shuffle`|`bool`|Whether to randomly shuffle the samples in the dataset.|`False`|
 |`allow_empty`|`bool`|Whether to add negative samples to the dataset.|`False`|
 |`empty_ratio`|`float`|Negative sample ratio. Takes effect only if `allow_empty` is `True`. If `empty_ratio` is negative or greater than or equal to `1`, all negative samples generated will be retained.|`1.0`|
+|`batch_transforms`|`paddlers.transforms.BatchCompose`|Data batch transformation operators applied to input data.||
 
 The requirements of `VOCDetDataset` for the file list are as follows:
 

+ 3 - 1
docs/apis/train_cn.md

@@ -166,6 +166,7 @@ def train(self,
           warmup_start_lr=0.0,
           lr_decay_epochs=(216, 243),
           lr_decay_gamma=0.1,
+          cosine_decay_num_epochs=1000,
           metric=None,
           use_ema=False,
           early_stop=False,
@@ -196,7 +197,8 @@ def train(self,
 |`warmup_start_lr`|`int`|默认优化器warm-up阶段使用的初始学习率。|`0`|
 |`lr_decay_epochs`|`list` \| `tuple`|默认优化器学习率衰减的milestones,以epoch计。即,在第几个epoch执行学习率的衰减。|`(216, 243)`|
 |`lr_decay_gamma`|`float`|学习率衰减系数,适用于默认优化器。|`0.1`|
-|`metric`|`str` \| `None`|评价指标,可以为`'VOC'`、`COCO`或`None`。若为`None`,则根据数据集格式自动确定使用的评价指标。|`None`|
+|`cosine_decay_num_epochs`|`int`|使用余弦退火学习率调度器时计算退火周期的参数。|`1000`|
+|`metric`|`str` \| `None`|评价指标,可以为`'VOC'`、`'COCO'`、`'RBOX'`或`None`。若为`None`,则根据数据集格式自动确定使用的评价指标。|`None`|
 |`use_ema`|`bool`|是否启用[指数滑动平均策略](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/models/ppdet/optimizer.py)更新模型权重参数。|`False`|
 |`early_stop`|`bool`|训练过程是否启用早停策略。|`False`|
 |`early_stop_patience`|`int`|启用早停策略时的`patience`参数(参见[`EarlyStop`](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/utils/utils.py))。|`5`|

+ 3 - 1
docs/apis/train_en.md

@@ -166,6 +166,7 @@ def train(self,
           warmup_start_lr=0.0,
           lr_decay_epochs=(216, 243),
           lr_decay_gamma=0.1,
+          cosine_decay_num_epochs=1000,
           metric=None,
           use_ema=False,
           early_stop=False,
@@ -196,7 +197,8 @@ The meaning of each parameter is as follows:
 |`warmup_start_lr`|`int`|Default initial learning rate used in the warm-up phase of the optimizer.|`0`|
 |`lr_decay_epochs`|`list` \| `tuple`|Milestones of learning rate decline of the default optimizer, in terms of epochs. That is, which epoch the decay of the learning rate occurs.|`(216, 243)`|
 |`lr_decay_gamma`|`float`|Learning rate attenuation coefficient, for default optimizer.|`0.1`|
-|`metric`|`str` \| `None`|Evaluation metrics, which can be `'VOC'`, `COCO`, or `None`. If `None`, the evaluation metrics will be automatically determined according to the format of the dataset.|`None`|
+|`cosine_decay_num_epochs`|`int`|Parameter to determine the annealing cycle when a cosine annealing learning rate scheduler is used.|`1000`|
+|`metric`|`str` \| `None`|Evaluation metrics, which can be `'VOC'`, `'COCO'`, `'RBOX'`, or `None`. If `None`, the evaluation metrics will be automatically determined according to the format of the dataset.|`None`|
 |`use_ema`|`bool`|Whether to enable [exponential moving average strategy](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/models/ppdet/optimizer.py) to update model weights.|`False`|
 |`early_stop`|`bool`|Whether to enable the early stopping policy during training.|`False`|
 |`early_stop_patience`|`int`|`patience` parameter when the early stopping policy is enabled. Please refer to [`EarlyStop`](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/utils/utils.py) for more details.|`5`|

+ 3 - 2
docs/intro/data_prep_cn.md

@@ -9,5 +9,6 @@
 | 变化检测 | LEVIR-CD | https://justchenhao.github.io/LEVIR/ | [prepare_levircd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_levircd.py) |
 | 变化检测 | Season-varying | https://paperswithcode.com/dataset/cdd-dataset-season-varying | [prepare_svcd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_svcd.py) |
 | 场景分类 | UC Merced | http://weegee.vision.ucmerced.edu/datasets/landuse.html | [prepare_ucmerced.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_ucmerced.py) |
-| 目标检测 | RSOD | https://github.com/RSIA-LIESMARS-WHU/RSOD-Dataset- | [prepare_rsod](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_rsod.py) |
-| 图像分割 | iSAID | https://captain-whu.github.io/iSAID/ | [prepare_isaid](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_isaid.py) |
+| 目标检测 | DOTA | https://captain-whu.github.io/DOTA/ | [prepare_dota.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_dota.py) |
+| 目标检测 | RSOD | https://github.com/RSIA-LIESMARS-WHU/RSOD-Dataset- | [prepare_rsod.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_rsod.py) |
+| 图像分割 | iSAID | https://captain-whu.github.io/iSAID/ | [prepare_isaid.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_isaid.py) |

+ 3 - 2
docs/intro/data_prep_en.md

@@ -9,5 +9,6 @@
 | Change Detection | LEVIR-CD | https://justchenhao.github.io/LEVIR/ | [prepare_levircd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_levircd.py) |
 | Change Detection | Season-varying | https://paperswithcode.com/dataset/cdd-dataset-season-varying | [prepare_svcd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_svcd.py) |
 | Scene Classification | UC Merced | http://weegee.vision.ucmerced.edu/datasets/landuse.html | [prepare_ucmerced.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_ucmerced.py) |
-| Object Detection | RSOD | https://github.com/RSIA-LIESMARS-WHU/RSOD-Dataset- | [prepare_rsod](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_rsod.py) |
-| Image Segmentation | iSAID | https://captain-whu.github.io/iSAID/ | [prepare_isaid](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_isaid.py) |
+| Object Detection | DOTA | https://captain-whu.github.io/DOTA/ | [prepare_dota.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_dota.py) |
+| Object Detection | RSOD | https://github.com/RSIA-LIESMARS-WHU/RSOD-Dataset- | [prepare_rsod.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_rsod.py) |
+| Image Segmentation | iSAID | https://captain-whu.github.io/iSAID/ | [prepare_isaid.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_isaid.py) |

+ 1 - 0
docs/intro/model_cons_params_cn.md

@@ -449,6 +449,7 @@
 
 | 参数名 | 描述                            | 默认值 |
 | --- |-------------------------------| --- |
+| `rotate (bool)` | 表示是否执行旋转目标检测 | `False` |
 | `num_classes (int)` | 目标类别数量                        | `80` |
 | `backbone (str)` | 骨干网络名称                | `'MobileNetV1'` |
 | `anchors (list[list[int]])` | 预定义锚框的大小                       | `[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]]` |

+ 1 - 0
docs/intro/model_cons_params_en.md

@@ -443,6 +443,7 @@ The YOLOv3 implementation based on PaddlePaddle.
 
 | Parameter Name | Description                                                                                                                 | Default Value |
 | --- |-----------------------------------------------------------------------------------------------------------------------------| --- |
+| `rotate (bool)` | If True, the model performs rotated object detection | `False` |
 | `num_classes (int)` | Number of target classes                                                                                                    | `80` |
 | `backbone (str)` | Backbone network to use                                                                                      | `'MobileNetV1'` |
 | `anchors (list[list[int]])` | Sizes of predefined anchor boxes                                                                                                   | `[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45 ], [59, 119], [116, 90], [156, 198], [373, 326]]` |

+ 1 - 0
docs/intro/model_zoo_cn.md

@@ -33,6 +33,7 @@ PaddleRS目前已支持的全部模型如下(标注\*的为遥感专用模型
 | 图像复原 | NAFNet | 是 |
 | 图像复原 | SwinIR | 是 |
 | 目标检测 | Faster R-CNN | 否 |
+| 目标检测 | FCOSR | 否 |
 | 目标检测 | PP-YOLO | 否 |
 | 目标检测 | PP-YOLO Tiny | 否 |
 | 目标检测 | PP-YOLOv2 | 否 |

+ 1 - 0
docs/intro/model_zoo_en.md

@@ -33,6 +33,7 @@ All models currently supported by PaddleRS are listed below (those marked \* are
 | Image Restoration | SwinIR | Yes |
 | Image Restoration | NAFNet | Yes |
 | Object Detection | Faster R-CNN | No |
+| Object Detection | FCOSR | No |
 | Object Detection | PP-YOLO | No |
 | Object Detection | PP-YOLO Tiny | No |
 | Object Detection | PP-YOLOv2 | No |

+ 9 - 0
docs/quick_start_cn.md

@@ -53,6 +53,15 @@ Windows用户可以在[此站点](https://www.lfd.uci.edu/~gohlke/pythonlibs/#gd
 pip install GDAL‑3.3.3‑cp39‑cp39‑win_amd64.whl
 ```
 
+4. (可选)安装ext_op
+
+PaddleRS支持旋转目标检测,在使用之前需要安装`ext_op`外部自定义库,安装方式如下:
+```shell
+cd paddlers/models/ppdet/ext_op
+python setup.py install
+```
+
+
 除了采用上述安装步骤以外,PaddleRS也提供Docker安装方式。具体步骤如下:
 
 1. 从dockerhub拉取镜像:

+ 9 - 0
docs/quick_start_en.md

@@ -46,6 +46,15 @@ Windows users can download GDAL wheels from [this site](https://www.lfd.uci.edu/
 pip install GDAL‑3.3.3‑cp39‑cp39‑win_amd64.whl
 ```
 
+4. (Optional) Install ext_op
+
+PaddleRS supports rotated object detection, which requires the installation of the `ext_op` external custom library before use. you need ti install ext_op as follows:
+
+```shell
+cd paddlers/models/ppdet/ext_op
+python setup.py install
+```
+
 We also provide a docker image for installation:
 
 1. Pull from dockerhub:

+ 31 - 5
paddlers/datasets/base.py

@@ -18,7 +18,8 @@ from paddle.io import Dataset
 from paddle.fluid.dataloader.collate import default_collate_fn
 
 from paddlers.utils import get_num_workers
-from paddlers.transforms import construct_sample_from_dict, Compose
+import paddlers.utils.logging as logging
+from paddlers.transforms import construct_sample_from_dict, Compose, BatchCompose
 
 
 class BaseDataset(Dataset):
@@ -26,7 +27,13 @@ class BaseDataset(Dataset):
     _KEYS_TO_DISCARD = None
     _collate_trans_info = False
 
-    def __init__(self, data_dir, label_list, transforms, num_workers, shuffle):
+    def __init__(self,
+                 data_dir,
+                 label_list,
+                 transforms,
+                 num_workers,
+                 shuffle,
+                 batch_transforms=None):
         super(BaseDataset, self).__init__()
 
         self.data_dir = data_dir
@@ -37,6 +44,8 @@ class BaseDataset(Dataset):
 
         self.num_workers = get_num_workers(num_workers)
         self.shuffle = shuffle
+        self.batch_transforms = None
+        self.build_collate_fn(batch_transforms)
 
     def __getitem__(self, idx):
         sample = construct_sample_from_dict(self.file_list[idx])
@@ -59,8 +68,25 @@ class BaseDataset(Dataset):
             for key in self._KEYS_TO_DISCARD:
                 for s, _ in batch:
                     s.pop(key, None)
+
+        samples = [s[0] for s in batch]
+
+        if self.batch_transforms:
+            samples = self.batch_transforms(samples)
+
         if self._collate_trans_info:
-            return default_collate_fn(
-                [s[0] for s in batch]), [s[1] for s in batch]
+            return default_collate_fn(samples), [s[1] for s in batch]
         else:
-            return default_collate_fn([s[0] for s in batch])
+            return default_collate_fn(samples)
+
+    def build_collate_fn(self, batch_transforms, collate_fn_constructor=None):
+        if self.batch_transforms is not None and batch_transforms:
+            logging.warning(
+                "The initial `batch_transforms` will be overwritten.")
+        if batch_transforms is not None:
+            batch_transforms = copy.deepcopy(batch_transforms)
+            if isinstance(batch_transforms, list):
+                batch_transforms = BatchCompose(batch_transforms)
+            self.batch_transforms = batch_transforms
+        if collate_fn_constructor:
+            self.collate_fn = collate_fn_constructor(self)

+ 3 - 2
paddlers/datasets/cd_dataset.py

@@ -55,9 +55,10 @@ class CDDataset(BaseDataset):
                  num_workers='auto',
                  shuffle=False,
                  with_seg_labels=False,
-                 binarize_labels=False):
+                 binarize_labels=False,
+                 batch_transforms=None):
         super(CDDataset, self).__init__(data_dir, label_list, transforms,
-                                        num_workers, shuffle)
+                                        num_workers, shuffle, batch_transforms)
 
         DELIMETER = ' '
 

+ 4 - 2
paddlers/datasets/clas_dataset.py

@@ -42,9 +42,11 @@ class ClasDataset(BaseDataset):
                  transforms,
                  label_list=None,
                  num_workers='auto',
-                 shuffle=False):
+                 shuffle=False,
+                 batch_transforms=None):
         super(ClasDataset, self).__init__(data_dir, label_list, transforms,
-                                          num_workers, shuffle)
+                                          num_workers, shuffle,
+                                          batch_transforms)
         self.file_list = list()
         self.labels = list()
 

+ 49 - 29
paddlers/datasets/coco.py

@@ -17,7 +17,7 @@ import copy
 import os
 import os.path as osp
 import random
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 
 import numpy as np
 
@@ -34,7 +34,7 @@ class COCODetDataset(BaseDataset):
     Args:
         data_dir (str): Root directory of the dataset.
         image_dir (str): Directory that contains the images.
-        ann_path (str): Path to COCO annotations.
+        anno_path (str): Path to COCO annotations.
         transforms (paddlers.transforms.Compose|list): Data preprocessing and data augmentation operators to apply.
         label_list (str|None, optional): Path of the file that contains the category names. Defaults to None.
         num_workers (int|str, optional): Number of processes used for data loading. If `num_workers` is 'auto',
@@ -45,6 +45,7 @@ class COCODetDataset(BaseDataset):
         allow_empty (bool, optional): Whether to add negative samples. Defaults to False.
         empty_ratio (float, optional): Ratio of negative samples. If `empty_ratio` is smaller than 0 or not less 
             than 1, keep all generated negative samples. Defaults to 1.0.
+        batch_transforms (paddlers.transforms.BatchCompose|list): Batch transformation operators to apply.
     """
 
     def __init__(self,
@@ -52,11 +53,12 @@ class COCODetDataset(BaseDataset):
                  image_dir,
                  anno_path,
                  transforms,
-                 label_list,
+                 label_list=None,
                  num_workers='auto',
                  shuffle=False,
                  allow_empty=False,
-                 empty_ratio=1.):
+                 empty_ratio=1.,
+                 batch_transforms=None):
         # matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
         # or matplotlib.backends is imported for the first time.
         import matplotlib
@@ -64,7 +66,8 @@ class COCODetDataset(BaseDataset):
         from pycocotools.coco import COCO
 
         super(COCODetDataset, self).__init__(data_dir, label_list, transforms,
-                                             num_workers, shuffle)
+                                             num_workers, shuffle,
+                                             batch_transforms)
 
         self.data_fields = None
         self.num_max_boxes = 50
@@ -83,33 +86,31 @@ class COCODetDataset(BaseDataset):
         self.file_list = list()
         neg_file_list = list()
         self.labels = list()
+        self.anno_path = anno_path
 
-        annotations = dict()
-        annotations['images'] = list()
-        annotations['categories'] = list()
-        annotations['annotations'] = list()
+        annotations = defaultdict(list)
 
         cname2cid = OrderedDict()
         label_id = 0
-        with open(label_list, 'r', encoding=get_encoding(label_list)) as f:
-            for line in f.readlines():
-                cname2cid[line.strip()] = label_id
-                label_id += 1
-                self.labels.append(line.strip())
-
-        for k, v in cname2cid.items():
-            annotations['categories'].append({
-                'supercategory': 'component',
-                'id': v + 1,
-                'name': k
-            })
+        if label_list:
+            with open(label_list, 'r', encoding=get_encoding(label_list)) as f:
+                for line in f.readlines():
+                    cname2cid[line.strip()] = label_id
+                    label_id += 1
+                    self.labels.append(line.strip())
+
+            for k, v in cname2cid.items():
+                annotations['categories'].append({
+                    'supercategory': 'component',
+                    'id': v + 1,
+                    'name': k
+                })
 
         anno_path = norm_path(os.path.join(self.data_dir, anno_path))
         image_dir = norm_path(os.path.join(self.data_dir, image_dir))
 
         assert anno_path.endswith('.json'), \
             'invalid coco annotation file: ' + anno_path
-        from pycocotools.coco import COCO
         coco = COCO(anno_path)
         img_ids = coco.getImgIds()
         img_ids.sort()
@@ -155,7 +156,8 @@ class COCODetDataset(BaseDataset):
             gt_classes = []
             gt_bboxs = []
             gt_scores = []
-            difficults = []
+            gt_poly = []
+            difficulties = []
 
             for inst in instances:
                 # Check gt bbox
@@ -182,12 +184,21 @@ class COCODetDataset(BaseDataset):
                         'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
                             img_id, float(inst['area']), x1, y1, x2, y2))
 
+                if 'segmentation' in inst and inst['iscrowd']:
+                    gt_poly.append([0.0 for _ in range(8)])
+                elif 'segmentation' in inst and inst['segmentation']:
+                    if not np.array(
+                            inst['segmentation'],
+                            dtype=object).size > 0 and not self.allow_empty:
+                        continue
+                    else:
+                        gt_poly.append(inst['segmentation'])
+
                 is_crowds.append([inst['iscrowd']])
-                gt_classes.append([inst['category_id']])
+                gt_classes.append([catid2clsid[inst['category_id']]])
                 gt_bboxs.append(inst['clean_bbox'])
                 gt_scores.append([1.])
-                difficults.append([0])
-
+                difficulties.append(inst.get('difficult', 0.))
                 annotations['annotations'].append({
                     'iscrowd': inst['iscrowd'],
                     'image_id': int(inst['image_id']),
@@ -195,18 +206,21 @@ class COCODetDataset(BaseDataset):
                     'area': inst['area'],
                     'category_id': inst['category_id'],
                     'id': inst['id'],
-                    'difficult': 0
+                    'difficult': inst.get('difficult', 0.)
                 })
+                if gt_poly:
+                    annotations['annotations'][-1]['gt_poly'] = gt_poly[-1]
 
             label_info = {
                 'is_crowd': np.array(is_crowds),
                 'gt_class': np.array(gt_classes),
                 'gt_bbox': np.array(gt_bboxs).astype(np.float32),
                 'gt_score': np.array(gt_scores).astype(np.float32),
-                'difficult': np.array(difficults),
+                'difficult': np.array(difficulties),
+                'gt_poly': np.array(gt_poly),
             }
 
-            if label_info['gt_bbox'].size > 0:
+            if label_info['gt_bbox'].size > 0 or label_info['gt_poly'].size > 0:
                 self.file_list.append({ ** im_info, ** label_info})
                 annotations['images'].append({
                     'height': im_h,
@@ -259,6 +273,7 @@ class COCODetDataset(BaseDataset):
                 DecodeImg(to_rgb=False)(sample),
                 DecodeImg(to_rgb=False)(sample_mix)
             ])
+
         sample['trans_info'] = []
         sample, trans_info = self.transforms(sample)
         return sample, trans_info
@@ -266,6 +281,11 @@ class COCODetDataset(BaseDataset):
     def __len__(self):
         return self.num_samples
 
+    def get_anno_path(self):
+        if self.anno_path:
+            return norm_path(os.path.join(self.data_dir, self.anno_path))
+        return None
+
     def set_epoch(self, epoch_id):
         self._epoch = epoch_id
 

+ 3 - 2
paddlers/datasets/res_dataset.py

@@ -45,9 +45,10 @@ class ResDataset(BaseDataset):
                  transforms,
                  num_workers='auto',
                  shuffle=False,
-                 sr_factor=None):
+                 sr_factor=None,
+                 batch_transforms=None):
         super(ResDataset, self).__init__(data_dir, None, transforms,
-                                         num_workers, shuffle)
+                                         num_workers, shuffle, batch_transforms)
         self.file_list = list()
 
         with open(file_list, encoding=get_encoding(file_list)) as f:

+ 3 - 2
paddlers/datasets/seg_dataset.py

@@ -43,9 +43,10 @@ class SegDataset(BaseDataset):
                  transforms,
                  label_list=None,
                  num_workers='auto',
-                 shuffle=False):
+                 shuffle=False,
+                 batch_transforms=None):
         super(SegDataset, self).__init__(data_dir, label_list, transforms,
-                                         num_workers, shuffle)
+                                         num_workers, shuffle, batch_transforms)
         self.file_list = list()
         self.labels = list()
 

+ 5 - 2
paddlers/datasets/voc.py

@@ -46,6 +46,7 @@ class VOCDetDataset(BaseDataset):
         allow_empty (bool, optional): Whether to add negative samples. Defaults to False.
         empty_ratio (float, optional): Ratio of negative samples. If `empty_ratio` is smaller than 0 or not less 
             than 1, keep all generated negative samples. Defaults to 1.0.
+        batch_transforms (paddlers.transforms.BatchCompose|list): Batch transformation operators to apply.
     """
 
     def __init__(self,
@@ -56,14 +57,16 @@ class VOCDetDataset(BaseDataset):
                  num_workers='auto',
                  shuffle=False,
                  allow_empty=False,
-                 empty_ratio=1.):
+                 empty_ratio=1.,
+                 batch_transforms=None):
         # matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
         # or matplotlib.backends is imported for the first time.
         import matplotlib
         matplotlib.use('Agg')
         from pycocotools.coco import COCO
         super(VOCDetDataset, self).__init__(data_dir, label_list, transforms,
-                                            num_workers, shuffle)
+                                            num_workers, shuffle,
+                                            batch_transforms)
 
         self.data_fields = None
         self.num_max_boxes = 50

+ 9 - 0
paddlers/models/ppdet/core/workspace.py

@@ -67,6 +67,15 @@ class AttrDict(dict):
             return self[key]
         raise AttributeError("object has no attribute '{}'".format(key))
 
+    def __setattr__(self, key, value):
+        self[key] = value
+
+    def copy(self):
+        new_dict = AttrDict()
+        for k, v in self.items():
+            new_dict.update({k: v})
+        return new_dict
+
 
 global_config = AttrDict()
 

+ 1 - 1
paddlers/models/ppdet/data/crop_utils/__init__.py

@@ -10,4 +10,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations under the License. 

+ 91 - 53
paddlers/models/ppdet/data/crop_utils/annotation_cropper.py

@@ -27,14 +27,15 @@ from .chip_box_utils import intersection_over_box
 
 
 class AnnoCropper(object):
-    def __init__(self, image_target_sizes: List[int],
+    def __init__(self,
+                 image_target_sizes: List[int],
                  valid_box_ratio_ranges: List[List[float]],
-                 chip_target_size: int, chip_target_stride: int,
-                 use_neg_chip: bool = False,
-                 max_neg_num_per_im: int = 8,
-                 max_per_img: int = -1,
-                 nms_thresh: int = 0.5
-                 ):
+                 chip_target_size: int,
+                 chip_target_stride: int,
+                 use_neg_chip: bool=False,
+                 max_neg_num_per_im: int=8,
+                 max_per_img: int=-1,
+                 nms_thresh: int=0.5):
         """
         Generate chips by chip_target_size and chip_target_stride.
         These two parameters just like kernel_size and stride in cnn.
@@ -117,7 +118,8 @@ class AnnoCropper(object):
         self.chip_records = []
         self._global_chip_id = 1
         for r in records:
-            self._cur_im_pos_chips = []  # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]
+            self._cur_im_pos_chips = [
+            ]  # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]
             self._cur_im_neg_chips = []  # element: (chip, neg_box_num)
             for scale_i in range(self.scale_num):
                 self._get_current_scale_parameters(scale_i, r)
@@ -126,12 +128,16 @@ class AnnoCropper(object):
                 chips = self._create_chips(r['h'], r['w'], self._cur_scale)
 
                 # # dict: chipid->[box_id, ...]
-                pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(r['gt_bbox'], chips)
+                pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(
+                    r['gt_bbox'], chips)
 
                 # dict: chipid->neg_box_num
-                neg_chip2box_num = self._get_neg_boxes_and_chips(chips, list(pos_chip2boxes_idx.keys()), r.get('proposals', None))
+                neg_chip2box_num = self._get_neg_boxes_and_chips(
+                    chips,
+                    list(pos_chip2boxes_idx.keys()), r.get('proposals', None))
 
-                self._add_to_cur_im_chips(chips, pos_chip2boxes_idx, neg_chip2box_num)
+                self._add_to_cur_im_chips(chips, pos_chip2boxes_idx,
+                                          neg_chip2box_num)
 
             cur_image_records = self._trans_all_chips2annotations(r)
             self.chip_records.extend(cur_image_records)
@@ -147,7 +153,7 @@ class AnnoCropper(object):
 
         for neg_chipid, neg_box_num in neg_chip2box_num.items():
             chip = np.array(chips[neg_chipid])
-            self._cur_im_neg_chips.append((chip,  neg_box_num))
+            self._cur_im_neg_chips.append((chip, neg_box_num))
 
     def _trans_all_chips2annotations(self, r):
         gt_bbox = r['gt_bbox']
@@ -156,20 +162,24 @@ class AnnoCropper(object):
         gt_class = r['gt_class']
         # gt_poly = r['gt_poly']   # [None]xN
         # remaining keys: im_id, h, w
-        chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox, is_crowd, gt_class)
+        chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox,
+                                                         is_crowd, gt_class)
 
         if not self.use_neg_chip:
             return chip_records
 
         sampled_neg_chips = self._sample_neg_chips()
-        neg_chip_records = self._trans_neg_chips2annotations(im_file, sampled_neg_chips)
+        neg_chip_records = self._trans_neg_chips2annotations(im_file,
+                                                             sampled_neg_chips)
         chip_records.extend(neg_chip_records)
         return chip_records
 
-    def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd, gt_class):
+    def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd,
+                                     gt_class):
         chip_records = []
         for chip, boxes_idx in self._cur_im_pos_chips:
-            chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx, chip)
+            chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx,
+                                                            chip)
             x1, y1, x2, y2 = chip
             chip_h = y2 - y1
             chip_w = x2 - x1
@@ -197,12 +207,15 @@ class AnnoCropper(object):
             return self._cur_im_neg_chips
 
         candidate_num = int(sample_num * 1.5)
-        candidate_neg_chips = sorted(self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]
+        candidate_neg_chips = sorted(
+            self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]
         random.shuffle(candidate_neg_chips)
         sampled_neg_chips = candidate_neg_chips[:sample_num]
         return sampled_neg_chips
 
-    def _trans_neg_chips2annotations(self, im_file: str, sampled_neg_chips: List[Tuple]):
+    def _trans_neg_chips2annotations(self,
+                                     im_file: str,
+                                     sampled_neg_chips: List[Tuple]):
         chip_records = []
         for chip, neg_box_num in sampled_neg_chips:
             x1, y1, x2, y2 = chip
@@ -213,9 +226,12 @@ class AnnoCropper(object):
                 'im_id': np.array([self._global_chip_id]),
                 'h': chip_h,
                 'w': chip_w,
-                'gt_bbox': np.zeros((0, 4), dtype=np.float32),
-                'is_crowd': np.zeros((0, 1), dtype=np.int32),
-                'gt_class': np.zeros((0, 1), dtype=np.int32),
+                'gt_bbox': np.zeros(
+                    (0, 4), dtype=np.float32),
+                'is_crowd': np.zeros(
+                    (0, 1), dtype=np.int32),
+                'gt_class': np.zeros(
+                    (0, 1), dtype=np.int32),
                 # 'gt_poly': [],
                 'chip': chip
             }
@@ -247,7 +263,8 @@ class AnnoCropper(object):
 
         assert chip_size >= stride
         chip_overlap = chip_size - stride
-        if (width - chip_overlap) % stride > min_chip_location_diff:  # 不能被stride整除的部分比较大,则保留
+        if (width - chip_overlap
+            ) % stride > min_chip_location_diff:  # 不能被stride整除的部分比较大,则保留
             w_steps = max(1, int(math.ceil((width - chip_overlap) / stride)))
         else:  # 不能被stride整除的部分比较小,则丢弃
             w_steps = max(1, int(math.floor((width - chip_overlap) / stride)))
@@ -267,9 +284,10 @@ class AnnoCropper(object):
 
         # check  chip size
         for item in chips:
-            if item[2] - item[0] > chip_size * 1.1 or item[3] - item[1] > chip_size * 1.1:
+            if item[2] - item[0] > chip_size * 1.1 or item[3] - item[
+                    1] > chip_size * 1.1:
                 raise ValueError(item)
-        chips = np.array(chips, dtype=np.float)
+        chips = np.array(chips, dtype=np.float32)
 
         raw_size_chips = chips / scale
         return raw_size_chips
@@ -279,12 +297,15 @@ class AnnoCropper(object):
         im_size = self._cur_im_size
         scale = self._cur_scale
         #   Nx4            N
-        valid_boxes, valid_boxes_idx = self._validate_boxes(valid_ratio_range, im_size, gt_bbox, scale)
+        valid_boxes, valid_boxes_idx = self._validate_boxes(
+            valid_ratio_range, im_size, gt_bbox, scale)
         # dict: chipid->[box_id, ...]
-        pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes, valid_boxes_idx)
+        pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes,
+                                                  valid_boxes_idx)
         return pos_chip2boxes_idx
 
-    def _validate_boxes(self, valid_ratio_range: List[float],
+    def _validate_boxes(self,
+                        valid_ratio_range: List[float],
                         im_size: int,
                         gt_boxes: 'np.array of Nx4',
                         scale: float):
@@ -299,20 +320,26 @@ class AnnoCropper(object):
         target_mins = mins * scale
 
         low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0
-        high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(np.float).max
+        high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(
+            np.float32).max
 
-        valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & (target_mins >= 2))[0]
+        valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & (
+            target_mins >= 2))[0]
         valid_boxes = gt_boxes[valid_boxes_idx]
         return valid_boxes, valid_boxes_idx
 
-    def _find_pos_chips(self, chips: 'Cx4', valid_boxes: 'Bx4', valid_boxes_idx: 'B'):
+    def _find_pos_chips(self,
+                        chips: 'Cx4',
+                        valid_boxes: 'Bx4',
+                        valid_boxes_idx: 'B'):
         """
         :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...]
         """
         iob = intersection_over_box(chips, valid_boxes)  # overlap, CxB
 
         iob_threshold_to_find_chips = 1.
-        pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips)
+        pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(
+            iob, iob_threshold_to_find_chips)
         pos_chip_ids = set(pos_chip_ids)
 
         iob_threshold_to_assign_box = 0.5
@@ -323,7 +350,8 @@ class AnnoCropper(object):
     def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold):
         return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold)
 
-    def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids, valid_boxes_idx):
+    def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids,
+                                   valid_boxes_idx):
         chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
         pos_chip2boxes_idx = defaultdict(list)
         for chip_id, box_id in zip(chip_ids, box_ids):
@@ -333,7 +361,10 @@ class AnnoCropper(object):
             pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx)
         return pos_chip2boxes_idx
 
-    def _get_neg_boxes_and_chips(self, chips: 'Cx4', pos_chip_ids: 'D', proposals: 'Px4'):
+    def _get_neg_boxes_and_chips(self,
+                                 chips: 'Cx4',
+                                 pos_chip_ids: 'D',
+                                 proposals: 'Px4'):
         """
         :param chips:
         :param pos_chip_ids:
@@ -351,12 +382,16 @@ class AnnoCropper(object):
         im_size = self._cur_im_size
         scale = self._cur_scale
 
-        valid_props, _ = self._validate_boxes(valid_ratio_range, im_size, proposals, scale)
+        valid_props, _ = self._validate_boxes(valid_ratio_range, im_size,
+                                              proposals, scale)
         neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props)
         neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes)
         return neg_chip2box_num
 
-    def _find_neg_boxes(self, chips: 'Cx4', pos_chip_ids: 'D', valid_props: 'Px4'):
+    def _find_neg_boxes(self,
+                        chips: 'Cx4',
+                        pos_chip_ids: 'D',
+                        valid_props: 'Px4'):
         """
         :return: neg_boxes: Nx4
         """
@@ -370,7 +405,8 @@ class AnnoCropper(object):
         neg_boxes = valid_props[non_overlap_props_idx]
         return neg_boxes
 
-    def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D', neg_boxes: 'Nx4'):
+    def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D',
+                        neg_boxes: 'Nx4'):
         """
         :return: neg_chip2box_num, dict: chipid->neg_box_num
         """
@@ -469,31 +505,37 @@ class AnnoCropper(object):
         for result in results:
             bbox_locs = result['bbox']
             bbox_nums = result['bbox_num']
-            if len(bbox_locs) == 1 and bbox_locs[0][0] == -1:  # current batch has no detections
+            if len(bbox_locs) == 1 and bbox_locs[0][
+                    0] == -1:  # current batch has no detections
                 # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]]
                 # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1.
                 continue
-            im_ids = result['im_id'] # replace with range(len(bbox_nums))
+            im_ids = result['im_id']  # replace with range(len(bbox_nums))
 
             last_bbox_num = 0
             for idx, im_id in enumerate(im_ids):
 
                 cur_bbox_len = bbox_nums[idx]
-                bboxes = bbox_locs[last_bbox_num: last_bbox_num + cur_bbox_len]
+                bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len]
                 last_bbox_num += cur_bbox_len
                 # box: [num_id, score, xmin, ymin, xmax, ymax]
                 if len(bboxes) == 0:  # current image has no detections
                     continue
 
-                chip_rec = records[int(im_id) - 1]  # im_id starts from 1, type is np.int64
+                chip_rec = records[int(im_id) -
+                                   1]  # im_id starts from 1, type is np.int64
                 image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"])
 
-                bboxes = transform_chip_boxes2image_boxes(bboxes, chip_rec["chip"], chip_rec["ori_im_h"], chip_rec["ori_im_w"])
+                bboxes = transform_chip_boxes2image_boxes(
+                    bboxes, chip_rec["chip"], chip_rec["ori_im_h"],
+                    chip_rec["ori_im_w"])
 
                 scale_i = chip_rec["scale_i"]
-                cur_scale = self._get_current_scale(self.target_sizes[scale_i], image_size)
-                _, valid_boxes_idx = self._validate_boxes(self.valid_box_ratio_ranges[scale_i], image_size,
-                                                                    bboxes[:, 2:], cur_scale)
+                cur_scale = self._get_current_scale(self.target_sizes[scale_i],
+                                                    image_size)
+                _, valid_boxes_idx = self._validate_boxes(
+                    self.valid_box_ratio_ranges[scale_i], image_size,
+                    bboxes[:, 2:], cur_scale)
                 ori_img_id = self._global_chip_id2img_id[int(im_id)]
 
                 img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx])
@@ -507,7 +549,8 @@ class AnnoCropper(object):
         nms_thresh = self.nms_thresh
 
         for img_id in img_id2bbox:
-            box = img_id2bbox[img_id]  # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
+            box = img_id2bbox[
+                img_id]  # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
             box = np.concatenate(box, axis=0)
             nms_dets = nms(box, nms_thresh)
             if max_per_img > 0:
@@ -525,18 +568,13 @@ class AnnoCropper(object):
         results = []
         for img_id in im_ids:  # output by original im_id order
             if len(img_id2bbox[img_id]) == 0:
-                bbox = np.array([[-1.,  0.,  0.,  0.,  0.,  0.]])  # edge case: no detections
+                bbox = np.array(
+                    [[-1., 0., 0., 0., 0., 0.]])  # edge case: no detections
                 bbox_num = np.array([0])
             else:
                 # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
                 bbox = img_id2bbox[img_id]
                 bbox_num = np.array([len(bbox)])
-            res = dict(
-                im_id=np.array([[img_id]]),
-                bbox=bbox,
-                bbox_num=bbox_num
-            )
+            res = dict(im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num)
             results.append(res)
         return results
-
-

+ 10 - 6
paddlers/models/ppdet/data/crop_utils/chip_box_utils.py

@@ -33,8 +33,10 @@ def intersection_over_box(chips, boxes):
 
     box_area = bbox_area(boxes)  # B
 
-    inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:], boxes[:, 2:])  # CxBX2
-    inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2], boxes[:, :2])  # CxBx2
+    inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:],
+                            boxes[:, 2:])  # CxBX2
+    inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2],
+                            boxes[:, :2])  # CxBx2
     inter_wh = inter_x2y2 - inter_x1y1
     inter_wh = np.clip(inter_wh, a_min=0, a_max=None)
     inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # CxB
@@ -81,8 +83,9 @@ def transform_chip_box(gt_bbox: 'Gx4', boxes_idx: 'B', chip: '4'):
 def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):
     chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
     chip_id2overlap_box_num = np.bincount(chip_ids)  # 1d array
-    chip_id2overlap_box_num = np.pad(chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)),
-                                     constant_values=0)
+    chip_id2overlap_box_num = np.pad(
+        chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)),
+        constant_values=0)
 
     chosen_chip_ids = []
     while len(box_ids) > 0:
@@ -92,7 +95,8 @@ def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):
         chosen_chip_ids.append(max_count_chip_id)
 
         box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id]
-        ids_not_in_cur_boxes_mask = np.logical_not(np.isin(box_ids, box_ids_in_cur_chip))
+        ids_not_in_cur_boxes_mask = np.logical_not(
+            np.isin(box_ids, box_ids_in_cur_chip))
         chip_ids = chip_ids[ids_not_in_cur_boxes_mask]
         box_ids = box_ids[ids_not_in_cur_boxes_mask]
     return chosen_chip_ids, chip_id2overlap_box_num
@@ -124,7 +128,7 @@ def nms(dets, thresh):
     order = scores.argsort()[::-1]
 
     ndets = dets.shape[0]
-    suppressed = np.zeros((ndets), dtype=np.int)
+    suppressed = np.zeros((ndets), dtype=np.int32)
 
     # nominal indices
     # _i, _j

+ 309 - 0
paddlers/models/ppdet/data/reader.py

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import os
 import traceback
 import six
@@ -21,6 +22,10 @@ if sys.version_info >= (3, 0):
 else:
     pass
 import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from copy import deepcopy
 
 from paddle.io import DataLoader, DistributedBatchSampler
 from .utils import default_collate_fn
@@ -300,3 +305,307 @@ class TestMOTReader(BaseDataLoader):
         super(TestMOTReader, self).__init__(sample_transforms, batch_transforms,
                                             batch_size, shuffle, drop_last,
                                             num_classes, **kwargs)
+
+
+# For Semi-Supervised Object Detection (SSOD)
+class Compose_SSOD(object):
+    def __init__(self, base_transforms, weak_aug, strong_aug, num_classes=80):
+        self.base_transforms = base_transforms
+        self.base_transforms_cls = []
+        for t in self.base_transforms:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.base_transforms_cls.append(f)
+
+        self.weak_augs = weak_aug
+        self.weak_augs_cls = []
+        for t in self.weak_augs:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.weak_augs_cls.append(f)
+
+        self.strong_augs = strong_aug
+        self.strong_augs_cls = []
+        for t in self.strong_augs:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.strong_augs_cls.append(f)
+
+    def __call__(self, data):
+        for f in self.base_transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map sample transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        weak_data = deepcopy(data)
+        strong_data = deepcopy(data)
+        for f in self.weak_augs_cls:
+            try:
+                weak_data = f(weak_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map weak aug [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        for f in self.strong_augs_cls:
+            try:
+                strong_data = f(strong_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map strong aug [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        weak_data['strong_aug'] = strong_data
+        return weak_data
+
+
+class BatchCompose_SSOD(Compose):
+    def __init__(self, transforms, num_classes=80, collate_batch=True):
+        super(BatchCompose_SSOD, self).__init__(transforms, num_classes)
+        self.collate_batch = collate_batch
+
+    def __call__(self, data):
+        # split strong_data from data(weak_data)
+        strong_data = []
+        for sample in data:
+            strong_data.append(sample['strong_aug'])
+            sample.pop('strong_aug')
+
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+                strong_data = f(strong_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map batch transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        # remove keys which is not needed by model
+        extra_key = ['h', 'w', 'flipped']
+        for k in extra_key:
+            for sample in data:
+                if k in sample:
+                    sample.pop(k)
+            for sample in strong_data:
+                if k in sample:
+                    sample.pop(k)
+
+        # batch data, if user-define batch function needed
+        # use user-defined here
+        if self.collate_batch:
+            batch_data = default_collate_fn(data)
+            strong_batch_data = default_collate_fn(strong_data)
+            return batch_data, strong_batch_data
+        else:
+            batch_data = {}
+            for k in data[0].keys():
+                tmp_data = []
+                for i in range(len(data)):
+                    tmp_data.append(data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                batch_data[k] = tmp_data
+
+            strong_batch_data = {}
+            for k in strong_data[0].keys():
+                tmp_data = []
+                for i in range(len(strong_data)):
+                    tmp_data.append(strong_data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                strong_batch_data[k] = tmp_data
+
+        return batch_data, strong_batch_data
+
+
+class CombineSSODLoader(object):
+    def __init__(self, label_loader, unlabel_loader):
+        self.label_loader = label_loader
+        self.unlabel_loader = unlabel_loader
+
+    def __iter__(self):
+        while True:
+            try:
+                label_samples = next(self.label_loader_iter)
+            except:
+                self.label_loader_iter = iter(self.label_loader)
+                label_samples = next(self.label_loader_iter)
+
+            try:
+                unlabel_samples = next(self.unlabel_loader_iter)
+            except:
+                self.unlabel_loader_iter = iter(self.unlabel_loader)
+                unlabel_samples = next(self.unlabel_loader_iter)
+
+            yield (
+                label_samples[0],  # sup weak
+                label_samples[1],  # sup strong
+                unlabel_samples[0],  # unsup weak
+                unlabel_samples[1]  # unsup strong
+            )
+
+    def __call__(self):
+        return self.__iter__()
+
+
+class BaseSemiDataLoader(object):
+    def __init__(self,
+                 sample_transforms=[],
+                 weak_aug=[],
+                 strong_aug=[],
+                 sup_batch_transforms=[],
+                 unsup_batch_transforms=[],
+                 sup_batch_size=1,
+                 unsup_batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 use_shared_memory=False,
+                 **kwargs):
+        # sup transforms
+        self._sample_transforms_label = Compose_SSOD(
+            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
+        self._batch_transforms_label = BatchCompose_SSOD(
+            sup_batch_transforms, num_classes, collate_batch)
+        self.batch_size_label = sup_batch_size
+
+        # unsup transforms
+        self._sample_transforms_unlabel = Compose_SSOD(
+            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
+        self._batch_transforms_unlabel = BatchCompose_SSOD(
+            unsup_batch_transforms, num_classes, collate_batch)
+        self.batch_size_unlabel = unsup_batch_size
+
+        # common
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.use_shared_memory = use_shared_memory
+        self.kwargs = kwargs
+
+    def __call__(self,
+                 dataset_label,
+                 dataset_unlabel,
+                 worker_num,
+                 batch_sampler_label=None,
+                 batch_sampler_unlabel=None,
+                 return_list=False):
+        # sup dataset 
+        self.dataset_label = dataset_label
+        self.dataset_label.check_or_download_dataset()
+        self.dataset_label.parse_dataset()
+        self.dataset_label.set_transform(self._sample_transforms_label)
+        self.dataset_label.set_kwargs(**self.kwargs)
+        if batch_sampler_label is None:
+            self._batch_sampler_label = DistributedBatchSampler(
+                self.dataset_label,
+                batch_size=self.batch_size_label,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler_label = batch_sampler_label
+
+        # unsup dataset
+        self.dataset_unlabel = dataset_unlabel
+        self.dataset_unlabel.length = self.dataset_label.__len__()
+        self.dataset_unlabel.check_or_download_dataset()
+        self.dataset_unlabel.parse_dataset()
+        self.dataset_unlabel.set_transform(self._sample_transforms_unlabel)
+        self.dataset_unlabel.set_kwargs(**self.kwargs)
+        if batch_sampler_unlabel is None:
+            self._batch_sampler_unlabel = DistributedBatchSampler(
+                self.dataset_unlabel,
+                batch_size=self.batch_size_unlabel,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler_unlabel = batch_sampler_unlabel
+
+        # DataLoader do not start sub-process in Windows and Mac
+        # system, do not need to use shared memory
+        use_shared_memory = self.use_shared_memory and \
+                            sys.platform not in ['win32', 'darwin']
+        # check whether shared memory size is bigger than 1G(1024M)
+        if use_shared_memory:
+            shm_size = _get_shared_memory_size_in_M()
+            if shm_size is not None and shm_size < 1024.:
+                logger.warning("Shared memory size is less than 1G, "
+                               "disable shared_memory in DataLoader")
+                use_shared_memory = False
+
+        self.dataloader_label = DataLoader(
+            dataset=self.dataset_label,
+            batch_sampler=self._batch_sampler_label,
+            collate_fn=self._batch_transforms_label,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+
+        self.dataloader_unlabel = DataLoader(
+            dataset=self.dataset_unlabel,
+            batch_sampler=self._batch_sampler_unlabel,
+            collate_fn=self._batch_transforms_unlabel,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+
+        self.dataloader = CombineSSODLoader(self.dataloader_label,
+                                            self.dataloader_unlabel)
+        self.loader = iter(self.dataloader)
+        return self
+
+    def __len__(self):
+        return len(self._batch_sampler_label)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return next(self.loader)
+
+    def next(self):
+        # python2 compatibility
+        return self.__next__()
+
+
+@register
+class SemiTrainReader(BaseSemiDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 weak_aug=[],
+                 strong_aug=[],
+                 sup_batch_transforms=[],
+                 unsup_batch_transforms=[],
+                 sup_batch_size=1,
+                 unsup_batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 **kwargs):
+        super(SemiTrainReader, self).__init__(
+            sample_transforms, weak_aug, strong_aug, sup_batch_transforms,
+            unsup_batch_transforms, sup_batch_size, unsup_batch_size, shuffle,
+            drop_last, num_classes, collate_batch, **kwargs)

+ 1 - 0
paddlers/models/ppdet/data/source/__init__.py

@@ -28,3 +28,4 @@ from .keypoint_coco import *
 from .mot import *
 from .sniper_coco import SniperCOCODataSet
 from .dataset import ImageFolder
+from .pose3d_cmb import *

+ 3 - 0
paddlers/models/ppdet/data/source/category.py

@@ -118,6 +118,9 @@ def get_categories(metric_type, anno_file=None, arch=None):
     ) == 'keypointtopdownmpiieval':
         return (None, {'id': 'keypoint'})
 
+    elif metric_type.lower() == 'pose3deval':
+        return (None, {'id': 'pose3d'})
+
     elif metric_type.lower() in ['mot', 'motdet', 'reid']:
         if anno_file and os.path.isfile(anno_file):
             cats = []

+ 237 - 3
paddlers/models/ppdet/data/source/coco.py

@@ -13,6 +13,11 @@
 # limitations under the License.
 
 import os
+import copy
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
 import numpy as np
 from paddlers.models.ppdet.core.workspace import register, serializable
 from .dataset import DetDataset
@@ -20,6 +25,8 @@ from .dataset import DetDataset
 from paddlers.models.ppdet.utils.logger import setup_logger
 logger = setup_logger(__name__)
 
+__all__ = ['COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet']
+
 
 @register
 @serializable
@@ -170,8 +177,10 @@ class COCODataSet(DetDataset):
                 gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
                 is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
                 gt_poly = [None] * num_bbox
+                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)
 
                 has_segmentation = False
+                has_track_id = False
                 for i, box in enumerate(bboxes):
                     catid = box['category_id']
                     gt_class[i][0] = self.catid2clsid[catid]
@@ -181,8 +190,9 @@ class COCODataSet(DetDataset):
                     if 'segmentation' in box and box['iscrowd'] == 1:
                         gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
                     elif 'segmentation' in box and box['segmentation']:
-                        if not np.array(box['segmentation']
-                                        ).size > 0 and not self.allow_empty:
+                        if not np.array(
+                                box['segmentation'],
+                                dtype=object).size > 0 and not self.allow_empty:
                             bboxes.pop(i)
                             gt_poly.pop(i)
                             np.delete(is_crowd, i)
@@ -192,6 +202,10 @@ class COCODataSet(DetDataset):
                             gt_poly[i] = box['segmentation']
                         has_segmentation = True
 
+                    if 'track_id' in box:
+                        gt_track_id[i][0] = box['track_id']
+                        has_track_id = True
+
                 if has_segmentation and not any(
                         gt_poly) and not self.allow_empty:
                     continue
@@ -202,6 +216,8 @@ class COCODataSet(DetDataset):
                     'gt_bbox': gt_bbox,
                     'gt_poly': gt_poly,
                 }
+                if has_track_id:
+                    gt_rec.update({'gt_track_id': gt_track_id})
 
                 for k, v in gt_rec.items():
                     if k in self.data_fields:
@@ -223,7 +239,8 @@ class COCODataSet(DetDataset):
             if self.sample_num > 0 and ct >= self.sample_num:
                 break
         assert ct > 0, 'not found any coco record in %s' % (anno_path)
-        logger.debug('{} samples in file {}'.format(ct, anno_path))
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
         if self.allow_empty and len(empty_records) > 0:
             empty_records = self._sample_empty(empty_records, len(records))
             records += empty_records
@@ -351,3 +368,220 @@ class SlicedCOCODataSet(COCODataSet):
             empty_records = self._sample_empty(empty_records, len(records))
             records += empty_records
         self.roidbs = records
+
+
+@register
+@serializable
+class SemiCOCODataSet(COCODataSet):
+    """Semi-COCODataSet used for supervised and unsupervised dataSet"""
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1,
+                 supervised=True):
+        super(SemiCOCODataSet, self).__init__(
+            dataset_dir, image_dir, anno_path, data_fields, sample_num,
+            load_crowd, allow_empty, empty_ratio, repeat)
+        self.supervised = supervised
+        self.length = -1  # defalut -1 means all
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset or self.supervised == False:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    x1, y1, box_w, box_h = inst['bbox']
+                    x2 = x1 + box_w
+                    y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+
+                has_segmentation = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format 
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(box['segmentation']
+                                        ).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                gt_rec = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_poly': gt_poly,
+                }
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+        if self.supervised:
+            logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED')
+        else:
+            if self.length > 0:  # unsup length will be decide by sup length
+                all_roidbs = self.roidbs.copy()
+                selected_idxs = [
+                    np.random.choice(len(all_roidbs))
+                    for _ in range(self.length)
+                ]
+                self.roidbs = [all_roidbs[i] for i in selected_idxs]
+            logger.info(
+                f'Use {len(self.roidbs)} unsup_samples data as UNLABELED')
+
+    def __getitem__(self, idx):
+        n = len(self.roidbs)
+        if self.repeat > 1:
+            idx %= n
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(4)
+            ]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)

+ 9 - 1
paddlers/models/ppdet/data/source/dataset.py

@@ -86,6 +86,12 @@ class DetDataset(Dataset):
                 copy.deepcopy(self.roidbs[np.random.randint(n)])
                 for _ in range(4)
             ]
+        elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch:
+            # Add previous image as input, only used in CenterTrack
+            idx_pre_img = idx - 1
+            if idx_pre_img < 0:
+                idx_pre_img = idx + 1
+            roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])]
         if isinstance(roidb, Sequence):
             for r in roidb:
                 r['curr_iter'] = self._curr_iter
@@ -103,6 +109,7 @@ class DetDataset(Dataset):
         self.mixup_epoch = kwargs.get('mixup_epoch', -1)
         self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
         self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
+        self.pre_img_epoch = kwargs.get('pre_img_epoch', -1)
 
     def set_transform(self, transform):
         self.transform = transform
@@ -254,7 +261,8 @@ class ImageFolder(DetDataset):
                 records.append(rec)
             ct_sub += sub_img_num
             ct += 1
-        print('{} samples and slice to {} sub_samples'.format(ct, ct_sub))
+        logger.info('{} samples and slice to {} sub_samples.'.format(ct,
+                                                                     ct_sub))
         self.roidbs = records
 
     def get_label_list(self):

+ 84 - 29
paddlers/models/ppdet/data/source/keypoint_coco.py

@@ -80,7 +80,8 @@ class KeypointBottomUpBaseDataset(DetDataset):
         records = copy.deepcopy(self._get_imganno(idx))
         records['image'] = cv2.imread(records['image_file'])
         records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
-        records['mask'] = (records['mask'] + 0).astype('uint8')
+        if 'mask' in records:
+            records['mask'] = (records['mask'] + 0).astype('uint8')
         records = self.transform(records)
         return records
 
@@ -135,24 +136,37 @@ class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
                  num_joints,
                  transform=[],
                  shard=[0, 1],
-                 test_mode=False):
+                 test_mode=False,
+                 return_mask=True,
+                 return_bbox=True,
+                 return_area=True,
+                 return_class=True):
         super().__init__(dataset_dir, image_dir, anno_path, num_joints,
                          transform, shard, test_mode)
 
         self.ann_file = os.path.join(dataset_dir, anno_path)
         self.shard = shard
         self.test_mode = test_mode
+        self.return_mask = return_mask
+        self.return_bbox = return_bbox
+        self.return_area = return_area
+        self.return_class = return_class
 
     def parse_dataset(self):
         self.coco = COCO(self.ann_file)
 
         self.img_ids = self.coco.getImgIds()
         if not self.test_mode:
-            self.img_ids = [
-                img_id for img_id in self.img_ids
-                if len(self.coco.getAnnIds(
-                    imgIds=img_id, iscrowd=None)) > 0
-            ]
+            self.img_ids_tmp = []
+            for img_id in self.img_ids:
+                ann_ids = self.coco.getAnnIds(imgIds=img_id)
+                anno = self.coco.loadAnns(ann_ids)
+                anno = [obj for obj in anno if obj['iscrowd'] == 0]
+                if len(anno) == 0:
+                    continue
+                self.img_ids_tmp.append(img_id)
+            self.img_ids = self.img_ids_tmp
+
         blocknum = int(len(self.img_ids) / self.shard[1])
         self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
             self.shard[0] + 1))]
@@ -199,21 +213,31 @@ class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
         ann_ids = coco.getAnnIds(imgIds=img_id)
         anno = coco.loadAnns(ann_ids)
 
-        mask = self._get_mask(anno, idx)
         anno = [
             obj for obj in anno
-            if obj['iscrowd'] == 0 or obj['num_keypoints'] > 0
+            if obj['iscrowd'] == 0 and obj['num_keypoints'] > 0
         ]
 
+        db_rec = {}
         joints, orgsize = self._get_joints(anno, idx)
+        db_rec['gt_joints'] = joints
+        db_rec['im_shape'] = orgsize
+
+        if self.return_bbox:
+            db_rec['gt_bbox'] = self._get_bboxs(anno, idx)
+
+        if self.return_class:
+            db_rec['gt_class'] = self._get_labels(anno, idx)
+
+        if self.return_area:
+            db_rec['gt_areas'] = self._get_areas(anno, idx)
+
+        if self.return_mask:
+            db_rec['mask'] = self._get_mask(anno, idx)
 
-        db_rec = {}
         db_rec['im_id'] = img_id
         db_rec['image_file'] = os.path.join(self.img_prefix,
                                             self.id2name[img_id])
-        db_rec['mask'] = mask
-        db_rec['joints'] = joints
-        db_rec['im_shape'] = orgsize
 
         return db_rec
 
@@ -229,12 +253,41 @@ class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
                 np.array(obj['keypoints']).reshape([-1, 3])
 
         img_info = self.coco.loadImgs(self.img_ids[idx])[0]
-        joints[..., 0] /= img_info['width']
-        joints[..., 1] /= img_info['height']
-        orgsize = np.array([img_info['height'], img_info['width']])
+        orgsize = np.array([img_info['height'], img_info['width'], 1])
 
         return joints, orgsize
 
+    def _get_bboxs(self, anno, idx):
+        num_people = len(anno)
+        gt_bboxes = np.zeros((num_people, 4), dtype=np.float32)
+
+        for idx, obj in enumerate(anno):
+            if 'bbox' in obj:
+                gt_bboxes[idx, :] = obj['bbox']
+
+        gt_bboxes[:, 2] += gt_bboxes[:, 0]
+        gt_bboxes[:, 3] += gt_bboxes[:, 1]
+        return gt_bboxes
+
+    def _get_labels(self, anno, idx):
+        num_people = len(anno)
+        gt_labels = np.zeros((num_people, 1), dtype=np.float32)
+
+        for idx, obj in enumerate(anno):
+            if 'category_id' in obj:
+                catid = obj['category_id']
+                gt_labels[idx, 0] = self.catid2clsid[catid]
+        return gt_labels
+
+    def _get_areas(self, anno, idx):
+        num_people = len(anno)
+        gt_areas = np.zeros((num_people, ), dtype=np.float32)
+
+        for idx, obj in enumerate(anno):
+            if 'area' in obj:
+                gt_areas[idx, ] = obj['area']
+        return gt_areas
+
     def _get_mask(self, anno, idx):
         """Get ignore masks to mask out losses."""
         coco = self.coco
@@ -487,9 +540,9 @@ class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
                     continue
 
                 joints = np.zeros(
-                    (self.ann_info['num_joints'], 3), dtype=np.float)
+                    (self.ann_info['num_joints'], 3), dtype=np.float32)
                 joints_vis = np.zeros(
-                    (self.ann_info['num_joints'], 3), dtype=np.float)
+                    (self.ann_info['num_joints'], 3), dtype=np.float32)
                 for ipt in range(self.ann_info['num_joints']):
                     joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
                     joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
@@ -506,7 +559,7 @@ class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
                     'image_file': os.path.join(self.img_prefix, file_name),
                     'center': center,
                     'scale': scale,
-                    'joints': joints,
+                    'gt_joints': joints,
                     'joints_vis': joints_vis,
                     'im_id': im_id,
                 })
@@ -560,16 +613,17 @@ class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
                 continue
 
             center, scale = self._box2cs(box)
-            joints = np.zeros((self.ann_info['num_joints'], 3), dtype=np.float)
+            joints = np.zeros(
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
             joints_vis = np.ones(
-                (self.ann_info['num_joints'], 3), dtype=np.float)
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
             kpt_db.append({
                 'image_file': img_name,
                 'im_id': im_id,
                 'center': center,
                 'scale': scale,
                 'score': score,
-                'joints': joints,
+                'gt_joints': joints,
                 'joints_vis': joints_vis,
             })
 
@@ -633,8 +687,8 @@ class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
             im_id = a['image_id'] if 'image_id' in a else int(
                 os.path.splitext(image_name)[0])
 
-            c = np.array(a['center'], dtype=np.float)
-            s = np.array([a['scale'], a['scale']], dtype=np.float)
+            c = np.array(a['center'], dtype=np.float32)
+            s = np.array([a['scale'], a['scale']], dtype=np.float32)
 
             # Adjust center/scale slightly to avoid cropping limbs
             if c[0] != -1:
@@ -642,11 +696,12 @@ class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
                 s = s * 1.25
             c = c - 1
 
-            joints = np.zeros((self.ann_info['num_joints'], 3), dtype=np.float)
+            joints = np.zeros(
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
             joints_vis = np.zeros(
-                (self.ann_info['num_joints'], 3), dtype=np.float)
-            if 'joints' in a:
-                joints_ = np.array(a['joints'])
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
+            if 'gt_joints' in a:
+                joints_ = np.array(a['gt_joints'])
                 joints_[:, 0:2] = joints_[:, 0:2] - 1
                 joints_vis_ = np.array(a['joints_vis'])
                 assert len(joints_) == self.ann_info[
@@ -662,7 +717,7 @@ class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
                 'im_id': im_id,
                 'center': c,
                 'scale': s,
-                'joints': joints,
+                'gt_joints': joints,
                 'joints_vis': joints_vis
             })
         print("number length: {}".format(len(gt_db)))

+ 380 - 0
paddlers/models/ppdet/data/source/pose3d_cmb.py

@@ -0,0 +1,380 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import cv2
+import numpy as np
+import json
+import copy
+import pycocotools
+from pycocotools.coco import COCO
+from .dataset import DetDataset
+from paddlers.models.ppdet.core.workspace import register, serializable
+from paddle.io import Dataset
+
+
+@serializable
+class Pose3DDataset(DetDataset):
+    """Pose3D Dataset class. 
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        anno_list (list of str): each of the element is a relative path to the annotation file.
+        image_dirs (list of str): each of path is a relative path where images are held.
+        transform (composed(operators)): A sequence of data transforms.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+        24 joints order:
+        0-2: 'R_Ankle', 'R_Knee', 'R_Hip', 
+        3-5:'L_Hip', 'L_Knee', 'L_Ankle', 
+        6-8:'R_Wrist', 'R_Elbow', 'R_Shoulder', 
+        9-11:'L_Shoulder','L_Elbow','L_Wrist',
+        12-14:'Neck','Top_of_Head','Pelvis',
+        15-18:'Thorax','Spine','Jaw','Head',
+        19-23:'Nose','L_Eye','R_Eye','L_Ear','R_Ear'
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dirs,
+                 anno_list,
+                 transform=[],
+                 num_joints=24,
+                 test_mode=False):
+        super().__init__(dataset_dir, image_dirs, anno_list)
+        self.image_info = {}
+        self.ann_info = {}
+        self.num_joints = num_joints
+
+        self.transform = transform
+        self.test_mode = test_mode
+
+        self.img_ids = []
+        self.dataset_dir = dataset_dir
+        self.image_dirs = image_dirs
+        self.anno_list = anno_list
+
+    def get_mask(self, mvm_percent=0.3):
+        num_joints = self.num_joints
+        mjm_mask = np.ones((num_joints, 1)).astype(np.float32)
+        if self.test_mode == False:
+            pb = np.random.random_sample()
+            masked_num = int(
+                pb * mvm_percent *
+                num_joints)  # at most x% of the joints could be masked
+            indices = np.random.choice(
+                np.arange(num_joints), replace=False, size=masked_num)
+            mjm_mask[indices, :] = 0.0
+        # return mjm_mask
+
+        num_joints = 10
+        mvm_mask = np.ones((num_joints, 1)).astype(np.float)
+        if self.test_mode == False:
+            num_vertices = num_joints
+            pb = np.random.random_sample()
+            masked_num = int(
+                pb * mvm_percent *
+                num_vertices)  # at most x% of the vertices could be masked
+            indices = np.random.choice(
+                np.arange(num_vertices), replace=False, size=masked_num)
+            mvm_mask[indices, :] = 0.0
+
+        mjm_mask = np.concatenate([mjm_mask, mvm_mask], axis=0)
+        return mjm_mask
+
+    def filterjoints(self, x):
+        if self.num_joints == 24:
+            return x
+        elif self.num_joints == 14:
+            return x[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18], :]
+        elif self.num_joints == 17:
+            return x[
+                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19], :]
+        else:
+            raise ValueError(
+                "unsupported joint numbers, only [24 or 17 or 14] is supported!")
+
+    def parse_dataset(self):
+        print("Loading annotations..., please wait")
+        self.annos = []
+        im_id = 0
+        self.human36m_num = 0
+        for idx, annof in enumerate(self.anno_list):
+            img_prefix = os.path.join(self.dataset_dir, self.image_dirs[idx])
+            dataf = os.path.join(self.dataset_dir, annof)
+            with open(dataf, 'r') as rf:
+                anno_data = json.load(rf)
+                annos = anno_data['data']
+                new_annos = []
+                print("{} has annos numbers: {}".format(dataf, len(annos)))
+                for anno in annos:
+                    new_anno = {}
+                    new_anno['im_id'] = im_id
+                    im_id += 1
+                    imagename = anno['imageName']
+                    if imagename.startswith("COCO_train2014_"):
+                        imagename = imagename[len("COCO_train2014_"):]
+                    elif imagename.startswith("COCO_val2014_"):
+                        imagename = imagename[len("COCO_val2014_"):]
+                    imagename = os.path.join(img_prefix, imagename)
+                    if not os.path.exists(imagename):
+                        if "train2017" in imagename:
+                            imagename = imagename.replace("train2017",
+                                                          "val2017")
+                            if not os.path.exists(imagename):
+                                print("cannot find imagepath:{}".format(
+                                    imagename))
+                                continue
+                        else:
+                            print("cannot find imagepath:{}".format(imagename))
+                            continue
+                    new_anno['imageName'] = imagename
+                    if 'human3.6m' in imagename:
+                        self.human36m_num += 1
+                    new_anno['bbox_center'] = anno['bbox_center']
+                    new_anno['bbox_scale'] = anno['bbox_scale']
+                    new_anno['joints_2d'] = np.array(anno[
+                        'gt_keypoint_2d']).astype(np.float32)
+                    if new_anno['joints_2d'].shape[0] == 49:
+                        #if the joints_2d is in SPIN format(which generated by eft), choose the last 24 public joints
+                        #for detail please refer: https://github.com/nkolot/SPIN/blob/master/constants.py
+                        new_anno['joints_2d'] = new_anno['joints_2d'][25:]
+                    new_anno['joints_3d'] = np.array(anno[
+                        'pose3d'])[:, :3].astype(np.float32)
+                    new_anno['mjm_mask'] = self.get_mask()
+                    if not 'has_3d_joints' in anno:
+                        new_anno['has_3d_joints'] = int(1)
+                        new_anno['has_2d_joints'] = int(1)
+                    else:
+                        new_anno['has_3d_joints'] = int(anno['has_3d_joints'])
+                        new_anno['has_2d_joints'] = int(anno['has_2d_joints'])
+                    new_anno['joints_2d'] = self.filterjoints(new_anno[
+                        'joints_2d'])
+                    self.annos.append(new_anno)
+                del annos
+
+    def get_temp_num(self):
+        """get temporal data number, like human3.6m"""
+        return self.human36m_num
+
+    def __len__(self):
+        """Get dataset length."""
+        return len(self.annos)
+
+    def _get_imganno(self, idx):
+        """Get anno for a single image."""
+        return self.annos[idx]
+
+    def __getitem__(self, idx):
+        """Prepare image for training given the index."""
+        records = copy.deepcopy(self._get_imganno(idx))
+        imgpath = records['imageName']
+        assert os.path.exists(imgpath), "cannot find image {}".format(imgpath)
+        records['image'] = cv2.imread(imgpath)
+        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
+        records = self.transform(records)
+        return records
+
+    def check_or_download_dataset(self):
+        alldatafind = True
+        for image_dir in self.image_dirs:
+            image_dir = os.path.join(self.dataset_dir, image_dir)
+            if not os.path.isdir(image_dir):
+                print("dataset [{}] is not found".format(image_dir))
+                alldatafind = False
+        if not alldatafind:
+            raise ValueError(
+                "Some dataset is not valid and cannot download automatically now, please prepare the dataset first"
+            )
+
+
+@register
+@serializable
+class Keypoint3DMultiFramesDataset(Dataset):
+    """24 keypoints 3D dataset for pose estimation. 
+
+    each item is a list of images
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+    """
+
+    def __init__(
+            self,
+            dataset_dir,  # 数据集根目录
+            image_dir,  # 图像文件夹
+            p3d_dir,  # 3D关键点文件夹
+            json_path,
+            img_size,  #图像resize大小
+            num_frames,  # 帧序列长度
+            anno_path=None, ):
+
+        self.dataset_dir = dataset_dir
+        self.image_dir = image_dir
+        self.p3d_dir = p3d_dir
+        self.json_path = json_path
+        self.img_size = img_size
+        self.num_frames = num_frames
+        self.anno_path = anno_path
+
+        self.data_labels, self.mf_inds = self._generate_multi_frames_list()
+
+    def _generate_multi_frames_list(self):
+        act_list = os.listdir(self.dataset_dir)  # 动作列表
+        count = 0
+        mf_list = []
+        annos_dict = {'images': [], 'annotations': [], 'act_inds': []}
+        for act in act_list:  #对每个动作,生成帧序列
+            if '.' in act:
+                continue
+
+            json_path = os.path.join(self.dataset_dir, act, self.json_path)
+            with open(json_path, 'r') as j:
+                annos = json.load(j)
+            length = len(annos['images'])
+            for k, v in annos.items():
+                if k in annos_dict:
+                    annos_dict[k].extend(v)
+            annos_dict['act_inds'].extend([act] * length)
+
+            mf = [[i + j + count for j in range(self.num_frames)]
+                  for i in range(0, length - self.num_frames + 1)]
+            mf_list.extend(mf)
+            count += length
+
+        print("total data number:", len(mf_list))
+        return annos_dict, mf_list
+
+    def __call__(self, *args, **kwargs):
+        return self
+
+    def __getitem__(self, index):  # 拿一个连续的序列
+        inds = self.mf_inds[
+            index]  # 如[568, 569, 570, 571, 572, 573],长度为num_frames
+
+        images = self.data_labels['images']  # all images
+        annots = self.data_labels['annotations']  # all annots
+
+        act = self.data_labels['act_inds'][inds[0]]  # 动作名(文件夹名)
+
+        kps3d_list = []
+        kps3d_vis_list = []
+        names = []
+
+        h, w = 0, 0
+        for ind in inds:  # one image
+            height = float(images[ind]['height'])
+            width = float(images[ind]['width'])
+            name = images[ind]['file_name']  # 图像名称,带有后缀
+
+            kps3d_name = name.split('.')[0] + '.obj'
+            kps3d_path = os.path.join(self.dataset_dir, act, self.p3d_dir,
+                                      kps3d_name)
+
+            joints, joints_vis = self.kps3d_process(kps3d_path)
+            joints_vis = np.array(joints_vis, dtype=np.float32)
+
+            kps3d_list.append(joints)
+            kps3d_vis_list.append(joints_vis)
+            names.append(name)
+
+        kps3d = np.array(kps3d_list)  # (6, 24, 3),(num_frames, joints_num, 3)
+        kps3d_vis = np.array(kps3d_vis_list)
+
+        # read image
+        imgs = []
+        for name in names:
+            img_path = os.path.join(self.dataset_dir, act, self.image_dir, name)
+
+            image = cv2.imread(img_path, cv2.IMREAD_COLOR |
+                               cv2.IMREAD_IGNORE_ORIENTATION)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+            imgs.append(np.expand_dims(image, axis=0))
+
+        imgs = np.concatenate(imgs, axis=0)
+        imgs = imgs.astype(
+            np.float32)  # (6, 1080, 1920, 3),(num_frames, h, w, c)
+
+        # attention: 此时图像和标注是镜像的
+        records = {
+            'kps3d': kps3d,
+            'kps3d_vis': kps3d_vis,
+            "image": imgs,
+            'act': act,
+            'names': names,
+            'im_id': index
+        }
+
+        return self.transform(records)
+
+    def kps3d_process(self, kps3d_path):
+        count = 0
+        kps = []
+        kps_vis = []
+
+        with open(kps3d_path, 'r') as f:
+            lines = f.readlines()
+            for line in lines:
+                if line[0] == 'v':
+                    kps.append([])
+                    line = line.strip('\n').split(' ')[1:]
+                    for kp in line:
+                        kps[-1].append(float(kp))
+                    count += 1
+
+                    kps_vis.append([1, 1, 1])
+
+        kps = np.array(kps)  # 52,3
+        kps_vis = np.array(kps_vis)
+
+        kps *= 10  # scale points
+        kps -= kps[[0], :]  # set root point to zero
+
+        kps = np.concatenate((kps[0:23], kps[[37]]), axis=0)  # 24,3
+
+        kps *= 10
+
+        kps_vis = np.concatenate((kps_vis[0:23], kps_vis[[37]]), axis=0)  # 24,3
+
+        return kps, kps_vis
+
+    def __len__(self):
+        return len(self.mf_inds)
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        return os.path.join(self.dataset_dir, self.anno_path)
+
+    def check_or_download_dataset(self):
+        return
+
+    def parse_dataset(self, ):
+        return
+
+    def set_transform(self, transform):
+        self.transform = transform
+
+    def set_epoch(self, epoch_id):
+        self._epoch = epoch_id
+
+    def set_kwargs(self, **kwargs):
+        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
+        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
+        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)

+ 2 - 0
paddlers/models/ppdet/data/transform/__init__.py

@@ -17,12 +17,14 @@ from . import batch_operators
 from . import keypoint_operators
 from . import mot_operators
 from . import rotated_operators
+from . import keypoints_3d_operators
 
 from .operators import *
 from .batch_operators import *
 from .keypoint_operators import *
 from .mot_operators import *
 from .rotated_operators import *
+from .keypoints_3d_operators import *
 
 __all__ = []
 __all__ += registered_ops

+ 159 - 7
paddlers/models/ppdet/data/transform/atss_assigner.py

@@ -43,7 +43,8 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
     Returns:
         Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
     """
-    assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
+    assert mode in ['iou', 'iof', 'giou', 'diou'], 'Unsupported mode {}'.format(
+        mode)
     # Either the boxes are empty or the length of boxes's last dimenstion is 4
     assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
     assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
@@ -83,6 +84,13 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
         if mode == 'giou':
             enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
             enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
+        if mode == 'diou':
+            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
+            b1_x1, b1_y1 = bboxes1[..., 0], bboxes1[..., 1]
+            b1_x2, b1_y2 = bboxes1[..., 2], bboxes1[..., 3]
+            b2_x1, b2_y1 = bboxes2[..., 0], bboxes2[..., 1]
+            b2_x2, b2_y2 = bboxes2[..., 2], bboxes2[..., 3]
     else:
         lt = np.maximum(bboxes1[..., :, None, :2],
                         bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
@@ -101,6 +109,15 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
                                      bboxes2[..., None, :, :2])
             enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
                                      bboxes2[..., None, :, 2:])
+        if mode == 'diou':
+            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
+                                     bboxes2[..., None, :, :2])
+            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
+                                     bboxes2[..., None, :, 2:])
+            b1_x1, b1_y1 = bboxes1[..., :, None, 0], bboxes1[..., :, None, 1]
+            b1_x2, b1_y2 = bboxes1[..., :, None, 2], bboxes1[..., :, None, 3]
+            b2_x1, b2_y1 = bboxes2[..., None, :, 0], bboxes2[..., None, :, 1]
+            b2_x2, b2_y2 = bboxes2[..., None, :, 2], bboxes2[..., None, :, 3]
 
     eps = np.array([eps])
     union = np.maximum(union, eps)
@@ -108,18 +125,32 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
     if mode in ['iou', 'iof']:
         return ious
     # calculate gious
-    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
-    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
-    enclose_area = np.maximum(enclose_area, eps)
-    gious = ious - (enclose_area - union) / enclose_area
-    return gious
+    if mode in ['giou']:
+        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
+        enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+        enclose_area = np.maximum(enclose_area, eps)
+        gious = ious - (enclose_area - union) / enclose_area
+        return gious
+    if mode in ['diou']:
+        left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+        right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+        rho2 = left + right
+        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
+        enclose_c = enclose_wh[..., 0]**2 + enclose_wh[..., 1]**2
+        enclose_c = np.maximum(enclose_c, eps)
+        dious = ious - rho2 / enclose_c
+        return dious
 
 
 def topk_(input, k, axis=1, largest=True):
     x = -input if largest else input
     if axis == 0:
         row_index = np.arange(input.shape[1 - axis])
-        topk_index = np.argpartition(x, k, axis=axis)[0:k, :]
+        if k == x.shape[0]:  # argpartition requires index < len(input)
+            topk_index = np.argpartition(x, k - 1, axis=axis)[0:k, :]
+        else:
+            topk_index = np.argpartition(x, k, axis=axis)[0:k, :]
+
         topk_data = x[topk_index, row_index]
 
         topk_index_sort = np.argsort(topk_data, axis=axis)
@@ -267,3 +298,124 @@ class ATSSAssigner(object):
                          -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
 
         return assigned_gt_inds, max_overlaps
+
+    def get_vlr_region(self,
+                       bboxes,
+                       num_level_bboxes,
+                       gt_bboxes,
+                       gt_bboxes_ignore=None,
+                       gt_labels=None):
+        """get vlr region for ld distillation.
+        Args:
+            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
+            num_level_bboxes (List): num of bboxes in each level
+            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
+        """
+        bboxes = bboxes[:, :4]
+
+        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]
+
+        # compute iou between all bbox and gt
+        overlaps = bbox_overlaps(bboxes, gt_bboxes)
+
+        # compute diou between all bbox and gt
+        diou = bbox_overlaps(bboxes, gt_bboxes, mode='diou')
+
+        # assign 0 by default
+        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)
+
+        vlr_region_iou = (assigned_gt_inds + 0).astype(np.float32)
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = np.zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if not np.any(gt_labels):
+                assigned_labels = None
+            else:
+                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
+            return assigned_gt_inds, max_overlaps
+
+        # compute center distance between all bbox and gt
+        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        gt_points = np.stack((gt_cx, gt_cy), axis=1)
+
+        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)
+
+        distances = np.sqrt(
+            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
+            .sum(-1))
+
+        # Selecting candidates based on the center distance
+        candidate_idxs = []
+        candidate_idxs_t = []
+        start_idx = 0
+        for bboxes_per_level in num_level_bboxes:
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_idx = start_idx + bboxes_per_level
+            distances_per_level = distances[start_idx:end_idx, :]
+            selectable_t = min(self.topk, bboxes_per_level)
+            selectable_k = bboxes_per_level  #k for all
+            _, topt_idxs_per_level = topk_(
+                distances_per_level, selectable_t, axis=0, largest=False)
+            _, topk_idxs_per_level = topk_(
+                distances_per_level, selectable_k, axis=0, largest=False)
+            candidate_idxs_t.append(topt_idxs_per_level + start_idx)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+            start_idx = end_idx
+
+        candidate_idxs_t = np.concatenate(candidate_idxs_t, axis=0)
+        candidate_idxs = np.concatenate(candidate_idxs, axis=0)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        candidate_overlaps_t = overlaps[candidate_idxs_t, np.arange(num_gt)]
+
+        # compute tdiou
+        t_diou = diou[candidate_idxs, np.arange(num_gt)]
+
+        overlaps_mean_per_gt = candidate_overlaps_t.mean(0)
+        overlaps_std_per_gt = candidate_overlaps_t.std(
+            0, ddof=1)  # NOTE: use Bessel correction
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        # compute region        
+        is_pos = (t_diou < overlaps_thr_per_gt[None, :]) & (
+            t_diou >= 0.25 * overlaps_thr_per_gt[None, :])
+
+        # limit the positive sample's center in gt
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+
+        candidate_idxs = candidate_idxs.reshape(-1)
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
+        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]
+
+        overlaps_inf[index] = overlaps.T.reshape(-1)[index]
+        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
+
+        max_overlaps = overlaps_inf.max(axis=1)
+        argmax_overlaps = overlaps_inf.argmax(axis=1)
+
+        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
+        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
+
+        assigned_gt_inds[max_overlaps !=
+                         -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
+
+        vlr_region_iou[max_overlaps !=
+                       -np.inf] = max_overlaps[max_overlaps != -np.inf] + 0
+
+        return vlr_region_iou

+ 359 - 42
paddlers/models/ppdet/data/transform/batch_operators.py

@@ -24,6 +24,7 @@ except Exception:
     from collections import Sequence
 
 import cv2
+import copy
 import math
 import numpy as np
 from .operators import register_op, BaseOperator, Resize
@@ -43,10 +44,11 @@ __all__ = [
     'Gt2FCOSTarget',
     'Gt2TTFTarget',
     'Gt2Solov2Target',
-    'Gt2SparseRCNNTarget',
+    'Gt2SparseTarget',
     'PadMaskBatch',
     'Gt2GFLTarget',
     'Gt2CenterNetTarget',
+    'Gt2CenterTrackTarget',
     'PadGT',
     'PadRGT',
 ]
@@ -169,6 +171,7 @@ class BatchRandomResize(BaseOperator):
 
 @register_op
 class Gt2YoloTarget(BaseOperator):
+    __shared__ = ['num_classes']
     """
     Generate YOLOv3 targets by groud truth data, this operator is only used in
     fine grained YOLOv3 loss mode
@@ -292,7 +295,9 @@ class Gt2FCOSTarget(BaseOperator):
                  object_sizes_boundary,
                  center_sampling_radius,
                  downsample_ratios,
-                 norm_reg_targets=False):
+                 num_shift=0.5,
+                 multiply_strides_reg_targets=False,
+                 norm_reg_targets=True):
         super(Gt2FCOSTarget, self).__init__()
         self.center_sampling_radius = center_sampling_radius
         self.downsample_ratios = downsample_ratios
@@ -304,6 +309,8 @@ class Gt2FCOSTarget(BaseOperator):
                 self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1]
             ])
         self.object_sizes_of_interest = object_sizes_of_interest
+        self.num_shift = num_shift
+        self.multiply_strides_reg_targets = multiply_strides_reg_targets
         self.norm_reg_targets = norm_reg_targets
 
     def _compute_points(self, w, h):
@@ -320,7 +327,8 @@ class Gt2FCOSTarget(BaseOperator):
             shift_x, shift_y = np.meshgrid(shift_x, shift_y)
             shift_x = shift_x.flatten()
             shift_y = shift_y.flatten()
-            location = np.stack([shift_x, shift_y], axis=1) + stride // 2
+            location = np.stack(
+                [shift_x, shift_y], axis=1) + stride * self.num_shift
             locations.append(location)
         num_points_each_level = [len(location) for location in locations]
         locations = np.concatenate(locations, axis=0)
@@ -459,11 +467,16 @@ class Gt2FCOSTarget(BaseOperator):
                 grid_w = int(np.ceil(w / self.downsample_ratios[lvl]))
                 grid_h = int(np.ceil(h / self.downsample_ratios[lvl]))
                 if self.norm_reg_targets:
-                    sample['reg_target{}'.format(lvl)] = \
-                        np.reshape(
-                            reg_targets_by_level[lvl] / \
-                            self.downsample_ratios[lvl],
+                    if self.multiply_strides_reg_targets:
+                        sample['reg_target{}'.format(lvl)] = np.reshape(
+                            reg_targets_by_level[lvl],
                             newshape=[grid_h, grid_w, 4])
+                    else:
+                        sample['reg_target{}'.format(lvl)] = \
+                            np.reshape(
+                                reg_targets_by_level[lvl] / \
+                                self.downsample_ratios[lvl],
+                                newshape=[grid_h, grid_w, 4])
                 else:
                     sample['reg_target{}'.format(lvl)] = np.reshape(
                         reg_targets_by_level[lvl],
@@ -482,6 +495,7 @@ class Gt2FCOSTarget(BaseOperator):
 
 @register_op
 class Gt2GFLTarget(BaseOperator):
+    __shared__ = ['num_classes']
     """
     Generate GFocal loss targets by groud truth data
     """
@@ -490,12 +504,14 @@ class Gt2GFLTarget(BaseOperator):
                  num_classes=80,
                  downsample_ratios=[8, 16, 32, 64, 128],
                  grid_cell_scale=4,
-                 cell_offset=0):
+                 cell_offset=0,
+                 compute_vlr_region=False):
         super(Gt2GFLTarget, self).__init__()
         self.num_classes = num_classes
         self.downsample_ratios = downsample_ratios
         self.grid_cell_scale = grid_cell_scale
         self.cell_offset = cell_offset
+        self.compute_vlr_region = compute_vlr_region
 
         self.assigner = ATSSAssigner()
 
@@ -574,6 +590,13 @@ class Gt2GFLTarget(BaseOperator):
             assign_gt_inds, _ = self.assigner(grid_cells, num_level_cells,
                                               gt_bboxes, gt_bboxes_ignore,
                                               gt_labels)
+
+            if self.compute_vlr_region:
+                vlr_region = self.assigner.get_vlr_region(
+                    grid_cells, num_level_cells, gt_bboxes, gt_bboxes_ignore,
+                    gt_labels)
+                sample['vlr_regions'] = vlr_region
+
             pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.get_sample(
                 assign_gt_inds, gt_bboxes)
 
@@ -766,7 +789,7 @@ class Gt2Solov2Target(BaseOperator):
                 ins_label = []
                 grid_order = []
                 cate_label = np.zeros([num_grid, num_grid], dtype=np.int64)
-                ins_ind_label = np.zeros([num_grid**2], dtype=np.bool)
+                ins_ind_label = np.zeros([num_grid**2], dtype=np.bool_)
 
                 if num_ins == 0:
                     ins_label = np.zeros(
@@ -893,27 +916,33 @@ class Gt2Solov2Target(BaseOperator):
 
 
 @register_op
-class Gt2SparseRCNNTarget(BaseOperator):
-    '''
-    Generate SparseRCNN targets by groud truth data
-    '''
-
-    def __init__(self):
-        super(Gt2SparseRCNNTarget, self).__init__()
+class Gt2SparseTarget(BaseOperator):
+    def __init__(self, use_padding_shape=False):
+        super(Gt2SparseTarget, self).__init__()
+        self.use_padding_shape = use_padding_shape
 
     def __call__(self, samples, context=None):
         for sample in samples:
-            im = sample["image"]
-            h, w = im.shape[1:3]
-            img_whwh = np.array([w, h, w, h], dtype=np.int32)
-            sample["img_whwh"] = img_whwh
-            if "scale_factor" in sample:
-                sample["scale_factor_wh"] = np.array(
-                    [sample["scale_factor"][1], sample["scale_factor"][0]],
-                    dtype=np.float32)
+            ori_h, ori_w = sample['h'], sample['w']
+            if self.use_padding_shape:
+                h, w = sample["image"].shape[1:3]
+                if "scale_factor" in sample:
+                    sf_w, sf_h = sample["scale_factor"][1], sample[
+                        "scale_factor"][0]
+                    sample["scale_factor_whwh"] = np.array(
+                        [sf_w, sf_h, sf_w, sf_h], dtype=np.float32)
+                else:
+                    sample["scale_factor_whwh"] = np.array(
+                        [1.0, 1.0, 1.0, 1.0], dtype=np.float32)
             else:
-                sample["scale_factor_wh"] = np.array(
-                    [1.0, 1.0], dtype=np.float32)
+                h, w = round(sample['im_shape'][0]), round(sample['im_shape'][
+                    1])
+                sample["scale_factor_whwh"] = np.array(
+                    [w / ori_w, h / ori_h, w / ori_w, h / ori_h],
+                    dtype=np.float32)
+
+            sample["img_whwh"] = np.array([w, h, w, h], dtype=np.float32)
+            sample["ori_shape"] = np.array([ori_h, ori_w], dtype=np.int32)
 
         return samples
 
@@ -981,6 +1010,7 @@ class PadMaskBatch(BaseOperator):
 
 @register_op
 class Gt2CenterNetTarget(BaseOperator):
+    __shared__ = ['num_classes']
     """Gt2CenterNetTarget
     Genterate CenterNet targets by ground-truth
     Args:
@@ -990,40 +1020,39 @@ class Gt2CenterNetTarget(BaseOperator):
         max_objs (int): The maximum objects detected, 128 by default.
     """
 
-    def __init__(self, down_ratio, num_classes=80, max_objs=128):
+    def __init__(self, num_classes=80, down_ratio=4, max_objs=128):
         super(Gt2CenterNetTarget, self).__init__()
+        self.nc = num_classes
         self.down_ratio = down_ratio
-        self.num_classes = num_classes
         self.max_objs = max_objs
 
     def __call__(self, sample, context=None):
         input_h, input_w = sample['image'].shape[1:]
         output_h = input_h // self.down_ratio
         output_w = input_w // self.down_ratio
-        num_classes = self.num_classes
-        c = sample['center']
-        s = sample['scale']
         gt_bbox = sample['gt_bbox']
         gt_class = sample['gt_class']
 
-        hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32)
+        hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)
         wh = np.zeros((self.max_objs, 2), dtype=np.float32)
-        dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)
         reg = np.zeros((self.max_objs, 2), dtype=np.float32)
         ind = np.zeros((self.max_objs), dtype=np.int64)
         reg_mask = np.zeros((self.max_objs), dtype=np.int32)
-        cat_spec_wh = np.zeros(
-            (self.max_objs, num_classes * 2), dtype=np.float32)
-        cat_spec_mask = np.zeros(
-            (self.max_objs, num_classes * 2), dtype=np.int32)
+        cat_spec_wh = np.zeros((self.max_objs, self.nc * 2), dtype=np.float32)
+        cat_spec_mask = np.zeros((self.max_objs, self.nc * 2), dtype=np.int32)
 
-        trans_output = get_affine_transform(c, [s, s], 0, [output_w, output_h])
+        trans_output = get_affine_transform(
+            center=sample['center'],
+            input_size=[sample['scale'], sample['scale']],
+            rot=0,
+            output_size=[output_w, output_h])
 
         gt_det = []
         for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
             cls = int(cls)
             bbox[:2] = affine_transform(bbox[:2], trans_output)
             bbox[2:] = affine_transform(bbox[2:], trans_output)
+            bbox_amodal = copy.deepcopy(bbox)
             bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
             bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
             h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
@@ -1034,10 +1063,12 @@ class Gt2CenterNetTarget(BaseOperator):
                     [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
                     dtype=np.float32)
                 ct_int = ct.astype(np.int32)
+
+                # get hm,wh,reg,ind,ind_mask
                 draw_umich_gaussian(hm[cls], ct_int, radius)
                 wh[i] = 1. * w, 1. * h
-                ind[i] = ct_int[1] * output_w + ct_int[0]
                 reg[i] = ct - ct_int
+                ind[i] = ct_int[1] * output_w + ct_int[0]
                 reg_mask[i] = 1
                 cat_spec_wh[i, cls * 2:cls * 2 + 2] = wh[i]
                 cat_spec_mask[i, cls * 2:cls * 2 + 2] = 1
@@ -1052,9 +1083,10 @@ class Gt2CenterNetTarget(BaseOperator):
         sample.pop('scale', None)
         sample.pop('is_crowd', None)
         sample.pop('difficult', None)
-        sample['heatmap'] = hm
-        sample['index_mask'] = reg_mask
+
         sample['index'] = ind
+        sample['index_mask'] = reg_mask
+        sample['heatmap'] = hm
         sample['size'] = wh
         sample['offset'] = reg
         return sample
@@ -1070,13 +1102,115 @@ class PadGT(BaseOperator):
                                 1 means bbox, 0 means no bbox.
     """
 
-    def __init__(self, return_gt_mask=True):
+    def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0):
         super(PadGT, self).__init__()
         self.return_gt_mask = return_gt_mask
+        self.pad_img = pad_img
+        self.minimum_gtnum = minimum_gtnum
+
+    def _impad(self, img: np.ndarray,
+            *,
+            shape = None,
+            padding = None,
+            pad_val = 0,
+            padding_mode = 'constant') -> np.ndarray:
+        """Pad the given image to a certain shape or pad on all sides with
+        specified padding mode and padding value.
+
+        Args:
+            img (ndarray): Image to be padded.
+            shape (tuple[int]): Expected padding shape (h, w). Default: None.
+            padding (int or tuple[int]): Padding on each border. If a single int is
+                provided this is used to pad all borders. If tuple of length 2 is
+                provided this is the padding on left/right and top/bottom
+                respectively. If a tuple of length 4 is provided this is the
+                padding for the left, top, right and bottom borders respectively.
+                Default: None. Note that `shape` and `padding` can not be both
+                set.
+            pad_val (Number | Sequence[Number]): Values to be filled in padding
+                areas when padding_mode is 'constant'. Default: 0.
+            padding_mode (str): Type of padding. Should be: constant, edge,
+                reflect or symmetric. Default: constant.
+                - constant: pads with a constant value, this value is specified
+                with pad_val.
+                - edge: pads with the last value at the edge of the image.
+                - reflect: pads with reflection of image without repeating the last
+                value on the edge. For example, padding [1, 2, 3, 4] with 2
+                elements on both sides in reflect mode will result in
+                [3, 2, 1, 2, 3, 4, 3, 2].
+                - symmetric: pads with reflection of image repeating the last value
+                on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+                both sides in symmetric mode will result in
+                [2, 1, 1, 2, 3, 4, 4, 3]
+
+        Returns:
+            ndarray: The padded image.
+        """
+
+        assert (shape is not None) ^ (padding is not None)
+        if shape is not None:
+            width = max(shape[1] - img.shape[1], 0)
+            height = max(shape[0] - img.shape[0], 0)
+            padding = (0, 0, int(width), int(height))
+
+        # check pad_val
+        import numbers
+        if isinstance(pad_val, tuple):
+            assert len(pad_val) == img.shape[-1]
+        elif not isinstance(pad_val, numbers.Number):
+            raise TypeError('pad_val must be a int or a tuple. '
+                            f'But received {type(pad_val)}')
+
+        # check padding
+        if isinstance(padding, tuple) and len(padding) in [2, 4]:
+            if len(padding) == 2:
+                padding = (padding[0], padding[1], padding[0], padding[1])
+        elif isinstance(padding, numbers.Number):
+            padding = (padding, padding, padding, padding)
+        else:
+            raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+                            f'But received {padding}')
+
+        # check padding mode
+        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+        border_type = {
+            'constant': cv2.BORDER_CONSTANT,
+            'edge': cv2.BORDER_REPLICATE,
+            'reflect': cv2.BORDER_REFLECT_101,
+            'symmetric': cv2.BORDER_REFLECT
+        }
+        img = cv2.copyMakeBorder(
+            img,
+            padding[1],
+            padding[3],
+            padding[0],
+            padding[2],
+            border_type[padding_mode],
+            value=pad_val)
+
+        return img
+
+    def checkmaxshape(self, samples):
+        maxh, maxw = 0, 0
+        for sample in samples:
+            h,w = sample['im_shape']
+            if h>maxh:
+                maxh = h
+            if w>maxw:
+                maxw = w
+        return (maxh, maxw)
 
     def __call__(self, samples, context=None):
         num_max_boxes = max([len(s['gt_bbox']) for s in samples])
+        num_max_boxes = max(self.minimum_gtnum, num_max_boxes)
+        if self.pad_img:
+            maxshape = self.checkmaxshape(samples)
         for sample in samples:
+            if self.pad_img:
+                img = sample['image']
+                padimg = self._impad(img, shape=maxshape)
+                sample['image'] = padimg
             if self.return_gt_mask:
                 sample['pad_gt_mask'] = np.zeros(
                     (num_max_boxes, 1), dtype=np.float32)
@@ -1110,6 +1244,17 @@ class PadGT(BaseOperator):
                 if num_gt > 0:
                     pad_diff[:num_gt] = sample['difficult']
                 sample['difficult'] = pad_diff
+            if 'gt_joints' in sample:
+                num_joints = sample['gt_joints'].shape[1]
+                pad_gt_joints = np.zeros((num_max_boxes, num_joints, 3), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_joints[:num_gt] = sample['gt_joints']
+                sample['gt_joints'] = pad_gt_joints
+            if 'gt_areas' in sample:
+                pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_areas[:num_gt, 0] = sample['gt_areas']
+                sample['gt_areas'] = pad_gt_areas
         return samples
 
 
@@ -1165,3 +1310,175 @@ class PadRGT(BaseOperator):
                                num_gt)
 
         return samples
+
+
+@register_op
+class Gt2CenterTrackTarget(BaseOperator):
+    __shared__ = ['num_classes']
+    """Gt2CenterTrackTarget
+    Genterate CenterTrack targets by ground-truth
+    Args:
+        num_classes (int): The number of classes, 1 by default.
+        down_ratio (int): The down sample ratio between output feature and 
+                          input image.
+        max_objs (int): The maximum objects detected, 256 by default.
+    """
+
+    def __init__(self,
+                 num_classes=1,
+                 down_ratio=4,
+                 max_objs=256,
+                 hm_disturb=0.05,
+                 lost_disturb=0.4,
+                 fp_disturb=0.1,
+                 pre_hm=True,
+                 add_tracking=True,
+                 add_ltrb_amodal=True):
+        super(Gt2CenterTrackTarget, self).__init__()
+        self.nc = num_classes
+        self.down_ratio = down_ratio
+        self.max_objs = max_objs
+
+        self.hm_disturb = hm_disturb
+        self.lost_disturb = lost_disturb
+        self.fp_disturb = fp_disturb
+        self.pre_hm = pre_hm
+        self.add_tracking = add_tracking
+        self.add_ltrb_amodal = add_ltrb_amodal
+
+    def _get_pre_dets(self, input_h, input_w, trans_input_pre, gt_bbox_pre,
+                      gt_class_pre, gt_track_id_pre):
+        hm_h, hm_w = input_h, input_w
+        reutrn_hm = self.pre_hm
+        pre_hm = np.zeros(
+            (1, hm_h, hm_w), dtype=np.float32) if reutrn_hm else None
+        pre_cts, track_ids = [], []
+
+        for i, (
+                bbox, cls, track_id
+        ) in enumerate(zip(gt_bbox_pre, gt_class_pre, gt_track_id_pre)):
+            cls = int(cls)
+            bbox[:2] = affine_transform(bbox[:2], trans_input_pre)
+            bbox[2:] = affine_transform(bbox[2:], trans_input_pre)
+            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, hm_w - 1)
+            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, hm_h - 1)
+            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+            max_rad = 1
+            if (h > 0 and w > 0):
+                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
+                radius = max(0, int(radius))
+                max_rad = max(max_rad, radius)
+                ct = np.array(
+                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
+                    dtype=np.float32)
+                ct0 = ct.copy()
+                conf = 1
+
+                ct[0] = ct[0] + np.random.randn() * self.hm_disturb * w
+                ct[1] = ct[1] + np.random.randn() * self.hm_disturb * h
+                conf = 1 if np.random.rand() > self.lost_disturb else 0
+
+                ct_int = ct.astype(np.int32)
+                if conf == 0:
+                    pre_cts.append(ct / self.down_ratio)
+                else:
+                    pre_cts.append(ct0 / self.down_ratio)
+
+                track_ids.append(track_id)
+                if reutrn_hm:
+                    draw_umich_gaussian(pre_hm[0], ct_int, radius, k=conf)
+
+                if np.random.rand() < self.fp_disturb and reutrn_hm:
+                    ct2 = ct0.copy()
+                    # Hard code heatmap disturb ratio, haven't tried other numbers.
+                    ct2[0] = ct2[0] + np.random.randn() * 0.05 * w
+                    ct2[1] = ct2[1] + np.random.randn() * 0.05 * h
+                    ct2_int = ct2.astype(np.int32)
+                    draw_umich_gaussian(pre_hm[0], ct2_int, radius, k=conf)
+        return pre_hm, pre_cts, track_ids
+
+    def __call__(self, sample, context=None):
+        input_h, input_w = sample['image'].shape[1:]
+        output_h = input_h // self.down_ratio
+        output_w = input_w // self.down_ratio
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+
+        # init
+        hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)
+        wh = np.zeros((self.max_objs, 2), dtype=np.float32)
+        reg = np.zeros((self.max_objs, 2), dtype=np.float32)
+        ind = np.zeros((self.max_objs), dtype=np.int64)
+        reg_mask = np.zeros((self.max_objs), dtype=np.int32)
+        if self.add_tracking:
+            tr = np.zeros((self.max_objs, 2), dtype=np.float32)
+        if self.add_ltrb_amodal:
+            ltrb_amodal = np.zeros((self.max_objs, 4), dtype=np.float32)
+
+        trans_output = get_affine_transform(
+            center=sample['center'],
+            input_size=[sample['scale'], sample['scale']],
+            rot=0,
+            output_size=[output_w, output_h])
+
+        pre_hm, pre_cts, track_ids = self._get_pre_dets(
+            input_h, input_w, sample['trans_input'], sample['pre_gt_bbox'],
+            sample['pre_gt_class'], sample['pre_gt_track_id'])
+
+        for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
+            cls = int(cls)
+            rect = np.array(
+                [[bbox[0], bbox[1]], [bbox[0], bbox[3]], [bbox[2], bbox[3]],
+                 [bbox[2], bbox[1]]],
+                dtype=np.float32)
+            for t in range(4):
+                rect[t] = affine_transform(rect[t], trans_output)
+                bbox[:2] = rect[:, 0].min(), rect[:, 1].min()
+                bbox[2:] = rect[:, 0].max(), rect[:, 1].max()
+
+            bbox_amodal = copy.deepcopy(bbox)
+            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
+            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
+
+            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+            if h > 0 and w > 0:
+                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
+                radius = max(0, int(radius))
+                ct = np.array(
+                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
+                    dtype=np.float32)
+                ct_int = ct.astype(np.int32)
+
+                # get hm,wh,reg,ind,ind_mask
+                draw_umich_gaussian(hm[cls], ct_int, radius)
+                wh[i] = 1. * w, 1. * h
+                reg[i] = ct - ct_int
+                ind[i] = ct_int[1] * output_w + ct_int[0]
+                reg_mask[i] = 1
+                if self.add_tracking:
+                    if sample['gt_track_id'][i] in track_ids:
+                        pre_ct = pre_cts[track_ids.index(sample['gt_track_id'][
+                            i])]
+                        tr[i] = pre_ct - ct_int
+
+                if self.add_ltrb_amodal:
+                    ltrb_amodal[i] = \
+                        bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \
+                        bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1]
+
+        new_sample = {'image': sample['image']}
+        new_sample['index'] = ind
+        new_sample['index_mask'] = reg_mask
+        new_sample['heatmap'] = hm
+        new_sample['size'] = wh
+        new_sample['offset'] = reg
+        if self.add_tracking:
+            new_sample['tracking'] = tr
+        if self.add_ltrb_amodal:
+            new_sample['ltrb_amodal'] = ltrb_amodal
+
+        new_sample['pre_image'] = sample['pre_image']
+        new_sample['pre_hm'] = pre_hm
+
+        del sample
+        return new_sample

+ 832 - 85
paddlers/models/ppdet/data/transform/keypoint_operators.py

@@ -36,19 +36,12 @@ logger = setup_logger(__name__)
 registered_ops = []
 
 __all__ = [
-    'RandomAffine',
-    'KeyPointFlip',
-    'TagGenerate',
-    'ToHeatmaps',
-    'NormalizePermute',
-    'EvalAffine',
-    'RandomFlipHalfBodyTransform',
-    'TopDownAffine',
-    'ToHeatmapsTopDown',
-    'ToHeatmapsTopDown_DARK',
-    'ToHeatmapsTopDown_UDP',
-    'TopDownEvalAffine',
-    'AugmentationbyInformantionDropping',
+    'RandomAffine', 'KeyPointFlip', 'TagGenerate', 'ToHeatmaps',
+    'NormalizePermute', 'EvalAffine', 'RandomFlipHalfBodyTransform',
+    'TopDownAffine', 'ToHeatmapsTopDown', 'ToHeatmapsTopDown_DARK',
+    'ToHeatmapsTopDown_UDP', 'TopDownEvalAffine',
+    'AugmentationbyInformantionDropping', 'SinglePoseAffine', 'NoiseJitter',
+    'FlipPose', 'PETR_Resize'
 ]
 
 
@@ -72,38 +65,77 @@ class KeyPointFlip(object):
 
     """
 
-    def __init__(self, flip_permutation, hmsize, flip_prob=0.5):
+    def __init__(self, flip_permutation, hmsize=None, flip_prob=0.5):
         super(KeyPointFlip, self).__init__()
         assert isinstance(flip_permutation, Sequence)
         self.flip_permutation = flip_permutation
         self.flip_prob = flip_prob
         self.hmsize = hmsize
 
-    def __call__(self, records):
-        image = records['image']
-        kpts_lst = records['joints']
-        mask_lst = records['mask']
-        flip = np.random.random() < self.flip_prob
-        if flip:
-            image = image[:, ::-1]
-            for idx, hmsize in enumerate(self.hmsize):
-                if len(mask_lst) > idx:
-                    mask_lst[idx] = mask_lst[idx][:, ::-1]
+    def _flipjoints(self, records, sizelst):
+        '''
+        records['gt_joints'] is Sequence in higherhrnet
+        '''
+        if not ('gt_joints' in records and len(records['gt_joints']) > 0):
+            return records
+
+        kpts_lst = records['gt_joints']
+        if isinstance(kpts_lst, Sequence):
+            for idx, hmsize in enumerate(sizelst):
                 if kpts_lst[idx].ndim == 3:
                     kpts_lst[idx] = kpts_lst[idx][:, self.flip_permutation]
                 else:
                     kpts_lst[idx] = kpts_lst[idx][self.flip_permutation]
                 kpts_lst[idx][..., 0] = hmsize - kpts_lst[idx][..., 0]
-                kpts_lst[idx] = kpts_lst[idx].astype(np.int64)
-                kpts_lst[idx][kpts_lst[idx][..., 0] >= hmsize, 2] = 0
-                kpts_lst[idx][kpts_lst[idx][..., 1] >= hmsize, 2] = 0
-                kpts_lst[idx][kpts_lst[idx][..., 0] < 0, 2] = 0
-                kpts_lst[idx][kpts_lst[idx][..., 1] < 0, 2] = 0
-        records['image'] = image
-        records['joints'] = kpts_lst
+        else:
+            hmsize = sizelst[0]
+            if kpts_lst.ndim == 3:
+                kpts_lst = kpts_lst[:, self.flip_permutation]
+            else:
+                kpts_lst = kpts_lst[self.flip_permutation]
+            kpts_lst[..., 0] = hmsize - kpts_lst[..., 0]
+
+        records['gt_joints'] = kpts_lst
+        return records
+
+    def _flipmask(self, records, sizelst):
+        if not 'mask' in records:
+            return records
+
+        mask_lst = records['mask']
+        for idx, hmsize in enumerate(sizelst):
+            if len(mask_lst) > idx:
+                mask_lst[idx] = mask_lst[idx][:, ::-1]
         records['mask'] = mask_lst
         return records
 
+    def _flipbbox(self, records, sizelst):
+        if not 'gt_bbox' in records:
+            return records
+
+        bboxes = records['gt_bbox']
+        hmsize = sizelst[0]
+        bboxes[:, 0::2] = hmsize - bboxes[:, 0::2][:, ::-1]
+        bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, hmsize)
+        records['gt_bbox'] = bboxes
+        return records
+
+    def __call__(self, records):
+        flip = np.random.random() < self.flip_prob
+        if flip:
+            image = records['image']
+            image = image[:, ::-1]
+            records['image'] = image
+            if self.hmsize is None:
+                sizelst = [image.shape[1]]
+            else:
+                sizelst = self.hmsize
+            self._flipjoints(records, sizelst)
+            self._flipmask(records, sizelst)
+            self._flipbbox(records, sizelst)
+
+        return records
+
 
 @register_keypointop
 class RandomAffine(object):
@@ -115,7 +147,7 @@ class RandomAffine(object):
         max_scale (list[2]): the scale range to apply, transform range is [min, max]
         max_shift (float): the max abslute shift ratio to apply, transform range is [-max_shift*imagesize, max_shift*imagesize]
         hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
-        trainsize (int): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard
+        trainsize (list[2]): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard
         scale_type (str): the length of [h,w] to used for trainsize, chosed between 'short' and 'long'
         records(dict): the dict contained the image, mask and coords
 
@@ -128,9 +160,10 @@ class RandomAffine(object):
                  max_degree=30,
                  scale=[0.75, 1.5],
                  max_shift=0.2,
-                 hmsize=[128, 256],
-                 trainsize=512,
-                 scale_type='short'):
+                 hmsize=None,
+                 trainsize=[512, 512],
+                 scale_type='short',
+                 boldervalue=[114, 114, 114]):
         super(RandomAffine, self).__init__()
         self.max_degree = max_degree
         self.min_scale = scale[0]
@@ -139,8 +172,9 @@ class RandomAffine(object):
         self.hmsize = hmsize
         self.trainsize = trainsize
         self.scale_type = scale_type
+        self.boldervalue = boldervalue
 
-    def _get_affine_matrix(self, center, scale, res, rot=0):
+    def _get_affine_matrix_old(self, center, scale, res, rot=0):
         """Generate transformation matrix."""
         h = scale
         t = np.zeros((3, 3), dtype=np.float32)
@@ -166,21 +200,94 @@ class RandomAffine(object):
             t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
         return t
 
+    def _get_affine_matrix(self, center, scale, res, rot=0):
+        """Generate transformation matrix."""
+        w, h = scale
+        t = np.zeros((3, 3), dtype=np.float32)
+        t[0, 0] = float(res[0]) / w
+        t[1, 1] = float(res[1]) / h
+        t[0, 2] = res[0] * (-float(center[0]) / w + .5)
+        t[1, 2] = res[1] * (-float(center[1]) / h + .5)
+        t[2, 2] = 1
+        if rot != 0:
+            rot = -rot  # To match direction of rotation from cropping
+            rot_mat = np.zeros((3, 3), dtype=np.float32)
+            rot_rad = rot * np.pi / 180
+            sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+            rot_mat[0, :2] = [cs, -sn]
+            rot_mat[1, :2] = [sn, cs]
+            rot_mat[2, 2] = 1
+            # Need to rotate around center
+            t_mat = np.eye(3)
+            t_mat[0, 2] = -res[0] / 2
+            t_mat[1, 2] = -res[1] / 2
+            t_inv = t_mat.copy()
+            t_inv[:2, 2] *= -1
+            t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
+        return t
+
+    def _affine_joints_mask(self,
+                            degree,
+                            center,
+                            roi_size,
+                            dsize,
+                            keypoints=None,
+                            heatmap_mask=None,
+                            gt_bbox=None):
+        kpts = None
+        mask = None
+        bbox = None
+        mask_affine_mat = self._get_affine_matrix(center, roi_size, dsize,
+                                                  degree)[:2]
+        if heatmap_mask is not None:
+            mask = cv2.warpAffine(heatmap_mask, mask_affine_mat, dsize)
+            mask = ((mask / 255) > 0.5).astype(np.float32)
+        if keypoints is not None:
+            kpts = copy.deepcopy(keypoints)
+            kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(),
+                                                mask_affine_mat)
+            kpts[(kpts[..., 0]) > dsize[0], :] = 0
+            kpts[(kpts[..., 1]) > dsize[1], :] = 0
+            kpts[(kpts[..., 0]) < 0, :] = 0
+            kpts[(kpts[..., 1]) < 0, :] = 0
+        if gt_bbox is not None:
+            temp_bbox = gt_bbox[:, [0, 3, 2, 1]]
+            cat_bbox = np.concatenate((gt_bbox, temp_bbox), axis=-1)
+            gt_bbox_warped = warp_affine_joints(cat_bbox, mask_affine_mat)
+            bbox = np.zeros_like(gt_bbox)
+            bbox[:, 0] = gt_bbox_warped[:, 0::2].min(1).clip(0, dsize[0])
+            bbox[:, 2] = gt_bbox_warped[:, 0::2].max(1).clip(0, dsize[0])
+            bbox[:, 1] = gt_bbox_warped[:, 1::2].min(1).clip(0, dsize[1])
+            bbox[:, 3] = gt_bbox_warped[:, 1::2].max(1).clip(0, dsize[1])
+        return kpts, mask, bbox
+
     def __call__(self, records):
         image = records['image']
-        keypoints = records['joints']
-        heatmap_mask = records['mask']
+        shape = np.array(image.shape[:2][::-1])
+        keypoints = None
+        heatmap_mask = None
+        gt_bbox = None
+        if 'gt_joints' in records:
+            keypoints = records['gt_joints']
+
+        if 'mask' in records:
+            heatmap_mask = records['mask']
+            heatmap_mask *= 255
+
+        if 'gt_bbox' in records:
+            gt_bbox = records['gt_bbox']
 
         degree = (np.random.random() * 2 - 1) * self.max_degree
-        shape = np.array(image.shape[:2][::-1])
         center = center = np.array((np.array(shape) / 2))
 
         aug_scale = np.random.random() * (self.max_scale - self.min_scale
                                           ) + self.min_scale
         if self.scale_type == 'long':
-            scale = max(shape[0], shape[1]) / 1.0
+            scale = np.array([max(shape[0], shape[1]) / 1.0] * 2)
         elif self.scale_type == 'short':
-            scale = min(shape[0], shape[1]) / 1.0
+            scale = np.array([min(shape[0], shape[1]) / 1.0] * 2)
+        elif self.scale_type == 'wh':
+            scale = shape
         else:
             raise ValueError('Unknown scale type: {}'.format(self.scale_type))
         roi_size = aug_scale * scale
@@ -188,44 +295,55 @@ class RandomAffine(object):
         dy = int(0)
         if self.max_shift > 0:
 
-            dx = np.random.randint(-self.max_shift * roi_size,
-                                   self.max_shift * roi_size)
-            dy = np.random.randint(-self.max_shift * roi_size,
-                                   self.max_shift * roi_size)
+            dx = np.random.randint(-self.max_shift * roi_size[0],
+                                   self.max_shift * roi_size[0])
+            dy = np.random.randint(-self.max_shift * roi_size[0],
+                                   self.max_shift * roi_size[1])
 
         center += np.array([dx, dy])
         input_size = 2 * center
+        if self.trainsize != -1:
+            dsize = self.trainsize
+            imgshape = (dsize)
+        else:
+            dsize = scale
+            imgshape = (shape.tolist())
 
-        keypoints[..., :2] *= shape
-        heatmap_mask *= 255
-        kpts_lst = []
-        mask_lst = []
-
-        image_affine_mat = self._get_affine_matrix(
-            center, roi_size, (self.trainsize, self.trainsize), degree)[:2]
+        image_affine_mat = self._get_affine_matrix(center, roi_size, dsize,
+                                                   degree)[:2]
         image = cv2.warpAffine(
             image,
-            image_affine_mat, (self.trainsize, self.trainsize),
-            flags=cv2.INTER_LINEAR)
+            image_affine_mat,
+            imgshape,
+            flags=cv2.INTER_LINEAR,
+            borderValue=self.boldervalue)
+
+        if self.hmsize is None:
+            kpts, mask, gt_bbox = self._affine_joints_mask(
+                degree, center, roi_size, dsize, keypoints, heatmap_mask,
+                gt_bbox)
+            records['image'] = image
+            if kpts is not None: records['gt_joints'] = kpts
+            if mask is not None: records['mask'] = mask
+            if gt_bbox is not None: records['gt_bbox'] = gt_bbox
+            return records
+
+        kpts_lst = []
+        mask_lst = []
         for hmsize in self.hmsize:
-            kpts = copy.deepcopy(keypoints)
-            mask_affine_mat = self._get_affine_matrix(
-                center, roi_size, (hmsize, hmsize), degree)[:2]
-            if heatmap_mask is not None:
-                mask = cv2.warpAffine(heatmap_mask, mask_affine_mat,
-                                      (hmsize, hmsize))
-                mask = ((mask / 255) > 0.5).astype(np.float32)
-            kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(),
-                                                mask_affine_mat)
-            kpts[np.trunc(kpts[..., 0]) >= hmsize, 2] = 0
-            kpts[np.trunc(kpts[..., 1]) >= hmsize, 2] = 0
-            kpts[np.trunc(kpts[..., 0]) < 0, 2] = 0
-            kpts[np.trunc(kpts[..., 1]) < 0, 2] = 0
+            kpts, mask, gt_bbox = self._affine_joints_mask(
+                degree, center, roi_size, [hmsize, hmsize], keypoints,
+                heatmap_mask, gt_bbox)
             kpts_lst.append(kpts)
             mask_lst.append(mask)
         records['image'] = image
-        records['joints'] = kpts_lst
-        records['mask'] = mask_lst
+
+        if 'gt_joints' in records:
+            records['gt_joints'] = kpts_lst
+        if 'mask' in records:
+            records['mask'] = mask_lst
+        if 'gt_bbox' in records:
+            records['gt_bbox'] = gt_bbox
         return records
 
 
@@ -258,9 +376,10 @@ class EvalAffine(object):
         if mask is not None:
             mask = cv2.warpAffine(mask, trans, size_resized)
             records['mask'] = mask
-        if 'joints' in records:
-            del records['joints']
+        if 'gt_joints' in records:
+            del records['gt_joints']
         records['image'] = image_resized
+        records['scale_factor'] = self.size / min(h, w)
         return records
 
 
@@ -310,7 +429,7 @@ class TagGenerate(object):
         self.num_joints = num_joints
 
     def __call__(self, records):
-        kpts_lst = records['joints']
+        kpts_lst = records['gt_joints']
         kpts = kpts_lst[0]
         tagmap = np.zeros((self.max_people, self.num_joints, 4), dtype=np.int64)
         inds = np.where(kpts[..., 2] > 0)
@@ -322,7 +441,7 @@ class TagGenerate(object):
         tagmap[p, j, 2] = visible[..., 0]  # x
         tagmap[p, j, 3] = 1
         records['tagmap'] = tagmap
-        del records['joints']
+        del records['gt_joints']
         return records
 
 
@@ -356,7 +475,7 @@ class ToHeatmaps(object):
         self.gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
 
     def __call__(self, records):
-        kpts_lst = records['joints']
+        kpts_lst = records['gt_joints']
         mask_lst = records['mask']
         for idx, hmsize in enumerate(self.hmsize):
             mask = mask_lst[idx]
@@ -477,7 +596,7 @@ class RandomFlipHalfBodyTransform(object):
 
     def __call__(self, records):
         image = records['image']
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         c = records['center']
         s = records['scale']
@@ -500,7 +619,7 @@ class RandomFlipHalfBodyTransform(object):
                 joints, joints_vis, image.shape[1], self.flip_pairs)
             c[0] = image.shape[1] - c[0] - 1
         records['image'] = image
-        records['joints'] = joints
+        records['gt_joints'] = joints
         records['joints_vis'] = joints_vis
         records['center'] = c
         records['scale'] = s
@@ -560,7 +679,7 @@ class AugmentationbyInformantionDropping(object):
 
     def __call__(self, records):
         img = records['image']
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         if np.random.rand() < self.prob_cutout:
             img = self._cutout(img, joints, joints_vis)
@@ -588,7 +707,7 @@ class TopDownAffine(object):
 
     def __call__(self, records):
         image = records['image']
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         rot = records['rotate'] if "rotate" in records else 0
         if self.use_udp:
@@ -613,8 +732,171 @@ class TopDownAffine(object):
                     joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
 
         records['image'] = image
-        records['joints'] = joints
+        records['gt_joints'] = joints
+
+        return records
+
+
+@register_keypointop
+class SinglePoseAffine(object):
+    """apply affine transform to image and coords
+
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        use_udp (bool): whether to use Unbiased Data Processing.
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self,
+                 trainsize,
+                 rotate=[1.0, 30],
+                 scale=[1.0, 0.25],
+                 use_udp=False):
+        self.trainsize = trainsize
+        self.use_udp = use_udp
+        self.rot_prob = rotate[0]
+        self.rot_range = rotate[1]
+        self.scale_prob = scale[0]
+        self.scale_ratio = scale[1]
+
+    def __call__(self, records):
+        image = records['image']
+        if 'joints_2d' in records:
+            joints = records['joints_2d'] if 'joints_2d' in records else None
+            joints_vis = records[
+                'joints_vis'] if 'joints_vis' in records else np.ones(
+                    (len(joints), 1))
+        rot = 0
+        s = 1.
+        if np.random.random() < self.rot_prob:
+            rot = np.clip(np.random.randn() * self.rot_range,
+                          -self.rot_range * 2, self.rot_range * 2)
+        if np.random.random() < self.scale_prob:
+            s = np.clip(np.random.randn() * self.scale_ratio + 1,
+                        1 - self.scale_ratio, 1 + self.scale_ratio)
+
+        if self.use_udp:
+            trans = get_warp_matrix(
+                rot,
+                np.array(records['bbox_center']) * 2.0,
+                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
+                records['bbox_scale'] * 200.0 * s)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+            if 'joints_2d' in records:
+                joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(),
+                                                    trans)
+        else:
+            trans = get_affine_transform(
+                np.array(records['bbox_center']),
+                records['bbox_scale'] * s * 200, rot, self.trainsize)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+            if 'joints_2d' in records:
+                for i in range(len(joints)):
+                    if joints_vis[i, 0] > 0.0:
+                        joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
+
+        if 'joints_3d' in records:
+            pose3d = records['joints_3d']
+            if not rot == 0:
+                trans_3djoints = np.eye(3)
+                rot_rad = -rot * np.pi / 180
+                sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+                trans_3djoints[0, :2] = [cs, -sn]
+                trans_3djoints[1, :2] = [sn, cs]
+                pose3d[:, :3] = np.einsum('ij,kj->ki', trans_3djoints,
+                                          pose3d[:, :3])
+                records['joints_3d'] = pose3d
+
+        records['image'] = image
+        if 'joints_2d' in records:
+            records['joints_2d'] = joints
+
+        return records
+
+
+@register_keypointop
+class NoiseJitter(object):
+    """apply NoiseJitter to image
+
+    Args:
+        noise_factor (float): the noise factor ratio used to generate the jitter
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
 
+    def __init__(self, noise_factor=0.4):
+        self.noise_factor = noise_factor
+
+    def __call__(self, records):
+        self.pn = np.random.uniform(1 - self.noise_factor,
+                                    1 + self.noise_factor, 3)
+        rgb_img = records['image']
+        rgb_img[:, :, 0] = np.minimum(
+            255.0, np.maximum(0.0, rgb_img[:, :, 0] * self.pn[0]))
+        rgb_img[:, :, 1] = np.minimum(
+            255.0, np.maximum(0.0, rgb_img[:, :, 1] * self.pn[1]))
+        rgb_img[:, :, 2] = np.minimum(
+            255.0, np.maximum(0.0, rgb_img[:, :, 2] * self.pn[2]))
+        records['image'] = rgb_img
+        return records
+
+
+@register_keypointop
+class FlipPose(object):
+    """random apply flip to image
+
+    Args:
+        noise_factor (float): the noise factor ratio used to generate the jitter
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self, flip_prob=0.5, img_res=224, num_joints=14):
+        self.flip_pob = flip_prob
+        self.img_res = img_res
+        if num_joints == 24:
+            self.perm = [
+                5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17,
+                18, 19, 21, 20, 23, 22
+            ]
+        elif num_joints == 14:
+            self.perm = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13]
+        else:
+            print("error num_joints in flip :{}".format(num_joints))
+
+    def __call__(self, records):
+
+        if np.random.random() < self.flip_pob:
+            img = records['image']
+            img = np.fliplr(img)
+
+            if 'joints_2d' in records:
+                joints_2d = records['joints_2d']
+                joints_2d = joints_2d[self.perm]
+                joints_2d[:, 0] = self.img_res - joints_2d[:, 0]
+                records['joints_2d'] = joints_2d
+
+            if 'joints_3d' in records:
+                joints_3d = records['joints_3d']
+                joints_3d = joints_3d[self.perm]
+                joints_3d[:, 0] = -joints_3d[:, 0]
+                records['joints_3d'] = joints_3d
+
+            records['image'] = img
         return records
 
 
@@ -686,7 +968,7 @@ class ToHeatmapsTopDown(object):
             https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
             Copyright (c) Microsoft, under the MIT License.
         """
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         num_joints = joints.shape[0]
         image_size = np.array(
@@ -729,7 +1011,7 @@ class ToHeatmapsTopDown(object):
                     0]:g_y[1], g_x[0]:g_x[1]]
         records['target'] = target
         records['target_weight'] = target_weight
-        del records['joints'], records['joints_vis']
+        del records['gt_joints'], records['joints_vis']
 
         return records
 
@@ -754,7 +1036,7 @@ class ToHeatmapsTopDown_DARK(object):
         self.sigma = sigma
 
     def __call__(self, records):
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         num_joints = joints.shape[0]
         image_size = np.array(
@@ -787,7 +1069,7 @@ class ToHeatmapsTopDown_DARK(object):
                     (x - mu_x)**2 + (y - mu_y)**2) / (2 * self.sigma**2))
         records['target'] = target
         records['target_weight'] = target_weight
-        del records['joints'], records['joints_vis']
+        del records['gt_joints'], records['joints_vis']
 
         return records
 
@@ -816,7 +1098,7 @@ class ToHeatmapsTopDown_UDP(object):
         self.sigma = sigma
 
     def __call__(self, records):
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         num_joints = joints.shape[0]
         image_size = np.array(
@@ -861,6 +1143,471 @@ class ToHeatmapsTopDown_UDP(object):
                     0]:g_y[1], g_x[0]:g_x[1]]
         records['target'] = target
         records['target_weight'] = target_weight
-        del records['joints'], records['joints_vis']
+        del records['gt_joints'], records['joints_vis']
 
         return records
+
+
+from typing import Optional, Tuple, Union, List
+import numbers
+
+
+def _scale_size(
+        size: Tuple[int, int],
+        scale: Union[float, int, tuple], ) -> Tuple[int, int]:
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float | tuple(float)): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    if isinstance(scale, (float, int)):
+        scale = (scale, scale)
+    w, h = size
+    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
+
+
+def rescale_size(old_size: tuple,
+                 scale: Union[float, int, tuple],
+                 return_scale: bool=False) -> tuple:
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, list):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+    new_size = _scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imrescale(img: np.ndarray,
+              scale: Union[float, Tuple[int, int]],
+              return_scale: bool=False,
+              interpolation: str='bilinear',
+              backend: Optional[str]=None) -> Union[np.ndarray, Tuple[
+                  np.ndarray, float]]:
+    """Resize image while keeping the aspect ratio.
+
+    Args:
+        img (ndarray): The input image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The rescaled image.
+    """
+    h, w = img.shape[:2]
+    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+    rescaled_img = imresize(
+        img, new_size, interpolation=interpolation, backend=backend)
+    if return_scale:
+        return rescaled_img, scale_factor
+    else:
+        return rescaled_img
+
+
+def imresize(
+        img: np.ndarray,
+        size: Tuple[int, int],
+        return_scale: bool=False,
+        interpolation: str='bilinear',
+        out: Optional[np.ndarray]=None,
+        backend: Optional[str]=None,
+        interp=cv2.INTER_LINEAR, ) -> Union[Tuple[np.ndarray, float, float],
+                                            np.ndarray]:
+    """Resize image to a given size.
+
+    Args:
+        img (ndarray): The input image.
+        size (tuple[int]): Target size (w, h).
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported for resize.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+        resized_img = np.array(pil_image)
+    else:
+        resized_img = cv2.resize(img, size, dst=out, interpolation=interp)
+    if not return_scale:
+        return resized_img
+    else:
+        w_scale = size[0] / w
+        h_scale = size[1] / h
+        return resized_img, w_scale, h_scale
+
+
+class PETR_Resize:
+    """Resize images & bbox & mask.
+
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. If the input dict contains the key
+    "scale", then the scale in the input dict is used, otherwise the specified
+    scale in the init method is used. If the input dict contains the key
+    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
+    scale_factor), the actual scale will be computed by image shape and
+    scale_factor.
+
+    `img_scale` can either be a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 3 multiscale modes:
+
+    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
+      range and multiply it with the image scale.
+    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
+      sample a scale from the multiscale range.
+    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
+      sample a scale from multiple scales.
+
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+        multiscale_mode (str): Either "range" or "value".
+        ratio_range (tuple[float]): (min_ratio, max_ratio)
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        override (bool, optional): Whether to override `scale` and
+            `scale_factor` so as to call resize twice. Default False. If True,
+            after the first resizing, the existed `scale` and `scale_factor`
+            will be ignored so the second resizing can be allowed.
+            This option is a work-around for multiple times of resize in DETR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 bbox_clip_border=True,
+                 backend='cv2',
+                 interpolation='bilinear',
+                 override=False,
+                 keypoint_clip_border=True):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert isinstance(self.img_scale, list)
+
+        if ratio_range is not None:
+            # mode 1: given a scale and a range of image ratio
+            assert len(self.img_scale) == 1
+        else:
+            # mode 2: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+
+        self.backend = backend
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        # TODO: refactor the override option in Resize
+        self.interpolation = interpolation
+        self.override = override
+        self.bbox_clip_border = bbox_clip_border
+        self.keypoint_clip_border = keypoint_clip_border
+
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
+                where ``img_scale`` is the selected image scale and \
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        assert isinstance(img_scales, list)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and upper bound of image scales.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
+                ``img_scale`` is sampled scale and None is just a placeholder \
+                to be consistent with :func:`random_select`.
+        """
+
+        assert isinstance(img_scales, list) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long), max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short), max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+
+        Args:
+            img_scale (list): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where \
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
+                None is just a placeholder to be consistent with \
+                :func:`random_select`.
+        """
+
+        assert isinstance(img_scale, list) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into \
+                ``results``, which would be used by subsequent pipelines.
+        """
+
+        if self.ratio_range is not None:
+            scale, scale_idx = self.random_sample_ratio(self.img_scale[0],
+                                                        self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        for key in ['image'] if 'image' in results else []:
+            if self.keep_ratio:
+                img, scale_factor = imrescale(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    interpolation=self.interpolation,
+                    backend=self.backend)
+                # the w_scale and h_scale has minor difference
+                # a real fix should be done in the imrescale in the future
+                new_h, new_w = img.shape[:2]
+                h, w = results[key].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = imresize(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    interpolation=self.interpolation,
+                    backend=self.backend)
+
+            scale_factor = np.array(
+                [w_scale, h_scale, w_scale, h_scale], dtype=np.float32)
+            results['im_shape'] = np.array(img.shape)
+            # in case that there is no padding
+            results['pad_shape'] = img.shape
+            results['scale_factor'] = scale_factor
+            results['keep_ratio'] = self.keep_ratio
+            # img_pad = self.impad(img, shape=results['scale'])
+            results[key] = img
+
+    def _resize_bboxes(self, results):
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        for key in ['gt_bbox'] if 'gt_bbox' in results else []:
+            bboxes = results[key] * results['scale_factor']
+            if self.bbox_clip_border:
+                img_shape = results['im_shape']
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            results[key] = bboxes
+
+    def _resize_masks(self, results):
+        """Resize masks with ``results['scale']``"""
+        for key in ['mask'] if 'mask' in results else []:
+            if results[key] is None:
+                continue
+            if self.keep_ratio:
+                results[key] = results[key].rescale(results['scale'])
+            else:
+                results[key] = results[key].resize(results['im_shape'][:2])
+
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in ['seg'] if 'seg' in results else []:
+            if self.keep_ratio:
+                gt_seg = imrescale(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            else:
+                gt_seg = imresize(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            results[key] = gt_seg
+
+    def _resize_keypoints(self, results):
+        """Resize keypoints with ``results['scale_factor']``."""
+        for key in ['gt_joints'] if 'gt_joints' in results else []:
+            keypoints = results[key].copy()
+            keypoints[..., 0] = keypoints[..., 0] * results['scale_factor'][0]
+            keypoints[..., 1] = keypoints[..., 1] * results['scale_factor'][1]
+            if self.keypoint_clip_border:
+                img_shape = results['im_shape']
+                keypoints[..., 0] = np.clip(keypoints[..., 0], 0, img_shape[1])
+                keypoints[..., 1] = np.clip(keypoints[..., 1], 0, img_shape[0])
+            results[key] = keypoints
+
+    def _resize_areas(self, results):
+        """Resize mask areas with ``results['scale_factor']``."""
+        for key in ['gt_areas'] if 'gt_areas' in results else []:
+            areas = results[key].copy()
+            areas = areas * results['scale_factor'][0] * results[
+                'scale_factor'][1]
+            results[key] = areas
+
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'im_shape', 'pad_shape', 'scale_factor', \
+                'keep_ratio' keys are added into result dict.
+        """
+        if 'scale' not in results:
+            if 'scale_factor' in results:
+                img_shape = results['image'].shape[:2]
+                scale_factor = results['scale_factor'][0]
+                # assert isinstance(scale_factor, float)
+                results['scale'] = [int(x * scale_factor)
+                                    for x in img_shape][::-1]
+            else:
+                self._random_scale(results)
+        else:
+            if not self.override:
+                assert 'scale_factor' not in results, (
+                    'scale and scale_factor cannot be both set.')
+            else:
+                results.pop('scale')
+                if 'scale_factor' in results:
+                    results.pop('scale_factor')
+                self._random_scale(results)
+
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        self._resize_keypoints(results)
+        self._resize_areas(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'multiscale_mode={self.multiscale_mode}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        repr_str += f'keypoint_clip_border={self.keypoint_clip_border})'
+        return repr_str

+ 296 - 0
paddlers/models/ppdet/data/transform/keypoints_3d_operators.py

@@ -0,0 +1,296 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+import cv2
+import numpy as np
+import math
+import copy
+import random
+import uuid
+from numbers import Number, Integral
+
+from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix
+from paddlers.models.ppdet.core.workspace import serializable
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+registered_ops = []
+
+__all__ = [
+    'CropAndFlipImages', 'PermuteImages', 'RandomFlipHalfBody3DTransformImages'
+]
+
+import matplotlib.pyplot as plt
+from PIL import Image, ImageDraw
+from mpl_toolkits.mplot3d import Axes3D
+
+
+def register_keypointop(cls):
+    return serializable(cls)
+
+
+def register_op(cls):
+    registered_ops.append(cls.__name__)
+    if not hasattr(BaseOperator, cls.__name__):
+        setattr(BaseOperator, cls.__name__, cls)
+    else:
+        raise KeyError("The {} class has been registered.".format(cls.__name__))
+    return serializable(cls)
+
+
+class BaseOperator(object):
+    def __init__(self, name=None):
+        if name is None:
+            name = self.__class__.__name__
+        self._id = name + '_' + str(uuid.uuid4())[-6:]
+
+    def apply(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        return sample
+
+    def __call__(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        if isinstance(sample, Sequence):  # for batch_size
+            for i in range(len(sample)):
+                sample[i] = self.apply(sample[i], context)
+        else:
+            # image.shape changed
+            sample = self.apply(sample, context)
+        return sample
+
+    def __str__(self):
+        return str(self._id)
+
+
+@register_keypointop
+class CropAndFlipImages(object):
+    """Crop all images"""
+
+    def __init__(self, crop_range, flip_pairs=None):
+        super(CropAndFlipImages, self).__init__()
+        self.crop_range = crop_range
+        self.flip_pairs = flip_pairs
+
+    def __call__(self, records):  # tuple
+        images = records["image"]
+        images = images[:, :, ::-1, :]
+        images = images[:, :, self.crop_range[0]:self.crop_range[1]]
+        records["image"] = images
+
+        if "kps2d" in records.keys():
+            kps2d = records["kps2d"]
+
+            width, height = images.shape[2], images.shape[1]
+            kps2d = np.array(kps2d)
+            kps2d[:, :, 0] = kps2d[:, :, 0] - self.crop_range[0]
+
+            for pair in self.flip_pairs:
+                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
+                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()
+
+            records["kps2d"] = kps2d
+
+        return records
+
+
+@register_op
+class PermuteImages(BaseOperator):
+    def __init__(self):
+        """
+        Change the channel to be (batch_size, C, H, W) #(6, 3, 1080, 1920)
+        """
+        super(PermuteImages, self).__init__()
+
+    def apply(self, sample, context=None):
+        images = sample["image"]
+        images = images.transpose((0, 3, 1, 2))
+
+        sample["image"] = images
+
+        return sample
+
+
+@register_keypointop
+class RandomFlipHalfBody3DTransformImages(object):
+    """apply data augment to images and coords
+    to achieve the flip, scale, rotate and half body transform effect for training image
+    Args:
+        trainsize (list):[w, h], Image target size
+        upper_body_ids (list): The upper body joint ids
+        flip_pairs (list): The left-right joints exchange order list
+        pixel_std (int): The pixel std of the scale
+        scale (float): The scale factor to transform the image
+        rot (int): The rotate factor to transform the image
+        num_joints_half_body (int): The joints threshold of the half body transform
+        prob_half_body (float): The threshold of the half body transform
+        flip (bool): Whether to flip the image
+    Returns:
+        records(dict): contain the image and coords after tranformed
+    """
+
+    def __init__(self,
+                 trainsize,
+                 upper_body_ids,
+                 flip_pairs,
+                 pixel_std,
+                 scale=0.35,
+                 rot=40,
+                 num_joints_half_body=8,
+                 prob_half_body=0.3,
+                 flip=True,
+                 rot_prob=0.6,
+                 do_occlusion=False):
+        super(RandomFlipHalfBody3DTransformImages, self).__init__()
+        self.trainsize = trainsize
+        self.upper_body_ids = upper_body_ids
+        self.flip_pairs = flip_pairs
+        self.pixel_std = pixel_std
+        self.scale = scale
+        self.rot = rot
+        self.num_joints_half_body = num_joints_half_body
+        self.prob_half_body = prob_half_body
+        self.flip = flip
+        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
+        self.rot_prob = rot_prob
+        self.do_occlusion = do_occlusion
+
+    def halfbody_transform(self, joints, joints_vis):
+        upper_joints = []
+        lower_joints = []
+        for joint_id in range(joints.shape[0]):
+            if joints_vis[joint_id][0] > 0:
+                if joint_id in self.upper_body_ids:
+                    upper_joints.append(joints[joint_id])
+                else:
+                    lower_joints.append(joints[joint_id])
+        if np.random.randn() < 0.5 and len(upper_joints) > 2:
+            selected_joints = upper_joints
+        else:
+            selected_joints = lower_joints if len(
+                lower_joints) > 2 else upper_joints
+        if len(selected_joints) < 2:
+            return None, None
+        selected_joints = np.array(selected_joints, dtype=np.float32)
+        center = selected_joints.mean(axis=0)[:2]
+        left_top = np.amin(selected_joints, axis=0)
+        right_bottom = np.amax(selected_joints, axis=0)
+        w = right_bottom[0] - left_top[0]
+        h = right_bottom[1] - left_top[1]
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        scale = scale * 1.5
+
+        return center, scale
+
+    def flip_joints(self, joints, joints_vis, width, matched_parts, kps2d=None):
+        # joints: (6, 24, 3),(num_frames, num_joints, 3)
+
+        joints[:, :, 0] = width - joints[:, :, 0] - 1  # x
+        if kps2d is not None:
+            kps2d[:, :, 0] = width - kps2d[:, :, 0] - 1
+
+        for pair in matched_parts:
+            joints[:, pair[0], :], joints[:,pair[1], :] = \
+                joints[:,pair[1], :], joints[:,pair[0], :].copy()
+
+            joints_vis[:,pair[0], :], joints_vis[:,pair[1], :] = \
+                joints_vis[:,pair[1], :], joints_vis[:,pair[0], :].copy()
+
+            if kps2d is not None:
+                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
+                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()
+
+        # move to zero
+        joints -= joints[:, [0], :]  # (batch_size, 24, 3),numpy.ndarray
+
+        return joints, joints_vis, kps2d
+
+    def __call__(self, records):
+        images = records[
+            'image']  #kps3d, kps3d_vis, images. images.shape(num_frames, width, height, 3)
+
+        joints = records['kps3d']
+        joints_vis = records['kps3d_vis']
+
+        kps2d = None
+        if 'kps2d' in records.keys():
+            kps2d = records['kps2d']
+
+        if self.flip and np.random.random() <= 0.5:
+            images = images[:, :, ::-1, :]  # 图像水平翻转 (6, 1080, 810, 3)
+            joints, joints_vis, kps2d = self.flip_joints(
+                joints, joints_vis, images.shape[2], self.flip_pairs,
+                kps2d)  # 关键点左右对称翻转
+        occlusion = False
+        if self.do_occlusion and random.random() <= 0.5:  # 随机遮挡
+            height = images[0].shape[0]
+            width = images[0].shape[1]
+            occlusion = True
+            while True:
+                area_min = 0.0
+                area_max = 0.2
+                synth_area = (random.random() *
+                              (area_max - area_min) + area_min) * width * height
+
+                ratio_min = 0.3
+                ratio_max = 1 / 0.3
+                synth_ratio = (random.random() *
+                               (ratio_max - ratio_min) + ratio_min)
+
+                synth_h = math.sqrt(synth_area * synth_ratio)
+                synth_w = math.sqrt(synth_area / synth_ratio)
+                synth_xmin = random.random() * (width - synth_w - 1)
+                synth_ymin = random.random() * (height - synth_h - 1)
+
+                if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < width and synth_ymin + synth_h < height:
+                    xmin = int(synth_xmin)
+                    ymin = int(synth_ymin)
+                    w = int(synth_w)
+                    h = int(synth_h)
+
+                    mask = np.random.rand(h, w, 3) * 255
+                    images[:, ymin:ymin + h, xmin:xmin + w, :] = mask[
+                        None, :, :, :]
+                    break
+
+        records['image'] = images
+        records['kps3d'] = joints
+        records['kps3d_vis'] = joints_vis
+        if kps2d is not None:
+            records['kps2d'] = kps2d
+
+        return records

Fișier diff suprimat deoarece este prea mare
+ 500 - 71
paddlers/models/ppdet/data/transform/operators.py


+ 7 - 0
paddlers/models/ppdet/engine/__init__.py

@@ -15,6 +15,9 @@
 from . import trainer
 from .trainer import *
 
+from . import trainer_cot
+from .trainer_cot import *
+
 from . import callbacks
 from .callbacks import *
 
@@ -28,3 +31,7 @@ __all__ = trainer.__all__ \
 from . import tracker
 from .tracker import *
 __all__ = __all__ + tracker.__all__
+
+from . import trainer_ssod
+from .trainer_ssod import *
+__all__ = __all__ + trainer_ssod.__all__

+ 111 - 47
paddlers/models/ppdet/engine/callbacks.py

@@ -152,15 +152,14 @@ class LogPrinter(Callback):
             if mode == 'eval':
                 sample_num = status['sample_num']
                 cost_time = status['cost_time']
-                logger.info('Total sample number: {}, averge FPS: {}'.format(
+                logger.info('Total sample number: {}, average FPS: {}'.format(
                     sample_num, sample_num / cost_time))
 
 
 class Checkpointer(Callback):
     def __init__(self, model):
         super(Checkpointer, self).__init__(model)
-        cfg = self.model.cfg
-        self.best_ap = 0.
+        self.best_ap = -1000.
         self.save_dir = os.path.join(self.model.cfg.save_dir,
                                      self.model.cfg.filename)
         if hasattr(self.model.model, 'student_model'):
@@ -187,7 +186,11 @@ class Checkpointer(Callback):
                 if 'save_best_model' in status and status['save_best_model']:
                     for metric in self.model._metrics:
                         map_res = metric.get_results()
-                        if 'bbox' in map_res:
+                        eval_func = "ap"
+                        if 'pose3d' in map_res:
+                            key = 'pose3d'
+                            eval_func = "mpjpe"
+                        elif 'bbox' in map_res:
                             key = 'bbox'
                         elif 'keypoint' in map_res:
                             key = 'keypoint'
@@ -202,18 +205,36 @@ class Checkpointer(Callback):
                             self.best_ap = map_res[key][0]
                             save_name = 'best_model'
                             weight = self.weight.state_dict()
-                        logger.info("Best test {} ap is {:0.3f}.".format(
-                            key, self.best_ap))
+                        logger.info("Best test {} {} is {:0.3f}.".format(
+                            key, eval_func, abs(self.best_ap)))
             if weight:
                 if self.model.use_ema:
-                    # save model and ema_model
-                    save_model(
-                        status['weight'],
-                        self.model.optimizer,
-                        self.save_dir,
-                        save_name,
-                        epoch_id + 1,
-                        ema_model=weight)
+                    exchange_save_model = status.get('exchange_save_model',
+                                                     False)
+                    if not exchange_save_model:
+                        # save model and ema_model
+                        save_model(
+                            status['weight'],
+                            self.model.optimizer,
+                            self.save_dir,
+                            save_name,
+                            epoch_id + 1,
+                            ema_model=weight)
+                    else:
+                        # save model(student model) and ema_model(teacher model)
+                        # in DenseTeacher SSOD, the teacher model will be higher,
+                        # so exchange when saving pdparams
+                        student_model = status['weight']  # model
+                        teacher_model = weight  # ema_model
+                        save_model(
+                            teacher_model,
+                            self.model.optimizer,
+                            self.save_dir,
+                            save_name,
+                            epoch_id + 1,
+                            ema_model=student_model)
+                        del teacher_model
+                        del student_model
                 else:
                     save_model(weight, self.model.optimizer, self.save_dir,
                                save_name, epoch_id + 1)
@@ -288,6 +309,7 @@ class VisualDLWriter(Callback):
                                                    self.vdl_mAP_step)
                 self.vdl_mAP_step += 1
 
+
 class WandbCallback(Callback):
     def __init__(self, model):
         super(WandbCallback, self).__init__(model)
@@ -307,10 +329,8 @@ class WandbCallback(Callback):
             self.wandb_params = {}
         for k, v in model.cfg.items():
             if k.startswith("wandb_"):
-                self.wandb_params.update({
-                    k.lstrip("wandb_"): v
-                })
-        
+                self.wandb_params.update({k.lstrip("wandb_"): v})
+
         self._run = None
         if dist.get_world_size() < 2 or dist.get_rank() == 0:
             _ = self.run
@@ -318,37 +338,50 @@ class WandbCallback(Callback):
             self.run.define_metric("epoch")
             self.run.define_metric("eval/*", step_metric="epoch")
 
-        self.best_ap = 0
-    
+        self.best_ap = -1000.
+        self.fps = []
+
     @property
     def run(self):
         if self._run is None:
             if self.wandb.run is not None:
-                logger.info("There is an ongoing wandb run which will be used"
-                        "for logging. Please use `wandb.finish()` to end that"
-                        "if the behaviour is not intended")
+                logger.info(
+                    "There is an ongoing wandb run which will be used"
+                    "for logging. Please use `wandb.finish()` to end that"
+                    "if the behaviour is not intended")
                 self._run = self.wandb.run
             else:
                 self._run = self.wandb.init(**self.wandb_params)
         return self._run
-    
+
     def save_model(self,
-                optimizer,
-                save_dir,
-                save_name,
-                last_epoch,
-                ema_model=None,
-                ap=None, 
-                tags=None):
+                   optimizer,
+                   save_dir,
+                   save_name,
+                   last_epoch,
+                   ema_model=None,
+                   ap=None,
+                   fps=None,
+                   tags=None):
         if dist.get_world_size() < 2 or dist.get_rank() == 0:
             model_path = os.path.join(save_dir, save_name)
             metadata = {}
             metadata["last_epoch"] = last_epoch
             if ap:
                 metadata["ap"] = ap
+
+            if fps:
+                metadata["fps"] = fps
+
             if ema_model is None:
-                ema_artifact = self.wandb.Artifact(name="ema_model-{}".format(self.run.id), type="model", metadata=metadata)
-                model_artifact = self.wandb.Artifact(name="model-{}".format(self.run.id), type="model", metadata=metadata)
+                ema_artifact = self.wandb.Artifact(
+                    name="ema_model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+                model_artifact = self.wandb.Artifact(
+                    name="model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
 
                 ema_artifact.add_file(model_path + ".pdema", name="model_ema")
                 model_artifact.add_file(model_path + ".pdparams", name="model")
@@ -356,10 +389,13 @@ class WandbCallback(Callback):
                 self.run.log_artifact(ema_artifact, aliases=tags)
                 self.run.log_artfact(model_artifact, aliases=tags)
             else:
-                model_artifact = self.wandb.Artifact(name="model-{}".format(self.run.id), type="model", metadata=metadata)
+                model_artifact = self.wandb.Artifact(
+                    name="model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
                 model_artifact.add_file(model_path + ".pdparams", name="model")
                 self.run.log_artifact(model_artifact, aliases=tags)
-    
+
     def on_step_end(self, status):
 
         mode = status['mode']
@@ -368,22 +404,41 @@ class WandbCallback(Callback):
                 training_status = status['training_staus'].get()
                 for k, v in training_status.items():
                     training_status[k] = float(v)
-                metrics = {
-                    "train/" + k: v for k,v in training_status.items()
-                }
+
+                # calculate ips, data_cost, batch_cost
+                batch_time = status['batch_time']
+                data_time = status['data_time']
+                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
+                ))]['batch_size']
+
+                ips = float(batch_size) / float(batch_time.avg)
+                data_cost = float(data_time.avg)
+                batch_cost = float(batch_time.avg)
+
+                metrics = {"train/" + k: v for k, v in training_status.items()}
+
+                metrics["train/ips"] = ips
+                metrics["train/data_cost"] = data_cost
+                metrics["train/batch_cost"] = batch_cost
+
+                self.fps.append(ips)
                 self.run.log(metrics)
-    
+
     def on_epoch_end(self, status):
         mode = status['mode']
         epoch_id = status['epoch_id']
         save_name = None
         if dist.get_world_size() < 2 or dist.get_rank() == 0:
             if mode == 'train':
+                fps = sum(self.fps) / len(self.fps)
+                self.fps = []
+
                 end_epoch = self.model.cfg.epoch
                 if (
                         epoch_id + 1
                 ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
-                    save_name = str(epoch_id) if epoch_id != end_epoch - 1 else "model_final"
+                    save_name = str(
+                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
                     tags = ["latest", "epoch_{}".format(epoch_id)]
                     self.save_model(
                         self.model.optimizer,
@@ -391,20 +446,29 @@ class WandbCallback(Callback):
                         save_name,
                         epoch_id + 1,
                         self.model.use_ema,
-                        tags=tags
-                    )
+                        fps=fps,
+                        tags=tags)
             if mode == 'eval':
+                sample_num = status['sample_num']
+                cost_time = status['cost_time']
+
+                fps = sample_num / cost_time
+
                 merged_dict = {}
                 for metric in self.model._metrics:
                     for key, map_value in metric.get_results().items():
                         merged_dict["eval/{}-mAP".format(key)] = map_value[0]
                 merged_dict["epoch"] = status["epoch_id"]
+                merged_dict["eval/fps"] = sample_num / cost_time
+
                 self.run.log(merged_dict)
 
                 if 'save_best_model' in status and status['save_best_model']:
                     for metric in self.model._metrics:
                         map_res = metric.get_results()
-                        if 'bbox' in map_res:
+                        if 'pose3d' in map_res:
+                            key = 'pose3d'
+                        elif 'bbox' in map_res:
                             key = 'bbox'
                         elif 'keypoint' in map_res:
                             key = 'keypoint'
@@ -426,10 +490,10 @@ class WandbCallback(Callback):
                                 save_name,
                                 last_epoch=epoch_id + 1,
                                 ema_model=self.model.use_ema,
-                                ap=self.best_ap,
-                                tags=tags
-                            )
-    
+                                ap=abs(self.best_ap),
+                                fps=fps,
+                                tags=tags)
+
     def on_train_end(self, status):
         self.run.finish()
 

+ 54 - 6
paddlers/models/ppdet/engine/export_utils.py

@@ -29,6 +29,7 @@ logger = setup_logger('ppdet.engine')
 # Global dictionary
 TRT_MIN_SUBGRAPH = {
     'YOLO': 3,
+    'PPYOLOE': 3,
     'SSD': 60,
     'RCNN': 40,
     'RetinaNet': 40,
@@ -42,6 +43,7 @@ TRT_MIN_SUBGRAPH = {
     'HRNet': 3,
     'DeepSORT': 3,
     'ByteTrack': 10,
+    'CenterTrack': 5,
     'JDE': 10,
     'FairMOT': 5,
     'GFL': 16,
@@ -49,10 +51,46 @@ TRT_MIN_SUBGRAPH = {
     'CenterNet': 5,
     'TOOD': 5,
     'YOLOX': 8,
+    'YOLOF': 40,
+    'METRO_Body': 3,
+    'DETR': 3,
 }
 
 KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
-MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack']
+MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
+
+TO_STATIC_SPEC = {
+    'yolov3_darknet53_270e_coco': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'is_crowd': paddle.static.InputSpec(
+            name='is_crowd', shape=[-1, 50], dtype='float32'),
+        'gt_bbox': paddle.static.InputSpec(
+            name='gt_bbox', shape=[-1, 50, 4], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'target0': paddle.static.InputSpec(
+            name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+        'target1': paddle.static.InputSpec(
+            name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+        'target2': paddle.static.InputSpec(
+            name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+    }],
+}
+
+
+def apply_to_static(config, model):
+    filename = config.get('filename', None)
+    spec = TO_STATIC_SPEC.get(filename, None)
+    model = paddle.jit.to_static(model, input_spec=spec)
+    logger.info("Successfully to apply @to_static with specs: {}".format(spec))
+    return model
 
 
 def _prune_input_spec(input_spec, program, targets):
@@ -140,10 +178,11 @@ def _dump_infer_config(config, path, image_shape, model):
         infer_cfg['export_onnx'] = True
         infer_cfg['export_eb'] = export_eb
 
-
     if infer_arch in MOT_ARCH:
         if infer_arch == 'DeepSORT':
             tracker_cfg = config['DeepSORTTracker']
+        elif infer_arch == 'CenterTrack':
+            tracker_cfg = config['CenterTracker']
         else:
             tracker_cfg = config['JDETracker']
         infer_cfg['tracker'] = _parse_tracker(tracker_cfg)
@@ -155,7 +194,10 @@ def _dump_infer_config(config, path, image_shape, model):
             arch_state = True
             break
 
-    if infer_arch == 'YOLOX':
+    if infer_arch == 'PPYOLOEWithAuxHead':
+        infer_arch = 'PPYOLOE'
+
+    if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']:
         infer_cfg['arch'] = infer_arch
         infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
         arch_state = True
@@ -174,9 +216,15 @@ def _dump_infer_config(config, path, image_shape, model):
         label_arch = 'keypoint_arch'
 
     if infer_arch in MOT_ARCH:
-        label_arch = 'mot_arch'
-        reader_cfg = config['TestMOTReader']
-        dataset_cfg = config['TestMOTDataset']
+        if config['metric'] in ['COCO', 'VOC']:
+            # MOT model run as Detector
+            reader_cfg = config['TestReader']
+            dataset_cfg = config['TestDataset']
+        else:
+            # 'metric' in ['MOT', 'MCMOT', 'KITTI']
+            label_arch = 'mot_arch'
+            reader_cfg = config['TestMOTReader']
+            dataset_cfg = config['TestMOTDataset']
     else:
         reader_cfg = config['TestReader']
         dataset_cfg = config['TestDataset']

+ 107 - 10
paddlers/models/ppdet/engine/tracker.py

@@ -29,9 +29,11 @@ from paddlers.models.ppdet.core.workspace import create
 from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
 from paddlers.models.ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
 from paddlers.models.ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results
-from paddlers.models.ppdet.modeling.mot.tracker import JDETracker, DeepSORTTracker, OCSORTTracker
+from paddlers.models.ppdet.modeling.mot.tracker import JDETracker, CenterTracker
+from paddlers.models.ppdet.modeling.mot.tracker import DeepSORTTracker, OCSORTTracker, BOTSORTTracker
 from paddlers.models.ppdet.modeling.architectures import YOLOX
 from paddlers.models.ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric
+from paddlers.models.ppdet.data.source.category import get_categories
 import paddlers.models.ppdet.utils.stats as stats
 
 from .callbacks import Callback, ComposeCallback
@@ -39,9 +41,9 @@ from .callbacks import Callback, ComposeCallback
 from paddlers.models.ppdet.utils.logger import setup_logger
 logger = setup_logger(__name__)
 
-MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack']
-MOT_ARCH_JDE = ['JDE', 'FairMOT']
-MOT_ARCH_SDE = ['DeepSORT', 'ByteTrack']
+MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
+MOT_ARCH_JDE = MOT_ARCH[:2]
+MOT_ARCH_SDE = MOT_ARCH[2:4]
 MOT_DATA_TYPE = ['mot', 'mcmot', 'kitti']
 
 __all__ = ['Tracker']
@@ -67,6 +69,13 @@ class Tracker(object):
                     m._epsilon = 1e-3  # for amp(fp16)
                     m._momentum = 0.97  # 0.03 in pytorch
 
+        anno_file = self.dataset.get_anno()
+        clsid2catid, catid2name = get_categories(
+            self.cfg.metric, anno_file=anno_file)
+        self.ids2names = []
+        for k, v in catid2name.items():
+            self.ids2names.append(v)
+
         self.status = {}
         self.start_epoch = 0
 
@@ -130,6 +139,53 @@ class Tracker(object):
         else:
             load_weight(self.model.reid, reid_weights)
 
+    def _eval_seq_centertrack(self,
+                              dataloader,
+                              save_dir=None,
+                              show_image=False,
+                              frame_rate=30,
+                              draw_threshold=0):
+        assert isinstance(self.model.tracker, CenterTracker)
+        if save_dir:
+            if not os.path.exists(save_dir): os.makedirs(save_dir)
+        tracker = self.model.tracker
+
+        timer = MOTTimer()
+        frame_id = 0
+        self.status['mode'] = 'track'
+        self.model.eval()
+        results = defaultdict(list)  # only support single class now
+
+        for step_id, data in enumerate(tqdm(dataloader)):
+            self.status['step_id'] = step_id
+            if step_id == 0:
+                self.model.reset_tracking()
+
+            # forward
+            timer.tic()
+            pred_ret = self.model(data)
+
+            online_targets = tracker.update(pred_ret)
+            online_tlwhs, online_scores, online_ids = [], [], []
+            for t in online_targets:
+                bbox = t['bbox']
+                tlwh = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]]
+                tscore = float(t['score'])
+                tid = int(t['tracking_id'])
+                if tlwh[2] * tlwh[3] > 0:
+                    online_tlwhs.append(tlwh)
+                    online_ids.append(tid)
+                    online_scores.append(tscore)
+            timer.toc()
+            # save results
+            results[0].append(
+                (frame_id + 1, online_tlwhs, online_scores, online_ids))
+            save_vis_results(data, frame_id, online_ids, online_tlwhs,
+                             online_scores, timer.average_time, show_image,
+                             save_dir, self.cfg.num_classes, self.ids2names)
+            frame_id += 1
+        return results, frame_id, timer.average_time, timer.calls
+
     def _eval_seq_jde(self,
                       dataloader,
                       save_dir=None,
@@ -180,7 +236,7 @@ class Tracker(object):
             timer.toc()
             save_vis_results(data, frame_id, online_ids, online_tlwhs,
                              online_scores, timer.average_time, show_image,
-                             save_dir, self.cfg.num_classes)
+                             save_dir, self.cfg.num_classes, self.ids2names)
             frame_id += 1
 
         return results, frame_id, timer.average_time, timer.calls
@@ -197,7 +253,11 @@ class Tracker(object):
         if save_dir:
             if not os.path.exists(save_dir): os.makedirs(save_dir)
         use_detector = False if not self.model.detector else True
-        use_reid = False if not self.model.reid else True
+        use_reid = hasattr(self.model, 'reid')
+        if use_reid and self.model.reid is not None:
+            use_reid = True
+        else:
+            use_reid = False
 
         timer = MOTTimer()
         results = defaultdict(list)
@@ -290,7 +350,7 @@ class Tracker(object):
                 online_ids, online_tlwhs, online_scores = None, None, None
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                  online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes)
+                                 save_dir, self.cfg.num_classes, self.ids2names)
                 frame_id += 1
                 # thus will not inference reid model
                 continue
@@ -338,7 +398,7 @@ class Tracker(object):
                     (frame_id + 1, online_tlwhs, online_scores, online_ids))
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                  online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes)
+                                 save_dir, self.cfg.num_classes, self.ids2names)
 
             elif isinstance(tracker, JDETracker):
                 # trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set
@@ -369,7 +429,8 @@ class Tracker(object):
                 timer.toc()
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                  online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes)
+                                 save_dir, self.cfg.num_classes, self.ids2names)
+
             elif isinstance(tracker, OCSORTTracker):
                 # OC_SORT Tracker
                 online_targets = tracker.update(pred_dets_old, pred_embs)
@@ -390,7 +451,31 @@ class Tracker(object):
                     (frame_id + 1, online_tlwhs, online_scores, online_ids))
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                  online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes)
+                                 save_dir, self.cfg.num_classes, self.ids2names)
+
+            elif isinstance(tracker, BOTSORTTracker):
+                # BOTSORT Tracker
+                online_targets = tracker.update(
+                    pred_dets_old, img=ori_image.numpy())
+                online_tlwhs = []
+                online_ids = []
+                online_scores = []
+                for t in online_targets:
+                    tlwh = t.tlwh
+                    tid = t.track_id
+                    tscore = t.score
+                    if tlwh[2] * tlwh[3] > 0:
+                        online_tlwhs.append(tlwh)
+                        online_ids.append(tid)
+                        online_scores.append(tscore)
+                timer.toc()
+                # save results
+                results[0].append(
+                    (frame_id + 1, online_tlwhs, online_scores, online_ids))
+                save_vis_results(data, frame_id, online_ids, online_tlwhs,
+                                 online_scores, timer.average_time, show_image,
+                                 save_dir, self.cfg.num_classes, self.ids2names)
+
             else:
                 raise ValueError(tracker)
             frame_id += 1
@@ -461,6 +546,12 @@ class Tracker(object):
                         scaled=scaled,
                         det_file=os.path.join(det_results_dir,
                                               '{}.txt'.format(seq)))
+                elif model_type == 'CenterTrack':
+                    results, nf, ta, tc = self._eval_seq_centertrack(
+                        dataloader,
+                        save_dir=save_dir,
+                        show_image=show_image,
+                        frame_rate=frame_rate)
                 else:
                     raise ValueError(model_type)
 
@@ -587,6 +678,12 @@ class Tracker(object):
                     det_file=os.path.join(det_results_dir,
                                           '{}.txt'.format(seq)),
                     draw_threshold=draw_threshold)
+            elif model_type == 'CenterTrack':
+                results, nf, ta, tc = self._eval_seq_centertrack(
+                    dataloader,
+                    save_dir=save_dir,
+                    show_image=show_image,
+                    frame_rate=frame_rate)
             else:
                 raise ValueError(model_type)
 

+ 147 - 30
paddlers/models/ppdet/engine/trainer.py

@@ -38,7 +38,7 @@ from paddlers.models.ppdet.optimizer import ModelEMA
 from paddlers.models.ppdet.core.workspace import create
 from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
 from paddlers.models.ppdet.utils.visualizer import visualize_results, save_result
-from paddlers.models.ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownMPIIEval
+from paddlers.models.ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownMPIIEval, Pose3DEval
 from paddlers.models.ppdet.metrics import RBoxMetric, JDEDetMetric, SNIPERCOCOMetric
 from paddlers.models.ppdet.data.source.sniper_coco import SniperCOCODataSet
 from paddlers.models.ppdet.data.source.category import get_categories
@@ -48,7 +48,7 @@ from paddlers.models.ppdet.utils import profiler
 from paddlers.models.ppdet.modeling.post_process import multiclass_nms
 
 from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback
-from .export_utils import _dump_infer_config, _prune_input_spec
+from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static
 
 from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
 
@@ -57,12 +57,12 @@ logger = setup_logger('ppdet.engine')
 
 __all__ = ['Trainer']
 
-MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack']
+MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
 
 
 class Trainer(object):
     def __init__(self, cfg, mode='train'):
-        self.cfg = cfg
+        self.cfg = cfg.copy()
         assert mode.lower() in ['train', 'eval', 'test'], \
                 "mode should be 'train', 'eval' or 'test'"
         self.mode = mode.lower()
@@ -72,10 +72,14 @@ class Trainer(object):
         self.amp_level = self.cfg.get('amp_level', 'O1')
         self.custom_white_list = self.cfg.get('custom_white_list', None)
         self.custom_black_list = self.cfg.get('custom_black_list', None)
+        if 'slim' in cfg and cfg['slim_type'] == 'PTQ':
+            self.cfg['TestDataset'] = create('TestDataset')()
 
         # build data loader
         capital_mode = self.mode.capitalize()
-        if cfg.architecture in MOT_ARCH and self.mode in ['eval', 'test']:
+        if cfg.architecture in MOT_ARCH and self.mode in [
+                'eval', 'test'
+        ] and cfg.metric not in ['COCO', 'VOC']:
             self.dataset = self.cfg['{}MOTDataset'.format(
                 capital_mode)] = create('{}MOTDataset'.format(capital_mode))()
         else:
@@ -95,12 +99,12 @@ class Trainer(object):
                 self.dataset, cfg.worker_num)
 
         if cfg.architecture == 'JDE' and self.mode == 'train':
-            cfg['JDEEmbeddingHead'][
+            self.cfg['JDEEmbeddingHead'][
                 'num_identities'] = self.dataset.num_identities_dict[0]
             # JDE only support single class MOT now.
 
         if cfg.architecture == 'FairMOT' and self.mode == 'train':
-            cfg['FairMOTEmbeddingHead'][
+            self.cfg['FairMOTEmbeddingHead'][
                 'num_identities_dict'] = self.dataset.num_identities_dict
             # FairMOT support single class and multi-class MOT now.
 
@@ -136,17 +140,30 @@ class Trainer(object):
         if self.mode == 'eval':
             if cfg.architecture == 'FairMOT':
                 self.loader = create('EvalMOTReader')(self.dataset, 0)
+            elif cfg.architecture == "METRO_Body":
+                reader_name = '{}Reader'.format(self.mode.capitalize())
+                self.loader = create(reader_name)(self.dataset, cfg.worker_num)
             else:
                 self._eval_batch_sampler = paddle.io.BatchSampler(
                     self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
                 reader_name = '{}Reader'.format(self.mode.capitalize())
                 # If metric is VOC, need to be set collate_batch=False.
                 if cfg.metric == 'VOC':
-                    cfg[reader_name]['collate_batch'] = False
+                    self.cfg[reader_name]['collate_batch'] = False
                 self.loader = create(reader_name)(self.dataset, cfg.worker_num,
                                                   self._eval_batch_sampler)
         # TestDataset build after user set images, skip loader creation here
 
+        # get Params
+        print_params = self.cfg.get('print_params', False)
+        if print_params:
+            params = sum([
+                p.numel() for n, p in self.model.named_parameters()
+                if all([x not in n for x in ['_mean', '_variance', 'aux_']])
+            ])  # exclude BatchNorm running status
+            logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[
+                0]))
+
         # build optimizer in train mode
         if self.mode == 'train':
             steps_per_epoch = len(self.loader)
@@ -172,12 +189,14 @@ class Trainer(object):
             ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
             cycle_epoch = self.cfg.get('cycle_epoch', -1)
             ema_black_list = self.cfg.get('ema_black_list', None)
+            ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False)
             self.ema = ModelEMA(
                 self.model,
                 decay=ema_decay,
                 ema_decay_type=ema_decay_type,
                 cycle_epoch=cycle_epoch,
-                ema_black_list=ema_black_list)
+                ema_black_list=ema_black_list,
+                ema_filter_no_grad=ema_filter_no_grad)
 
         self._nranks = dist.get_world_size()
         self._local_rank = dist.get_rank()
@@ -342,6 +361,13 @@ class Trainer(object):
                     self.cfg.save_dir,
                     save_prediction_only=save_prediction_only)
             ]
+        elif self.cfg.metric == 'Pose3DEval':
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+            self._metrics = [
+                Pose3DEval(
+                    self.cfg.save_dir,
+                    save_prediction_only=save_prediction_only)
+            ]
         elif self.cfg.metric == 'MOTDet':
             self._metrics = [JDEDetMetric(), ]
         else:
@@ -378,7 +404,8 @@ class Trainer(object):
     def load_weights_sde(self, det_weights, reid_weights):
         if self.model.detector:
             load_weight(self.model.detector, det_weights)
-            load_weight(self.model.reid, reid_weights)
+            if self.model.reid:
+                load_weight(self.model.reid, reid_weights)
         else:
             load_weight(self.model.reid, reid_weights)
 
@@ -400,15 +427,19 @@ class Trainer(object):
                 "EvalDataset")()
 
         model = self.model
-        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
-                   self.cfg.use_gpu and self._nranks > 1)
+        if self.cfg.get('to_static', False):
+            model = apply_to_static(self.cfg, model)
+        sync_bn = (
+            getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
+            (self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu) and
+            self._nranks > 1)
         if sync_bn:
             model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
 
         # enabel auto mixed precision mode
         if self.use_amp:
             scaler = paddle.amp.GradScaler(
-                enable=self.cfg.use_gpu or self.cfg.use_npu,
+                enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu,
                 init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))
         # get distributed model
         if self.cfg.get('fleet', False):
@@ -463,7 +494,8 @@ class Trainer(object):
                             DataParallel) and use_fused_allreduce_gradients:
                         with model.no_sync():
                             with paddle.amp.auto_cast(
-                                    enable=self.cfg.use_gpu,
+                                    enable=self.cfg.use_gpu or
+                                    self.cfg.use_npu or self.cfg.use_mlu,
                                     custom_white_list=self.custom_white_list,
                                     custom_black_list=self.custom_black_list,
                                     level=self.amp_level):
@@ -477,7 +509,8 @@ class Trainer(object):
                             list(model.parameters()), None)
                     else:
                         with paddle.amp.auto_cast(
-                                enable=self.cfg.use_gpu,
+                                enable=self.cfg.use_gpu or self.cfg.use_npu or
+                                self.cfg.use_mlu,
                                 custom_white_list=self.custom_white_list,
                                 custom_black_list=self.custom_black_list,
                                 level=self.amp_level):
@@ -527,7 +560,7 @@ class Trainer(object):
             if self.cfg.get('unstructured_prune'):
                 self.pruner.update_params()
 
-            is_snapshot = (self._nranks < 2 or self._local_rank == 0) \
+            is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == "Pose3DEval")) \
                        and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
             if is_snapshot and self.use_ema:
                 # apply ema weight on model
@@ -548,10 +581,14 @@ class Trainer(object):
                     # If metric is VOC, need to be set collate_batch=False.
                     if self.cfg.metric == 'VOC':
                         self.cfg['EvalReader']['collate_batch'] = False
-                    self._eval_loader = create('EvalReader')(
-                        self._eval_dataset,
-                        self.cfg.worker_num,
-                        batch_sampler=self._eval_batch_sampler)
+                    if self.cfg.metric == "Pose3DEval":
+                        self._eval_loader = create('EvalReader')(
+                            self._eval_dataset, self.cfg.worker_num)
+                    else:
+                        self._eval_loader = create('EvalReader')(
+                            self._eval_dataset,
+                            self.cfg.worker_num,
+                            batch_sampler=self._eval_batch_sampler)
                 # if validation in training is enabled, metrics should be re-init
                 # Init_mark makes sure this code will only execute once
                 if validate and Init_mark == False:
@@ -575,6 +612,7 @@ class Trainer(object):
         tic = time.time()
         self._compose_callback.on_epoch_begin(self.status)
         self.status['mode'] = 'eval'
+
         self.model.eval()
         if self.cfg.get('print_flops', False):
             flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
@@ -586,7 +624,8 @@ class Trainer(object):
             # forward
             if self.use_amp:
                 with paddle.amp.auto_cast(
-                        enable=self.cfg.use_gpu,
+                        enable=self.cfg.use_gpu or self.cfg.use_npu or
+                        self.cfg.use_mlu,
                         custom_white_list=self.custom_white_list,
                         custom_black_list=self.custom_black_list,
                         level=self.amp_level):
@@ -617,6 +656,15 @@ class Trainer(object):
         self._reset_metrics()
 
     def evaluate(self):
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
         with paddle.no_grad():
             self._eval_with_loader(self.loader)
 
@@ -644,7 +692,8 @@ class Trainer(object):
             # forward
             if self.use_amp:
                 with paddle.amp.auto_cast(
-                        enable=self.cfg.use_gpu,
+                        enable=self.cfg.use_gpu or self.cfg.use_npu or
+                        self.cfg.use_mlu,
                         custom_white_list=self.custom_white_list,
                         custom_black_list=self.custom_black_list,
                         level=self.amp_level):
@@ -722,11 +771,51 @@ class Trainer(object):
                       output_dir='output',
                       save_results=False,
                       visualize=True):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
         self.dataset.set_slice_images(images, slice_size, overlap_ratio)
         loader = create('TestReader')(self.dataset, 0)
-
         imid2path = self.dataset.get_imid2path()
 
+        def setup_metrics_for_loader():
+            # mem
+            metrics = copy.deepcopy(self._metrics)
+            mode = self.mode
+            save_prediction_only = self.cfg[
+                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
+            output_eval = self.cfg[
+                'output_eval'] if 'output_eval' in self.cfg else None
+
+            # modify
+            self.mode = '_test'
+            self.cfg['save_prediction_only'] = True
+            self.cfg['output_eval'] = output_dir
+            self.cfg['imid2path'] = imid2path
+            self._init_metrics()
+
+            # restore
+            self.mode = mode
+            self.cfg.pop('save_prediction_only')
+            if save_prediction_only is not None:
+                self.cfg['save_prediction_only'] = save_prediction_only
+
+            self.cfg.pop('output_eval')
+            if output_eval is not None:
+                self.cfg['output_eval'] = output_eval
+
+            self.cfg.pop('imid2path')
+
+            _metrics = copy.deepcopy(self._metrics)
+            self._metrics = metrics
+
+            return _metrics
+
+        if save_results:
+            metrics = setup_metrics_for_loader()
+        else:
+            metrics = []
+
         anno_file = self.dataset.get_anno()
         clsid2catid, catid2name = get_categories(
             self.cfg.metric, anno_file=anno_file)
@@ -772,6 +861,9 @@ class Trainer(object):
                 merged_bboxs = []
                 data['im_id'] = data['ori_im_id']
 
+                for _m in metrics:
+                    _m.update(data, merged_results)
+
                 for key in ['im_shape', 'scale_factor', 'im_id']:
                     if isinstance(data, typing.Sequence):
                         merged_results[key] = data[0][key]
@@ -782,23 +874,36 @@ class Trainer(object):
                         merged_results[key] = value.numpy()
                 results.append(merged_results)
 
+        for _m in metrics:
+            _m.accumulate()
+            _m.reset()
+
         if visualize:
             for outs in results:
                 batch_res = get_infer_results(outs, clsid2catid)
                 bbox_num = outs['bbox_num']
+
                 start = 0
                 for i, im_id in enumerate(outs['im_id']):
                     image_path = imid2path[int(im_id)]
                     image = Image.open(image_path).convert('RGB')
                     image = ImageOps.exif_transpose(image)
                     self.status['original_image'] = np.array(image.copy())
+
                     end = start + bbox_num[i]
                     bbox_res = batch_res['bbox'][start:end] \
                             if 'bbox' in batch_res else None
-                    mask_res, segm_res, keypoint_res = None, None, None
+                    mask_res = batch_res['mask'][start:end] \
+                            if 'mask' in batch_res else None
+                    segm_res = batch_res['segm'][start:end] \
+                            if 'segm' in batch_res else None
+                    keypoint_res = batch_res['keypoint'][start:end] \
+                            if 'keypoint' in batch_res else None
+                    pose3d_res = batch_res['pose3d'][start:end] \
+                            if 'pose3d' in batch_res else None
                     image = visualize_results(
                         image, bbox_res, mask_res, segm_res, keypoint_res,
-                        int(im_id), catid2name, draw_threshold)
+                        pose3d_res, int(im_id), catid2name, draw_threshold)
                     self.status['result_image'] = np.array(image.copy())
                     if self._compose_callback:
                         self._compose_callback.on_step_end(self.status)
@@ -808,6 +913,7 @@ class Trainer(object):
                     logger.info("Detection bbox results save in {}".format(
                         save_name))
                     image.save(save_name, quality=95)
+
                     start = end
 
     def predict(self,
@@ -921,9 +1027,11 @@ class Trainer(object):
                             if 'segm' in batch_res else None
                     keypoint_res = batch_res['keypoint'][start:end] \
                             if 'keypoint' in batch_res else None
+                    pose3d_res = batch_res['pose3d'][start:end] \
+                            if 'pose3d' in batch_res else None
                     image = visualize_results(
                         image, bbox_res, mask_res, segm_res, keypoint_res,
-                        int(im_id), catid2name, draw_threshold)
+                        pose3d_res, int(im_id), catid2name, draw_threshold)
                     self.status['result_image'] = np.array(image.copy())
                     if self._compose_callback:
                         self._compose_callback.on_step_end(self.status)
@@ -935,6 +1043,7 @@ class Trainer(object):
                     image.save(save_name, quality=95)
 
                     start = end
+        return results
 
     def _get_save_image_name(self, output_dir, image_path):
         """
@@ -976,6 +1085,10 @@ class Trainer(object):
                 if hasattr(layer, 'convert_to_deploy'):
                     layer.convert_to_deploy()
 
+        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
+                'export'] and self.cfg['export']['fuse_conv_bn']:
+            self.model = fuse_conv_bn(self.model)
+
         export_post_process = self.cfg['export'].get(
             'post_process', False) if hasattr(self.cfg, 'export') else True
         export_nms = self.cfg['export'].get('nms', False) if hasattr(
@@ -1045,12 +1158,12 @@ class Trainer(object):
         return static_model, pruned_input_spec
 
     def export(self, output_dir='output_inference'):
+        if hasattr(self.model, 'aux_neck'):
+            self.model.__delattr__('aux_neck')
+        if hasattr(self.model, 'aux_head'):
+            self.model.__delattr__('aux_head')
         self.model.eval()
 
-        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
-                'export'] and self.cfg['export']['fuse_conv_bn']:
-            self.model = fuse_conv_bn(self.model)
-
         model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
         save_dir = os.path.join(output_dir, model_name)
         if not os.path.exists(save_dir):
@@ -1095,6 +1208,10 @@ class Trainer(object):
         logger.info("Export Post-Quant model and saved in {}".format(save_dir))
 
     def _flops(self, loader):
+        if hasattr(self.model, 'aux_neck'):
+            self.model.__delattr__('aux_neck')
+        if hasattr(self.model, 'aux_head'):
+            self.model.__delattr__('aux_head')
         self.model.eval()
         try:
             import paddleslim

+ 42 - 0
paddlers/models/ppdet/engine/trainer_cot.py

@@ -0,0 +1,42 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlers.models.ppdet.core.workspace import create
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+from . import Trainer
+__all__ = ['TrainerCot']
+
+class TrainerCot(Trainer):
+    """
+    Trainer for label-cotuning
+    calculate the relationship between base_classes and novel_classes
+    """
+    def __init__(self, cfg, mode='train'):
+        super(TrainerCot, self).__init__(cfg, mode)
+        self.cotuning_init()
+
+    def cotuning_init(self):    
+        num_classes_novel = self.cfg['num_classes']
+
+        self.load_weights(self.cfg.pretrain_weights)
+
+        self.model.eval()
+        relationship = self.model.relationship_learning(self.loader, num_classes_novel)
+    
+        self.model.init_cot_head(relationship)
+        self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
+
+

+ 475 - 0
paddlers/models/ppdet/engine/trainer_ssod.py

@@ -0,0 +1,475 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import time
+import typing
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+from paddle.distributed import fleet
+from paddlers.models.ppdet.optimizer import ModelEMA, SimpleModelEMA
+
+from paddlers.models.ppdet.core.workspace import create
+from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
+import paddlers.models.ppdet.utils.stats as stats
+from paddlers.models.ppdet.utils import profiler
+from paddlers.models.ppdet.modeling.ssod.utils import align_weak_strong_shape
+from .trainer import Trainer
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = ['Trainer_DenseTeacher']
+
+
+class Trainer_DenseTeacher(Trainer):
+    def __init__(self, cfg, mode='train'):
+        self.cfg = cfg
+        assert mode.lower() in ['train', 'eval', 'test'], \
+                "mode should be 'train', 'eval' or 'test'"
+        self.mode = mode.lower()
+        self.optimizer = None
+        self.is_loaded_weights = False
+        self.use_amp = self.cfg.get('amp', False)
+        self.amp_level = self.cfg.get('amp_level', 'O1')
+        self.custom_white_list = self.cfg.get('custom_white_list', None)
+        self.custom_black_list = self.cfg.get('custom_black_list', None)
+
+        # build data loader
+        capital_mode = self.mode.capitalize()
+        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
+            '{}Dataset'.format(capital_mode))()
+
+        if self.mode == 'train':
+            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
+                'UnsupTrainDataset')
+            self.loader = create('SemiTrainReader')(
+                self.dataset, self.dataset_unlabel, cfg.worker_num)
+
+        # build model
+        if 'model' not in self.cfg:
+            self.model = create(cfg.architecture)
+        else:
+            self.model = self.cfg.model
+            self.is_loaded_weights = True
+
+        # EvalDataset build with BatchSampler to evaluate in single device
+        # TODO: multi-device evaluate
+        if self.mode == 'eval':
+            self._eval_batch_sampler = paddle.io.BatchSampler(
+                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
+            # If metric is VOC, need to be set collate_batch=False.
+            if cfg.metric == 'VOC':
+                cfg['EvalReader']['collate_batch'] = False
+            self.loader = create('EvalReader')(self.dataset, cfg.worker_num,
+                                               self._eval_batch_sampler)
+        # TestDataset build after user set images, skip loader creation here
+
+        # build optimizer in train mode
+        if self.mode == 'train':
+            steps_per_epoch = len(self.loader)
+            if steps_per_epoch < 1:
+                logger.warning(
+                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
+                )
+            self.lr = create('LearningRate')(steps_per_epoch)
+            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
+
+            # Unstructured pruner is only enabled in the train mode.
+            if self.cfg.get('unstructured_prune'):
+                self.pruner = create('UnstructuredPruner')(self.model,
+                                                           steps_per_epoch)
+        if self.use_amp and self.amp_level == 'O2':
+            self.model, self.optimizer = paddle.amp.decorate(
+                models=self.model,
+                optimizers=self.optimizer,
+                level=self.amp_level)
+
+        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
+        if self.use_ema:
+            ema_decay = self.cfg.get('ema_decay', 0.9998)
+            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
+            cycle_epoch = self.cfg.get('cycle_epoch', -1)
+            ema_black_list = self.cfg.get('ema_black_list', None)
+            self.ema = ModelEMA(
+                self.model,
+                decay=ema_decay,
+                ema_decay_type=ema_decay_type,
+                cycle_epoch=cycle_epoch,
+                ema_black_list=ema_black_list)
+            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)
+
+        # simple_ema for SSOD
+        self.use_simple_ema = ('use_simple_ema' in cfg and
+                               cfg['use_simple_ema'])
+        if self.use_simple_ema:
+            self.use_ema = True
+            ema_decay = self.cfg.get('ema_decay', 0.9996)
+            self.ema = SimpleModelEMA(self.model, decay=ema_decay)
+            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)
+
+        self._nranks = dist.get_world_size()
+        self._local_rank = dist.get_rank()
+
+        self.status = {}
+
+        self.start_epoch = 0
+        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
+
+        # initial default callbacks
+        self._init_callbacks()
+
+        # initial default metrics
+        self._init_metrics()
+        self._reset_metrics()
+
+    def load_weights(self, weights):
+        if self.is_loaded_weights:
+            return
+        self.start_epoch = 0
+        load_pretrain_weight(self.model, weights)
+        load_pretrain_weight(self.ema.model, weights)
+        logger.info("Load weights {} to start training for teacher and student".
+                    format(weights))
+
+    def resume_weights(self, weights, exchange=True):
+        # support Distill resume weights
+        if hasattr(self.model, 'student_model'):
+            self.start_epoch = load_weight(self.model.student_model, weights,
+                                           self.optimizer, exchange)
+        else:
+            self.start_epoch = load_weight(self.model, weights, self.optimizer,
+                                           self.ema
+                                           if self.use_ema else None, exchange)
+        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
+
+    def train(self, validate=False):
+        self.semi_start_iters = self.cfg.get('semi_start_iters', 5000)
+        Init_mark = False
+        if validate:
+            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
+                "EvalDataset")()
+
+        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
+                   self.cfg.use_gpu and self._nranks > 1)
+        if sync_bn:
+            self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
+                self.model)
+
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
+            self.ema.model = paddle.DataParallel(
+                self.ema.model, find_unused_parameters=find_unused_parameters)
+
+        self.status.update({
+            'epoch_id': self.start_epoch,
+            'step_id': 0,
+            'steps_per_epoch': len(self.loader),
+            'exchange_save_model': True,
+        })
+        # Note: exchange_save_model
+        # in DenseTeacher SSOD, the teacher model will be higher, so exchange when saving pdparams
+
+        self.status['batch_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['data_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
+
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, self.cfg.worker_num)
+            self._flops(flops_loader)
+        profiler_options = self.cfg.get('profiler_options', None)
+        self._compose_callback.on_train_begin(self.status)
+
+        train_cfg = self.cfg.DenseTeacher['train_cfg']
+        concat_sup_data = train_cfg.get('concat_sup_data', True)
+
+        for param in self.ema.model.parameters():
+            param.stop_gradient = True
+
+        for epoch_id in range(self.start_epoch, self.cfg.epoch):
+            self.status['mode'] = 'train'
+            self.status['epoch_id'] = epoch_id
+            self._compose_callback.on_epoch_begin(self.status)
+            self.loader.dataset_label.set_epoch(epoch_id)
+            self.loader.dataset_unlabel.set_epoch(epoch_id)
+            iter_tic = time.time()
+            loss_dict = {
+                'loss': paddle.to_tensor([0]),
+                'loss_sup_sum': paddle.to_tensor([0]),
+                'loss_unsup_sum': paddle.to_tensor([0]),
+                'fg_sum': paddle.to_tensor([0]),
+            }
+            if self._nranks > 1:
+                for k in self.model._layers.get_loss_keys():
+                    loss_dict.update({k: paddle.to_tensor([0.])})
+                for k in self.model._layers.get_loss_keys():
+                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
+            else:
+                for k in self.model.get_loss_keys():
+                    loss_dict.update({k: paddle.to_tensor([0.])})
+                for k in self.model.get_loss_keys():
+                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
+
+            # Note: for step_id, data in enumerate(self.loader): # enumerate bug
+            for step_id in range(len(self.loader)):
+                data = next(self.loader)
+
+                self.model.train()
+                self.ema.model.eval()
+                data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data
+
+                self.status['data_time'].update(time.time() - iter_tic)
+                self.status['step_id'] = step_id
+                profiler.add_profiler_step(profiler_options)
+                self._compose_callback.on_step_begin(self.status)
+
+                if data_sup_w['image'].shape != data_sup_s['image'].shape:
+                    data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
+                                                                     data_sup_s)
+
+                data_sup_w['epoch_id'] = epoch_id
+                data_sup_s['epoch_id'] = epoch_id
+                if concat_sup_data:
+                    for k, v in data_sup_s.items():
+                        if k in ['epoch_id']:
+                            continue
+                        data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
+                    loss_dict_sup = self.model(data_sup_s)
+                else:
+                    loss_dict_sup_w = self.model(data_sup_w)
+                    loss_dict_sup = self.model(data_sup_s)
+                    for k, v in loss_dict_sup_w.items():
+                        loss_dict_sup[k] = (loss_dict_sup[k] + v) * 0.5
+
+                losses_sup = loss_dict_sup['loss'] * train_cfg['sup_weight']
+                losses_sup.backward()
+
+                losses = losses_sup.detach()
+                loss_dict.update(loss_dict_sup)
+                loss_dict.update({'loss_sup_sum': loss_dict['loss']})
+
+                curr_iter = len(self.loader) * epoch_id + step_id
+                st_iter = self.semi_start_iters
+                if curr_iter == st_iter:
+                    logger.info("***" * 30)
+                    logger.info('Semi starting ...')
+                    logger.info("***" * 30)
+                if curr_iter > st_iter:
+                    unsup_weight = train_cfg['unsup_weight']
+                    if train_cfg['suppress'] == 'linear':
+                        tar_iter = st_iter * 2
+                        if curr_iter <= tar_iter:
+                            unsup_weight *= (curr_iter - st_iter) / st_iter
+                    elif train_cfg['suppress'] == 'exp':
+                        tar_iter = st_iter + 2000
+                        if curr_iter <= tar_iter:
+                            scale = np.exp((curr_iter - tar_iter) / 1000)
+                            unsup_weight *= scale
+                    elif train_cfg['suppress'] == 'step':
+                        tar_iter = st_iter * 2
+                        if curr_iter <= tar_iter:
+                            unsup_weight *= 0.25
+                    else:
+                        raise ValueError
+
+                    if data_unsup_w['image'].shape != data_unsup_s[
+                            'image'].shape:
+                        data_unsup_w, data_unsup_s = align_weak_strong_shape(
+                            data_unsup_w, data_unsup_s)
+
+                    data_unsup_w['epoch_id'] = epoch_id
+                    data_unsup_s['epoch_id'] = epoch_id
+
+                    data_unsup_s['get_data'] = True
+                    student_preds = self.model(data_unsup_s)
+
+                    with paddle.no_grad():
+                        data_unsup_w['is_teacher'] = True
+                        teacher_preds = self.ema.model(data_unsup_w)
+
+                    train_cfg['curr_iter'] = curr_iter
+                    train_cfg['st_iter'] = st_iter
+                    if self._nranks > 1:
+                        loss_dict_unsup = self.model._layers.get_ssod_loss(
+                            student_preds, teacher_preds, train_cfg)
+                    else:
+                        loss_dict_unsup = self.model.get_ssod_loss(
+                            student_preds, teacher_preds, train_cfg)
+
+                    fg_num = loss_dict_unsup["fg_sum"]
+                    del loss_dict_unsup["fg_sum"]
+                    distill_weights = train_cfg['loss_weight']
+                    loss_dict_unsup = {
+                        k: v * distill_weights[k]
+                        for k, v in loss_dict_unsup.items()
+                    }
+
+                    losses_unsup = sum([
+                        metrics_value
+                        for metrics_value in loss_dict_unsup.values()
+                    ]) * unsup_weight
+                    losses_unsup.backward()
+
+                    loss_dict.update(loss_dict_unsup)
+                    loss_dict.update({'loss_unsup_sum': losses_unsup})
+                    losses += losses_unsup.detach()
+                    loss_dict.update({"fg_sum": fg_num})
+                    loss_dict['loss'] = losses
+
+                self.optimizer.step()
+                curr_lr = self.optimizer.get_lr()
+                self.lr.step()
+                self.optimizer.clear_grad()
+                self.status['learning_rate'] = curr_lr
+                if self._nranks < 2 or self._local_rank == 0:
+                    self.status['training_staus'].update(loss_dict)
+
+                self.status['batch_time'].update(time.time() - iter_tic)
+                self._compose_callback.on_step_end(self.status)
+                # Note: ema_start_iters
+                if self.use_ema and curr_iter == self.ema_start_iters:
+                    logger.info("***" * 30)
+                    logger.info('EMA starting ...')
+                    logger.info("***" * 30)
+                    self.ema.update(self.model, decay=0)
+                elif self.use_ema and curr_iter > self.ema_start_iters:
+                    self.ema.update(self.model)
+                iter_tic = time.time()
+
+            is_snapshot = (self._nranks < 2 or self._local_rank == 0) \
+                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
+            if is_snapshot and self.use_ema:
+                # apply ema weight on model
+                weight = copy.deepcopy(self.ema.model.state_dict())
+                for k, v in weight.items():
+                    if paddle.is_floating_point(v):
+                        weight[k].stop_gradient = True
+                self.status['weight'] = weight
+
+            self._compose_callback.on_epoch_end(self.status)
+
+            if validate and is_snapshot:
+                if not hasattr(self, '_eval_loader'):
+                    # build evaluation dataset and loader
+                    self._eval_dataset = self.cfg.EvalDataset
+                    self._eval_batch_sampler = \
+                        paddle.io.BatchSampler(
+                            self._eval_dataset,
+                            batch_size=self.cfg.EvalReader['batch_size'])
+                    # If metric is VOC, need to be set collate_batch=False.
+                    if self.cfg.metric == 'VOC':
+                        self.cfg['EvalReader']['collate_batch'] = False
+                    self._eval_loader = create('EvalReader')(
+                        self._eval_dataset,
+                        self.cfg.worker_num,
+                        batch_sampler=self._eval_batch_sampler)
+                # if validation in training is enabled, metrics should be re-init
+                # Init_mark makes sure this code will only execute once
+                if validate and Init_mark == False:
+                    Init_mark = True
+                    self._init_metrics(validate=validate)
+                    self._reset_metrics()
+
+                with paddle.no_grad():
+                    self.status['save_best_model'] = True
+                    self._eval_with_loader(self._eval_loader)
+
+            if is_snapshot and self.use_ema:
+                self.status.pop('weight')
+
+        self._compose_callback.on_train_end(self.status)
+
+    def evaluate(self):
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
+        with paddle.no_grad():
+            self._eval_with_loader(self.loader)
+
+    def _eval_with_loader(self, loader):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+
+        test_cfg = self.cfg.DenseTeacher['test_cfg']
+        if test_cfg['inference_on'] == 'teacher':
+            logger.info("***** teacher model evaluating *****")
+            eval_model = self.ema.model
+        else:
+            logger.info("***** student model evaluating *****")
+            eval_model = self.model
+
+        eval_model.eval()
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
+            self._flops(flops_loader)
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            # forward
+            if self.use_amp:
+                with paddle.amp.auto_cast(
+                        enable=self.cfg.use_gpu or self.cfg.use_mlu,
+                        custom_white_list=self.custom_white_list,
+                        custom_black_list=self.custom_black_list,
+                        level=self.amp_level):
+                    outs = eval_model(data)
+            else:
+                outs = eval_model(data)
+
+            # update metrics
+            for metric in self._metrics:
+                metric.update(data, outs)
+
+            # multi-scale inputs: all inputs have same im_id
+            if isinstance(data, typing.Sequence):
+                sample_num += data[0]['im_id'].numpy().shape[0]
+            else:
+                sample_num += data['im_id'].numpy().shape[0]
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()

+ 18 - 17
paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cc → paddlers/models/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc

@@ -13,14 +13,14 @@
 // limitations under the License.
 //
 // The code is based on
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
 
+#include "../rbox_iou/rbox_iou_utils.h"
 #include "paddle/extension.h"
-#include "rbox_iou_op.h"
 
 template <typename T>
 void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,
-                            const T *rbox2_data_ptr, T *output_data_ptr) {
+                                 const T *rbox2_data_ptr, T *output_data_ptr) {
 
   int i;
   for (i = 0; i < rbox_num; i++) {
@@ -30,42 +30,43 @@ void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,
 }
 
 #define CHECK_INPUT_CPU(x)                                                     \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
-std::vector<paddle::Tensor> MatchedRboxIouCPUForward(const paddle::Tensor &rbox1,
-                                                 const paddle::Tensor &rbox2) {
+std::vector<paddle::Tensor>
+MatchedRboxIouCPUForward(const paddle::Tensor &rbox1,
+                         const paddle::Tensor &rbox2) {
   CHECK_INPUT_CPU(rbox1);
   CHECK_INPUT_CPU(rbox2);
   PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
 
   auto rbox_num = rbox1.shape()[0];
-  auto output = paddle::Tensor(paddle::PlaceType::kCPU, {rbox_num});
+  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::CPUPlace());
 
-  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rotated_iou_cpu_kernel", ([&] {
+  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "matched_rbox_iou_cpu_kernel", ([&] {
                                matched_rbox_iou_cpu_kernel<data_t>(
                                    rbox_num, rbox1.data<data_t>(),
-                                   rbox2.data<data_t>(),
-                                   output.mutable_data<data_t>());
+                                   rbox2.data<data_t>(), output.data<data_t>());
                              }));
 
   return {output};
 }
 
 #ifdef PADDLE_WITH_CUDA
-std::vector<paddle::Tensor> MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
-                                                  const paddle::Tensor &rbox2);
+std::vector<paddle::Tensor>
+MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
+                          const paddle::Tensor &rbox2);
 #endif
 
 #define CHECK_INPUT_SAME(x1, x2)                                               \
   PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
 
 std::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1,
-                                              const paddle::Tensor &rbox2) {
+                                                  const paddle::Tensor &rbox2) {
   CHECK_INPUT_SAME(rbox1, rbox2);
-  if (rbox1.place() == paddle::PlaceType::kCPU) {
+  if (rbox1.is_cpu()) {
     return MatchedRboxIouCPUForward(rbox1, rbox2);
 #ifdef PADDLE_WITH_CUDA
-  } else if (rbox1.place() == paddle::PlaceType::kGPU) {
+  } else if (rbox1.is_gpu()) {
     return MatchedRboxIouCUDAForward(rbox1, rbox2);
 #endif
   }
@@ -73,12 +74,12 @@ std::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1,
 
 std::vector<std::vector<int64_t>>
 MatchedRboxIouInferShape(std::vector<int64_t> rbox1_shape,
-                     std::vector<int64_t> rbox2_shape) {
+                         std::vector<int64_t> rbox2_shape) {
   return {{rbox1_shape[0]}};
 }
 
 std::vector<paddle::DataType> MatchedRboxIouInferDtype(paddle::DataType t1,
-                                                   paddle::DataType t2) {
+                                                       paddle::DataType t2) {
   return {t1};
 }
 

+ 9 - 14
paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cu → paddlers/models/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu

@@ -13,21 +13,15 @@
 // limitations under the License.
 //
 // The code is based on
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
 
+#include "../rbox_iou/rbox_iou_utils.h"
 #include "paddle/extension.h"
-#include "rbox_iou_op.h"
-
-/**
-   Computes ceil(a / b)
-*/
-
-static inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; }
 
 template <typename T>
 __global__ void
 matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,
-                        const T *rbox2_data_ptr, T *output_data_ptr) {
+                             const T *rbox2_data_ptr, T *output_data_ptr) {
   for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num;
        tid += blockDim.x * gridDim.x) {
     output_data_ptr[tid] =
@@ -36,17 +30,18 @@ matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,
 }
 
 #define CHECK_INPUT_GPU(x)                                                     \
-  PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
+  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
 
-std::vector<paddle::Tensor> MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
-                                                  const paddle::Tensor &rbox2) {
+std::vector<paddle::Tensor>
+MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
+                          const paddle::Tensor &rbox2) {
   CHECK_INPUT_GPU(rbox1);
   CHECK_INPUT_GPU(rbox2);
   PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
 
   auto rbox_num = rbox1.shape()[0];
 
-  auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox_num});
+  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::GPUPlace());
 
   const int thread_per_block = 512;
   const int block_per_grid = CeilDiv(rbox_num, thread_per_block);
@@ -56,7 +51,7 @@ std::vector<paddle::Tensor> MatchedRboxIouCUDAForward(const paddle::Tensor &rbox
         matched_rbox_iou_cuda_kernel<
             data_t><<<block_per_grid, thread_per_block, 0, rbox1.stream()>>>(
             rbox_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
-            output.mutable_data<data_t>());
+            output.data<data_t>());
       }));
 
   return {output};

+ 121 - 0
paddlers/models/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc

@@ -0,0 +1,121 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../rbox_iou/rbox_iou_utils.h"
+#include "paddle/extension.h"
+
+template <typename T>
+void nms_rotated_cpu_kernel(const T *boxes_data, const float threshold,
+                            const int64_t num_boxes, int64_t *num_keep_boxes,
+                            int64_t *output_data) {
+
+  int num_masks = CeilDiv(num_boxes, 64);
+  std::vector<int64_t> masks(num_masks, 0);
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (masks[i / 64] & 1ULL << (i % 64))
+      continue;
+    T box_1[5];
+    for (int k = 0; k < 5; ++k) {
+      box_1[k] = boxes_data[i * 5 + k];
+    }
+    for (int64_t j = i + 1; j < num_boxes; ++j) {
+      if (masks[j / 64] & 1ULL << (j % 64))
+        continue;
+      T box_2[5];
+      for (int k = 0; k < 5; ++k) {
+        box_2[k] = boxes_data[j * 5 + k];
+      }
+      if (rbox_iou_single<T>(box_1, box_2) > threshold) {
+        masks[j / 64] |= 1ULL << (j % 64);
+      }
+    }
+  }
+  int64_t output_data_idx = 0;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (masks[i / 64] & 1ULL << (i % 64))
+      continue;
+    output_data[output_data_idx++] = i;
+  }
+  *num_keep_boxes = output_data_idx;
+  for (; output_data_idx < num_boxes; ++output_data_idx) {
+    output_data[output_data_idx] = 0;
+  }
+}
+
+#define CHECK_INPUT_CPU(x)                                                     \
+  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+std::vector<paddle::Tensor> NMSRotatedCPUForward(const paddle::Tensor &boxes,
+                                                 const paddle::Tensor &scores,
+                                                 float threshold) {
+  CHECK_INPUT_CPU(boxes);
+  CHECK_INPUT_CPU(scores);
+
+  auto num_boxes = boxes.shape()[0];
+
+  auto order_t =
+      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
+  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);
+
+  auto keep =
+      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
+  int64_t num_keep_boxes = 0;
+
+  PD_DISPATCH_FLOATING_TYPES(boxes.type(), "nms_rotated_cpu_kernel", ([&] {
+                               nms_rotated_cpu_kernel<data_t>(
+                                   boxes_sorted.data<data_t>(), threshold,
+                                   num_boxes, &num_keep_boxes,
+                                   keep.data<int64_t>());
+                             }));
+
+  keep = keep.slice(0, num_keep_boxes);
+  return {paddle::gather(order_t, keep, /* axis=*/0)};
+}
+
+#ifdef PADDLE_WITH_CUDA
+std::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,
+                                                  const paddle::Tensor &scores,
+                                                  float threshold);
+#endif
+
+std::vector<paddle::Tensor> NMSRotatedForward(const paddle::Tensor &boxes,
+                                              const paddle::Tensor &scores,
+                                              float threshold) {
+  if (boxes.is_cpu()) {
+    return NMSRotatedCPUForward(boxes, scores, threshold);
+#ifdef PADDLE_WITH_CUDA
+  } else if (boxes.is_gpu()) {
+    return NMSRotatedCUDAForward(boxes, scores, threshold);
+#endif
+  }
+}
+
+std::vector<std::vector<int64_t>>
+NMSRotatedInferShape(std::vector<int64_t> boxes_shape,
+                     std::vector<int64_t> scores_shape) {
+  return {{-1}};
+}
+
+std::vector<paddle::DataType> NMSRotatedInferDtype(paddle::DataType t1,
+                                                   paddle::DataType t2) {
+  return {paddle::DataType::INT64};
+}
+
+PD_BUILD_OP(nms_rotated)
+    .Inputs({"Boxes", "Scores"})
+    .Outputs({"Output"})
+    .Attrs({"threshold: float"})
+    .SetKernelFn(PD_KERNEL(NMSRotatedForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(NMSRotatedInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(NMSRotatedInferDtype));

+ 96 - 0
paddlers/models/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu

@@ -0,0 +1,96 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../rbox_iou/rbox_iou_utils.h"
+#include "paddle/extension.h"
+
+static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
+
+template <typename T>
+__global__ void
+nms_rotated_cuda_kernel(const T *boxes_data, const float threshold,
+                        const int64_t num_boxes, int64_t *masks) {
+  auto raw_start = blockIdx.y;
+  auto col_start = blockIdx.x;
+  if (raw_start > col_start)
+    return;
+  const int raw_last_storage =
+      min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
+  const int col_last_storage =
+      min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
+  if (threadIdx.x < raw_last_storage) {
+    int64_t mask = 0;
+    auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
+    const T *current_box = boxes_data + current_box_idx * 5;
+    for (int i = 0; i < col_last_storage; ++i) {
+      const T *target_box = boxes_data + (col_start * threadsPerBlock + i) * 5;
+      if (rbox_iou_single<T>(current_box, target_box) > threshold) {
+        mask |= 1ULL << i;
+      }
+    }
+    const int blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
+    masks[current_box_idx * blocks_per_line + col_start] = mask;
+  }
+}
+
+#define CHECK_INPUT_GPU(x)                                                     \
+  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+
+std::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,
+                                                  const paddle::Tensor &scores,
+                                                  float threshold) {
+  CHECK_INPUT_GPU(boxes);
+  CHECK_INPUT_GPU(scores);
+
+  auto num_boxes = boxes.shape()[0];
+  auto order_t =
+      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
+  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);
+
+  const auto blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
+  dim3 block(threadsPerBlock);
+  dim3 grid(blocks_per_line, blocks_per_line);
+  auto mask_dev = paddle::empty({num_boxes * blocks_per_line},
+                                paddle::DataType::INT64, paddle::GPUPlace());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      boxes.type(), "nms_rotated_cuda_kernel", ([&] {
+        nms_rotated_cuda_kernel<data_t><<<grid, block, 0, boxes.stream()>>>(
+            boxes_sorted.data<data_t>(), threshold, num_boxes,
+            mask_dev.data<int64_t>());
+      }));
+
+  auto mask_host = mask_dev.copy_to(paddle::CPUPlace(), true);
+  auto keep_host =
+      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
+  int64_t *keep_host_ptr = keep_host.data<int64_t>();
+  int64_t *mask_host_ptr = mask_host.data<int64_t>();
+  std::vector<int64_t> remv(blocks_per_line);
+  int64_t last_box_num = 0;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    auto remv_element_id = i / threadsPerBlock;
+    auto remv_bit_id = i % threadsPerBlock;
+    if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
+      keep_host_ptr[last_box_num++] = i;
+      int64_t *current_mask = mask_host_ptr + i * blocks_per_line;
+      for (auto j = remv_element_id; j < blocks_per_line; ++j) {
+        remv[j] |= current_mask[j];
+      }
+    }
+  }
+
+  keep_host = keep_host.slice(0, last_box_num);
+  auto keep_dev = keep_host.copy_to(paddle::GPUPlace(), true);
+  return {paddle::gather(order_t, keep_dev, /* axis=*/0)};
+}

+ 95 - 0
paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc

@@ -0,0 +1,95 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
+
+#include "paddle/extension.h"
+#include "rbox_iou_utils.h"
+
+template <typename T>
+void rbox_iou_cpu_kernel(const int rbox1_num, const int rbox2_num,
+                         const T *rbox1_data_ptr, const T *rbox2_data_ptr,
+                         T *output_data_ptr) {
+
+  int i, j;
+  for (i = 0; i < rbox1_num; i++) {
+    for (j = 0; j < rbox2_num; j++) {
+      int offset = i * rbox2_num + j;
+      output_data_ptr[offset] =
+          rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5);
+    }
+  }
+}
+
+#define CHECK_INPUT_CPU(x)                                                     \
+  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+std::vector<paddle::Tensor> RboxIouCPUForward(const paddle::Tensor &rbox1,
+                                              const paddle::Tensor &rbox2) {
+  CHECK_INPUT_CPU(rbox1);
+  CHECK_INPUT_CPU(rbox2);
+
+  auto rbox1_num = rbox1.shape()[0];
+  auto rbox2_num = rbox2.shape()[0];
+
+  auto output =
+      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::CPUPlace());
+
+  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rbox_iou_cpu_kernel", ([&] {
+                               rbox_iou_cpu_kernel<data_t>(
+                                   rbox1_num, rbox2_num, rbox1.data<data_t>(),
+                                   rbox2.data<data_t>(), output.data<data_t>());
+                             }));
+
+  return {output};
+}
+
+#ifdef PADDLE_WITH_CUDA
+std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
+                                               const paddle::Tensor &rbox2);
+#endif
+
+#define CHECK_INPUT_SAME(x1, x2)                                               \
+  PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
+
+std::vector<paddle::Tensor> RboxIouForward(const paddle::Tensor &rbox1,
+                                           const paddle::Tensor &rbox2) {
+  CHECK_INPUT_SAME(rbox1, rbox2);
+  if (rbox1.is_cpu()) {
+    return RboxIouCPUForward(rbox1, rbox2);
+#ifdef PADDLE_WITH_CUDA
+  } else if (rbox1.is_gpu()) {
+    return RboxIouCUDAForward(rbox1, rbox2);
+#endif
+  }
+}
+
+std::vector<std::vector<int64_t>>
+RboxIouInferShape(std::vector<int64_t> rbox1_shape,
+                  std::vector<int64_t> rbox2_shape) {
+  return {{rbox1_shape[0], rbox2_shape[0]}};
+}
+
+std::vector<paddle::DataType> RboxIouInferDtype(paddle::DataType t1,
+                                                paddle::DataType t2) {
+  return {t1};
+}
+
+PD_BUILD_OP(rbox_iou)
+    .Inputs({"RBox1", "RBox2"})
+    .Outputs({"Output"})
+    .SetKernelFn(PD_KERNEL(RboxIouForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(RboxIouInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(RboxIouInferDtype));

+ 6 - 11
paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cu → paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu

@@ -13,21 +13,15 @@
 // limitations under the License.
 //
 // The code is based on
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
 
 #include "paddle/extension.h"
-#include "rbox_iou_op.h"
+#include "rbox_iou_utils.h"
 
 // 2D block with 32 * 16 = 512 threads per block
 const int BLOCK_DIM_X = 32;
 const int BLOCK_DIM_Y = 16;
 
-/**
-   Computes ceil(a / b)
-*/
-
-static inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; }
-
 template <typename T>
 __global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,
                                      const T *rbox1_data_ptr,
@@ -85,7 +79,7 @@ __global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,
 }
 
 #define CHECK_INPUT_GPU(x)                                                     \
-  PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
+  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
 
 std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
                                                const paddle::Tensor &rbox2) {
@@ -95,7 +89,8 @@ std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
   auto rbox1_num = rbox1.shape()[0];
   auto rbox2_num = rbox2.shape()[0];
 
-  auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox1_num, rbox2_num});
+  auto output =
+      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::GPUPlace());
 
   const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X);
   const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y);
@@ -107,7 +102,7 @@ std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
       rbox1.type(), "rbox_iou_cuda_kernel", ([&] {
         rbox_iou_cuda_kernel<data_t><<<blocks, threads, 0, rbox1.stream()>>>(
             rbox1_num, rbox2_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
-            output.mutable_data<data_t>());
+            output.data<data_t>());
       }));
 
   return {output};

+ 0 - 97
paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cc

@@ -1,97 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
-
-#include "rbox_iou_op.h"
-#include "paddle/extension.h"
-
-
-template <typename T>
-void rbox_iou_cpu_kernel(
-    const int rbox1_num,
-    const int rbox2_num,
-    const T* rbox1_data_ptr,
-    const T* rbox2_data_ptr,
-    T* output_data_ptr) {
-
-    int i, j;
-    for (i = 0; i < rbox1_num; i++) {
-        for (j = 0; j < rbox2_num; j++) {
-		int offset = i * rbox2_num + j;
-		output_data_ptr[offset] = rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5);
-        }
-    }
-}
-
-
-#define CHECK_INPUT_CPU(x) PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
-
-std::vector<paddle::Tensor> RboxIouCPUForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) {
-    CHECK_INPUT_CPU(rbox1);
-    CHECK_INPUT_CPU(rbox2);
-
-    auto rbox1_num = rbox1.shape()[0];
-    auto rbox2_num = rbox2.shape()[0];
-
-    auto output = paddle::Tensor(paddle::PlaceType::kCPU, {rbox1_num, rbox2_num});
-
-    PD_DISPATCH_FLOATING_TYPES(
-        rbox1.type(),
-        "rbox_iou_cpu_kernel",
-        ([&] {
-            rbox_iou_cpu_kernel<data_t>(
-                rbox1_num,
-                rbox2_num,
-                rbox1.data<data_t>(),
-                rbox2.data<data_t>(),
-                output.mutable_data<data_t>());
-        }));
-    
-    return {output};
-}
-
-
-#ifdef PADDLE_WITH_CUDA
-std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2);
-#endif
-
-
-#define CHECK_INPUT_SAME(x1, x2) PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
-
-std::vector<paddle::Tensor> RboxIouForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) {
-    CHECK_INPUT_SAME(rbox1, rbox2);
-    if (rbox1.place() == paddle::PlaceType::kCPU) {
-        return RboxIouCPUForward(rbox1, rbox2);
-#ifdef PADDLE_WITH_CUDA
-    } else if (rbox1.place() == paddle::PlaceType::kGPU) {
-        return RboxIouCUDAForward(rbox1, rbox2);
-#endif
-    }
-}
-
-std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> rbox1_shape, std::vector<int64_t> rbox2_shape) {
-    return {{rbox1_shape[0], rbox2_shape[0]}};
-}
-
-std::vector<paddle::DataType> InferDtype(paddle::DataType t1, paddle::DataType t2) {
-    return {t1};
-}
-
-PD_BUILD_OP(rbox_iou)
-    .Inputs({"RBOX1", "RBOX2"})
-    .Outputs({"Output"})
-    .SetKernelFn(PD_KERNEL(RboxIouForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype));

+ 12 - 4
paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.h → paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h

@@ -13,7 +13,7 @@
 // limitations under the License.
 //
 // The code is based on
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
 
 #pragma once
 
@@ -336,13 +336,21 @@ HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw,
   box2.h = box2_raw[3];
   box2.a = box2_raw[4];
 
-  const T area1 = box1.w * box1.h;
-  const T area2 = box2.w * box2.h;
-  if (area1 < 1e-14 || area2 < 1e-14) {
+  if (box1.w < 1e-2 || box1.h < 1e-2 || box2.w < 1e-2 || box2.h < 1e-2) {
     return 0.f;
   }
+  const T area1 = box1.w * box1.h;
+  const T area2 = box2.w * box2.h;
 
   const T intersection = rboxes_intersection<T>(box1, box2);
   const T iou = intersection / (area1 + area2 - intersection);
   return iou;
 }
+
+/**
+   Computes ceil(a / b)
+*/
+
+HOST_DEVICE inline int CeilDiv(const int a, const int b) {
+  return (a + b - 1) / b;
+}

+ 1 - 1
paddlers/models/ppdet/hash.txt

@@ -1 +1 @@
-e3f8dd16bffca04060ec1edc388c5a618e15bbf8
+00fe2a1c35603b6fb37b73265aecf6282e5e2ad4

+ 2 - 1
paddlers/models/ppdet/metrics/__init__.py

@@ -17,6 +17,7 @@ from . import keypoint_metrics
 
 from .metrics import *
 from .keypoint_metrics import *
+from .pose3d_metrics import *
 
 __all__ = metrics.__all__ + keypoint_metrics.__all__
 
@@ -26,4 +27,4 @@ __all__ = metrics.__all__ + mot_metrics.__all__
 
 from . import mcmot_metrics
 from .mcmot_metrics import *
-__all__ = metrics.__all__ + mcmot_metrics.__all__
+__all__ = metrics.__all__ + mcmot_metrics.__all__ 

+ 6 - 2
paddlers/models/ppdet/metrics/coco_utils.py

@@ -21,7 +21,7 @@ import sys
 import numpy as np
 import itertools
 
-from paddlers.models.ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res
+from paddlers.models.ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res
 from paddlers.models.ppdet.metrics.map_utils import draw_pr_curve
 
 from paddlers.models.ppdet.utils.logger import setup_logger
@@ -64,6 +64,10 @@ def get_infer_results(outs, catid, bias=0):
         infer_res['keypoint'] = get_keypoint_res(outs, im_id)
         outs['bbox_num'] = [len(infer_res['keypoint'])]
 
+    if 'pose3d' in outs:
+        infer_res['pose3d'] = get_pose3d_res(outs, im_id)
+        outs['bbox_num'] = [len(infer_res['pose3d'])]
+
     return infer_res
 
 
@@ -150,7 +154,7 @@ def cocoapi_eval(jsonfile,
         results_flatten = list(itertools.chain(*results_per_category))
         headers = ['category', 'AP'] * (num_columns // 2)
         results_2d = itertools.zip_longest(
-            *[results_flatten[i::num_columns] for i in range(num_columns)])
+            * [results_flatten[i::num_columns] for i in range(num_columns)])
         table_data = [headers]
         table_data += [result for result in results_2d]
         table = AsciiTable(table_data)

+ 16 - 0
paddlers/models/ppdet/metrics/json_results.py

@@ -157,3 +157,19 @@ def get_keypoint_res(results, im_id):
             ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
             anns.append(ann)
     return anns
+
+
+def get_pose3d_res(results, im_id):
+    anns = []
+    preds = results['pose3d']
+    for idx in range(im_id.shape[0]):
+        image_id = im_id[idx].item()
+        pose3d = preds[idx]
+        ann = {
+            'image_id': image_id,
+            'category_id': 1,  # XXX hard code
+            'pose3d': pose3d.tolist(),
+            'score': float(1.)
+        }
+        anns.append(ann)
+    return anns

+ 1 - 1
paddlers/models/ppdet/metrics/metrics.py

@@ -350,7 +350,7 @@ class WiderFaceMetric(Metric):
 class RBoxMetric(Metric):
     def __init__(self, anno_file, **kwargs):
         self.anno_file = anno_file
-        self.clsid2catid, self.catid2name = get_categories('COCO', anno_file)
+        self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file)
         self.catid2clsid = {v: k for k, v in self.clsid2catid.items()}
         self.classwise = kwargs.get('classwise', False)
         self.output_eval = kwargs.get('output_eval', None)

+ 200 - 0
paddlers/models/ppdet/metrics/pose3d_metrics.py

@@ -0,0 +1,200 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+from paddle.distributed import ParallelEnv
+import os
+import json
+from collections import defaultdict, OrderedDict
+import numpy as np
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['Pose3DEval']
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def mean_per_joint_position_error(pred, gt, has_3d_joints):
+    """ 
+    Compute mPJPE
+    """
+    gt = gt[has_3d_joints == 1]
+    gt = gt[:, :, :3]
+    pred = pred[has_3d_joints == 1]
+
+    with paddle.no_grad():
+        gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2
+        gt = gt - gt_pelvis[:, None, :]
+        pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2
+        pred = pred - pred_pelvis[:, None, :]
+        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy()
+        return error
+
+
+def compute_similarity_transform(S1, S2):
+    """Computes a similarity transform (sR, t) that takes
+    a set of 3D points S1 (3 x N) closest to a set of 3D points S2,
+    where R is an 3x3 rotation matrix, t 3x1 translation, s scale.
+    i.e. solves the orthogonal Procrutes problem.
+    """
+    transposed = False
+    if S1.shape[0] != 3 and S1.shape[0] != 2:
+        S1 = S1.T
+        S2 = S2.T
+        transposed = True
+    assert (S2.shape[1] == S1.shape[1])
+
+    # 1. Remove mean.
+    mu1 = S1.mean(axis=1, keepdims=True)
+    mu2 = S2.mean(axis=1, keepdims=True)
+    X1 = S1 - mu1
+    X2 = S2 - mu2
+
+    # 2. Compute variance of X1 used for scale.
+    var1 = np.sum(X1**2)
+
+    # 3. The outer product of X1 and X2.
+    K = X1.dot(X2.T)
+
+    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
+    # singular vectors of K.
+    U, s, Vh = np.linalg.svd(K)
+    V = Vh.T
+    # Construct Z that fixes the orientation of R to get det(R)=1.
+    Z = np.eye(U.shape[0])
+    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
+    # Construct R.
+    R = V.dot(Z.dot(U.T))
+
+    # 5. Recover scale.
+    scale = np.trace(R.dot(K)) / var1
+
+    # 6. Recover translation.
+    t = mu2 - scale * (R.dot(mu1))
+
+    # 7. Error:
+    S1_hat = scale * R.dot(S1) + t
+
+    if transposed:
+        S1_hat = S1_hat.T
+
+    return S1_hat
+
+
+def compute_similarity_transform_batch(S1, S2):
+    """Batched version of compute_similarity_transform."""
+    S1_hat = np.zeros_like(S1)
+    for i in range(S1.shape[0]):
+        S1_hat[i] = compute_similarity_transform(S1[i], S2[i])
+    return S1_hat
+
+
+def reconstruction_error(S1, S2, reduction='mean'):
+    """Do Procrustes alignment and compute reconstruction error."""
+    S1_hat = compute_similarity_transform_batch(S1, S2)
+    re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1)
+    if reduction == 'mean':
+        re = re.mean()
+    elif reduction == 'sum':
+        re = re.sum()
+    return re
+
+
+def all_gather(data):
+    if paddle.distributed.get_world_size() == 1:
+        return data
+    vlist = []
+    paddle.distributed.all_gather(vlist, data)
+    data = paddle.concat(vlist, 0)
+    return data
+
+
+class Pose3DEval(object):
+    def __init__(self, output_eval, save_prediction_only=False):
+        super(Pose3DEval, self).__init__()
+        self.output_eval = output_eval
+        self.res_file = os.path.join(output_eval, "pose3d_results.json")
+        self.save_prediction_only = save_prediction_only
+        self.reset()
+
+    def reset(self):
+        self.PAmPJPE = AverageMeter()
+        self.mPJPE = AverageMeter()
+        self.eval_results = {}
+
+    def get_human36m_joints(self, input):
+        J24_TO_J14 = paddle.to_tensor(
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18])
+        J24_TO_J17 = paddle.to_tensor(
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19])
+        return paddle.index_select(input, J24_TO_J14, axis=1)
+
+    def update(self, inputs, outputs):
+        gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv()
+                                                           .local_rank))
+        has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv()
+                                                                .local_rank))
+        pred_3d_joints = all_gather(outputs['pose3d'])
+        if gt_3d_joints.shape[1] == 24:
+            gt_3d_joints = self.get_human36m_joints(gt_3d_joints)
+        if pred_3d_joints.shape[1] == 24:
+            pred_3d_joints = self.get_human36m_joints(pred_3d_joints)
+        mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints,
+                                                  has_3d_joints).mean()
+        PAmPJPE_val = reconstruction_error(
+            pred_3d_joints.numpy(),
+            gt_3d_joints[:, :, :3].numpy(),
+            reduction=None).mean()
+        count = int(np.sum(has_3d_joints.numpy()))
+        self.PAmPJPE.update(PAmPJPE_val * 1000., count)
+        self.mPJPE.update(mPJPE_val * 1000., count)
+
+    def accumulate(self):
+        if self.save_prediction_only:
+            logger.info(f'The pose3d result is saved to {self.res_file} '
+                        'and do not evaluate the model.')
+            return
+        self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg]
+
+    def log(self):
+        if self.save_prediction_only:
+            return
+        stats_names = ['mPJPE', 'PAmPJPE']
+        num_values = len(stats_names)
+        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
+        print('|---' * (num_values + 1) + '|')
+
+        print(' '.join([
+            '| {:.3f}'.format(abs(value))
+            for value in self.eval_results['pose3d']
+        ]) + ' |')
+
+    def get_results(self):
+        return self.eval_results

+ 2 - 0
paddlers/models/ppdet/modeling/__init__.py

@@ -30,6 +30,7 @@ from . import mot
 from . import transformers
 from . import assigners
 from . import rbox_utils
+from . import ssod
 
 from .ops import *
 from .backbones import *
@@ -45,3 +46,4 @@ from .mot import *
 from .transformers import *
 from .assigners import *
 from .rbox_utils import *
+from .ssod import *

+ 11 - 0
paddlers/models/ppdet/modeling/architectures/__init__.py

@@ -16,6 +16,7 @@ from . import meta_arch
 from . import faster_rcnn
 from . import mask_rcnn
 from . import yolo
+from . import ppyoloe
 from . import cascade_rcnn
 from . import ssd
 from . import fcos
@@ -36,11 +37,16 @@ from . import tood
 from . import retinanet
 from . import bytetrack
 from . import yolox
+from . import yolof
+from . import pose3d_metro
+from . import centertrack
+from . import queryinst
 
 from .meta_arch import *
 from .faster_rcnn import *
 from .mask_rcnn import *
 from .yolo import *
+from .ppyoloe import *
 from .cascade_rcnn import *
 from .ssd import *
 from .fcos import *
@@ -62,3 +68,8 @@ from .tood import *
 from .retinanet import *
 from .bytetrack import *
 from .yolox import *
+from .yolof import *
+from .pose3d_metro import *
+from .centertrack import *
+from .queryinst import *
+from .keypoint_petr import *

+ 35 - 9
paddlers/models/ppdet/modeling/architectures/blazeface.py

@@ -18,6 +18,8 @@ from __future__ import print_function
 
 from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
+import paddle
+import paddle.nn.functional as F
 
 __all__ = ['BlazeFace']
 
@@ -74,18 +76,42 @@ class BlazeFace(BaseArch):
                                    self.inputs['gt_class'])
         else:
             preds, anchors = self.blaze_head(neck_feats, self.inputs['image'])
-            bbox, bbox_num = self.post_process(preds, anchors,
-                                               self.inputs['im_shape'],
-                                               self.inputs['scale_factor'])
-            return bbox, bbox_num
+            bbox, bbox_num, nms_keep_idx = self.post_process(
+                preds, anchors, self.inputs['im_shape'],
+                self.inputs['scale_factor'])
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]
+                extra_data['scores'] = F.softmax(paddle.concat(
+                    preds_logits, axis=1)).transpose([0, 2, 1])
+                extra_data['logits'] = paddle.concat(
+                    preds_logits, axis=1).transpose([0, 2, 1])
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox, bbox_num, extra_data
+            else:
+                return bbox, bbox_num
 
     def get_loss(self, ):
         return {"loss": self._forward()}
 
     def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {
-            "bbox": bbox_pred,
-            "bbox_num": bbox_num,
-        }
+        if self.use_extra_data:
+            bbox_pred, bbox_num, extra_data = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+                "extra_data": extra_data
+            }
+        else:
+            bbox_pred, bbox_num = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+            }
+
         return output

+ 1 - 1
paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py

@@ -108,7 +108,7 @@ class CascadeRCNN(BaseArch):
             im_shape = self.inputs['im_shape']
             scale_factor = self.inputs['scale_factor']
 
-            bbox, bbox_num = self.bbox_post_process(
+            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
                 preds, (refined_rois, rois_num), im_shape, scale_factor)
             # rescale the prediction back to origin image
             bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(

+ 11 - 16
paddlers/models/ppdet/modeling/architectures/centernet.py

@@ -78,30 +78,25 @@ class CenterNet(BaseArch):
 
     def get_pred(self):
         head_out = self._forward()
+        bbox, bbox_num, bbox_inds, topk_clses, topk_ys, topk_xs = self.post_process(
+            head_out['heatmap'],
+            head_out['size'],
+            head_out['offset'],
+            im_shape=self.inputs['im_shape'],
+            scale_factor=self.inputs['scale_factor'])
+
         if self.for_mot:
-            bbox, bbox_inds, topk_clses = self.post_process(
-                head_out['heatmap'],
-                head_out['size'],
-                head_out['offset'],
-                im_shape=self.inputs['im_shape'],
-                scale_factor=self.inputs['scale_factor'])
             output = {
                 "bbox": bbox,
+                "bbox_num": bbox_num,
                 "bbox_inds": bbox_inds,
                 "topk_clses": topk_clses,
+                "topk_ys": topk_ys,
+                "topk_xs": topk_xs,
                 "neck_feat": head_out['neck_feat']
             }
         else:
-            bbox, bbox_num, _ = self.post_process(
-                head_out['heatmap'],
-                head_out['size'],
-                head_out['offset'],
-                im_shape=self.inputs['im_shape'],
-                scale_factor=self.inputs['scale_factor'])
-            output = {
-                "bbox": bbox,
-                "bbox_num": bbox_num,
-            }
+            output = {"bbox": bbox, "bbox_num": bbox_num}
         return output
 
     def get_loss(self):

+ 176 - 0
paddlers/models/ppdet/modeling/architectures/centertrack.py

@@ -0,0 +1,176 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import math
+import numpy as np
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+from ..keypoint_utils import affine_transform
+from paddlers.models.ppdet.data.transform.op_helper import gaussian_radius, gaussian2D, draw_umich_gaussian
+
+__all__ = ['CenterTrack']
+
+
+@register
+class CenterTrack(BaseArch):
+    """
+    CenterTrack network, see http://arxiv.org/abs/2004.01177
+
+    Args:
+        detector (object): 'CenterNet' instance
+        plugin_head (object): 'CenterTrackHead' instance
+        tracker (object): 'CenterTracker' instance
+    """
+    __category__ = 'architecture'
+    __shared__ = ['mot_metric']
+
+    def __init__(self,
+                 detector='CenterNet',
+                 plugin_head='CenterTrackHead',
+                 tracker='CenterTracker',
+                 mot_metric=False):
+        super(CenterTrack, self).__init__()
+        self.detector = detector
+        self.plugin_head = plugin_head
+        self.tracker = tracker
+        self.mot_metric = mot_metric
+        self.pre_image = None
+        self.deploy = False
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        detector = create(cfg['detector'])
+        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape
+
+        kwargs = {'input_shape': detector_out_shape}
+        plugin_head = create(cfg['plugin_head'], **kwargs)
+        tracker = create(cfg['tracker'])
+
+        return {
+            'detector': detector,
+            'plugin_head': plugin_head,
+            'tracker': tracker,
+        }
+
+    def _forward(self):
+        if self.training:
+            det_outs = self.detector(self.inputs)
+            neck_feat = det_outs['neck_feat']
+
+            losses = {}
+            for k, v in det_outs.items():
+                if 'loss' not in k: continue
+                losses.update({k: v})
+
+            plugin_outs = self.plugin_head(neck_feat, self.inputs)
+            for k, v in plugin_outs.items():
+                if 'loss' not in k: continue
+                losses.update({k: v})
+
+            losses['loss'] = det_outs['det_loss'] + plugin_outs['plugin_loss']
+            return losses
+
+        else:
+            if not self.mot_metric:
+                # detection, support bs>=1
+                det_outs = self.detector(self.inputs)
+                return {
+                    'bbox': det_outs['bbox'],
+                    'bbox_num': det_outs['bbox_num']
+                }
+
+            else:
+                # MOT, only support bs=1
+                if not self.deploy:
+                    if self.pre_image is None:
+                        self.pre_image = self.inputs['image']
+                        # initializing tracker for the first frame
+                        self.tracker.init_track([])
+                    self.inputs['pre_image'] = self.pre_image
+                    self.pre_image = self.inputs[
+                        'image']  # Note: update for next image
+
+                    # render input heatmap from tracker status
+                    pre_hm = self.get_additional_inputs(
+                        self.tracker.tracks, self.inputs, with_hm=True)
+                    self.inputs['pre_hm'] = paddle.to_tensor(pre_hm)
+
+                # model inference
+                det_outs = self.detector(self.inputs)
+                neck_feat = det_outs['neck_feat']
+                result = self.plugin_head(
+                    neck_feat, self.inputs, det_outs['bbox'],
+                    det_outs['bbox_inds'], det_outs['topk_clses'],
+                    det_outs['topk_ys'], det_outs['topk_xs'])
+
+                if not self.deploy:
+                    # convert the cropped and 4x downsampled output coordinate system
+                    # back to the input image coordinate system
+                    result = self.plugin_head.centertrack_post_process(
+                        result, self.inputs, self.tracker.out_thresh)
+                return result
+
+    def get_pred(self):
+        return self._forward()
+
+    def get_loss(self):
+        return self._forward()
+
+    def reset_tracking(self):
+        self.tracker.reset()
+        self.pre_image = None
+
+    def get_additional_inputs(self, dets, meta, with_hm=True):
+        # Render input heatmap from previous trackings.
+        trans_input = meta['trans_input'][0].numpy()
+        inp_width, inp_height = int(meta['inp_width'][0]), int(meta[
+            'inp_height'][0])
+        input_hm = np.zeros((1, inp_height, inp_width), dtype=np.float32)
+
+        for det in dets:
+            if det['score'] < self.tracker.pre_thresh:
+                continue
+            bbox = affine_transform_bbox(det['bbox'], trans_input, inp_width,
+                                         inp_height)
+            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+            if (h > 0 and w > 0):
+                radius = gaussian_radius(
+                    (math.ceil(h), math.ceil(w)), min_overlap=0.7)
+                radius = max(0, int(radius))
+                ct = np.array(
+                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
+                    dtype=np.float32)
+                ct_int = ct.astype(np.int32)
+                if with_hm:
+                    input_hm[0] = draw_umich_gaussian(input_hm[0], ct_int,
+                                                      radius)
+        if with_hm:
+            input_hm = input_hm[np.newaxis]
+        return input_hm
+
+
+def affine_transform_bbox(bbox, trans, width, height):
+    bbox = np.array(copy.deepcopy(bbox), dtype=np.float32)
+    bbox[:2] = affine_transform(bbox[:2], trans)
+    bbox[2:] = affine_transform(bbox[2:], trans)
+    bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, width - 1)
+    bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, height - 1)
+    return bbox

+ 13 - 5
paddlers/models/ppdet/modeling/architectures/detr.py

@@ -27,17 +27,20 @@ __all__ = ['DETR']
 class DETR(BaseArch):
     __category__ = 'architecture'
     __inject__ = ['post_process']
+    __shared__ = ['exclude_post_process']
 
     def __init__(self,
                  backbone,
                  transformer,
                  detr_head,
-                 post_process='DETRBBoxPostProcess'):
+                 post_process='DETRBBoxPostProcess',
+                 exclude_post_process=False):
         super(DETR, self).__init__()
         self.backbone = backbone
         self.transformer = transformer
         self.detr_head = detr_head
         self.post_process = post_process
+        self.exclude_post_process = exclude_post_process
 
     @classmethod
     def from_config(cls, cfg, *args, **kwargs):
@@ -65,18 +68,23 @@ class DETR(BaseArch):
         body_feats = self.backbone(self.inputs)
 
         # Transformer
-        out_transformer = self.transformer(body_feats, self.inputs['pad_mask'])
+        pad_mask = self.inputs['pad_mask'] if self.training else None
+        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
 
         # DETR Head
         if self.training:
             return self.detr_head(out_transformer, body_feats, self.inputs)
         else:
             preds = self.detr_head(out_transformer, body_feats)
-            bbox, bbox_num = self.post_process(preds, self.inputs['im_shape'],
-                                               self.inputs['scale_factor'])
+            if self.exclude_post_process:
+                bboxes, logits, masks = preds
+                return bboxes, logits
+            else:
+                bbox, bbox_num = self.post_process(
+                    preds, self.inputs['im_shape'], self.inputs['scale_factor'])
             return bbox, bbox_num
 
-    def get_loss(self, ):
+    def get_loss(self):
         losses = self._forward()
         losses.update({
             'loss':

+ 61 - 5
paddlers/models/ppdet/modeling/architectures/faster_rcnn.py

@@ -19,6 +19,7 @@ from __future__ import print_function
 import paddle
 from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
+import numpy as np
 
 __all__ = ['FasterRCNN']
 
@@ -51,6 +52,9 @@ class FasterRCNN(BaseArch):
         self.bbox_head = bbox_head
         self.bbox_post_process = bbox_post_process
 
+    def init_cot_head(self, relationship):
+        self.bbox_head.init_cot_head(relationship)
+
     @classmethod
     def from_config(cls, cfg, *args, **kwargs):
         backbone = create(cfg['backbone'])
@@ -80,16 +84,29 @@ class FasterRCNN(BaseArch):
         else:
             rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
             preds, _ = self.bbox_head(body_feats, rois, rois_num, None)
-
             im_shape = self.inputs['im_shape']
             scale_factor = self.inputs['scale_factor']
-            bbox, bbox_num = self.bbox_post_process(preds, (rois, rois_num),
+            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(preds, (rois, rois_num),
                                                     im_shape, scale_factor)
 
             # rescale the prediction back to origin image
             bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
                 bbox, bbox_num, im_shape, scale_factor)
-            return bbox_pred, bbox_num
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                """
+                extra_data['scores'] = preds[1]  # predict scores (probability)
+                # Todo: get logits output
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox_pred, bbox_num, extra_data
+            else:
+                return bbox_pred, bbox_num
+
 
     def get_loss(self, ):
         rpn_loss, bbox_loss = self._forward()
@@ -101,6 +118,45 @@ class FasterRCNN(BaseArch):
         return loss
 
     def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        if self.use_extra_data:
+            bbox_pred, bbox_num, extra_data = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'extra_data': extra_data}
+        else:
+            bbox_pred, bbox_num = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
         return output
+
+    def target_bbox_forward(self, data):
+        body_feats = self.backbone(data)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+        rois = [roi for roi in data['gt_bbox']]
+        rois_num = paddle.concat([paddle.shape(roi)[0] for roi in rois])
+
+        preds, _ = self.bbox_head(body_feats, rois, rois_num, None, cot=True)
+        return preds
+
+    def relationship_learning(self, loader, num_classes_novel):
+        print('computing relationship')
+        train_labels_list = []
+        label_list = []
+
+        for step_id, data in enumerate(loader):
+            _, bbox_prob = self.target_bbox_forward(data)      
+            batch_size = data['im_id'].shape[0]
+            for i in range(batch_size):
+                num_bbox = data['gt_class'][i].shape[0]           
+                train_labels = data['gt_class'][i]
+                train_labels_list.append(train_labels.numpy().squeeze(1))
+            base_labels = bbox_prob.detach().numpy()[:,:-1]
+            label_list.append(base_labels)
+
+        labels = np.concatenate(train_labels_list, 0)
+        probabilities = np.concatenate(label_list, 0)
+        N_t = np.max(labels) + 1
+        conditional = []
+        for i in range(N_t):
+            this_class = probabilities[labels == i]
+            average = np.mean(this_class, axis=0, keepdims=True)
+            conditional.append(average)
+        return np.concatenate(conditional) 

+ 30 - 39
paddlers/models/ppdet/modeling/architectures/fcos.py

@@ -16,7 +16,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import paddle
 from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 
@@ -32,22 +31,25 @@ class FCOS(BaseArch):
         backbone (object): backbone instance
         neck (object): 'FPN' instance
         fcos_head (object): 'FCOSHead' instance
-        post_process (object): 'FCOSPostProcess' instance
+        ssod_loss (object): 'SSODFCOSLoss' instance, only used for semi-det(ssod)
     """
 
     __category__ = 'architecture'
-    __inject__ = ['fcos_post_process']
+    __inject__ = ['ssod_loss']
 
     def __init__(self,
-                 backbone,
-                 neck,
+                 backbone='ResNet',
+                 neck='FPN',
                  fcos_head='FCOSHead',
-                 fcos_post_process='FCOSPostProcess'):
+                 ssod_loss='SSODFCOSLoss'):
         super(FCOS, self).__init__()
         self.backbone = backbone
         self.neck = neck
         self.fcos_head = fcos_head
-        self.fcos_post_process = fcos_post_process
+
+        # for ssod, semi-det
+        self.is_teacher = False
+        self.ssod_loss = ssod_loss
 
     @classmethod
     def from_config(cls, cfg, *args, **kwargs):
@@ -68,38 +70,27 @@ class FCOS(BaseArch):
     def _forward(self):
         body_feats = self.backbone(self.inputs)
         fpn_feats = self.neck(body_feats)
-        fcos_head_outs = self.fcos_head(fpn_feats, self.training)
-        if not self.training:
-            scale_factor = self.inputs['scale_factor']
-            bboxes = self.fcos_post_process(fcos_head_outs, scale_factor)
-            return bboxes
+
+        self.is_teacher = self.inputs.get('is_teacher', False)
+        if self.training or self.is_teacher:
+            losses = self.fcos_head(fpn_feats, self.inputs)
+            return losses
         else:
-            return fcos_head_outs
-
-    def get_loss(self, ):
-        loss = {}
-        tag_labels, tag_bboxes, tag_centerness = [], [], []
-        for i in range(len(self.fcos_head.fpn_stride)):
-            # labels, reg_target, centerness
-            k_lbl = 'labels{}'.format(i)
-            if k_lbl in self.inputs:
-                tag_labels.append(self.inputs[k_lbl])
-            k_box = 'reg_target{}'.format(i)
-            if k_box in self.inputs:
-                tag_bboxes.append(self.inputs[k_box])
-            k_ctn = 'centerness{}'.format(i)
-            if k_ctn in self.inputs:
-                tag_centerness.append(self.inputs[k_ctn])
-
-        fcos_head_outs = self._forward()
-        loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels,
-                                            tag_bboxes, tag_centerness)
-        loss.update(loss_fcos)
-        total_loss = paddle.add_n(list(loss.values()))
-        loss.update({'loss': total_loss})
-        return loss
+            fcos_head_outs = self.fcos_head(fpn_feats)
+            bbox_pred, bbox_num = self.fcos_head.post_process(
+                fcos_head_outs, self.inputs['scale_factor'])
+            return {'bbox': bbox_pred, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
 
     def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
-        return output
+        return self._forward()
+
+    def get_loss_keys(self):
+        return ['loss_cls', 'loss_box', 'loss_quality']
+
+    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
+        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
+                                     train_cfg)
+        return ssod_losses

+ 207 - 6
paddlers/models/ppdet/modeling/architectures/keypoint_hrnet.py

@@ -24,8 +24,9 @@ from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 from ..keypoint_utils import transform_preds
 from .. import layers as L
+from paddle.nn import functional as F
 
-__all__ = ['TopDownHRNet']
+__all__ = ['TopDownHRNet', 'TinyPose3DHRNet', 'TinyPose3DHRHeatmapNet']
 
 
 @register
@@ -45,7 +46,7 @@ class TopDownHRNet(BaseArch):
                  use_dark=True):
         """
         HRNet network, see https://arxiv.org/abs/1902.09212
-
+ 
         Args:
             backbone (nn.Layer): backbone instance
             post_process (object): `HRNetPostProcess` instance
@@ -131,10 +132,10 @@ class HRNetPostProcess(object):
 
     def get_max_preds(self, heatmaps):
         '''get predictions from score maps
-
+ 
         Args:
             heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
-
+ 
         Returns:
             preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
             maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
@@ -219,12 +220,12 @@ class HRNetPostProcess(object):
     def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
         """the highest heatvalue location with a quarter offset in the
         direction from the highest response to the second highest response.
-
+ 
         Args:
             heatmaps (numpy.ndarray): The predicted heatmaps
             center (numpy.ndarray): The boxes center
             scale (numpy.ndarray): The scale factor
-
+ 
         Returns:
             preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
             maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
@@ -265,3 +266,203 @@ class HRNetPostProcess(object):
                     maxvals, axis=1)
         ]]
         return outputs
+
+
+class TinyPose3DPostProcess(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, output, center, scale):
+        """
+        Args:
+            output (numpy.ndarray): numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
+            scale (numpy.ndarray): The scale factor
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
+        """
+
+        preds = output.numpy().copy()
+
+        # Transform back
+        for i in range(output.shape[0]):  # batch_size
+            preds[i][:, 0] = preds[i][:, 0] * scale[i][0]
+            preds[i][:, 1] = preds[i][:, 1] * scale[i][1]
+
+        return preds
+
+
+def soft_argmax(heatmaps, joint_num):
+    dims = heatmaps.shape
+    depth_dim = (int)(dims[1] / joint_num)
+    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim * dims[2] * dims[3]))
+    heatmaps = F.softmax(heatmaps, 2)
+    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim, dims[2], dims[3]))
+
+    accu_x = heatmaps.sum(axis=(2, 3))
+    accu_y = heatmaps.sum(axis=(2, 4))
+    accu_z = heatmaps.sum(axis=(3, 4))
+
+    accu_x = accu_x * paddle.arange(1, 33)
+    accu_y = accu_y * paddle.arange(1, 33)
+    accu_z = accu_z * paddle.arange(1, 33)
+
+    accu_x = accu_x.sum(axis=2, keepdim=True) - 1
+    accu_y = accu_y.sum(axis=2, keepdim=True) - 1
+    accu_z = accu_z.sum(axis=2, keepdim=True) - 1
+
+    coord_out = paddle.concat(
+        (accu_x, accu_y, accu_z), axis=2)  # [batch_size, joint_num, 3]
+
+    return coord_out
+
+
+@register
+class TinyPose3DHRHeatmapNet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(
+            self,
+            width,  # 40, backbone输出的channel数目
+            num_joints,
+            backbone='HRNet',
+            loss='KeyPointRegressionMSELoss',
+            post_process=TinyPose3DPostProcess):
+        """
+        Args:
+            backbone (nn.Layer): backbone instance
+            post_process (object): post process instance
+        """
+        super(TinyPose3DHRHeatmapNet, self).__init__()
+
+        self.backbone = backbone
+        self.post_process = TinyPose3DPostProcess()
+        self.loss = loss
+        self.deploy = False
+        self.num_joints = num_joints
+
+        self.final_conv = L.Conv2d(width, num_joints * 32, 1, 1, 0, bias=True)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        return {'backbone': backbone, }
+
+    def _forward(self):
+        feats = self.backbone(self.inputs)  # feats:[[batch_size, 40, 32, 24]]
+
+        hrnet_outputs = self.final_conv(feats[0])
+        res = soft_argmax(hrnet_outputs, self.num_joints)
+        return res
+
+    def get_loss(self):
+        pose3d = self._forward()
+        loss = self.loss(pose3d, None, self.inputs)
+        outputs = {'loss': loss}
+        return outputs
+
+    def get_pred(self):
+        res_lst = self._forward()
+        outputs = {'pose3d': res_lst}
+        return outputs
+
+    def flip_back(self, output_flipped, matched_parts):
+        assert output_flipped.ndim == 4,\
+                'output_flipped should be [batch_size, num_joints, height, width]'
+
+        output_flipped = output_flipped[:, :, :, ::-1]
+
+        for pair in matched_parts:
+            tmp = output_flipped[:, pair[0], :, :].copy()
+            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+            output_flipped[:, pair[1], :, :] = tmp
+
+        return output_flipped
+
+
+@register
+class TinyPose3DHRNet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 width,
+                 num_joints,
+                 fc_channel=768,
+                 backbone='HRNet',
+                 loss='KeyPointRegressionMSELoss',
+                 post_process=TinyPose3DPostProcess):
+        """
+        Args:
+            backbone (nn.Layer): backbone instance
+            post_process (object): post process instance
+        """
+        super(TinyPose3DHRNet, self).__init__()
+        self.backbone = backbone
+        self.post_process = TinyPose3DPostProcess()
+        self.loss = loss
+        self.deploy = False
+        self.num_joints = num_joints
+
+        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)
+
+        self.flatten = paddle.nn.Flatten(start_axis=2, stop_axis=3)
+        self.fc1 = paddle.nn.Linear(fc_channel, 256)
+        self.act1 = paddle.nn.ReLU()
+        self.fc2 = paddle.nn.Linear(256, 64)
+        self.act2 = paddle.nn.ReLU()
+        self.fc3 = paddle.nn.Linear(64, 3)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        return {'backbone': backbone, }
+
+    def _forward(self):
+        '''
+        self.inputs is a dict
+        '''
+        feats = self.backbone(
+            self.inputs)  # feats:[[batch_size, 40, width/4, height/4]]
+
+        hrnet_outputs = self.final_conv(
+            feats[0])  # hrnet_outputs: [batch_size, num_joints*32,32,32]
+
+        flatten_res = self.flatten(
+            hrnet_outputs)  # [batch_size,num_joints*32,32*32]
+
+        res = self.fc1(flatten_res)
+        res = self.act1(res)
+        res = self.fc2(res)
+        res = self.act2(res)
+        res = self.fc3(res)
+
+        if self.training:
+            return self.loss(res, self.inputs)
+        else:  # export model need
+            return res
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        res_lst = self._forward()
+        outputs = {'pose3d': res_lst}
+        return outputs
+
+    def flip_back(self, output_flipped, matched_parts):
+        assert output_flipped.ndim == 4,\
+                'output_flipped should be [batch_size, num_joints, height, width]'
+
+        output_flipped = output_flipped[:, :, :, ::-1]
+
+        for pair in matched_parts:
+            tmp = output_flipped[:, pair[0], :, :].copy()
+            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+            output_flipped[:, pair[1], :, :] = tmp
+
+        return output_flipped

+ 217 - 0
paddlers/models/ppdet/modeling/architectures/keypoint_petr.py

@@ -0,0 +1,217 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and 
+# limitations under the License.
+"""
+this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/detectors/petr.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register
+from .meta_arch import BaseArch
+from .. import layers as L
+
+__all__ = ['PETR']
+
+
+@register
+class PETR(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['backbone', 'neck', 'bbox_head']
+
+    def __init__(self,
+                 backbone='ResNet',
+                 neck='ChannelMapper',
+                 bbox_head='PETRHead'):
+        """
+        PETR, see https://openaccess.thecvf.com/content/CVPR2022/papers/Shi_End-to-End_Multi-Person_Pose_Estimation_With_Transformers_CVPR_2022_paper.pdf
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck between backbone and head
+            bbox_head (nn.Layer): model output and loss
+        """
+        super(PETR, self).__init__()
+        self.backbone = backbone
+        if neck is not None:
+            self.with_neck = True
+        self.neck = neck
+        self.bbox_head = bbox_head
+        self.deploy = False
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def get_inputs(self):
+        img_metas = []
+        gt_bboxes = []
+        gt_labels = []
+        gt_keypoints = []
+        gt_areas = []
+        pad_gt_mask = self.inputs['pad_gt_mask'].astype("bool").squeeze(-1)
+        for idx, im_shape in enumerate(self.inputs['im_shape']):
+            img_meta = {
+                'img_shape': im_shape.astype("int32").tolist() + [1, ],
+                'batch_input_shape': self.inputs['image'].shape[-2:],
+                'image_name': self.inputs['image_file'][idx]
+            }
+            img_metas.append(img_meta)
+            if (not pad_gt_mask[idx].any()):
+                gt_keypoints.append(self.inputs['gt_joints'][idx][:1])
+                gt_labels.append(self.inputs['gt_class'][idx][:1])
+                gt_bboxes.append(self.inputs['gt_bbox'][idx][:1])
+                gt_areas.append(self.inputs['gt_areas'][idx][:1])
+                continue
+
+            gt_keypoints.append(self.inputs['gt_joints'][idx][pad_gt_mask[idx]])
+            gt_labels.append(self.inputs['gt_class'][idx][pad_gt_mask[idx]])
+            gt_bboxes.append(self.inputs['gt_bbox'][idx][pad_gt_mask[idx]])
+            gt_areas.append(self.inputs['gt_areas'][idx][pad_gt_mask[idx]])
+
+        return img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas
+
+    def get_loss(self):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+            gt_keypoints (list[Tensor]): Each item are the truth keypoints for
+                each image in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x,
+                p^{K}_y, p^{K}_v] format.
+            gt_areas (list[Tensor]): mask areas corresponding to each box.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas = self.get_inputs(
+        )
+        gt_bboxes_ignore = getattr(self.inputs, 'gt_bboxes_ignore', None)
+
+        x = self.extract_feat(self.inputs)
+        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
+                                              gt_labels, gt_keypoints, gt_areas,
+                                              gt_bboxes_ignore)
+        loss = 0
+        for k, v in losses.items():
+            loss += v
+        losses['loss'] = loss
+
+        return losses
+
+    def get_pred_numpy(self):
+        """Used for computing network flops.
+        """
+
+        img = self.inputs['image']
+        batch_size, _, height, width = img.shape
+        dummy_img_metas = [
+            dict(
+                batch_input_shape=(height, width),
+                img_shape=(height, width, 3),
+                scale_factor=(1., 1., 1., 1.)) for _ in range(batch_size)
+        ]
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x, img_metas=dummy_img_metas)
+        bbox_list = self.bbox_head.get_bboxes(
+            *outs, dummy_img_metas, rescale=True)
+        return bbox_list
+
+    def get_pred(self):
+        """
+        """
+        img = self.inputs['image']
+        batch_size, _, height, width = img.shape
+        img_metas = [
+            dict(
+                batch_input_shape=(height, width),
+                img_shape=(height, width, 3),
+                scale_factor=self.inputs['scale_factor'][i])
+            for i in range(batch_size)
+        ]
+        kptpred = self.simple_test(
+            self.inputs, img_metas=img_metas, rescale=True)
+        keypoints = kptpred[0][1][0]
+        bboxs = kptpred[0][0][0]
+        keypoints[..., 2] = bboxs[:, None, 4]
+        res_lst = [[keypoints, bboxs[:, 4]]]
+        outputs = {'keypoint': res_lst}
+        return outputs
+
+    def simple_test(self, inputs, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            inputs (list[paddle.Tensor]): List of multiple images.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox and keypoint results of each image
+                and classes. The outer list corresponds to each image.
+                The inner list corresponds to each class.
+        """
+        batch_size = len(img_metas)
+        assert batch_size == 1, 'Currently only batch_size 1 for inference ' \
+            f'mode is supported. Found batch_size {batch_size}.'
+        feat = self.extract_feat(inputs)
+        results_list = self.bbox_head.simple_test(
+            feat, img_metas, rescale=rescale)
+
+        bbox_kpt_results = [
+            self.bbox_kpt2result(det_bboxes, det_labels, det_kpts,
+                                 self.bbox_head.num_classes)
+            for det_bboxes, det_labels, det_kpts in results_list
+        ]
+        return bbox_kpt_results
+
+    def bbox_kpt2result(self, bboxes, labels, kpts, num_classes):
+        """Convert detection results to a list of numpy arrays.
+
+        Args:
+            bboxes (paddle.Tensor | np.ndarray): shape (n, 5).
+            labels (paddle.Tensor | np.ndarray): shape (n, ).
+            kpts (paddle.Tensor | np.ndarray): shape (n, K, 3).
+            num_classes (int): class number, including background class.
+
+        Returns:
+            list(ndarray): bbox and keypoint results of each class.
+        """
+        if bboxes.shape[0] == 0:
+            return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)], \
+                [np.zeros((0, kpts.size(1), 3), dtype=np.float32)
+                    for i in range(num_classes)]
+        else:
+            if isinstance(bboxes, paddle.Tensor):
+                bboxes = bboxes.numpy()
+                labels = labels.numpy()
+                kpts = kpts.numpy()
+            return [bboxes[labels == i, :] for i in range(num_classes)], \
+                [kpts[labels == i, :, :] for i in range(num_classes)]

+ 22 - 5
paddlers/models/ppdet/modeling/architectures/mask_rcnn.py

@@ -106,8 +106,8 @@ class MaskRCNN(BaseArch):
             im_shape = self.inputs['im_shape']
             scale_factor = self.inputs['scale_factor']
 
-            bbox, bbox_num = self.bbox_post_process(preds, (rois, rois_num),
-                                                    im_shape, scale_factor)
+            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
+                preds, (rois, rois_num), im_shape, scale_factor)
             mask_out = self.mask_head(
                 body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)
 
@@ -117,7 +117,20 @@ class MaskRCNN(BaseArch):
             origin_shape = self.bbox_post_process.get_origin_shape()
             mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
                                                origin_shape)
-            return bbox_pred, bbox_num, mask_pred
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                """
+                extra_data['scores'] = preds[1]  # predict scores (probability)
+                # Todo: get logits output
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox_pred, bbox_num, mask_pred, extra_data
+            else:
+                return bbox_pred, bbox_num, mask_pred
 
     def get_loss(self, ):
         bbox_loss, mask_loss, rpn_loss = self._forward()
@@ -130,6 +143,10 @@ class MaskRCNN(BaseArch):
         return loss
 
     def get_pred(self):
-        bbox_pred, bbox_num, mask_pred = self._forward()
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
+        if self.use_extra_data:
+            bbox_pred, bbox_num, mask_pred, extra_data = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred, 'extra_data': extra_data}
+        else:
+            bbox_pred, bbox_num, mask_pred = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
         return output

+ 2 - 1
paddlers/models/ppdet/modeling/architectures/meta_arch.py

@@ -15,11 +15,12 @@ __all__ = ['BaseArch']
 
 @register
 class BaseArch(nn.Layer):
-    def __init__(self, data_format='NCHW'):
+    def __init__(self, data_format='NCHW', use_extra_data=False):
         super(BaseArch, self).__init__()
         self.data_format = data_format
         self.inputs = {}
         self.fuse_norm = False
+        self.use_extra_data = use_extra_data
 
     def load_meanstd(self, cfg_transform):
         scale = 1.

+ 114 - 0
paddlers/models/ppdet/modeling/architectures/pose3d_metro.py

@@ -0,0 +1,114 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and 
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+from .. import layers as L
+
+__all__ = ['METRO_Body']
+
+
+def orthographic_projection(X, camera):
+    """Perform orthographic projection of 3D points X using the camera parameters
+    Args:
+        X: size = [B, N, 3]
+        camera: size = [B, 3]
+    Returns:
+        Projected 2D points -- size = [B, N, 2]
+    """
+    camera = camera.reshape((-1, 1, 3))
+    X_trans = X[:, :, :2] + camera[:, :, 1:]
+    shape = paddle.shape(X_trans)
+    X_2d = (camera[:, :, 0] * X_trans.reshape((shape[0], -1))).reshape(shape)
+    return X_2d
+
+
+@register
+class METRO_Body(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(
+            self,
+            num_joints,
+            backbone='HRNet',
+            trans_encoder='',
+            loss='Pose3DLoss', ):
+        """
+        Modified from METRO network, see https://arxiv.org/abs/2012.09760
+
+        Args:
+            backbone (nn.Layer): backbone instance
+        """
+        super(METRO_Body, self).__init__()
+        self.num_joints = num_joints
+        self.backbone = backbone
+        self.loss = loss
+        self.deploy = False
+
+        self.trans_encoder = trans_encoder
+        self.conv_learn_tokens = paddle.nn.Conv1D(49, num_joints + 10, 1)
+        self.cam_param_fc = paddle.nn.Linear(3, 2)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        trans_encoder = create(cfg['trans_encoder'])
+
+        return {'backbone': backbone, 'trans_encoder': trans_encoder}
+
+    def _forward(self):
+        batch_size = self.inputs['image'].shape[0]
+
+        image_feat = self.backbone(self.inputs)
+        image_feat_flatten = image_feat.reshape((batch_size, 2048, 49))
+        image_feat_flatten = image_feat_flatten.transpose(perm=(0, 2, 1))
+        # and apply a conv layer to learn image token for each 3d joint/vertex position
+        features = self.conv_learn_tokens(image_feat_flatten)  # (B, J, C)
+
+        if self.training:
+            # apply mask vertex/joint modeling
+            # meta_masks is a tensor of all the masks, randomly generated in dataloader
+            # we pre-define a [MASK] token, which is a floating-value vector with 0.01s
+            meta_masks = self.inputs['mjm_mask'].expand((-1, -1, 2048))
+            constant_tensor = paddle.ones_like(features) * 0.01
+            features = features * meta_masks + constant_tensor * (1 - meta_masks
+                                                                  )
+        pred_out = self.trans_encoder(features)
+
+        pred_3d_joints = pred_out[:, :self.num_joints, :]
+        cam_features = pred_out[:, self.num_joints:, :]
+
+        # learn camera parameters
+        pred_2d_joints = self.cam_param_fc(cam_features)
+        return pred_3d_joints, pred_2d_joints
+
+    def get_loss(self):
+        preds_3d, preds_2d = self._forward()
+        loss = self.loss(preds_3d, preds_2d, self.inputs)
+        output = {'loss': loss}
+        return output
+
+    def get_pred(self):
+        preds_3d, preds_2d = self._forward()
+        outputs = {'pose3d': preds_3d, 'pose2d': preds_2d}
+        return outputs

+ 260 - 0
paddlers/models/ppdet/modeling/architectures/ppyoloe.py

@@ -0,0 +1,260 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['PPYOLOE', 'PPYOLOEWithAuxHead']
+# PP-YOLOE and PP-YOLOE+ are recommended to use this architecture, especially when use distillation or aux head
+# PP-YOLOE and PP-YOLOE+ can also use the same architecture of YOLOv3 in yolo.py when not use distillation or aux head
+
+
+@register
+class PPYOLOE(BaseArch):
+    """
+    PPYOLOE network, see https://arxiv.org/abs/2203.16250
+
+    Args:
+        backbone (nn.Layer): backbone instance
+        neck (nn.Layer): neck instance
+        yolo_head (nn.Layer): anchor_head instance
+        post_process (object): `BBoxPostProcess` instance
+        ssod_loss (object): 'SSODPPYOLOELoss' instance, only used for semi-det(ssod)
+        for_distill (bool): whether for distillation
+        feat_distill_place (str): distill which feature for distillation
+        for_mot (bool): whether return other features for multi-object tracking
+            models, default False in pure object detection models.
+    """
+
+    __category__ = 'architecture'
+    __shared__ = ['for_distill']
+    __inject__ = ['post_process', 'ssod_loss']
+
+    def __init__(self,
+                 backbone='CSPResNet',
+                 neck='CustomCSPPAN',
+                 yolo_head='PPYOLOEHead',
+                 post_process='BBoxPostProcess',
+                 ssod_loss='SSODPPYOLOELoss',
+                 for_distill=False,
+                 feat_distill_place='neck_feats',
+                 for_mot=False):
+        super(PPYOLOE, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+        # for ssod, semi-det
+        self.is_teacher = False
+        self.ssod_loss = ssod_loss
+
+        # distill
+        self.for_distill = for_distill
+        self.feat_distill_place = feat_distill_place
+        if for_distill:
+            assert feat_distill_place in ['backbone_feats', 'neck_feats']
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        self.is_teacher = self.inputs.get('is_teacher', False)  # for semi-det
+        if self.training or self.is_teacher:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+
+            if self.for_distill:
+                if self.feat_distill_place == 'backbone_feats':
+                    self.yolo_head.distill_pairs['backbone_feats'] = body_feats
+                elif self.feat_distill_place == 'neck_feats':
+                    self.yolo_head.distill_pairs['neck_feats'] = neck_feats
+                else:
+                    raise ValueError
+            return yolo_losses
+        else:
+
+            yolo_head_outs = self.yolo_head(neck_feats)
+
+            if self.post_process is not None:
+                bbox, bbox_num, nms_keep_idx = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors,
+                    self.inputs['im_shape'], self.inputs['scale_factor'])
+
+            else:
+                bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
+                    yolo_head_outs, self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
+                extra_data['nms_keep_idx'] = nms_keep_idx
+                output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
+            else:
+                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
+
+    def get_loss_keys(self):
+        return ['loss_cls', 'loss_iou', 'loss_dfl', 'loss_contrast']
+
+    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
+        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
+                                     train_cfg)
+        return ssod_losses
+
+
+@register
+class PPYOLOEWithAuxHead(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='CSPResNet',
+                 neck='CustomCSPPAN',
+                 yolo_head='PPYOLOEHead',
+                 aux_head='SimpleConvHead',
+                 post_process='BBoxPostProcess',
+                 for_mot=False,
+                 detach_epoch=5):
+        """
+        PPYOLOE network, see https://arxiv.org/abs/2203.16250
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): anchor_head instance
+            post_process (object): `BBoxPostProcess` instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(PPYOLOEWithAuxHead, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.aux_neck = copy.deepcopy(self.neck)
+
+        self.yolo_head = yolo_head
+        self.aux_head = aux_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+        self.detach_epoch = detach_epoch
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+        aux_neck = copy.deepcopy(neck)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+        aux_head = create(cfg['aux_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+            'aux_head': aux_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            if self.inputs['epoch_id'] >= self.detach_epoch:
+                aux_neck_feats = self.aux_neck([f.detach() for f in body_feats])
+                dual_neck_feats = (paddle.concat(
+                    [f.detach(), aux_f], axis=1) for f, aux_f in
+                                   zip(neck_feats, aux_neck_feats))
+            else:
+                aux_neck_feats = self.aux_neck(body_feats)
+                dual_neck_feats = (paddle.concat(
+                    [f, aux_f], axis=1) for f, aux_f in
+                                   zip(neck_feats, aux_neck_feats))
+            aux_cls_scores, aux_bbox_preds = self.aux_head(dual_neck_feats)
+            loss = self.yolo_head(
+                neck_feats,
+                self.inputs,
+                aux_pred=[aux_cls_scores, aux_bbox_preds])
+            return loss
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+
+            if self.post_process is not None:
+                bbox, bbox_num, nms_keep_idx = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors,
+                    self.inputs['im_shape'], self.inputs['scale_factor'])
+            else:
+                bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
+                    yolo_head_outs, self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
+                # Todo: get logits output
+                extra_data['nms_keep_idx'] = nms_keep_idx
+                output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
+            else:
+                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()

+ 104 - 0
paddlers/models/ppdet/modeling/architectures/queryinst.py

@@ -0,0 +1,104 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['QueryInst']
+
+
+@register
+class QueryInst(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 roi_head,
+                 post_process='SparsePostProcess'):
+        super(QueryInst, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.rpn_head = rpn_head
+        self.roi_head = roi_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        roi_head = create(cfg['roi_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            'rpn_head': rpn_head,
+            "roi_head": roi_head
+        }
+
+    def _forward(self, targets=None):
+        features = self.backbone(self.inputs)
+        features = self.neck(features)
+
+        proposal_bboxes, proposal_features = self.rpn_head(self.inputs[
+            'img_whwh'])
+        outputs = self.roi_head(features, proposal_bboxes, proposal_features,
+                                targets)
+
+        if self.training:
+            return outputs
+        else:
+            bbox_pred, bbox_num, mask_pred = self.post_process(
+                outputs['class_logits'], outputs['bbox_pred'],
+                self.inputs['scale_factor_whwh'], self.inputs['ori_shape'],
+                outputs['mask_logits'])
+            return bbox_pred, bbox_num, mask_pred
+
+    def get_loss(self):
+        targets = []
+        for i in range(len(self.inputs['img_whwh'])):
+            boxes = self.inputs['gt_bbox'][i]
+            labels = self.inputs['gt_class'][i].squeeze(-1)
+            img_whwh = self.inputs['img_whwh'][i]
+            if boxes.shape[0] != 0:
+                img_whwh_tgt = img_whwh.unsqueeze(0).tile([boxes.shape[0], 1])
+            else:
+                img_whwh_tgt = paddle.zeros_like(boxes)
+            gt_segm = self.inputs['gt_segm'][i].astype('float32')
+            targets.append({
+                'boxes': boxes,
+                'labels': labels,
+                'img_whwh': img_whwh,
+                'img_whwh_tgt': img_whwh_tgt,
+                'gt_segm': gt_segm
+            })
+        losses = self._forward(targets)
+        losses.update({'loss': sum(losses.values())})
+        return losses
+
+    def get_pred(self):
+        bbox_pred, bbox_num, mask_pred = self._forward()
+        return {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}

+ 18 - 2
paddlers/models/ppdet/modeling/architectures/retinanet.py

@@ -19,6 +19,7 @@ from __future__ import print_function
 from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 import paddle
+import paddle.nn.functional as F
 
 __all__ = ['RetinaNet']
 
@@ -57,9 +58,24 @@ class RetinaNet(BaseArch):
             return self.head(neck_feats, self.inputs)
         else:
             head_outs = self.head(neck_feats)
-            bbox, bbox_num = self.head.post_process(
+            bbox, bbox_num, nms_keep_idx = self.head.post_process(
                 head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
-            return {'bbox': bbox, 'bbox_num': bbox_num}
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                preds_logits = self.head.decode_cls_logits(head_outs[0])
+                preds_scores = F.sigmoid(preds_logits)
+                extra_data['logits'] = preds_logits
+                extra_data['scores'] = preds_scores
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return {'bbox': bbox, 'bbox_num': bbox_num, "extra_data": extra_data}
+            else:
+                return {'bbox': bbox, 'bbox_num': bbox_num}
 
     def get_loss(self):
         return self._forward()

+ 3 - 3
paddlers/models/ppdet/modeling/architectures/sparse_rcnn.py

@@ -60,10 +60,10 @@ class SparseRCNN(BaseArch):
         head_outs = self.head(fpn_feats, self.inputs["img_whwh"])
 
         if not self.training:
-            bboxes = self.postprocess(
+            bbox_pred, bbox_num = self.postprocess(
                 head_outs["pred_logits"], head_outs["pred_boxes"],
-                self.inputs["scale_factor_wh"], self.inputs["img_whwh"])
-            return bboxes
+                self.inputs["scale_factor_whwh"], self.inputs["ori_shape"])
+            return bbox_pred, bbox_num
         else:
             return head_outs
 

+ 35 - 9
paddlers/models/ppdet/modeling/architectures/ssd.py

@@ -18,6 +18,8 @@ from __future__ import print_function
 
 from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
+import paddle
+import paddle.nn.functional as F
 
 __all__ = ['SSD']
 
@@ -75,18 +77,42 @@ class SSD(BaseArch):
                                  self.inputs['gt_class'])
         else:
             preds, anchors = self.ssd_head(body_feats, self.inputs['image'])
-            bbox, bbox_num = self.post_process(preds, anchors,
-                                               self.inputs['im_shape'],
-                                               self.inputs['scale_factor'])
-            return bbox, bbox_num
+            bbox, bbox_num, nms_keep_idx = self.post_process(
+                preds, anchors, self.inputs['im_shape'],
+                self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]
+                extra_data['scores'] = F.softmax(paddle.concat(
+                    preds_logits, axis=1)).transpose([0, 2, 1])
+                extra_data['logits'] = paddle.concat(
+                    preds_logits, axis=1).transpose([0, 2, 1])
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox, bbox_num, extra_data
+            else:
+                return bbox, bbox_num
 
     def get_loss(self, ):
         return {"loss": self._forward()}
 
     def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {
-            "bbox": bbox_pred,
-            "bbox_num": bbox_num,
-        }
+        if self.use_extra_data:
+            bbox_pred, bbox_num, extra_data = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+                "extra_data": extra_data
+            }
+        else:
+            bbox_pred, bbox_num = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+            }
         return output

+ 28 - 5
paddlers/models/ppdet/modeling/architectures/yolo.py

@@ -21,6 +21,8 @@ from .meta_arch import BaseArch
 from ..post_process import JDEBBoxPostProcess
 
 __all__ = ['YOLOv3']
+# YOLOv3,PP-YOLO,PP-YOLOv2,PP-YOLOE,PP-YOLOE+ use the same architecture as YOLOv3
+# PP-YOLOE and PP-YOLOE+ are recommended to use PPYOLOE architecture in ppyoloe.py, especially when use distillation or aux head
 
 
 @register
@@ -77,7 +79,10 @@ class YOLOv3(BaseArch):
 
     def _forward(self):
         body_feats = self.backbone(self.inputs)
-        neck_feats = self.neck(body_feats, self.for_mot)
+        if self.for_mot:
+            neck_feats = self.neck(body_feats, self.for_mot)
+        else:
+            neck_feats = self.neck(body_feats)
 
         if isinstance(neck_feats, dict):
             assert self.for_mot == True
@@ -96,6 +101,7 @@ class YOLOv3(BaseArch):
             yolo_head_outs = self.yolo_head(neck_feats)
 
             if self.for_mot:
+                # the detection part of JDE MOT model
                 boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process(
                     yolo_head_outs, self.yolo_head.mask_anchors)
                 output = {
@@ -107,16 +113,33 @@ class YOLOv3(BaseArch):
                 }
             else:
                 if self.return_idx:
-                    _, bbox, bbox_num, _ = self.post_process(
+                    # the detection part of JDE MOT model
+                    _, bbox, bbox_num, nms_keep_idx = self.post_process(
                         yolo_head_outs, self.yolo_head.mask_anchors)
                 elif self.post_process is not None:
-                    bbox, bbox_num = self.post_process(
+                    # anchor based YOLOs: YOLOv3,PP-YOLO,PP-YOLOv2 use mask_anchors
+                    bbox, bbox_num, nms_keep_idx = self.post_process(
                         yolo_head_outs, self.yolo_head.mask_anchors,
                         self.inputs['im_shape'], self.inputs['scale_factor'])
                 else:
-                    bbox, bbox_num = self.yolo_head.post_process(
+                    # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
+                    bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
                         yolo_head_outs, self.inputs['scale_factor'])
-                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+                if self.use_extra_data:
+                    extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                    """extra_data:{
+                                'scores': predict scores,
+                                'nms_keep_idx': bbox index before nms,
+                               }
+                    """
+                    extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
+                    # Todo: get logits output
+                    extra_data['nms_keep_idx'] = nms_keep_idx
+                    # Todo support for mask_anchors yolo
+                    output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
+                else:
+                    output = {'bbox': bbox, 'bbox_num': bbox_num}
 
             return output
 

+ 88 - 0
paddlers/models/ppdet/modeling/architectures/yolof.py

@@ -0,0 +1,88 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['YOLOF']
+
+
+@register
+class YOLOF(BaseArch):
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 backbone='ResNet',
+                 neck='DilatedEncoder',
+                 head='YOLOFHead',
+                 for_mot=False):
+        """
+        YOLOF network, see https://arxiv.org/abs/2103.09460
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): DilatedEncoder instance
+            head (nn.Layer): YOLOFHead instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOF, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolo_losses = self.head(neck_feats, self.inputs)
+            return yolo_losses
+        else:
+            yolo_head_outs = self.head(neck_feats)
+            bbox, bbox_num = self.head.post_process(yolo_head_outs,
+                                                    self.inputs['im_shape'],
+                                                    self.inputs['scale_factor'])
+            output = {'bbox': bbox, 'bbox_num': bbox_num}
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()

+ 10 - 0
paddlers/models/ppdet/modeling/assigners/__init__.py

@@ -17,9 +17,19 @@ from . import task_aligned_assigner
 from . import atss_assigner
 from . import simota_assigner
 from . import max_iou_assigner
+from . import fcosr_assigner
+from . import rotated_task_aligned_assigner
+from . import task_aligned_assigner_cr
+from . import uniform_assigner
 
 from .utils import *
 from .task_aligned_assigner import *
 from .atss_assigner import *
 from .simota_assigner import *
 from .max_iou_assigner import *
+from .fcosr_assigner import *
+from .rotated_task_aligned_assigner import *
+from .task_aligned_assigner_cr import *
+from .uniform_assigner import *
+from .hungarian_assigner import *
+from .pose_utils import *

+ 16 - 6
paddlers/models/ppdet/modeling/assigners/atss_assigner.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,12 +41,14 @@ class ATSSAssigner(nn.Layer):
                  topk=9,
                  num_classes=80,
                  force_gt_matching=False,
-                 eps=1e-9):
+                 eps=1e-9,
+                 sm_use=False):
         super(ATSSAssigner, self).__init__()
         self.topk = topk
         self.num_classes = num_classes
         self.force_gt_matching = force_gt_matching
         self.eps = eps
+        self.sm_use = sm_use
 
     def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list,
                              pad_gt_mask):
@@ -124,7 +126,8 @@ class ATSSAssigner(nn.Layer):
             assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
             assigned_scores = paddle.zeros(
                 [batch_size, num_anchors, self.num_classes])
-            return assigned_labels, assigned_bboxes, assigned_scores
+            mask_positive = paddle.zeros([batch_size, 1, num_anchors])
+            return assigned_labels, assigned_bboxes, assigned_scores, mask_positive
 
         # 1. compute iou between gt and anchor bbox, [B, n, L]
         ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
@@ -154,7 +157,11 @@ class ATSSAssigner(nn.Layer):
                                   paddle.zeros_like(is_in_topk))
 
         # 6. check the positive sample's center in gt, [B, n, L]
-        is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)
+        if self.sm_use:
+            is_in_gts = check_points_inside_bboxes(
+                anchor_centers, gt_bboxes, sm_use=True)
+        else:
+            is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)
 
         # select positive sample, [B, n, L]
         mask_positive = is_in_topk * is_in_gts * pad_gt_mask
@@ -165,7 +172,10 @@ class ATSSAssigner(nn.Layer):
         if mask_positive_sum.max() > 1:
             mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
                 [1, num_max_boxes, 1])
-            is_max_iou = compute_max_iou_anchor(ious)
+            if self.sm_use:
+                is_max_iou = compute_max_iou_anchor(ious * mask_positive)
+            else:
+                is_max_iou = compute_max_iou_anchor(ious)
             mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
                                          mask_positive)
             mask_positive_sum = mask_positive.sum(axis=-2)
@@ -212,4 +222,4 @@ class ATSSAssigner(nn.Layer):
                                          paddle.zeros_like(gather_scores))
             assigned_scores *= gather_scores.unsqueeze(-1)
 
-        return assigned_labels, assigned_bboxes, assigned_scores
+        return assigned_labels, assigned_bboxes, assigned_scores, mask_positive

+ 227 - 0
paddlers/models/ppdet/modeling/assigners/fcosr_assigner.py

@@ -0,0 +1,227 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register
+from paddlers.models.ppdet.modeling.rbox_utils import box2corners, check_points_in_polys, paddle_gather
+
+__all__ = ['FCOSRAssigner']
+
+EPS = 1e-9
+
+
+@register
+class FCOSRAssigner(nn.Layer):
+    """ FCOSR Assigner, refer to https://arxiv.org/abs/2111.10780 for details
+
+    1. compute normalized gaussian distribution score and refined gaussian distribution score
+    2. refer to ellipse center sampling, sample points whose normalized gaussian distribution score is greater than threshold
+    3. refer to multi-level sampling, assign ground truth to feature map which follows two conditions.
+        i). first, the ratio between the short edge of the target and the stride of the feature map is less than 2.
+        ii). second, the long edge of minimum bounding rectangle of the target is larger than the acceptance range of feature map
+    4. refer to fuzzy sample label assignment, the points satisfying 2 and 3 will be assigned to the ground truth according to gaussian distribution score
+    """
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 factor=12,
+                 threshold=0.23,
+                 boundary=[[-1, 128], [128, 320], [320, 10000]],
+                 score_type='iou'):
+        super(FCOSRAssigner, self).__init__()
+        self.num_classes = num_classes
+        self.factor = factor
+        self.threshold = threshold
+        self.boundary = [
+            paddle.to_tensor(
+                l, dtype=paddle.float32).reshape([1, 1, 2]) for l in boundary
+        ]
+        self.score_type = score_type
+
+    def get_gaussian_distribution_score(self, points, gt_rboxes, gt_polys):
+        # projecting points to coordinate system defined by each rbox
+        # [B, N, 4, 2] -> 4 * [B, N, 1, 2]
+        a, b, c, d = gt_polys.split(4, axis=2)
+        # [1, L, 2] -> [1, 1, L, 2]
+        points = points.unsqueeze(0)
+        ab = b - a
+        ad = d - a
+        # [B, N, 5] -> [B, N, 2], [B, N, 2], [B, N, 1]
+        xy, wh, angle = gt_rboxes.split([2, 2, 1], axis=-1)
+        # [B, N, 2] -> [B, N, 1, 2]
+        xy = xy.unsqueeze(2)
+        # vector of points to center [B, N, L, 2]
+        vec = points - xy
+        # <ab, vec> = |ab| * |vec| * cos(theta) [B, N, L]
+        vec_dot_ab = paddle.sum(vec * ab, axis=-1)
+        # <ad, vec> = |ad| * |vec| * cos(theta) [B, N, L]
+        vec_dot_ad = paddle.sum(vec * ad, axis=-1)
+        # norm_ab [B, N, L]
+        norm_ab = paddle.sum(ab * ab, axis=-1).sqrt()
+        # norm_ad [B, N, L]
+        norm_ad = paddle.sum(ad * ad, axis=-1).sqrt()
+        # min(h, w), [B, N, 1]
+        min_edge = paddle.min(wh, axis=-1, keepdim=True)
+        # delta_x, delta_y [B, N, L]
+        delta_x = vec_dot_ab.pow(2) / (norm_ab.pow(3) * min_edge + EPS)
+        delta_y = vec_dot_ad.pow(2) / (norm_ad.pow(3) * min_edge + EPS)
+        # score [B, N, L]
+        norm_score = paddle.exp(-0.5 * self.factor * (delta_x + delta_y))
+
+        # simplified calculation
+        sigma = min_edge / self.factor
+        refined_score = norm_score / (2 * np.pi * sigma + EPS)
+        return norm_score, refined_score
+
+    def get_rotated_inside_mask(self, points, gt_polys, scores):
+        inside_mask = check_points_in_polys(points, gt_polys)
+        center_mask = scores >= self.threshold
+        return (inside_mask & center_mask).cast(paddle.float32)
+
+    def get_inside_range_mask(self, points, gt_bboxes, gt_rboxes, stride_tensor,
+                              regress_range):
+        # [1, L, 2] -> [1, 1, L, 2]
+        points = points.unsqueeze(0)
+        # [B, n, 4] -> [B, n, 1, 4]
+        x1y1, x2y2 = gt_bboxes.unsqueeze(2).split(2, axis=-1)
+        # [B, n, L, 2]
+        lt = points - x1y1
+        rb = x2y2 - points
+        # [B, n, L, 4]
+        ltrb = paddle.concat([lt, rb], axis=-1)
+        # [B, n, L, 4] -> [B, n, L]
+        inside_mask = paddle.min(ltrb, axis=-1) > EPS
+        # regress_range [1, L, 2] -> [1, 1, L, 2]
+        regress_range = regress_range.unsqueeze(0)
+        # stride_tensor [1, L, 1] -> [1, 1, L]
+        stride_tensor = stride_tensor.transpose((0, 2, 1))
+        # fcos range
+        # [B, n, L, 4] -> [B, n, L]
+        ltrb_max = paddle.max(ltrb, axis=-1)
+        # [1, 1, L, 2] -> [1, 1, L]
+        low, high = regress_range[..., 0], regress_range[..., 1]
+        # [B, n, L]
+        regress_mask = (ltrb_max >= low) & (ltrb_max <= high)
+        # mask for rotated
+        # [B, n, 1]
+        min_edge = paddle.min(gt_rboxes[..., 2:4], axis=-1, keepdim=True)
+        # [B, n , L]
+        rotated_mask = ((min_edge / stride_tensor) < 2.0) & (ltrb_max > high)
+        mask = inside_mask & (regress_mask | rotated_mask)
+        return mask.cast(paddle.float32)
+
+    @paddle.no_grad()
+    def forward(self,
+                anchor_points,
+                stride_tensor,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                gt_rboxes,
+                pad_gt_mask,
+                bg_index,
+                pred_rboxes=None):
+        r"""
+
+        Args:
+            anchor_points (Tensor, float32): pre-defined anchor points, shape(1, L, 2),
+                    "x, y" format
+            stride_tensor (Tensor, float32): stride tensor, shape (1, L, 1)
+            num_anchors_list (List): num of anchors in each level
+            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
+            gt_rboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)
+            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
+            bg_index (int): background index
+            pred_rboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 5)
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_rboxes (Tensor): (B, L, 5)
+            assigned_scores (Tensor): (B, L, C), if pred_rboxes is not None, then output ious
+        """
+
+        _, num_anchors, _ = anchor_points.shape
+        batch_size, num_max_boxes, _ = gt_rboxes.shape
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full(
+                [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)
+            assigned_rboxes = paddle.zeros([batch_size, num_anchors, 5])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, self.num_classes])
+            return assigned_labels, assigned_rboxes, assigned_scores
+
+        # get normalized gaussian distribution score and refined distribution score
+        gt_polys = box2corners(gt_rboxes)
+        score, refined_score = self.get_gaussian_distribution_score(
+            anchor_points, gt_rboxes, gt_polys)
+        inside_mask = self.get_rotated_inside_mask(anchor_points, gt_polys,
+                                                   score)
+        regress_ranges = []
+        for num, bound in zip(num_anchors_list, self.boundary):
+            regress_ranges.append(bound.tile((1, num, 1)))
+        regress_ranges = paddle.concat(regress_ranges, axis=1)
+        regress_mask = self.get_inside_range_mask(
+            anchor_points, gt_bboxes, gt_rboxes, stride_tensor, regress_ranges)
+        # [B, n, L]
+        mask_positive = inside_mask * regress_mask * pad_gt_mask
+        refined_score = refined_score * mask_positive - (1. - mask_positive)
+
+        argmax_refined_score = refined_score.argmax(axis=-2)
+        max_refined_score = refined_score.max(axis=-2)
+        assigned_gt_index = argmax_refined_score
+
+        # assigned target
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            max_refined_score > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_rboxes = paddle.gather(
+            gt_rboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)
+        assigned_rboxes = assigned_rboxes.reshape([batch_size, num_anchors, 5])
+
+        assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)
+        ind = list(range(self.num_classes + 1))
+        ind.remove(bg_index)
+        assigned_scores = paddle.index_select(
+            assigned_scores, paddle.to_tensor(ind), axis=-1)
+
+        if self.score_type == 'gaussian':
+            selected_scores = paddle_gather(
+                score, 1, argmax_refined_score.unsqueeze(-2)).squeeze(-2)
+            assigned_scores = assigned_scores * selected_scores.unsqueeze(-1)
+        elif self.score_type == 'iou':
+            assert pred_rboxes is not None, 'If score type is iou, pred_rboxes should not be None'
+            from ext_op import matched_rbox_iou
+            b, l = pred_rboxes.shape[:2]
+            iou_score = matched_rbox_iou(
+                pred_rboxes.reshape((-1, 5)), assigned_rboxes.reshape(
+                    (-1, 5))).reshape((b, l, 1))
+            assigned_scores = assigned_scores * iou_score
+
+        return assigned_labels, assigned_rboxes, assigned_scores 

+ 316 - 0
paddlers/models/ppdet/modeling/assigners/hungarian_assigner.py

@@ -0,0 +1,316 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+import paddle
+
+from paddlers.models.ppdet.core.workspace import register
+
+__all__ = ['PoseHungarianAssigner', 'PseudoSampler']
+
+
+class AssignResult:
+    """Stores assignments between predicted and truth boxes.
+
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+
+        gt_inds (LongTensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+
+        max_overlaps (FloatTensor): the iou between the predicted box and its
+            assigned truth box.
+
+        labels (None | LongTensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+    """
+
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+
+@register
+class PoseHungarianAssigner:
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression oks cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt.
+    - positive integer: positive sample, index (1-based) of assigned gt.
+
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        kpt_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        oks_weight (int | float, optional): The scale factor for regression
+            oks cost. Default 1.0.
+    """
+    __inject__ = ['cls_cost', 'kpt_cost', 'oks_cost']
+
+    def __init__(self,
+                 cls_cost='ClassificationCost',
+                 kpt_cost='KptL1Cost',
+                 oks_cost='OksCost'):
+        self.cls_cost = cls_cost
+        self.kpt_cost = kpt_cost
+        self.oks_cost = oks_cost
+
+    def assign(self,
+               cls_pred,
+               kpt_pred,
+               gt_labels,
+               gt_keypoints,
+               gt_areas,
+               img_meta,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            kpt_pred (Tensor): Predicted keypoints with normalized coordinates
+                (x_{i}, y_{i}), which are all in range [0, 1]. Shape
+                [num_query, K*2].
+            gt_labels (Tensor): Label of `gt_keypoints`, shape (num_gt,).
+            gt_keypoints (Tensor): Ground truth keypoints with unnormalized
+                coordinates [p^{1}_x, p^{1}_y, p^{1}_v, ..., \
+                    p^{K}_x, p^{K}_y, p^{K}_v]. Shape [num_gt, K*3].
+            gt_areas (Tensor): Ground truth mask areas, shape (num_gt,).
+            img_meta (dict): Meta information for current image.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        num_gts, num_kpts = gt_keypoints.shape[0], kpt_pred.shape[0]
+        if not gt_keypoints.astype('bool').any():
+            num_gts = 0
+
+        # 1. assign -1 by default
+        assigned_gt_inds = paddle.full((num_kpts, ), -1, dtype="int64")
+        assigned_labels = paddle.full((num_kpts, ), -1, dtype="int64")
+        if num_gts == 0 or num_kpts == 0:
+            # No ground truth or keypoints, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+        img_h, img_w, _ = img_meta['img_shape']
+        factor = paddle.to_tensor(
+            [img_w, img_h, img_w, img_h], dtype=gt_keypoints.dtype).reshape(
+                (1, -1))
+
+        # 2. compute the weighted costs
+        # classification cost
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+
+        # keypoint regression L1 cost
+        gt_keypoints_reshape = gt_keypoints.reshape((gt_keypoints.shape[0], -1,
+                                                     3))
+        valid_kpt_flag = gt_keypoints_reshape[..., -1]
+        kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,
+                                                          2))
+        normalize_gt_keypoints = gt_keypoints_reshape[
+            ..., :2] / factor[:, :2].unsqueeze(0)
+        kpt_cost = self.kpt_cost(kpt_pred_tmp, normalize_gt_keypoints,
+                                 valid_kpt_flag)
+        # keypoint OKS cost
+        kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,
+                                                          2))
+        kpt_pred_tmp = kpt_pred_tmp * factor[:, :2].unsqueeze(0)
+        oks_cost = self.oks_cost(kpt_pred_tmp, gt_keypoints_reshape[..., :2],
+                                 valid_kpt_flag, gt_areas)
+        # weighted sum of above three costs
+        cost = cls_cost + kpt_cost + oks_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = paddle.to_tensor(matched_row_inds)
+        matched_col_inds = paddle.to_tensor(matched_col_inds)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds][
+            ..., 0].astype("int64")
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+
+class SamplingResult:
+    """Bbox sampling result.
+    """
+
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        if pos_inds.size > 0:
+            self.pos_bboxes = bboxes[pos_inds]
+            self.neg_bboxes = bboxes[neg_inds]
+            self.pos_is_gt = gt_flags[pos_inds]
+
+            self.num_gts = gt_bboxes.shape[0]
+            self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+            if gt_bboxes.numel() == 0:
+                # hack for index error case
+                assert self.pos_assigned_gt_inds.numel() == 0
+                self.pos_gt_bboxes = paddle.zeros(
+                    gt_bboxes.shape, dtype=gt_bboxes.dtype).reshape((-1, 4))
+            else:
+                if len(gt_bboxes.shape) < 2:
+                    gt_bboxes = gt_bboxes.reshape((-1, 4))
+
+                self.pos_gt_bboxes = paddle.index_select(
+                    gt_bboxes,
+                    self.pos_assigned_gt_inds.astype('int64'),
+                    axis=0)
+
+            if assign_result.labels is not None:
+                self.pos_gt_labels = assign_result.labels[pos_inds]
+            else:
+                self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        """paddle.Tensor: concatenated positive and negative boxes"""
+        return paddle.concat([self.pos_bboxes, self.neg_bboxes])
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_bboxes'] = data.pop('pos_bboxes').shape
+        data['neg_bboxes'] = data.pop('neg_bboxes').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_bboxes': self.pos_bboxes,
+            'neg_bboxes': self.neg_bboxes,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
+
+
+@register
+class PseudoSampler:
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result, bboxes, gt_bboxes, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            bboxes (paddle.Tensor): Bounding boxes
+            gt_bboxes (paddle.Tensor): Ground truth boxes
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pos_inds = paddle.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1)
+        neg_inds = paddle.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1)
+        gt_flags = paddle.zeros([bboxes.shape[0]], dtype='int32')
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result

+ 275 - 0
paddlers/models/ppdet/modeling/assigners/pose_utils.py

@@ -0,0 +1,275 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register
+
+__all__ = ['KptL1Cost', 'OksCost', 'ClassificationCost']
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+@register
+class KptL1Cost(object):
+    """KptL1Cost.
+
+    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
+
+    Args:
+        weight (int | float, optional): loss_weight.
+    """
+
+    def __init__(self, weight=1.0):
+        self.weight = weight
+
+    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag):
+        """
+        Args:
+            kpt_pred (Tensor): Predicted keypoints with normalized coordinates
+                (x_{i}, y_{i}), which are all in range [0, 1]. Shape
+                [num_query, K, 2].
+            gt_keypoints (Tensor): Ground truth keypoints with normalized
+                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
+            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
+                Shape [num_gt, K].
+
+        Returns:
+            paddle.Tensor: kpt_cost value with weight.
+        """
+        kpt_cost = []
+        for i in range(len(gt_keypoints)):
+            if gt_keypoints[i].size == 0:
+                kpt_cost.append(kpt_pred.sum() * 0)
+            kpt_pred_tmp = kpt_pred.clone()
+            valid_flag = valid_kpt_flag[i] > 0
+            valid_flag_expand = valid_flag.unsqueeze(0).unsqueeze(-1).expand_as(
+                kpt_pred_tmp)
+            if not valid_flag_expand.all():
+                kpt_pred_tmp = masked_fill(kpt_pred_tmp, ~valid_flag_expand, 0)
+            cost = F.pairwise_distance(
+                kpt_pred_tmp.reshape((kpt_pred_tmp.shape[0], -1)),
+                gt_keypoints[i].reshape((-1, )).unsqueeze(0),
+                p=1,
+                keepdim=True)
+            avg_factor = paddle.clip(
+                valid_flag.astype('float32').sum() * 2, 1.0)
+            cost = cost / avg_factor
+            kpt_cost.append(cost)
+        kpt_cost = paddle.concat(kpt_cost, axis=1)
+        return kpt_cost * self.weight
+
+
+@register
+class OksCost(object):
+    """OksCost.
+
+    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
+
+    Args:
+        num_keypoints (int): number of keypoints
+        weight (int | float, optional): loss_weight.
+    """
+
+    def __init__(self, num_keypoints=17, weight=1.0):
+        self.weight = weight
+        if num_keypoints == 17:
+            self.sigmas = np.array(
+                [
+                    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
+                    1.07, .87, .87, .89, .89
+                ],
+                dtype=np.float32) / 10.0
+        elif num_keypoints == 14:
+            self.sigmas = np.array(
+                [
+                    .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89,
+                    .89, .79, .79
+                ],
+                dtype=np.float32) / 10.0
+        else:
+            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
+
+    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag, gt_areas):
+        """
+        Args:
+            kpt_pred (Tensor): Predicted keypoints with unnormalized
+                coordinates (x_{i}, y_{i}). Shape [num_query, K, 2].
+            gt_keypoints (Tensor): Ground truth keypoints with unnormalized
+                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
+            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
+                Shape [num_gt, K].
+            gt_areas (Tensor): Ground truth mask areas. Shape [num_gt,].
+
+        Returns:
+            paddle.Tensor: oks_cost value with weight.
+        """
+        sigmas = paddle.to_tensor(self.sigmas)
+        variances = (sigmas * 2)**2
+
+        oks_cost = []
+        assert len(gt_keypoints) == len(gt_areas)
+        for i in range(len(gt_keypoints)):
+            if gt_keypoints[i].size == 0:
+                oks_cost.append(kpt_pred.sum() * 0)
+            squared_distance = \
+                (kpt_pred[:, :, 0] - gt_keypoints[i, :, 0].unsqueeze(0)) ** 2 + \
+                (kpt_pred[:, :, 1] - gt_keypoints[i, :, 1].unsqueeze(0)) ** 2
+            vis_flag = (valid_kpt_flag[i] > 0).astype('int')
+            vis_ind = vis_flag.nonzero(as_tuple=False)[:, 0]
+            num_vis_kpt = vis_ind.shape[0]
+            # assert num_vis_kpt > 0
+            if num_vis_kpt == 0:
+                oks_cost.append(paddle.zeros((squared_distance.shape[0], 1)))
+                continue
+            area = gt_areas[i]
+
+            squared_distance0 = squared_distance / (area * variances * 2)
+            squared_distance0 = paddle.index_select(
+                squared_distance0, vis_ind, axis=1)
+            squared_distance1 = paddle.exp(-squared_distance0).sum(axis=1,
+                                                                   keepdim=True)
+            oks = squared_distance1 / num_vis_kpt
+            # The 1 is a constant that doesn't change the matching, so omitted.
+            oks_cost.append(-oks)
+        oks_cost = paddle.concat(oks_cost, axis=1)
+        return oks_cost * self.weight
+
+
+@register
+class ClassificationCost:
+    """ClsSoftmaxCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            paddle.Tensor: cls_cost value with weight
+        """
+        # Following the official DETR repo, contrary to the loss that
+        # NLL is used, we approximate it in 1 - cls_score[gt_label].
+        # The 1 is a constant that doesn't change the matching,
+        # so it can be omitted.
+        cls_score = cls_pred.softmax(-1)
+        cls_cost = -cls_score[:, gt_labels]
+        return cls_cost * self.weight
+
+
+@register
+class FocalLossCost:
+    """FocalLossCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+         alpha (int | float, optional): focal_loss alpha
+         gamma (int | float, optional): focal_loss gamma
+         eps (float, optional): default 1e-12
+         binary_input (bool, optional): Whether the input is binary,
+            default False.
+    """
+
+    def __init__(self,
+                 weight=1.,
+                 alpha=0.25,
+                 gamma=2,
+                 eps=1e-12,
+                 binary_input=False):
+        self.weight = weight
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+        self.binary_input = binary_input
+
+    def _focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            paddle.Tensor: cls_cost value with weight
+        """
+        if gt_labels.size == 0:
+            return cls_pred.sum() * 0
+        cls_pred = F.sigmoid(cls_pred)
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = paddle.index_select(
+            pos_cost, gt_labels, axis=1) - paddle.index_select(
+                neg_cost, gt_labels, axis=1)
+        return cls_cost * self.weight
+
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits
+                in shape (num_query, d1, ..., dn), dtype=paddle.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=paddle.long. Labels should be binary.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = F.sigmoid(cls_pred)
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = paddle.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            paddle.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits.
+            gt_labels (Tensor)): Labels.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        if self.binary_input:
+            return self._mask_focal_loss_cost(cls_pred, gt_labels)
+        else:
+            return self._focal_loss_cost(cls_pred, gt_labels)

+ 164 - 0
paddlers/models/ppdet/modeling/assigners/rotated_task_aligned_assigner.py

@@ -0,0 +1,164 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register
+from ..rbox_utils import rotated_iou_similarity, check_points_in_rotated_boxes
+from .utils import gather_topk_anchors, compute_max_iou_anchor
+
+__all__ = ['RotatedTaskAlignedAssigner']
+
+
+@register
+class RotatedTaskAlignedAssigner(nn.Layer):
+    """TOOD: Task-aligned One-stage Object Detection
+    """
+
+    def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
+        super(RotatedTaskAlignedAssigner, self).__init__()
+        self.topk = topk
+        self.alpha = alpha
+        self.beta = beta
+        self.eps = eps
+
+    @paddle.no_grad()
+    def forward(self,
+                pred_scores,
+                pred_bboxes,
+                anchor_points,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index,
+                gt_scores=None):
+        r"""This code is based on
+            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
+
+        The assignment is done in following steps
+        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free detector
+           only can predict positive distance)
+        4. if an anchor box is assigned to multiple gts, the one with the
+           highest iou will be selected.
+        Args:
+            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
+            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 5)
+            anchor_points (Tensor, float32): pre-defined anchors, shape(1, L, 2), "cxcy" format
+            num_anchors_list (List): num of anchors in each level, shape(L)
+            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)
+            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
+            bg_index (int): background index
+            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_bboxes (Tensor): (B, L, 5)
+            assigned_scores (Tensor): (B, L, C)
+        """
+        assert pred_scores.ndim == pred_bboxes.ndim
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+
+        batch_size, num_anchors, num_classes = pred_scores.shape
+        _, num_max_boxes, _ = gt_bboxes.shape
+
+        # negative batch
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full(
+                [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)
+            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 5])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, num_classes])
+            return assigned_labels, assigned_bboxes, assigned_scores
+
+        # compute iou between gt and pred bbox, [B, n, L]
+        ious = rotated_iou_similarity(gt_bboxes, pred_bboxes)
+        ious = paddle.where(ious > 1 + self.eps, paddle.zeros_like(ious), ious)
+        ious.stop_gradient = True
+        # gather pred bboxes class score
+        pred_scores = pred_scores.transpose([0, 2, 1])
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        gt_labels_ind = paddle.stack(
+            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
+            axis=-1)
+        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
+        # compute alignment metrics, [B, n, L]
+        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
+            self.beta)
+
+        # check the positive sample's center in gt, [B, n, L]
+        is_in_gts = check_points_in_rotated_boxes(anchor_points, gt_bboxes)
+
+        # select topk largest alignment metrics pred bbox as candidates
+        # for each gt, [B, n, L]
+        is_in_topk = gather_topk_anchors(
+            alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)
+
+        # select positive sample, [B, n, L]
+        mask_positive = is_in_topk * is_in_gts * pad_gt_mask
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected, [B, n, L]
+        mask_positive_sum = mask_positive.sum(axis=-2)
+        if mask_positive_sum.max() > 1:
+            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
+                [1, num_max_boxes, 1])
+            is_max_iou = compute_max_iou_anchor(ious)
+            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        assigned_gt_index = mask_positive.argmax(axis=-2)
+
+        # assigned target
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            mask_positive_sum > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_bboxes = paddle.gather(
+            gt_bboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)
+        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 5])
+
+        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
+        ind = list(range(num_classes + 1))
+        ind.remove(bg_index)
+        assigned_scores = paddle.index_select(
+            assigned_scores, paddle.to_tensor(ind), axis=-1)
+        # rescale alignment metrics
+        alignment_metrics *= mask_positive
+        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
+        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
+                                                           keepdim=True)
+        alignment_metrics = alignment_metrics / (
+            max_metrics_per_instance + self.eps) * max_ious_per_instance
+        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
+        assigned_scores = assigned_scores * alignment_metrics
+
+        assigned_bboxes.stop_gradient = True
+        assigned_scores.stop_gradient = True
+        assigned_labels.stop_gradient = True
+        return assigned_labels, assigned_bboxes, assigned_scores

+ 1 - 1
paddlers/models/ppdet/modeling/assigners/simota_assigner.py

@@ -236,7 +236,7 @@ class SimOTAAssigner(object):
         )] = match_fg_mask_inmatrix
 
         assigned_gt_inds[match_fg_mask_inall.astype(
-            np.bool)] = match_gt_inds_to_fg + 1
+            np.bool_)] = match_gt_inds_to_fg + 1
 
         pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \
             = self.get_sample(assigned_gt_inds, gt_bboxes.numpy())

+ 38 - 4
paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py

@@ -28,17 +28,47 @@ from .utils import (gather_topk_anchors, check_points_inside_bboxes,
 __all__ = ['TaskAlignedAssigner']
 
 
+def is_close_gt(anchor, gt, stride_lst, max_dist=2.0, alpha=2.):
+    """Calculate distance ratio of box1 and box2 in batch for larger stride
+        anchors dist/stride to promote the survive of large distance match
+    Args:
+        anchor (Tensor): box with the shape [L, 2]
+        gt (Tensor): box with the shape [N, M2, 4]
+    Return:
+        dist (Tensor): dist ratio between box1 and box2 with the shape [N, M1, M2]
+    """
+    center1 = anchor.unsqueeze(0)
+    center2 = (gt[..., :2] + gt[..., -2:]) / 2.
+    center1 = center1.unsqueeze(1)  # [N, M1, 2] -> [N, 1, M1, 2]
+    center2 = center2.unsqueeze(2)  # [N, M2, 2] -> [N, M2, 1, 2]
+
+    stride = paddle.concat([
+        paddle.full([x], 32 / pow(2, idx)) for idx, x in enumerate(stride_lst)
+    ]).unsqueeze(0).unsqueeze(0)
+    dist = paddle.linalg.norm(center1 - center2, p=2, axis=-1) / stride
+    dist_ratio = dist
+    dist_ratio[dist < max_dist] = 1.
+    dist_ratio[dist >= max_dist] = 0.
+    return dist_ratio
+
+
 @register
 class TaskAlignedAssigner(nn.Layer):
     """TOOD: Task-aligned One-stage Object Detection
     """
 
-    def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
+    def __init__(self,
+                 topk=13,
+                 alpha=1.0,
+                 beta=6.0,
+                 eps=1e-9,
+                 is_close_gt=False):
         super(TaskAlignedAssigner, self).__init__()
         self.topk = topk
         self.alpha = alpha
         self.beta = beta
         self.eps = eps
+        self.is_close_gt = is_close_gt
 
     @paddle.no_grad()
     def forward(self,
@@ -90,7 +120,8 @@ class TaskAlignedAssigner(nn.Layer):
             assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
             assigned_scores = paddle.zeros(
                 [batch_size, num_anchors, num_classes])
-            return assigned_labels, assigned_bboxes, assigned_scores
+            mask_positive = paddle.zeros([batch_size, 1, num_anchors])
+            return assigned_labels, assigned_bboxes, assigned_scores, mask_positive
 
         # compute iou between gt and pred bbox, [B, n, L]
         ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
@@ -107,7 +138,10 @@ class TaskAlignedAssigner(nn.Layer):
             self.beta)
 
         # check the positive sample's center in gt, [B, n, L]
-        is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)
+        if self.is_close_gt:
+            is_in_gts = is_close_gt(anchor_points, gt_bboxes, num_anchors_list)
+        else:
+            is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)
 
         # select topk largest alignment metrics pred bbox as candidates
         # for each gt, [B, n, L]
@@ -157,4 +191,4 @@ class TaskAlignedAssigner(nn.Layer):
         alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
         assigned_scores = assigned_scores * alignment_metrics
 
-        return assigned_labels, assigned_bboxes, assigned_scores
+        return assigned_labels, assigned_bboxes, assigned_scores, mask_positive

+ 182 - 0
paddlers/models/ppdet/modeling/assigners/task_aligned_assigner_cr.py

@@ -0,0 +1,182 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register
+from ..bbox_utils import batch_iou_similarity
+from .utils import (gather_topk_anchors, check_points_inside_bboxes,
+                    compute_max_iou_anchor)
+
+__all__ = ['TaskAlignedAssigner_CR']
+
+
+@register
+class TaskAlignedAssigner_CR(nn.Layer):
+    """TOOD: Task-aligned One-stage Object Detection with Center R
+    """
+
+    def __init__(self,
+                 topk=13,
+                 alpha=1.0,
+                 beta=6.0,
+                 center_radius=None,
+                 eps=1e-9):
+        super(TaskAlignedAssigner_CR, self).__init__()
+        self.topk = topk
+        self.alpha = alpha
+        self.beta = beta
+        self.center_radius = center_radius
+        self.eps = eps
+
+    @paddle.no_grad()
+    def forward(self,
+                pred_scores,
+                pred_bboxes,
+                anchor_points,
+                stride_tensor,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index,
+                gt_scores=None):
+        r"""This code is based on
+            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
+
+        The assignment is done in following steps
+        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free detector
+           only can predict positive distance)
+        4. if an anchor box is assigned to multiple gts, the one with the
+           highest iou will be selected.
+        Args:
+            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
+            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
+            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
+            stride_tensor (Tensor, float32): stride of feature map, shape(L, 1)
+            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
+            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
+            bg_index (int): background index
+            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_bboxes (Tensor): (B, L, 4)
+            assigned_scores (Tensor): (B, L, C)
+        """
+        assert pred_scores.ndim == pred_bboxes.ndim
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+
+        batch_size, num_anchors, num_classes = pred_scores.shape
+        _, num_max_boxes, _ = gt_bboxes.shape
+
+        # negative batch
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full(
+                [batch_size, num_anchors], bg_index, dtype='int32')
+            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, num_classes])
+            mask_positive = paddle.zeros([batch_size, 1, num_anchors])
+            return assigned_labels, assigned_bboxes, assigned_scores, mask_positive
+
+        # compute iou between gt and pred bbox, [B, n, L]
+        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
+        # gather pred bboxes class score
+        pred_scores = pred_scores.transpose([0, 2, 1])
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        gt_labels_ind = paddle.stack(
+            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
+            axis=-1)
+        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
+        # compute alignment metrics, [B, n, L]
+        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
+            self.beta) * pad_gt_mask
+
+        # select positive sample, [B, n, L]
+        if self.center_radius is None:
+            # check the positive sample's center in gt, [B, n, L]
+            is_in_gts = check_points_inside_bboxes(
+                anchor_points, gt_bboxes, sm_use=True)
+            # select topk largest alignment metrics pred bbox as candidates
+            # for each gt, [B, n, L]
+            mask_positive = gather_topk_anchors(
+                alignment_metrics, self.topk, topk_mask=pad_gt_mask) * is_in_gts
+        else:
+            is_in_gts, is_in_center = check_points_inside_bboxes(
+                anchor_points,
+                gt_bboxes,
+                stride_tensor * self.center_radius,
+                sm_use=True)
+            is_in_gts *= pad_gt_mask
+            is_in_center *= pad_gt_mask
+            candidate_metrics = paddle.where(
+                is_in_gts.sum(-1, keepdim=True) == 0,
+                alignment_metrics + is_in_center,
+                alignment_metrics)
+            mask_positive = gather_topk_anchors(
+                candidate_metrics, self.topk,
+                topk_mask=pad_gt_mask) * paddle.cast((is_in_center > 0) |
+                                                     (is_in_gts > 0), 'float32')
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected, [B, n, L]
+        mask_positive_sum = mask_positive.sum(axis=-2)
+        if mask_positive_sum.max() > 1:
+            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
+                [1, num_max_boxes, 1])
+            is_max_iou = compute_max_iou_anchor(ious * mask_positive)
+            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        assigned_gt_index = mask_positive.argmax(axis=-2)
+
+        # assigned target
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            mask_positive_sum > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_bboxes = paddle.gather(
+            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
+        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
+
+        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
+        ind = list(range(num_classes + 1))
+        ind.remove(bg_index)
+        assigned_scores = paddle.index_select(
+            assigned_scores, paddle.to_tensor(ind), axis=-1)
+        # rescale alignment metrics
+        alignment_metrics *= mask_positive
+        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
+        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
+                                                           keepdim=True)
+        alignment_metrics = alignment_metrics / (
+            max_metrics_per_instance + self.eps) * max_ious_per_instance
+        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
+        assigned_scores = assigned_scores * alignment_metrics
+
+        return assigned_labels, assigned_bboxes, assigned_scores, mask_positive

+ 93 - 0
paddlers/models/ppdet/modeling/assigners/uniform_assigner.py

@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlers.models.ppdet.core.workspace import register
+
+from paddlers.models.ppdet.modeling.bbox_utils import batch_bbox_overlaps
+from paddlers.models.ppdet.modeling.transformers import bbox_xyxy_to_cxcywh
+
+__all__ = ['UniformAssigner']
+
+
+def batch_p_dist(x, y, p=2):
+    """
+    calculate pairwise p_dist, the first index of x and y are batch
+    return [x.shape[0], y.shape[0]]
+    """
+    x = x.unsqueeze(1)
+    diff = x - y
+    return paddle.norm(diff, p=p, axis=list(range(2, diff.dim())))
+
+
+@register
+class UniformAssigner(nn.Layer):
+    def __init__(self, pos_ignore_thr, neg_ignore_thr, match_times=4):
+        super(UniformAssigner, self).__init__()
+        self.pos_ignore_thr = pos_ignore_thr
+        self.neg_ignore_thr = neg_ignore_thr
+        self.match_times = match_times
+
+    def forward(self, bbox_pred, anchor, gt_bboxes, gt_labels=None):
+        num_bboxes = bbox_pred.shape[0]
+        num_gts = gt_bboxes.shape[0]
+        match_labels = paddle.full([num_bboxes], -1, dtype=paddle.int32)
+
+        pred_ious = batch_bbox_overlaps(bbox_pred, gt_bboxes)
+        pred_max_iou = pred_ious.max(axis=1)
+        neg_ignore = pred_max_iou > self.neg_ignore_thr
+        # exclude potential ignored neg samples first, deal with pos samples later
+        #match_labels: -2(ignore), -1(neg) or >=0(pos_inds)
+        match_labels = paddle.where(neg_ignore,
+                                    paddle.full_like(match_labels, -2),
+                                    match_labels)
+
+        bbox_pred_c = bbox_xyxy_to_cxcywh(bbox_pred)
+        anchor_c = bbox_xyxy_to_cxcywh(anchor)
+        gt_bboxes_c = bbox_xyxy_to_cxcywh(gt_bboxes)
+        bbox_pred_dist = batch_p_dist(bbox_pred_c, gt_bboxes_c, p=1)
+        anchor_dist = batch_p_dist(anchor_c, gt_bboxes_c, p=1)
+
+        top_pred = bbox_pred_dist.topk(
+            k=self.match_times, axis=0, largest=False)[1]
+        top_anchor = anchor_dist.topk(
+            k=self.match_times, axis=0, largest=False)[1]
+
+        tar_pred = paddle.arange(num_gts).expand([self.match_times, num_gts])
+        tar_anchor = paddle.arange(num_gts).expand([self.match_times, num_gts])
+        pos_places = paddle.concat([top_pred, top_anchor]).reshape([-1])
+        pos_inds = paddle.concat([tar_pred, tar_anchor]).reshape([-1])
+
+        pos_anchor = anchor[pos_places]
+        pos_tar_bbox = gt_bboxes[pos_inds]
+        pos_ious = batch_bbox_overlaps(
+            pos_anchor, pos_tar_bbox, is_aligned=True)
+        pos_ignore = pos_ious < self.pos_ignore_thr
+        pos_inds = paddle.where(pos_ignore,
+                                paddle.full_like(pos_inds, -2), pos_inds)
+        match_labels[pos_places] = pos_inds
+        match_labels.stop_gradient = True
+        pos_keep = ~pos_ignore
+
+        if pos_keep.sum() > 0:
+            pos_places_keep = pos_places[pos_keep]
+            pos_bbox_pred = bbox_pred[pos_places_keep].reshape([-1, 4])
+            pos_bbox_tar = pos_tar_bbox[pos_keep].reshape([-1, 4]).detach()
+        else:
+            pos_bbox_pred = None
+            pos_bbox_tar = None
+
+        return match_labels, pos_bbox_pred, pos_bbox_tar

+ 8 - 3
paddlers/models/ppdet/modeling/assigners/utils.py

@@ -108,7 +108,8 @@ def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
 def check_points_inside_bboxes(points,
                                bboxes,
                                center_radius_tensor=None,
-                               eps=1e-9):
+                               eps=1e-9,
+                               sm_use=False):
     r"""
     Args:
         points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
@@ -139,8 +140,12 @@ def check_points_inside_bboxes(points,
         b = (cy + center_radius_tensor) - y
         delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1)
         is_in_center = (delta_ltrb_c.min(axis=-1) > eps)
-        return (paddle.logical_and(is_in_bboxes, is_in_center),
-                paddle.logical_or(is_in_bboxes, is_in_center))
+        if sm_use:
+            return is_in_bboxes.astype(bboxes.dtype), is_in_center.astype(
+                bboxes.dtype)
+        else:
+            return (paddle.logical_and(is_in_bboxes, is_in_center),
+                    paddle.logical_or(is_in_bboxes, is_in_center))
 
     return is_in_bboxes.astype(bboxes.dtype)
 

+ 2 - 0
paddlers/models/ppdet/modeling/backbones/__init__.py

@@ -34,6 +34,7 @@ from . import csp_darknet
 from . import convnext
 from . import vision_transformer
 from . import mobileone
+from . import trans_encoder
 
 from .vgg import *
 from .resnet import *
@@ -58,3 +59,4 @@ from .convnext import *
 from .vision_transformer import *
 from .vision_transformer import *
 from .mobileone import *
+from .trans_encoder import *

+ 49 - 9
paddlers/models/ppdet/modeling/backbones/dla.py

@@ -19,7 +19,7 @@ from paddlers.models.ppdet.core.workspace import register, serializable
 from paddlers.models.ppdet.modeling.layers import ConvNormLayer
 from ..shape_spec import ShapeSpec
 
-DLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512])}
+DLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512]), }
 
 
 class BasicBlock(nn.Layer):
@@ -157,17 +157,25 @@ class DLA(nn.Layer):
     DLA, see https://arxiv.org/pdf/1707.06484.pdf
 
     Args:
-        depth (int): DLA depth, should be 34.
+        depth (int): DLA depth, only support 34 now.
         residual_root (bool): whether use a reidual layer in the root block
-
+        pre_img (bool): add pre_img, only used in CenterTrack
+        pre_hm (bool): add pre_hm, only used in CenterTrack
     """
 
-    def __init__(self, depth=34, residual_root=False):
+    def __init__(self,
+                 depth=34,
+                 residual_root=False,
+                 pre_img=False,
+                 pre_hm=False):
         super(DLA, self).__init__()
-        levels, channels = DLA_cfg[depth]
+        assert depth == 34, 'Only support DLA with depth of 34 now.'
         if depth == 34:
             block = BasicBlock
+        levels, channels = DLA_cfg[depth]
         self.channels = channels
+        self.num_levels = len(levels)
+
         self.base_layer = nn.Sequential(
             ConvNormLayer(
                 3,
@@ -213,6 +221,29 @@ class DLA(nn.Layer):
             level_root=True,
             root_residual=residual_root)
 
+        if pre_img:
+            self.pre_img_layer = nn.Sequential(
+                ConvNormLayer(
+                    3,
+                    channels[0],
+                    filter_size=7,
+                    stride=1,
+                    bias_on=False,
+                    norm_decay=None),
+                nn.ReLU())
+        if pre_hm:
+            self.pre_hm_layer = nn.Sequential(
+                ConvNormLayer(
+                    1,
+                    channels[0],
+                    filter_size=7,
+                    stride=1,
+                    bias_on=False,
+                    norm_decay=None),
+                nn.ReLU())
+        self.pre_img = pre_img
+        self.pre_hm = pre_hm
+
     def _make_conv_level(self, ch_in, ch_out, conv_num, stride=1):
         modules = []
         for i in range(conv_num):
@@ -230,13 +261,22 @@ class DLA(nn.Layer):
 
     @property
     def out_shape(self):
-        return [ShapeSpec(channels=self.channels[i]) for i in range(6)]
+        return [
+            ShapeSpec(channels=self.channels[i]) for i in range(self.num_levels)
+        ]
 
     def forward(self, inputs):
         outs = []
-        im = inputs['image']
-        feats = self.base_layer(im)
-        for i in range(6):
+        feats = self.base_layer(inputs['image'])
+
+        if self.pre_img and 'pre_image' in inputs and inputs[
+                'pre_image'] is not None:
+            feats = feats + self.pre_img_layer(inputs['pre_image'])
+
+        if self.pre_hm and 'pre_hm' in inputs and inputs['pre_hm'] is not None:
+            feats = feats + self.pre_hm_layer(inputs['pre_hm'])
+
+        for i in range(self.num_levels):
             feats = getattr(self, 'level{}'.format(i))(feats)
             outs.append(feats)
 

+ 144 - 2
paddlers/models/ppdet/modeling/backbones/hrnet.py

@@ -37,6 +37,7 @@ class ConvNormLayer(nn.Layer):
                  norm_type='bn',
                  norm_groups=32,
                  use_dcn=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  freeze_norm=False,
                  act=None,
@@ -66,6 +67,7 @@ class ConvNormLayer(nn.Layer):
         if norm_type in ['bn', 'sync_bn']:
             self.norm = nn.BatchNorm2D(
                 ch_out,
+                momentum=norm_momentum,
                 weight_attr=param_attr,
                 bias_attr=bias_attr,
                 use_global_stats=global_stats)
@@ -93,6 +95,7 @@ class Layer1(nn.Layer):
     def __init__(self,
                  num_channels,
                  has_se=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  freeze_norm=True,
                  name=None):
@@ -109,6 +112,7 @@ class Layer1(nn.Layer):
                     has_se=has_se,
                     stride=1,
                     downsample=True if i == 0 else False,
+                    norm_momentum=norm_momentum,
                     norm_decay=norm_decay,
                     freeze_norm=freeze_norm,
                     name=name + '_' + str(i + 1)))
@@ -125,6 +129,7 @@ class TransitionLayer(nn.Layer):
     def __init__(self,
                  in_channels,
                  out_channels,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  freeze_norm=True,
                  name=None):
@@ -144,6 +149,7 @@ class TransitionLayer(nn.Layer):
                             ch_in=in_channels[i],
                             ch_out=out_channels[i],
                             filter_size=3,
+                            norm_momentum=norm_momentum,
                             norm_decay=norm_decay,
                             freeze_norm=freeze_norm,
                             act='relu',
@@ -156,6 +162,7 @@ class TransitionLayer(nn.Layer):
                         ch_out=out_channels[i],
                         filter_size=3,
                         stride=2,
+                        norm_momentum=norm_momentum,
                         norm_decay=norm_decay,
                         freeze_norm=freeze_norm,
                         act='relu',
@@ -181,6 +188,7 @@ class Branches(nn.Layer):
                  in_channels,
                  out_channels,
                  has_se=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  freeze_norm=True,
                  name=None):
@@ -197,6 +205,7 @@ class Branches(nn.Layer):
                         num_channels=in_ch,
                         num_filters=out_channels[i],
                         has_se=has_se,
+                        norm_momentum=norm_momentum,
                         norm_decay=norm_decay,
                         freeze_norm=freeze_norm,
                         name=name + '_branch_layer_' + str(i + 1) + '_' +
@@ -221,6 +230,7 @@ class BottleneckBlock(nn.Layer):
                  has_se,
                  stride=1,
                  downsample=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  freeze_norm=True,
                  name=None):
@@ -233,6 +243,7 @@ class BottleneckBlock(nn.Layer):
             ch_in=num_channels,
             ch_out=num_filters,
             filter_size=1,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             act="relu",
@@ -242,6 +253,7 @@ class BottleneckBlock(nn.Layer):
             ch_out=num_filters,
             filter_size=3,
             stride=stride,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             act="relu",
@@ -250,6 +262,7 @@ class BottleneckBlock(nn.Layer):
             ch_in=num_filters,
             ch_out=num_filters * 4,
             filter_size=1,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             act=None,
@@ -260,6 +273,7 @@ class BottleneckBlock(nn.Layer):
                 ch_in=num_channels,
                 ch_out=num_filters * 4,
                 filter_size=1,
+                norm_momentum=norm_momentum,
                 norm_decay=norm_decay,
                 freeze_norm=freeze_norm,
                 act=None,
@@ -296,6 +310,7 @@ class BasicBlock(nn.Layer):
                  stride=1,
                  has_se=False,
                  downsample=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  freeze_norm=True,
                  name=None):
@@ -307,6 +322,7 @@ class BasicBlock(nn.Layer):
             ch_in=num_channels,
             ch_out=num_filters,
             filter_size=3,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             stride=stride,
@@ -316,6 +332,7 @@ class BasicBlock(nn.Layer):
             ch_in=num_filters,
             ch_out=num_filters,
             filter_size=3,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             stride=1,
@@ -327,6 +344,7 @@ class BasicBlock(nn.Layer):
                 ch_in=num_channels,
                 ch_out=num_filters * 4,
                 filter_size=1,
+                norm_momentum=norm_momentum,
                 norm_decay=norm_decay,
                 freeze_norm=freeze_norm,
                 act=None,
@@ -394,6 +412,7 @@ class Stage(nn.Layer):
                  num_modules,
                  num_filters,
                  has_se=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  freeze_norm=True,
                  multi_scale_output=True,
@@ -410,6 +429,7 @@ class Stage(nn.Layer):
                         num_channels=num_channels,
                         num_filters=num_filters,
                         has_se=has_se,
+                        norm_momentum=norm_momentum,
                         norm_decay=norm_decay,
                         freeze_norm=freeze_norm,
                         multi_scale_output=False,
@@ -421,6 +441,7 @@ class Stage(nn.Layer):
                         num_channels=num_channels,
                         num_filters=num_filters,
                         has_se=has_se,
+                        norm_momentum=norm_momentum,
                         norm_decay=norm_decay,
                         freeze_norm=freeze_norm,
                         name=name + '_' + str(i + 1)))
@@ -440,6 +461,7 @@ class HighResolutionModule(nn.Layer):
                  num_filters,
                  has_se=False,
                  multi_scale_output=True,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  freeze_norm=True,
                  name=None):
@@ -449,6 +471,7 @@ class HighResolutionModule(nn.Layer):
             in_channels=num_channels,
             out_channels=num_filters,
             has_se=has_se,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             name=name)
@@ -457,6 +480,7 @@ class HighResolutionModule(nn.Layer):
             in_channels=num_filters,
             out_channels=num_filters,
             multi_scale_output=multi_scale_output,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             name=name)
@@ -472,6 +496,7 @@ class FuseLayers(nn.Layer):
                  in_channels,
                  out_channels,
                  multi_scale_output=True,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  freeze_norm=True,
                  name=None):
@@ -493,6 +518,7 @@ class FuseLayers(nn.Layer):
                             filter_size=1,
                             stride=1,
                             act=None,
+                            norm_momentum=norm_momentum,
                             norm_decay=norm_decay,
                             freeze_norm=freeze_norm,
                             name=name + '_layer_' + str(i + 1) + '_' +
@@ -510,6 +536,7 @@ class FuseLayers(nn.Layer):
                                     ch_out=out_channels[i],
                                     filter_size=3,
                                     stride=2,
+                                    norm_momentum=norm_momentum,
                                     norm_decay=norm_decay,
                                     freeze_norm=freeze_norm,
                                     act=None,
@@ -525,6 +552,7 @@ class FuseLayers(nn.Layer):
                                     ch_out=out_channels[j],
                                     filter_size=3,
                                     stride=2,
+                                    norm_momentum=norm_momentum,
                                     norm_decay=norm_decay,
                                     freeze_norm=freeze_norm,
                                     act="relu",
@@ -549,7 +577,6 @@ class FuseLayers(nn.Layer):
                     for k in range(i - j):
                         y = self.residual_func_list[residual_func_idx](y)
                         residual_func_idx += 1
-
                     residual = paddle.add(x=residual, y=y)
             residual = F.relu(residual)
             outs.append(residual)
@@ -567,6 +594,7 @@ class HRNet(nn.Layer):
         has_se (bool): whether to add SE block for each stage
         freeze_at (int): the stage to freeze
         freeze_norm (bool): whether to freeze norm in HRNet
+        norm_momentum (float): momentum of BatchNorm
         norm_decay (float): weight decay for normalization layer weights
         return_idx (List): the stage to return
         upsample (bool): whether to upsample and concat the backbone feats
@@ -577,9 +605,11 @@ class HRNet(nn.Layer):
                  has_se=False,
                  freeze_at=0,
                  freeze_norm=True,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  return_idx=[0, 1, 2, 3],
-                 upsample=False):
+                 upsample=False,
+                 downsample=False):
         super(HRNet, self).__init__()
 
         self.width = width
@@ -591,6 +621,7 @@ class HRNet(nn.Layer):
         self.freeze_at = freeze_at
         self.return_idx = return_idx
         self.upsample = upsample
+        self.downsample = downsample
 
         self.channels = {
             18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]],
@@ -613,6 +644,7 @@ class HRNet(nn.Layer):
             ch_out=64,
             filter_size=3,
             stride=2,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             act='relu',
@@ -623,6 +655,7 @@ class HRNet(nn.Layer):
             ch_out=64,
             filter_size=3,
             stride=2,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             act='relu',
@@ -631,6 +664,7 @@ class HRNet(nn.Layer):
         self.la1 = Layer1(
             num_channels=64,
             has_se=has_se,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             name="layer2")
@@ -638,6 +672,7 @@ class HRNet(nn.Layer):
         self.tr1 = TransitionLayer(
             in_channels=[256],
             out_channels=channels_2,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             name="tr1")
@@ -647,6 +682,7 @@ class HRNet(nn.Layer):
             num_modules=num_modules_2,
             num_filters=channels_2,
             has_se=self.has_se,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             name="st2")
@@ -654,6 +690,7 @@ class HRNet(nn.Layer):
         self.tr2 = TransitionLayer(
             in_channels=channels_2,
             out_channels=channels_3,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             name="tr2")
@@ -663,6 +700,7 @@ class HRNet(nn.Layer):
             num_modules=num_modules_3,
             num_filters=channels_3,
             has_se=self.has_se,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             name="st3")
@@ -670,6 +708,7 @@ class HRNet(nn.Layer):
         self.tr3 = TransitionLayer(
             in_channels=channels_3,
             out_channels=channels_4,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             name="tr3")
@@ -678,11 +717,106 @@ class HRNet(nn.Layer):
             num_modules=num_modules_4,
             num_filters=channels_4,
             has_se=self.has_se,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             multi_scale_output=len(return_idx) > 1,
             name="st4")
 
+        if self.downsample:
+            self.incre_modules, self.downsamp_modules, \
+                self.final_layer = self._make_head(channels_4, norm_momentum=norm_momentum, has_se=self.has_se)
+
+    def _make_layer(self,
+                    block,
+                    inplanes,
+                    planes,
+                    blocks,
+                    stride=1,
+                    norm_momentum=0.9,
+                    has_se=False,
+                    name=None):
+        downsample = None
+        if stride != 1 or inplanes != planes * 4:
+            downsample = True
+
+        layers = []
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                has_se,
+                stride,
+                downsample,
+                norm_momentum=norm_momentum,
+                freeze_norm=False,
+                name=name + "_s0"))
+        inplanes = planes * 4
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    has_se,
+                    norm_momentum=norm_momentum,
+                    freeze_norm=False,
+                    name=name + "_s" + str(i)))
+
+        return nn.Sequential(*layers)
+
+    def _make_head(self, pre_stage_channels, norm_momentum=0.9, has_se=False):
+        head_block = BottleneckBlock
+        head_channels = [32, 64, 128, 256]
+
+        # Increasing the #channels on each resolution 
+        # from C, 2C, 4C, 8C to 128, 256, 512, 1024
+        incre_modules = []
+        for i, channels in enumerate(pre_stage_channels):
+            incre_module = self._make_layer(
+                head_block,
+                channels,
+                head_channels[i],
+                1,
+                stride=1,
+                norm_momentum=norm_momentum,
+                has_se=has_se,
+                name='incre' + str(i))
+            incre_modules.append(incre_module)
+        incre_modules = nn.LayerList(incre_modules)
+
+        # downsampling modules
+        downsamp_modules = []
+        for i in range(len(pre_stage_channels) - 1):
+            in_channels = head_channels[i] * 4
+            out_channels = head_channels[i + 1] * 4
+
+            downsamp_module = nn.Sequential(
+                nn.Conv2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1),
+                nn.BatchNorm2D(
+                    out_channels, momentum=norm_momentum),
+                nn.ReLU())
+
+            downsamp_modules.append(downsamp_module)
+        downsamp_modules = nn.LayerList(downsamp_modules)
+
+        final_layer = nn.Sequential(
+            nn.Conv2D(
+                in_channels=head_channels[3] * 4,
+                out_channels=2048,
+                kernel_size=1,
+                stride=1,
+                padding=0),
+            nn.BatchNorm2D(
+                2048, momentum=norm_momentum),
+            nn.ReLU())
+
+        return incre_modules, downsamp_modules, final_layer
+
     def forward(self, inputs):
         x = inputs['image']
         conv1 = self.conv_layer1_1(x)
@@ -707,6 +841,14 @@ class HRNet(nn.Layer):
             x = paddle.concat([st4[0], x1, x2, x3], 1)
             return x
 
+        if self.downsample:
+            y = self.incre_modules[0](st4[0])
+            for i in range(len(self.downsamp_modules)):
+                y = self.incre_modules[i+1](st4[i+1]) + \
+                            self.downsamp_modules[i](y)
+            y = self.final_layer(y)
+            return y
+
         res = []
         for i, layer in enumerate(st4):
             if i == self.freeze_at:

+ 5 - 0
paddlers/models/ppdet/modeling/backbones/lite_hrnet.py

@@ -854,6 +854,11 @@ class LiteHRNet(nn.Layer):
 
     def forward(self, inputs):
         x = inputs['image']
+        dims = x.shape
+        if len(dims) == 5:
+            x = paddle.reshape(x, (dims[0] * dims[1], dims[2], dims[3],
+                                   dims[4]))  # [6, 3, 128, 96]
+
         x = self.stem(x)
         y_list = [x]
         for stage_idx in range(3):

+ 30 - 30
paddlers/models/ppdet/modeling/backbones/resnet.py

@@ -285,36 +285,6 @@ class BottleNeck(nn.Layer):
         # ResNeXt
         width = int(ch_out * (base_width / 64.)) * groups
 
-        self.shortcut = shortcut
-        if not shortcut:
-            if variant == 'd' and stride == 2:
-                self.short = nn.Sequential()
-                self.short.add_sublayer(
-                    'pool',
-                    nn.AvgPool2D(
-                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
-                self.short.add_sublayer(
-                    'conv',
-                    ConvNormLayer(
-                        ch_in=ch_in,
-                        ch_out=ch_out * self.expansion,
-                        filter_size=1,
-                        stride=1,
-                        norm_type=norm_type,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        lr=lr))
-            else:
-                self.short = ConvNormLayer(
-                    ch_in=ch_in,
-                    ch_out=ch_out * self.expansion,
-                    filter_size=1,
-                    stride=stride,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    lr=lr)
-
         self.branch2a = ConvNormLayer(
             ch_in=ch_in,
             ch_out=width,
@@ -351,6 +321,36 @@ class BottleNeck(nn.Layer):
             freeze_norm=freeze_norm,
             lr=lr)
 
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out * self.expansion,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out * self.expansion,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
         self.std_senet = std_senet
         if self.std_senet:
             self.se = SELayer(ch_out * self.expansion)

+ 381 - 0
paddlers/models/ppdet/modeling/backbones/trans_encoder.py

@@ -0,0 +1,381 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import ReLU, Swish, GELU
+import math
+
+from paddlers.models.ppdet.core.workspace import register
+from ..shape_spec import ShapeSpec
+
+__all__ = ['TransEncoder']
+
+
+class BertEmbeddings(nn.Layer):
+    def __init__(self, word_size, position_embeddings_size, word_type_size,
+                 hidden_size, dropout_prob):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(
+            word_size, hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(position_embeddings_size,
+                                                hidden_size)
+        self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(dropout_prob)
+
+    def forward(self, x, token_type_ids=None, position_ids=None):
+        seq_len = paddle.shape(x)[1]
+        if position_ids is None:
+            position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(paddle.shape(x))
+
+        word_embs = self.word_embeddings(x)
+        position_embs = self.position_embeddings(position_ids)
+        token_type_embs = self.token_type_embeddings(token_type_ids)
+
+        embs_cmb = word_embs + position_embs + token_type_embs
+        embs_out = self.layernorm(embs_cmb)
+        embs_out = self.dropout(embs_out)
+        return embs_out
+
+
+class BertSelfAttention(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 output_attentions=False):
+        super(BertSelfAttention, self).__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                "The hidden_size must be a multiple of the number of attention "
+                "heads, but got {} % {} != 0" %
+                (hidden_size, num_attention_heads))
+
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(hidden_size, self.all_head_size)
+        self.key = nn.Linear(hidden_size, self.all_head_size)
+        self.value = nn.Linear(hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(attention_probs_dropout_prob)
+        self.output_attentions = output_attentions
+
+    def forward(self, x, attention_mask, head_mask=None):
+        query = self.query(x)
+        key = self.key(x)
+        value = self.value(x)
+
+        query_dim1, query_dim2 = paddle.shape(query)[:-1]
+        new_shape = [
+            query_dim1, query_dim2, self.num_attention_heads,
+            self.attention_head_size
+        ]
+        query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
+        key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
+        value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
+
+        attention = paddle.matmul(query,
+                                  key) / math.sqrt(self.attention_head_size)
+        attention = attention + attention_mask
+        attention_value = F.softmax(attention, axis=-1)
+        attention_value = self.dropout(attention_value)
+
+        if head_mask is not None:
+            attention_value = attention_value * head_mask
+
+        context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
+                                                                        3))
+        ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2]
+        new_context_shape = [
+            ctx_dim1,
+            ctx_dim2,
+            self.all_head_size,
+        ]
+        context = context.reshape(new_context_shape)
+
+        if self.output_attentions:
+            return (context, attention_value)
+        else:
+            return (context, )
+
+
+class BertAttention(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 output_attentions=False):
+        super(BertAttention, self).__init__()
+        self.bert_selfattention = BertSelfAttention(
+            hidden_size, num_attention_heads, attention_probs_dropout_prob,
+            output_attentions)
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+
+    def forward(self, x, attention_mask, head_mask=None):
+        attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
+        features = self.fc(attention_feats[0])
+        features = self.dropout(features)
+        features = self.layernorm(features + x)
+        if len(attention_feats) == 2:
+            return (features, attention_feats[1])
+        else:
+            return (features, )
+
+
+class BertFeedForward(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False):
+        super(BertFeedForward, self).__init__()
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.act_fn = eval(act_fn)
+        self.fc2 = nn.Linear(intermediate_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+
+    def forward(self, x):
+        features = self.fc1(x)
+        features = self.act_fn(features)
+        features = self.fc2(features)
+        features = self.dropout(features)
+        features = self.layernorm(features + x)
+        return features
+
+
+class BertLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(hidden_size, num_attention_heads,
+                                       attention_probs_dropout_prob,
+                                       output_attentions)
+        self.feed_forward = BertFeedForward(
+            hidden_size, intermediate_size, num_attention_heads,
+            attention_probs_dropout_prob, fc_dropout_prob, act_fn,
+            output_attentions)
+
+    def forward(self, x, attention_mask, head_mask=None):
+        attention_feats = self.attention(x, attention_mask, head_mask)
+        features = self.feed_forward(attention_feats[0])
+        if len(attention_feats) == 2:
+            return (features, attention_feats[1])
+        else:
+            return (features, )
+
+
+class BertEncoder(nn.Layer):
+    def __init__(self,
+                 num_hidden_layers,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False,
+                 output_hidden_feats=False):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = output_attentions
+        self.output_hidden_feats = output_hidden_feats
+        self.layers = nn.LayerList([
+            BertLayer(hidden_size, intermediate_size, num_attention_heads,
+                      attention_probs_dropout_prob, fc_dropout_prob, act_fn,
+                      output_attentions) for _ in range(num_hidden_layers)
+        ])
+
+    def forward(self, x, attention_mask, head_mask=None):
+        all_features = (x, )
+        all_attentions = ()
+
+        for i, layer in enumerate(self.layers):
+            mask = head_mask[i] if head_mask is not None else None
+            layer_out = layer(x, attention_mask, mask)
+
+            if self.output_hidden_feats:
+                all_features = all_features + (x, )
+            x = layer_out[0]
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_out[1], )
+
+        outputs = (x, )
+        if self.output_hidden_feats:
+            outputs += (all_features, )
+        if self.output_attentions:
+            outputs += (all_attentions, )
+        return outputs
+
+
+class BertPooler(nn.Layer):
+    def __init__(self, hidden_size):
+        super(BertPooler, self).__init__()
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.act = nn.Tanh()
+
+    def forward(self, x):
+        first_token = x[:, 0]
+        pooled_output = self.fc(first_token)
+        pooled_output = self.act(pooled_output)
+        return pooled_output
+
+
+class METROEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size,
+                 num_hidden_layers,
+                 features_dims,
+                 position_embeddings_size,
+                 hidden_size,
+                 intermediate_size,
+                 output_feature_dim,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False,
+                 output_hidden_feats=False,
+                 use_img_layernorm=False):
+        super(METROEncoder, self).__init__()
+        self.img_dims = features_dims
+        self.num_hidden_layers = num_hidden_layers
+        self.use_img_layernorm = use_img_layernorm
+        self.output_attentions = output_attentions
+        self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
+                                        hidden_size, fc_dropout_prob)
+        self.encoder = BertEncoder(
+            num_hidden_layers, hidden_size, intermediate_size,
+            num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
+            act_fn, output_attentions, output_hidden_feats)
+        self.pooler = BertPooler(hidden_size)
+        self.position_embeddings = nn.Embedding(position_embeddings_size,
+                                                hidden_size)
+        self.img_embedding = nn.Linear(
+            features_dims, hidden_size, bias_attr=True)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+        self.cls_head = nn.Linear(hidden_size, output_feature_dim)
+        self.residual = nn.Linear(features_dims, output_feature_dim)
+
+        self.apply(self.init_weights)
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.set_value(
+                paddle.normal(
+                    mean=0.0, std=0.02, shape=module.weight.shape))
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
+            module.weight.set_value(
+                paddle.full(
+                    shape=module.weight.shape, fill_value=1.0))
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
+
+    def forward(self, x):
+        batchsize, seq_len = paddle.shape(x)[:2]
+        input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
+        position_ids = paddle.arange(
+            seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
+
+        attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
+        head_mask = [None] * self.num_hidden_layers
+
+        position_embs = self.position_embeddings(position_ids)
+        attention_mask = (1.0 - attention_mask) * -10000.0
+
+        img_features = self.img_embedding(x)
+
+        # We empirically observe that adding an additional learnable position embedding leads to more stable training
+        embeddings = position_embs + img_features
+        if self.use_img_layernorm:
+            embeddings = self.layernorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        encoder_outputs = self.encoder(
+            embeddings, attention_mask, head_mask=head_mask)
+
+        pred_score = self.cls_head(encoder_outputs[0])
+        res_img_feats = self.residual(x)
+        pred_score = pred_score + res_img_feats
+
+        if self.output_attentions and self.output_hidden_feats:
+            return pred_score, encoder_outputs[1], encoder_outputs[-1]
+        else:
+            return pred_score
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
+
+
+@register
+class TransEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size=30522,
+                 num_hidden_layers=4,
+                 num_attention_heads=4,
+                 position_embeddings_size=512,
+                 intermediate_size=3072,
+                 input_feat_dim=[2048, 512, 128],
+                 hidden_feat_dim=[1024, 256, 128],
+                 attention_probs_dropout_prob=0.1,
+                 fc_dropout_prob=0.1,
+                 act_fn='gelu',
+                 output_attentions=False,
+                 output_hidden_feats=False):
+        super(TransEncoder, self).__init__()
+        output_feat_dim = input_feat_dim[1:] + [3]
+        trans_encoder = []
+        for i in range(len(output_feat_dim)):
+            features_dims = input_feat_dim[i]
+            output_feature_dim = output_feat_dim[i]
+            hidden_size = hidden_feat_dim[i]
+
+            # init a transformer encoder and append it to a list
+            assert hidden_size % num_attention_heads == 0
+            model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
+                                 position_embeddings_size, hidden_size,
+                                 intermediate_size, output_feature_dim,
+                                 num_attention_heads,
+                                 attention_probs_dropout_prob, fc_dropout_prob,
+                                 act_fn, output_attentions, output_hidden_feats)
+            trans_encoder.append(model)
+        self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
+
+    def forward(self, x):
+        out = self.trans_encoder(x)
+        return out

+ 29 - 11
paddlers/models/ppdet/modeling/backbones/vision_transformer.py

@@ -284,9 +284,9 @@ class RelativePositionBias(nn.Layer):
 
     def forward(self):
         relative_position_bias = \
-            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
                  self.window_size[0] * self.window_size[1] + 1,
-                 self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH 
+                 self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH 
         return relative_position_bias.transpose((2, 0, 1))  # nH, Wh*Ww, Wh*Ww
 
 
@@ -340,6 +340,7 @@ class VisionTransformer(nn.Layer):
                  use_abs_pos_emb=False,
                  use_sincos_pos_emb=True,
                  with_fpn=True,
+                 num_fpn_levels=4,
                  use_checkpoint=False,
                  **args):
         super().__init__()
@@ -350,6 +351,8 @@ class VisionTransformer(nn.Layer):
         self.use_sincos_pos_emb = use_sincos_pos_emb
         self.use_rel_pos_bias = use_rel_pos_bias
         self.final_norm = final_norm
+        self.out_indices = out_indices
+        self.num_fpn_levels = num_fpn_levels
 
         if use_checkpoint:
             paddle.seed(0)
@@ -415,14 +418,15 @@ class VisionTransformer(nn.Layer):
 
         assert len(out_indices) <= 4, ''
         self.out_indices = out_indices
-        self.out_channels = [embed_dim for _ in range(len(out_indices))]
-        self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
-            8 for _ in range(len(out_indices))
+        self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
+        self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
+            patch_size for _ in range(len(out_indices))
         ]
 
         self.norm = Identity()
 
         if self.with_fpn:
+            assert num_fpn_levels <= 4, ''
             self.init_fpn(
                 embed_dim=embed_dim,
                 patch_size=patch_size, )
@@ -505,16 +509,24 @@ class VisionTransformer(nn.Layer):
         dim = x.shape[-1]
         # we add a small number to avoid floating point error in the interpolation
         # see discussion at https://github.com/facebookresearch/dino/issues/8
-        w0, h0 = w0 + 0.1, h0 + 0.1
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        # patch_pos_embed = nn.functional.interpolate(
+        #     patch_pos_embed.reshape([
+        #         1, self.patch_embed.num_patches_w,
+        #         self.patch_embed.num_patches_h, dim
+        #     ]).transpose((0, 3, 1, 2)),
+        #     scale_factor=(w0 / self.patch_embed.num_patches_w,
+        #                   h0 / self.patch_embed.num_patches_h),
+        #     mode='bicubic', )
 
         patch_pos_embed = nn.functional.interpolate(
             patch_pos_embed.reshape([
                 1, self.patch_embed.num_patches_w,
                 self.patch_embed.num_patches_h, dim
             ]).transpose((0, 3, 1, 2)),
-            scale_factor=(w0 / self.patch_embed.num_patches_w,
-                          h0 / self.patch_embed.num_patches_h),
+            (w0, h0),
             mode='bicubic', )
+
         assert int(w0) == patch_pos_embed.shape[-2] and int(
             h0) == patch_pos_embed.shape[-1]
         patch_pos_embed = patch_pos_embed.transpose(
@@ -611,9 +623,15 @@ class VisionTransformer(nn.Layer):
                 feats.append(xp)
 
         if self.with_fpn:
-            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
-            for i in range(len(feats)):
-                feats[i] = fpns[i](feats[i])
+            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
+                -self.num_fpn_levels:]
+            assert len(fpns) == len(feats) or len(feats) == 1, ''
+            outputs = []
+            for i, m in enumerate(fpns):
+                outputs.append(
+                    m(feats[i] if len(feats) == len(fpns) else feats[-1]))
+
+            return outputs
 
         return feats
 

+ 99 - 94
paddlers/models/ppdet/modeling/bbox_utils.py

@@ -17,7 +17,9 @@ import paddle
 import numpy as np
 
 
-def bbox2delta(src_boxes, tgt_boxes, weights):
+def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):
+    """Encode bboxes to deltas.
+    """
     src_w = src_boxes[:, 2] - src_boxes[:, 0]
     src_h = src_boxes[:, 3] - src_boxes[:, 1]
     src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
@@ -38,7 +40,11 @@ def bbox2delta(src_boxes, tgt_boxes, weights):
     return deltas
 
 
-def delta2bbox(deltas, boxes, weights):
+def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
+    """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
     clip_scale = math.log(1000.0 / 16)
 
     widths = boxes[:, 2] - boxes[:, 0]
@@ -67,6 +73,96 @@ def delta2bbox(deltas, boxes, weights):
     pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
     pred_boxes = paddle.stack(pred_boxes, axis=-1)
 
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
+            min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
+            min=0, max=max_shape[0])
+    return pred_boxes
+
+
+def bbox2delta_v2(src_boxes,
+                  tgt_boxes,
+                  delta_mean=[0.0, 0.0, 0.0, 0.0],
+                  delta_std=[1.0, 1.0, 1.0, 1.0]):
+    """Encode bboxes to deltas.
+    Modified from bbox2delta() which just use weight parameters to multiply deltas.
+    """
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+    dx = (tgt_ctr_x - src_ctr_x) / src_w
+    dy = (tgt_ctr_y - src_ctr_y) / src_h
+    dw = paddle.log(tgt_w / src_w)
+    dh = paddle.log(tgt_h / src_h)
+
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    deltas = (
+        deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)
+    return deltas
+
+
+def delta2bbox_v2(deltas,
+                  boxes,
+                  delta_mean=[0.0, 0.0, 0.0, 0.0],
+                  delta_std=[1.0, 1.0, 1.0, 1.0],
+                  max_shape=None,
+                  ctr_clip=32.0):
+    """Decode deltas to bboxes.
+    Modified from delta2bbox() which just use weight parameters to be divided by deltas.
+    Used in YOLOFHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)
+    dx = deltas[:, 0::4]
+    dy = deltas[:, 1::4]
+    dw = deltas[:, 2::4]
+    dh = deltas[:, 3::4]
+
+    # Prevent sending too large values into paddle.exp()
+    dx = dx * widths.unsqueeze(1)
+    dy = dy * heights.unsqueeze(1)
+    if ctr_clip is not None:
+        dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)
+        dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)
+        dw = paddle.clip(dw, max=clip_scale)
+        dh = paddle.clip(dh, max=clip_scale)
+    else:
+        dw = dw.clip(min=-clip_scale, max=clip_scale)
+        dh = dh.clip(min=-clip_scale, max=clip_scale)
+
+    pred_ctr_x = dx + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
+            min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
+            min=0, max=max_shape[0])
     return pred_boxes
 
 
@@ -269,8 +365,7 @@ def decode_yolo(box, anchor, downsample_ratio):
     x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
     y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
 
-    anchor = paddle.to_tensor(anchor)
-    anchor = paddle.cast(anchor, x.dtype)
+    anchor = paddle.to_tensor(anchor, dtype=x.dtype)
     anchor = anchor.reshape((1, na, 1, 1, 2))
     w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
     h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
@@ -489,96 +584,6 @@ def batch_distance2bbox(points, distance, max_shapes=None):
     return out_bbox
 
 
-def delta2bbox_v2(rois,
-                  deltas,
-                  means=(0.0, 0.0, 0.0, 0.0),
-                  stds=(1.0, 1.0, 1.0, 1.0),
-                  max_shape=None,
-                  wh_ratio_clip=16.0 / 1000.0,
-                  ctr_clip=None):
-    """Transform network output(delta) to bboxes.
-    Based on https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/
-             bbox/coder/delta_xywh_bbox_coder.py
-    Args:
-        rois (Tensor): shape [..., 4], base bboxes, typical examples include
-            anchor and rois
-        deltas (Tensor): shape [..., 4], offset relative to base bboxes
-        means (list[float]): the mean that was used to normalize deltas,
-            must be of size 4
-        stds (list[float]): the std that was used to normalize deltas,
-            must be of size 4
-        max_shape (list[float] or None): height and width of image, will be
-            used to clip bboxes if not None
-        wh_ratio_clip (float): to clip delta wh of decoded bboxes
-        ctr_clip (float or None): whether to clip delta xy of decoded bboxes
-    """
-    if rois.size == 0:
-        return paddle.empty_like(rois)
-    means = paddle.to_tensor(means)
-    stds = paddle.to_tensor(stds)
-    deltas = deltas * stds + means
-
-    dxy = deltas[..., :2]
-    dwh = deltas[..., 2:]
-
-    pxy = (rois[..., :2] + rois[..., 2:]) * 0.5
-    pwh = rois[..., 2:] - rois[..., :2]
-    dxy_wh = pwh * dxy
-
-    max_ratio = np.abs(np.log(wh_ratio_clip))
-    if ctr_clip is not None:
-        dxy_wh = paddle.clip(dxy_wh, max=ctr_clip, min=-ctr_clip)
-        dwh = paddle.clip(dwh, max=max_ratio)
-    else:
-        dwh = dwh.clip(min=-max_ratio, max=max_ratio)
-
-    gxy = pxy + dxy_wh
-    gwh = pwh * dwh.exp()
-    x1y1 = gxy - (gwh * 0.5)
-    x2y2 = gxy + (gwh * 0.5)
-    bboxes = paddle.concat([x1y1, x2y2], axis=-1)
-    if max_shape is not None:
-        bboxes[..., 0::2] = bboxes[..., 0::2].clip(min=0, max=max_shape[1])
-        bboxes[..., 1::2] = bboxes[..., 1::2].clip(min=0, max=max_shape[0])
-    return bboxes
-
-
-def bbox2delta_v2(src_boxes,
-                  tgt_boxes,
-                  means=(0.0, 0.0, 0.0, 0.0),
-                  stds=(1.0, 1.0, 1.0, 1.0)):
-    """Encode bboxes to deltas.
-    Modified from ppdet.modeling.bbox_utils.bbox2delta.
-    Args:
-        src_boxes (Tensor[..., 4]): base bboxes
-        tgt_boxes (Tensor[..., 4]): target bboxes
-        means (list[float]): the mean that will be used to normalize delta
-        stds (list[float]): the std that will be used to normalize delta
-    """
-    if src_boxes.size == 0:
-        return paddle.empty_like(src_boxes)
-    src_w = src_boxes[..., 2] - src_boxes[..., 0]
-    src_h = src_boxes[..., 3] - src_boxes[..., 1]
-    src_ctr_x = src_boxes[..., 0] + 0.5 * src_w
-    src_ctr_y = src_boxes[..., 1] + 0.5 * src_h
-
-    tgt_w = tgt_boxes[..., 2] - tgt_boxes[..., 0]
-    tgt_h = tgt_boxes[..., 3] - tgt_boxes[..., 1]
-    tgt_ctr_x = tgt_boxes[..., 0] + 0.5 * tgt_w
-    tgt_ctr_y = tgt_boxes[..., 1] + 0.5 * tgt_h
-
-    dx = (tgt_ctr_x - src_ctr_x) / src_w
-    dy = (tgt_ctr_y - src_ctr_y) / src_h
-    dw = paddle.log(tgt_w / src_w)
-    dh = paddle.log(tgt_h / src_h)
-
-    deltas = paddle.stack((dx, dy, dw, dh), axis=1)  # [n, 4]
-    means = paddle.to_tensor(means, place=src_boxes.place)
-    stds = paddle.to_tensor(stds, place=src_boxes.place)
-    deltas = (deltas - means) / stds
-    return deltas
-
-
 def iou_similarity(box1, box2, eps=1e-10):
     """Calculate iou of box1 and box2
 

+ 13 - 0
paddlers/models/ppdet/modeling/heads/__init__.py

@@ -33,6 +33,12 @@ from . import sparsercnn_head
 from . import tood_head
 from . import retina_head
 from . import ppyoloe_head
+from . import fcosr_head
+from . import ppyoloe_r_head
+from . import yolof_head
+from . import ppyoloe_contrast_head
+from . import centertrack_head
+from . import sparse_roi_head
 
 from .bbox_head import *
 from .mask_head import *
@@ -55,3 +61,10 @@ from .sparsercnn_head import *
 from .tood_head import *
 from .retina_head import *
 from .ppyoloe_head import *
+from .fcosr_head import *
+from .ppyoloe_r_head import *
+from .yolof_head import *
+from .ppyoloe_contrast_head import *
+from .centertrack_head import *
+from .sparse_roi_head import *
+from .petr_head import *

+ 51 - 12
paddlers/models/ppdet/modeling/heads/bbox_head.py

@@ -160,8 +160,8 @@ class XConvNormHead(nn.Layer):
 
 @register
 class BBoxHead(nn.Layer):
-    __shared__ = ['num_classes']
-    __inject__ = ['bbox_assigner', 'bbox_loss']
+    __shared__ = ['num_classes', 'use_cot']
+    __inject__ = ['bbox_assigner', 'bbox_loss', 'loss_cot']
     """
     RCNN bbox head
 
@@ -173,7 +173,10 @@ class BBoxHead(nn.Layer):
             box.
         with_pool (bool): Whether to use pooling for the RoI feature.
         num_classes (int): The number of classes
-        bbox_weight (List[float]): The weight to get the decode box 
+        bbox_weight (List[float]): The weight to get the decode box
+        cot_classes (int): The number of base classes
+        loss_cot (object): The module of Label-cotuning
+        use_cot(bool): whether to use Label-cotuning 
     """
 
     def __init__(self,
@@ -185,7 +188,10 @@ class BBoxHead(nn.Layer):
                  num_classes=80,
                  bbox_weight=[10., 10., 5., 5.],
                  bbox_loss=None,
-                 loss_normalize_pos=False):
+                 loss_normalize_pos=False,
+                 cot_classes=None,
+                 loss_cot='COTLoss',
+                 use_cot=False):
         super(BBoxHead, self).__init__()
         self.head = head
         self.roi_extractor = roi_extractor
@@ -199,11 +205,29 @@ class BBoxHead(nn.Layer):
         self.bbox_loss = bbox_loss
         self.loss_normalize_pos = loss_normalize_pos
 
-        self.bbox_score = nn.Linear(
-            in_channel,
-            self.num_classes + 1,
-            weight_attr=paddle.ParamAttr(initializer=Normal(
-                mean=0.0, std=0.01)))
+        self.loss_cot = loss_cot
+        self.cot_relation = None
+        self.cot_classes = cot_classes
+        self.use_cot = use_cot
+        if use_cot:
+            self.cot_bbox_score = nn.Linear(
+                in_channel,
+                self.num_classes + 1,
+                weight_attr=paddle.ParamAttr(initializer=Normal(
+                    mean=0.0, std=0.01)))
+            
+            self.bbox_score = nn.Linear(
+                in_channel,
+                self.cot_classes + 1,
+                weight_attr=paddle.ParamAttr(initializer=Normal(
+                    mean=0.0, std=0.01)))
+            self.cot_bbox_score.skip_quant = True
+        else:
+            self.bbox_score = nn.Linear(
+                in_channel,
+                self.num_classes + 1,
+                weight_attr=paddle.ParamAttr(initializer=Normal(
+                    mean=0.0, std=0.01)))
         self.bbox_score.skip_quant = True
 
         self.bbox_delta = nn.Linear(
@@ -215,6 +239,9 @@ class BBoxHead(nn.Layer):
         self.assigned_label = None
         self.assigned_rois = None
 
+    def init_cot_head(self, relationship):
+        self.cot_relation = relationship
+        
     @classmethod
     def from_config(cls, cfg, input_shape):
         roi_pooler = cfg['roi_extractor']
@@ -229,7 +256,7 @@ class BBoxHead(nn.Layer):
             'in_channel': head.out_shape[0].channels
         }
 
-    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None):
+    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None, cot=False):
         """
         body_feats (list[Tensor]): Feature maps from backbone
         rois (list[Tensor]): RoIs generated from RPN module
@@ -248,7 +275,11 @@ class BBoxHead(nn.Layer):
             feat = paddle.squeeze(feat, axis=[2, 3])
         else:
             feat = bbox_feat
-        scores = self.bbox_score(feat)
+        if self.use_cot:
+            scores = self.cot_bbox_score(feat)
+            cot_scores = self.bbox_score(feat)
+        else:
+            scores = self.bbox_score(feat)
         deltas = self.bbox_delta(feat)
 
         if self.training:
@@ -259,11 +290,19 @@ class BBoxHead(nn.Layer):
                 rois,
                 self.bbox_weight,
                 loss_normalize_pos=self.loss_normalize_pos)
+            
+            if self.cot_relation is not None:
+                loss_cot = self.loss_cot(cot_scores, targets, self.cot_relation)
+                loss.update(loss_cot)
             return loss, bbox_feat
         else:
-            pred = self.get_prediction(scores, deltas)
+            if cot:
+                pred = self.get_prediction(cot_scores, deltas)
+            else:
+                pred = self.get_prediction(scores, deltas)
             return pred, self.head
 
+
     def get_loss(self,
                  scores,
                  deltas,

+ 42 - 40
paddlers/models/ppdet/modeling/heads/centernet_head.py

@@ -61,13 +61,12 @@ class CenterNetHead(nn.Layer):
         in_channels (int): the channel number of input to CenterNetHead.
         num_classes (int): the number of classes, 80 (COCO dataset) by default.
         head_planes (int): the channel number in all head, 256 by default.
-        heatmap_weight (float): the weight of heatmap loss, 1 by default.
+        prior_bias (float): prior bias in heatmap head, -2.19 by default, -4.6 in CenterTrack
         regress_ltrb (bool): whether to regress left/top/right/bottom or
-            width/height for a box, true by default
-        size_weight (float): the weight of box size loss, 0.1 by default.
-        size_loss (): the type of size regression loss, 'L1 loss' by default.
-        offset_weight (float): the weight of center offset loss, 1 by default.
-        iou_weight (float): the weight of iou head loss, 0 by default.
+            width/height for a box, True by default.
+        size_loss (str): the type of size regression loss, 'L1' by default, can be 'giou'.
+        loss_weight (dict): the weight of each loss.
+        add_iou (bool): whether to add iou branch, False by default.
     """
 
     __shared__ = ['num_classes']
@@ -76,20 +75,20 @@ class CenterNetHead(nn.Layer):
                  in_channels,
                  num_classes=80,
                  head_planes=256,
-                 heatmap_weight=1,
+                 prior_bias=-2.19,
                  regress_ltrb=True,
-                 size_weight=0.1,
                  size_loss='L1',
-                 offset_weight=1,
-                 iou_weight=0):
+                 loss_weight={
+                     'heatmap': 1.0,
+                     'size': 0.1,
+                     'offset': 1.0,
+                     'iou': 0.0,
+                 },
+                 add_iou=False):
         super(CenterNetHead, self).__init__()
         self.regress_ltrb = regress_ltrb
-        self.weights = {
-            'heatmap': heatmap_weight,
-            'size': size_weight,
-            'offset': offset_weight,
-            'iou': iou_weight
-        }
+        self.loss_weight = loss_weight
+        self.add_iou = add_iou
 
         # heatmap head
         self.heatmap = nn.Sequential(
@@ -104,7 +103,7 @@ class CenterNetHead(nn.Layer):
                 padding=0,
                 bias=True))
         with paddle.no_grad():
-            self.heatmap[2].conv.bias[:] = -2.19
+            self.heatmap[2].conv.bias[:] = prior_bias
 
         # size(ltrb or wh) head
         self.size = nn.Sequential(
@@ -129,7 +128,7 @@ class CenterNetHead(nn.Layer):
                 head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))
 
         # iou head (optinal)
-        if iou_weight > 0:
+        if self.add_iou and 'iou' in self.loss_weight:
             self.iou = nn.Sequential(
                 ConvLayer(
                     in_channels,
@@ -153,34 +152,34 @@ class CenterNetHead(nn.Layer):
         return {'in_channels': input_shape.channels}
 
     def forward(self, feat, inputs):
-        heatmap = self.heatmap(feat)
+        heatmap = F.sigmoid(self.heatmap(feat))
         size = self.size(feat)
         offset = self.offset(feat)
-        iou = self.iou(feat) if hasattr(self, 'iou_weight') else None
+        head_outs = {'heatmap': heatmap, 'size': size, 'offset': offset}
+        if self.add_iou and 'iou' in self.loss_weight:
+            iou = self.iou(feat)
+            head_outs.update({'iou': iou})
 
         if self.training:
-            loss = self.get_loss(
-                inputs, self.weights, heatmap, size, offset, iou=iou)
-            return loss
+            losses = self.get_loss(inputs, self.loss_weight, head_outs)
+            return losses
         else:
-            heatmap = F.sigmoid(heatmap)
-            head_outs = {'heatmap': heatmap, 'size': size, 'offset': offset}
-            if iou is not None:
-                head_outs.update({'iou': iou})
             return head_outs
 
-    def get_loss(self, inputs, weights, heatmap, size, offset, iou=None):
-        # heatmap head loss: CTFocalLoss
+    def get_loss(self, inputs, weights, head_outs):
+        # 1.heatmap(hm) head loss: CTFocalLoss
+        heatmap = head_outs['heatmap']
         heatmap_target = inputs['heatmap']
-        heatmap = paddle.clip(F.sigmoid(heatmap), 1e-4, 1 - 1e-4)
+        heatmap = paddle.clip(heatmap, 1e-4, 1 - 1e-4)
         ctfocal_loss = CTFocalLoss()
         heatmap_loss = ctfocal_loss(heatmap, heatmap_target)
 
-        # size head loss: L1 loss or GIoU loss
+        # 2.size(wh) head loss: L1 loss or GIoU loss
+        size = head_outs['size']
         index = inputs['index']
         mask = inputs['index_mask']
         size = paddle.transpose(size, perm=[0, 2, 3, 1])
-        size_n, size_h, size_w, size_c = size.shape
+        size_n, _, _, size_c = size.shape
         size = paddle.reshape(size, shape=[size_n, -1, size_c])
         index = paddle.unsqueeze(index, 2)
         batch_inds = list()
@@ -208,7 +207,8 @@ class CenterNetHead(nn.Layer):
                 else:
                     # inputs['size'] is ltrb, but regress as wh
                     # shape: [bs, max_per_img, 4]
-                    size_target = inputs['size'][:, :, 0:2] + inputs['size'][:, :, 2:]
+                    size_target = inputs['size'][:, :, 0:2] + inputs[
+                        'size'][:, :, 2:]
 
             size_target.stop_gradient = True
             size_loss = F.l1_loss(
@@ -232,10 +232,11 @@ class CenterNetHead(nn.Layer):
                 loc_reweight=None)
             size_loss = size_loss / (pos_num + 1e-4)
 
-        # offset head loss: L1 loss
+        # 3.offset(reg) head loss: L1 loss
+        offset = head_outs['offset']
         offset_target = inputs['offset']
         offset = paddle.transpose(offset, perm=[0, 2, 3, 1])
-        offset_n, offset_h, offset_w, offset_c = offset.shape
+        offset_n, _, _, offset_c = offset.shape
         offset = paddle.reshape(offset, shape=[offset_n, -1, offset_c])
         pos_offset = paddle.gather_nd(offset, index=index)
         offset_mask = paddle.expand_as(mask, pos_offset)
@@ -249,10 +250,11 @@ class CenterNetHead(nn.Layer):
             reduction='sum')
         offset_loss = offset_loss / (pos_num + 1e-4)
 
-        # iou head loss: GIoU loss
-        if iou is not None:
+        # 4.iou head loss: GIoU loss (optinal)
+        if self.add_iou and 'iou' in self.loss_weight:
+            iou = head_outs['iou']
             iou = paddle.transpose(iou, perm=[0, 2, 3, 1])
-            iou_n, iou_h, iou_w, iou_c = iou.shape
+            iou_n, _, _, iou_c = iou.shape
             iou = paddle.reshape(iou, shape=[iou_n, -1, iou_c])
             pos_iou = paddle.gather_nd(iou, index=index)
             iou_mask = paddle.expand_as(mask, pos_iou)
@@ -284,8 +286,8 @@ class CenterNetHead(nn.Layer):
         det_loss = weights['heatmap'] * heatmap_loss + weights[
             'size'] * size_loss + weights['offset'] * offset_loss
 
-        if iou is not None:
+        if self.add_iou and 'iou' in self.loss_weight:
             losses.update({'iou_loss': iou_loss})
-            det_loss = det_loss + weights['iou'] * iou_loss
+            det_loss += weights['iou'] * iou_loss
         losses.update({'det_loss': det_loss})
         return losses

Unele fișiere nu au fost afișate deoarece prea multe fișiere au fost modificate în acest diff