浏览代码

【Hackathon + No.150】Add Rotated Box Detection Functionality (#100)

Asthestarsfalll 2 年之前
父节点
当前提交
c6f15f726f
共有 100 个文件被更改,包括 8559 次插入909 次删除
  1. 3 1
      docs/apis/data_cn.md
  2. 3 1
      docs/apis/data_en.md
  3. 3 1
      docs/apis/train_cn.md
  4. 3 1
      docs/apis/train_en.md
  5. 3 2
      docs/intro/data_prep_cn.md
  6. 3 2
      docs/intro/data_prep_en.md
  7. 1 0
      docs/intro/model_cons_params_cn.md
  8. 1 0
      docs/intro/model_cons_params_en.md
  9. 1 0
      docs/intro/model_zoo_cn.md
  10. 1 0
      docs/intro/model_zoo_en.md
  11. 9 0
      docs/quick_start_cn.md
  12. 9 0
      docs/quick_start_en.md
  13. 31 5
      paddlers/datasets/base.py
  14. 3 2
      paddlers/datasets/cd_dataset.py
  15. 4 2
      paddlers/datasets/clas_dataset.py
  16. 49 29
      paddlers/datasets/coco.py
  17. 3 2
      paddlers/datasets/res_dataset.py
  18. 3 2
      paddlers/datasets/seg_dataset.py
  19. 5 2
      paddlers/datasets/voc.py
  20. 9 0
      paddlers/models/ppdet/core/workspace.py
  21. 1 1
      paddlers/models/ppdet/data/crop_utils/__init__.py
  22. 91 53
      paddlers/models/ppdet/data/crop_utils/annotation_cropper.py
  23. 10 6
      paddlers/models/ppdet/data/crop_utils/chip_box_utils.py
  24. 309 0
      paddlers/models/ppdet/data/reader.py
  25. 1 0
      paddlers/models/ppdet/data/source/__init__.py
  26. 3 0
      paddlers/models/ppdet/data/source/category.py
  27. 237 3
      paddlers/models/ppdet/data/source/coco.py
  28. 9 1
      paddlers/models/ppdet/data/source/dataset.py
  29. 84 29
      paddlers/models/ppdet/data/source/keypoint_coco.py
  30. 380 0
      paddlers/models/ppdet/data/source/pose3d_cmb.py
  31. 2 0
      paddlers/models/ppdet/data/transform/__init__.py
  32. 159 7
      paddlers/models/ppdet/data/transform/atss_assigner.py
  33. 359 42
      paddlers/models/ppdet/data/transform/batch_operators.py
  34. 832 85
      paddlers/models/ppdet/data/transform/keypoint_operators.py
  35. 296 0
      paddlers/models/ppdet/data/transform/keypoints_3d_operators.py
  36. 500 71
      paddlers/models/ppdet/data/transform/operators.py
  37. 7 0
      paddlers/models/ppdet/engine/__init__.py
  38. 111 47
      paddlers/models/ppdet/engine/callbacks.py
  39. 54 6
      paddlers/models/ppdet/engine/export_utils.py
  40. 107 10
      paddlers/models/ppdet/engine/tracker.py
  41. 147 30
      paddlers/models/ppdet/engine/trainer.py
  42. 42 0
      paddlers/models/ppdet/engine/trainer_cot.py
  43. 475 0
      paddlers/models/ppdet/engine/trainer_ssod.py
  44. 18 17
      paddlers/models/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc
  45. 9 14
      paddlers/models/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu
  46. 121 0
      paddlers/models/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc
  47. 96 0
      paddlers/models/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu
  48. 95 0
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc
  49. 6 11
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu
  50. 0 97
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cc
  51. 12 4
      paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h
  52. 1 1
      paddlers/models/ppdet/hash.txt
  53. 2 1
      paddlers/models/ppdet/metrics/__init__.py
  54. 6 2
      paddlers/models/ppdet/metrics/coco_utils.py
  55. 16 0
      paddlers/models/ppdet/metrics/json_results.py
  56. 1 1
      paddlers/models/ppdet/metrics/metrics.py
  57. 200 0
      paddlers/models/ppdet/metrics/pose3d_metrics.py
  58. 2 0
      paddlers/models/ppdet/modeling/__init__.py
  59. 11 0
      paddlers/models/ppdet/modeling/architectures/__init__.py
  60. 35 9
      paddlers/models/ppdet/modeling/architectures/blazeface.py
  61. 1 1
      paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py
  62. 11 16
      paddlers/models/ppdet/modeling/architectures/centernet.py
  63. 176 0
      paddlers/models/ppdet/modeling/architectures/centertrack.py
  64. 13 5
      paddlers/models/ppdet/modeling/architectures/detr.py
  65. 61 5
      paddlers/models/ppdet/modeling/architectures/faster_rcnn.py
  66. 30 39
      paddlers/models/ppdet/modeling/architectures/fcos.py
  67. 207 6
      paddlers/models/ppdet/modeling/architectures/keypoint_hrnet.py
  68. 217 0
      paddlers/models/ppdet/modeling/architectures/keypoint_petr.py
  69. 22 5
      paddlers/models/ppdet/modeling/architectures/mask_rcnn.py
  70. 2 1
      paddlers/models/ppdet/modeling/architectures/meta_arch.py
  71. 114 0
      paddlers/models/ppdet/modeling/architectures/pose3d_metro.py
  72. 260 0
      paddlers/models/ppdet/modeling/architectures/ppyoloe.py
  73. 104 0
      paddlers/models/ppdet/modeling/architectures/queryinst.py
  74. 18 2
      paddlers/models/ppdet/modeling/architectures/retinanet.py
  75. 3 3
      paddlers/models/ppdet/modeling/architectures/sparse_rcnn.py
  76. 35 9
      paddlers/models/ppdet/modeling/architectures/ssd.py
  77. 28 5
      paddlers/models/ppdet/modeling/architectures/yolo.py
  78. 88 0
      paddlers/models/ppdet/modeling/architectures/yolof.py
  79. 10 0
      paddlers/models/ppdet/modeling/assigners/__init__.py
  80. 16 6
      paddlers/models/ppdet/modeling/assigners/atss_assigner.py
  81. 227 0
      paddlers/models/ppdet/modeling/assigners/fcosr_assigner.py
  82. 316 0
      paddlers/models/ppdet/modeling/assigners/hungarian_assigner.py
  83. 275 0
      paddlers/models/ppdet/modeling/assigners/pose_utils.py
  84. 164 0
      paddlers/models/ppdet/modeling/assigners/rotated_task_aligned_assigner.py
  85. 1 1
      paddlers/models/ppdet/modeling/assigners/simota_assigner.py
  86. 38 4
      paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py
  87. 182 0
      paddlers/models/ppdet/modeling/assigners/task_aligned_assigner_cr.py
  88. 93 0
      paddlers/models/ppdet/modeling/assigners/uniform_assigner.py
  89. 8 3
      paddlers/models/ppdet/modeling/assigners/utils.py
  90. 2 0
      paddlers/models/ppdet/modeling/backbones/__init__.py
  91. 49 9
      paddlers/models/ppdet/modeling/backbones/dla.py
  92. 144 2
      paddlers/models/ppdet/modeling/backbones/hrnet.py
  93. 5 0
      paddlers/models/ppdet/modeling/backbones/lite_hrnet.py
  94. 30 30
      paddlers/models/ppdet/modeling/backbones/resnet.py
  95. 381 0
      paddlers/models/ppdet/modeling/backbones/trans_encoder.py
  96. 29 11
      paddlers/models/ppdet/modeling/backbones/vision_transformer.py
  97. 99 94
      paddlers/models/ppdet/modeling/bbox_utils.py
  98. 13 0
      paddlers/models/ppdet/modeling/heads/__init__.py
  99. 51 12
      paddlers/models/ppdet/modeling/heads/bbox_head.py
  100. 42 40
      paddlers/models/ppdet/modeling/heads/centernet_head.py

+ 3 - 1
docs/apis/data_cn.md

@@ -57,13 +57,14 @@
 |-------|----|--------|-----|
 |-------|----|--------|-----|
 |`data_dir`|`str`|数据集存放目录。||
 |`data_dir`|`str`|数据集存放目录。||
 |`image_dir`|`str`|输入图像存放目录。||
 |`image_dir`|`str`|输入图像存放目录。||
-|`ann_path`|`str`|[COCO格式](https://cocodataset.org/#home)标注文件路径。||
+|`anno_path`|`str`|[COCO格式](https://cocodataset.org/#home)标注文件路径。||
 |`transforms`|`paddlers.transforms.Compose`|对输入数据应用的数据变换算子。||
 |`transforms`|`paddlers.transforms.Compose`|对输入数据应用的数据变换算子。||
 |`label_list`|`str` \| `None`|label list文件。label list是一个文本文件,其中每一行包含一个类别的名称。|`None`|
 |`label_list`|`str` \| `None`|label list文件。label list是一个文本文件,其中每一行包含一个类别的名称。|`None`|
 |`num_workers`|`int` \| `str`|加载数据时使用的辅助进程数。若设置为`'auto'`,则按照如下规则确定使用进程数:当CPU核心数大于16时,使用8个数据读取辅助进程;否则,使用CPU核心数一半数量的辅助进程。|`'auto'`|
 |`num_workers`|`int` \| `str`|加载数据时使用的辅助进程数。若设置为`'auto'`,则按照如下规则确定使用进程数:当CPU核心数大于16时,使用8个数据读取辅助进程;否则,使用CPU核心数一半数量的辅助进程。|`'auto'`|
 |`shuffle`|`bool`|是否随机打乱数据集中的样本。|`False`|
 |`shuffle`|`bool`|是否随机打乱数据集中的样本。|`False`|
 |`allow_empty`|`bool`|是否向数据集中添加负样本。|`False`|
 |`allow_empty`|`bool`|是否向数据集中添加负样本。|`False`|
 |`empty_ratio`|`float`|负样本占比,仅当`allow_empty`为`True`时生效。若`empty_ratio`为负值或大于等于1,则保留所有生成的负样本。|`1.0`|
 |`empty_ratio`|`float`|负样本占比,仅当`allow_empty`为`True`时生效。若`empty_ratio`为负值或大于等于1,则保留所有生成的负样本。|`1.0`|
+|`batch_transforms`|`paddlers.transforms.BatchCompose`|对输入数据应用的批数据变换算子。||
 
 
 ### VOC格式目标检测数据集`VOCDetDataset`
 ### VOC格式目标检测数据集`VOCDetDataset`
 
 
@@ -81,6 +82,7 @@
 |`shuffle`|`bool`|是否随机打乱数据集中的样本。|`False`|
 |`shuffle`|`bool`|是否随机打乱数据集中的样本。|`False`|
 |`allow_empty`|`bool`|是否向数据集中添加负样本。|`False`|
 |`allow_empty`|`bool`|是否向数据集中添加负样本。|`False`|
 |`empty_ratio`|`float`|负样本占比,仅当`allow_empty`为`True`时生效。若`empty_ratio`为负值或大于等于1,则保留所有生成的负样本。|`1.0`|
 |`empty_ratio`|`float`|负样本占比,仅当`allow_empty`为`True`时生效。若`empty_ratio`为负值或大于等于1,则保留所有生成的负样本。|`1.0`|
+|`batch_transforms`|`paddlers.transforms.BatchCompose`|对输入数据应用的批数据变换算子。||
 
 
 `VOCDetDataset`对file list的要求如下:
 `VOCDetDataset`对file list的要求如下:
 
 

+ 3 - 1
docs/apis/data_en.md

@@ -57,13 +57,14 @@ The initialization parameter list is as follows:
 |-------|----|--------|-----|
 |-------|----|--------|-----|
 |`data_dir`|`str`|Directory that stores the dataset.||
 |`data_dir`|`str`|Directory that stores the dataset.||
 |`image_dir`|`str`|Directory of input images.||
 |`image_dir`|`str`|Directory of input images.||
-|`ann_path`|`str`|[COCO Format](https://cocodataset.org/#home)label file path.||
+|`anno_path`|`str`|[COCO Format](https://cocodataset.org/#home)label file path.||
 |`transforms`|`paddlers.transforms.Compose`|Data transformation operators applied to input data.||
 |`transforms`|`paddlers.transforms.Compose`|Data transformation operators applied to input data.||
 |`label_list`|`str` \| `None`|Label list path. Label list is a text file, in which each line contains the name of class.|`None`|
 |`label_list`|`str` \| `None`|Label list path. Label list is a text file, in which each line contains the name of class.|`None`|
 |`num_workers`|`int` \| `str`|Number of auxiliary processes used when loading data. If it is set to `'auto'`, use the following rules to determine the number of processes to use: When the number of CPU cores is greater than 16, 8 data read auxiliary processes are used; otherwise, the number of auxiliary processes is set to half the counts of CPU cores.|`'auto'`|
 |`num_workers`|`int` \| `str`|Number of auxiliary processes used when loading data. If it is set to `'auto'`, use the following rules to determine the number of processes to use: When the number of CPU cores is greater than 16, 8 data read auxiliary processes are used; otherwise, the number of auxiliary processes is set to half the counts of CPU cores.|`'auto'`|
 |`shuffle`|`bool`|Whether to randomly shuffle the samples in the dataset.|`False`|
 |`shuffle`|`bool`|Whether to randomly shuffle the samples in the dataset.|`False`|
 |`allow_empty`|`bool`|Whether to add negative samples to the dataset.|`False`|
 |`allow_empty`|`bool`|Whether to add negative samples to the dataset.|`False`|
 |`empty_ratio`|`float`|Negative sample ratio. Take effect only if `allow_empty` is `True`. If `empty_ratio` is negative or greater than or equal to 1, all negative samples generated are retained.|`1.0`|
 |`empty_ratio`|`float`|Negative sample ratio. Take effect only if `allow_empty` is `True`. If `empty_ratio` is negative or greater than or equal to 1, all negative samples generated are retained.|`1.0`|
+|`batch_transforms`|`paddlers.transforms.BatchCompose`|Data batch transformation operators applied to input data.||
 
 
 ### VOC Format Object Detection Dataset `VOCDetDataset`
 ### VOC Format Object Detection Dataset `VOCDetDataset`
 
 
@@ -81,6 +82,7 @@ The initialization parameter list is as follows:
 |`shuffle`|`bool`|Whether to randomly shuffle the samples in the dataset.|`False`|
 |`shuffle`|`bool`|Whether to randomly shuffle the samples in the dataset.|`False`|
 |`allow_empty`|`bool`|Whether to add negative samples to the dataset.|`False`|
 |`allow_empty`|`bool`|Whether to add negative samples to the dataset.|`False`|
 |`empty_ratio`|`float`|Negative sample ratio. Takes effect only if `allow_empty` is `True`. If `empty_ratio` is negative or greater than or equal to `1`, all negative samples generated will be retained.|`1.0`|
 |`empty_ratio`|`float`|Negative sample ratio. Takes effect only if `allow_empty` is `True`. If `empty_ratio` is negative or greater than or equal to `1`, all negative samples generated will be retained.|`1.0`|
+|`batch_transforms`|`paddlers.transforms.BatchCompose`|Data batch transformation operators applied to input data.||
 
 
 The requirements of `VOCDetDataset` for the file list are as follows:
 The requirements of `VOCDetDataset` for the file list are as follows:
 
 

+ 3 - 1
docs/apis/train_cn.md

@@ -166,6 +166,7 @@ def train(self,
           warmup_start_lr=0.0,
           warmup_start_lr=0.0,
           lr_decay_epochs=(216, 243),
           lr_decay_epochs=(216, 243),
           lr_decay_gamma=0.1,
           lr_decay_gamma=0.1,
+          cosine_decay_num_epochs=1000,
           metric=None,
           metric=None,
           use_ema=False,
           use_ema=False,
           early_stop=False,
           early_stop=False,
@@ -196,7 +197,8 @@ def train(self,
 |`warmup_start_lr`|`int`|默认优化器warm-up阶段使用的初始学习率。|`0`|
 |`warmup_start_lr`|`int`|默认优化器warm-up阶段使用的初始学习率。|`0`|
 |`lr_decay_epochs`|`list` \| `tuple`|默认优化器学习率衰减的milestones,以epoch计。即,在第几个epoch执行学习率的衰减。|`(216, 243)`|
 |`lr_decay_epochs`|`list` \| `tuple`|默认优化器学习率衰减的milestones,以epoch计。即,在第几个epoch执行学习率的衰减。|`(216, 243)`|
 |`lr_decay_gamma`|`float`|学习率衰减系数,适用于默认优化器。|`0.1`|
 |`lr_decay_gamma`|`float`|学习率衰减系数,适用于默认优化器。|`0.1`|
-|`metric`|`str` \| `None`|评价指标,可以为`'VOC'`、`COCO`或`None`。若为`None`,则根据数据集格式自动确定使用的评价指标。|`None`|
+|`cosine_decay_num_epochs`|`int`|使用余弦退火学习率调度器时计算退火周期的参数。|`1000`|
+|`metric`|`str` \| `None`|评价指标,可以为`'VOC'`、`'COCO'`、`'RBOX'`或`None`。若为`None`,则根据数据集格式自动确定使用的评价指标。|`None`|
 |`use_ema`|`bool`|是否启用[指数滑动平均策略](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/models/ppdet/optimizer.py)更新模型权重参数。|`False`|
 |`use_ema`|`bool`|是否启用[指数滑动平均策略](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/models/ppdet/optimizer.py)更新模型权重参数。|`False`|
 |`early_stop`|`bool`|训练过程是否启用早停策略。|`False`|
 |`early_stop`|`bool`|训练过程是否启用早停策略。|`False`|
 |`early_stop_patience`|`int`|启用早停策略时的`patience`参数(参见[`EarlyStop`](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/utils/utils.py))。|`5`|
 |`early_stop_patience`|`int`|启用早停策略时的`patience`参数(参见[`EarlyStop`](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/utils/utils.py))。|`5`|

+ 3 - 1
docs/apis/train_en.md

@@ -166,6 +166,7 @@ def train(self,
           warmup_start_lr=0.0,
           warmup_start_lr=0.0,
           lr_decay_epochs=(216, 243),
           lr_decay_epochs=(216, 243),
           lr_decay_gamma=0.1,
           lr_decay_gamma=0.1,
+          cosine_decay_num_epochs=1000,
           metric=None,
           metric=None,
           use_ema=False,
           use_ema=False,
           early_stop=False,
           early_stop=False,
@@ -196,7 +197,8 @@ The meaning of each parameter is as follows:
 |`warmup_start_lr`|`int`|Default initial learning rate used in the warm-up phase of the optimizer.|`0`|
 |`warmup_start_lr`|`int`|Default initial learning rate used in the warm-up phase of the optimizer.|`0`|
 |`lr_decay_epochs`|`list` \| `tuple`|Milestones of learning rate decline of the default optimizer, in terms of epochs. That is, which epoch the decay of the learning rate occurs.|`(216, 243)`|
 |`lr_decay_epochs`|`list` \| `tuple`|Milestones of learning rate decline of the default optimizer, in terms of epochs. That is, which epoch the decay of the learning rate occurs.|`(216, 243)`|
 |`lr_decay_gamma`|`float`|Learning rate attenuation coefficient, for default optimizer.|`0.1`|
 |`lr_decay_gamma`|`float`|Learning rate attenuation coefficient, for default optimizer.|`0.1`|
-|`metric`|`str` \| `None`|Evaluation metrics, which can be `'VOC'`, `COCO`, or `None`. If `None`, the evaluation metrics will be automatically determined according to the format of the dataset.|`None`|
+|`cosine_decay_num_epochs`|`int`|Parameter to determine the annealing cycle when a cosine annealing learning rate scheduler is used.|`1000`|
+|`metric`|`str` \| `None`|Evaluation metrics, which can be `'VOC'`, `'COCO'`, `'RBOX'`, or `None`. If `None`, the evaluation metrics will be automatically determined according to the format of the dataset.|`None`|
 |`use_ema`|`bool`|Whether to enable [exponential moving average strategy](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/models/ppdet/optimizer.py) to update model weights.|`False`|
 |`use_ema`|`bool`|Whether to enable [exponential moving average strategy](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/models/ppdet/optimizer.py) to update model weights.|`False`|
 |`early_stop`|`bool`|Whether to enable the early stopping policy during training.|`False`|
 |`early_stop`|`bool`|Whether to enable the early stopping policy during training.|`False`|
 |`early_stop_patience`|`int`|`patience` parameter when the early stopping policy is enabled. Please refer to [`EarlyStop`](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/utils/utils.py) for more details.|`5`|
 |`early_stop_patience`|`int`|`patience` parameter when the early stopping policy is enabled. Please refer to [`EarlyStop`](https://github.com/PaddlePaddle/PaddleRS/blob/develop/paddlers/utils/utils.py) for more details.|`5`|

+ 3 - 2
docs/intro/data_prep_cn.md

@@ -9,5 +9,6 @@
 | 变化检测 | LEVIR-CD | https://justchenhao.github.io/LEVIR/ | [prepare_levircd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_levircd.py) |
 | 变化检测 | LEVIR-CD | https://justchenhao.github.io/LEVIR/ | [prepare_levircd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_levircd.py) |
 | 变化检测 | Season-varying | https://paperswithcode.com/dataset/cdd-dataset-season-varying | [prepare_svcd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_svcd.py) |
 | 变化检测 | Season-varying | https://paperswithcode.com/dataset/cdd-dataset-season-varying | [prepare_svcd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_svcd.py) |
 | 场景分类 | UC Merced | http://weegee.vision.ucmerced.edu/datasets/landuse.html | [prepare_ucmerced.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_ucmerced.py) |
 | 场景分类 | UC Merced | http://weegee.vision.ucmerced.edu/datasets/landuse.html | [prepare_ucmerced.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_ucmerced.py) |
-| 目标检测 | RSOD | https://github.com/RSIA-LIESMARS-WHU/RSOD-Dataset- | [prepare_rsod](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_rsod.py) |
-| 图像分割 | iSAID | https://captain-whu.github.io/iSAID/ | [prepare_isaid](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_isaid.py) |
+| 目标检测 | DOTA | https://captain-whu.github.io/DOTA/ | [prepare_dota.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_dota.py) |
+| 目标检测 | RSOD | https://github.com/RSIA-LIESMARS-WHU/RSOD-Dataset- | [prepare_rsod.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_rsod.py) |
+| 图像分割 | iSAID | https://captain-whu.github.io/iSAID/ | [prepare_isaid.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_isaid.py) |

+ 3 - 2
docs/intro/data_prep_en.md

@@ -9,5 +9,6 @@
 | Change Detection | LEVIR-CD | https://justchenhao.github.io/LEVIR/ | [prepare_levircd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_levircd.py) |
 | Change Detection | LEVIR-CD | https://justchenhao.github.io/LEVIR/ | [prepare_levircd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_levircd.py) |
 | Change Detection | Season-varying | https://paperswithcode.com/dataset/cdd-dataset-season-varying | [prepare_svcd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_svcd.py) |
 | Change Detection | Season-varying | https://paperswithcode.com/dataset/cdd-dataset-season-varying | [prepare_svcd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_svcd.py) |
 | Scene Classification | UC Merced | http://weegee.vision.ucmerced.edu/datasets/landuse.html | [prepare_ucmerced.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_ucmerced.py) |
 | Scene Classification | UC Merced | http://weegee.vision.ucmerced.edu/datasets/landuse.html | [prepare_ucmerced.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_ucmerced.py) |
-| Object Detection | RSOD | https://github.com/RSIA-LIESMARS-WHU/RSOD-Dataset- | [prepare_rsod](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_rsod.py) |
-| Image Segmentation | iSAID | https://captain-whu.github.io/iSAID/ | [prepare_isaid](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_isaid.py) |
+| Object Detection | DOTA | https://captain-whu.github.io/DOTA/ | [prepare_dota.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_dota.py) |
+| Object Detection | RSOD | https://github.com/RSIA-LIESMARS-WHU/RSOD-Dataset- | [prepare_rsod.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_rsod.py) |
+| Image Segmentation | iSAID | https://captain-whu.github.io/iSAID/ | [prepare_isaid.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_isaid.py) |

+ 1 - 0
docs/intro/model_cons_params_cn.md

@@ -449,6 +449,7 @@
 
 
 | 参数名 | 描述                            | 默认值 |
 | 参数名 | 描述                            | 默认值 |
 | --- |-------------------------------| --- |
 | --- |-------------------------------| --- |
+| `rotate (bool)` | 表示是否执行旋转目标检测 | `False` |
 | `num_classes (int)` | 目标类别数量                        | `80` |
 | `num_classes (int)` | 目标类别数量                        | `80` |
 | `backbone (str)` | 骨干网络名称                | `'MobileNetV1'` |
 | `backbone (str)` | 骨干网络名称                | `'MobileNetV1'` |
 | `anchors (list[list[int]])` | 预定义锚框的大小                       | `[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]]` |
 | `anchors (list[list[int]])` | 预定义锚框的大小                       | `[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]]` |

+ 1 - 0
docs/intro/model_cons_params_en.md

@@ -443,6 +443,7 @@ The YOLOv3 implementation based on PaddlePaddle.
 
 
 | Parameter Name | Description                                                                                                                 | Default Value |
 | Parameter Name | Description                                                                                                                 | Default Value |
 | --- |-----------------------------------------------------------------------------------------------------------------------------| --- |
 | --- |-----------------------------------------------------------------------------------------------------------------------------| --- |
+| `rotate (bool)` | If True, the model performs rotated object detection | `False` |
 | `num_classes (int)` | Number of target classes                                                                                                    | `80` |
 | `num_classes (int)` | Number of target classes                                                                                                    | `80` |
 | `backbone (str)` | Backbone network to use                                                                                      | `'MobileNetV1'` |
 | `backbone (str)` | Backbone network to use                                                                                      | `'MobileNetV1'` |
 | `anchors (list[list[int]])` | Sizes of predefined anchor boxes                                                                                                   | `[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45 ], [59, 119], [116, 90], [156, 198], [373, 326]]` |
 | `anchors (list[list[int]])` | Sizes of predefined anchor boxes                                                                                                   | `[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45 ], [59, 119], [116, 90], [156, 198], [373, 326]]` |

+ 1 - 0
docs/intro/model_zoo_cn.md

@@ -33,6 +33,7 @@ PaddleRS目前已支持的全部模型如下(标注\*的为遥感专用模型
 | 图像复原 | NAFNet | 是 |
 | 图像复原 | NAFNet | 是 |
 | 图像复原 | SwinIR | 是 |
 | 图像复原 | SwinIR | 是 |
 | 目标检测 | Faster R-CNN | 否 |
 | 目标检测 | Faster R-CNN | 否 |
+| 目标检测 | FCOSR | 否 |
 | 目标检测 | PP-YOLO | 否 |
 | 目标检测 | PP-YOLO | 否 |
 | 目标检测 | PP-YOLO Tiny | 否 |
 | 目标检测 | PP-YOLO Tiny | 否 |
 | 目标检测 | PP-YOLOv2 | 否 |
 | 目标检测 | PP-YOLOv2 | 否 |

+ 1 - 0
docs/intro/model_zoo_en.md

@@ -33,6 +33,7 @@ All models currently supported by PaddleRS are listed below (those marked \* are
 | Image Restoration | SwinIR | Yes |
 | Image Restoration | SwinIR | Yes |
 | Image Restoration | NAFNet | Yes |
 | Image Restoration | NAFNet | Yes |
 | Object Detection | Faster R-CNN | No |
 | Object Detection | Faster R-CNN | No |
+| Object Detection | FCOSR | No |
 | Object Detection | PP-YOLO | No |
 | Object Detection | PP-YOLO | No |
 | Object Detection | PP-YOLO Tiny | No |
 | Object Detection | PP-YOLO Tiny | No |
 | Object Detection | PP-YOLOv2 | No |
 | Object Detection | PP-YOLOv2 | No |

+ 9 - 0
docs/quick_start_cn.md

@@ -53,6 +53,15 @@ Windows用户可以在[此站点](https://www.lfd.uci.edu/~gohlke/pythonlibs/#gd
 pip install GDAL‑3.3.3‑cp39‑cp39‑win_amd64.whl
 pip install GDAL‑3.3.3‑cp39‑cp39‑win_amd64.whl
 ```
 ```
 
 
+4. (可选)安装ext_op
+
+PaddleRS支持旋转目标检测,在使用之前需要安装`ext_op`外部自定义库,安装方式如下:
+```shell
+cd paddlers/models/ppdet/ext_op
+python setup.py install
+```
+
+
 除了采用上述安装步骤以外,PaddleRS也提供Docker安装方式。具体步骤如下:
 除了采用上述安装步骤以外,PaddleRS也提供Docker安装方式。具体步骤如下:
 
 
 1. 从dockerhub拉取镜像:
 1. 从dockerhub拉取镜像:

+ 9 - 0
docs/quick_start_en.md

@@ -46,6 +46,15 @@ Windows users can download GDAL wheels from [this site](https://www.lfd.uci.edu/
 pip install GDAL‑3.3.3‑cp39‑cp39‑win_amd64.whl
 pip install GDAL‑3.3.3‑cp39‑cp39‑win_amd64.whl
 ```
 ```
 
 
+4. (Optional) Install ext_op
+
+PaddleRS supports rotated object detection, which requires the installation of the `ext_op` external custom library before use. you need ti install ext_op as follows:
+
+```shell
+cd paddlers/models/ppdet/ext_op
+python setup.py install
+```
+
 We also provide a docker image for installation:
 We also provide a docker image for installation:
 
 
 1. Pull from dockerhub:
 1. Pull from dockerhub:

+ 31 - 5
paddlers/datasets/base.py

@@ -18,7 +18,8 @@ from paddle.io import Dataset
 from paddle.fluid.dataloader.collate import default_collate_fn
 from paddle.fluid.dataloader.collate import default_collate_fn
 
 
 from paddlers.utils import get_num_workers
 from paddlers.utils import get_num_workers
-from paddlers.transforms import construct_sample_from_dict, Compose
+import paddlers.utils.logging as logging
+from paddlers.transforms import construct_sample_from_dict, Compose, BatchCompose
 
 
 
 
 class BaseDataset(Dataset):
 class BaseDataset(Dataset):
@@ -26,7 +27,13 @@ class BaseDataset(Dataset):
     _KEYS_TO_DISCARD = None
     _KEYS_TO_DISCARD = None
     _collate_trans_info = False
     _collate_trans_info = False
 
 
-    def __init__(self, data_dir, label_list, transforms, num_workers, shuffle):
+    def __init__(self,
+                 data_dir,
+                 label_list,
+                 transforms,
+                 num_workers,
+                 shuffle,
+                 batch_transforms=None):
         super(BaseDataset, self).__init__()
         super(BaseDataset, self).__init__()
 
 
         self.data_dir = data_dir
         self.data_dir = data_dir
@@ -37,6 +44,8 @@ class BaseDataset(Dataset):
 
 
         self.num_workers = get_num_workers(num_workers)
         self.num_workers = get_num_workers(num_workers)
         self.shuffle = shuffle
         self.shuffle = shuffle
+        self.batch_transforms = None
+        self.build_collate_fn(batch_transforms)
 
 
     def __getitem__(self, idx):
     def __getitem__(self, idx):
         sample = construct_sample_from_dict(self.file_list[idx])
         sample = construct_sample_from_dict(self.file_list[idx])
@@ -59,8 +68,25 @@ class BaseDataset(Dataset):
             for key in self._KEYS_TO_DISCARD:
             for key in self._KEYS_TO_DISCARD:
                 for s, _ in batch:
                 for s, _ in batch:
                     s.pop(key, None)
                     s.pop(key, None)
+
+        samples = [s[0] for s in batch]
+
+        if self.batch_transforms:
+            samples = self.batch_transforms(samples)
+
         if self._collate_trans_info:
         if self._collate_trans_info:
-            return default_collate_fn(
-                [s[0] for s in batch]), [s[1] for s in batch]
+            return default_collate_fn(samples), [s[1] for s in batch]
         else:
         else:
-            return default_collate_fn([s[0] for s in batch])
+            return default_collate_fn(samples)
+
+    def build_collate_fn(self, batch_transforms, collate_fn_constructor=None):
+        if self.batch_transforms is not None and batch_transforms:
+            logging.warning(
+                "The initial `batch_transforms` will be overwritten.")
+        if batch_transforms is not None:
+            batch_transforms = copy.deepcopy(batch_transforms)
+            if isinstance(batch_transforms, list):
+                batch_transforms = BatchCompose(batch_transforms)
+            self.batch_transforms = batch_transforms
+        if collate_fn_constructor:
+            self.collate_fn = collate_fn_constructor(self)

+ 3 - 2
paddlers/datasets/cd_dataset.py

@@ -55,9 +55,10 @@ class CDDataset(BaseDataset):
                  num_workers='auto',
                  num_workers='auto',
                  shuffle=False,
                  shuffle=False,
                  with_seg_labels=False,
                  with_seg_labels=False,
-                 binarize_labels=False):
+                 binarize_labels=False,
+                 batch_transforms=None):
         super(CDDataset, self).__init__(data_dir, label_list, transforms,
         super(CDDataset, self).__init__(data_dir, label_list, transforms,
-                                        num_workers, shuffle)
+                                        num_workers, shuffle, batch_transforms)
 
 
         DELIMETER = ' '
         DELIMETER = ' '
 
 

+ 4 - 2
paddlers/datasets/clas_dataset.py

@@ -42,9 +42,11 @@ class ClasDataset(BaseDataset):
                  transforms,
                  transforms,
                  label_list=None,
                  label_list=None,
                  num_workers='auto',
                  num_workers='auto',
-                 shuffle=False):
+                 shuffle=False,
+                 batch_transforms=None):
         super(ClasDataset, self).__init__(data_dir, label_list, transforms,
         super(ClasDataset, self).__init__(data_dir, label_list, transforms,
-                                          num_workers, shuffle)
+                                          num_workers, shuffle,
+                                          batch_transforms)
         self.file_list = list()
         self.file_list = list()
         self.labels = list()
         self.labels = list()
 
 

+ 49 - 29
paddlers/datasets/coco.py

@@ -17,7 +17,7 @@ import copy
 import os
 import os
 import os.path as osp
 import os.path as osp
 import random
 import random
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 
 
 import numpy as np
 import numpy as np
 
 
@@ -34,7 +34,7 @@ class COCODetDataset(BaseDataset):
     Args:
     Args:
         data_dir (str): Root directory of the dataset.
         data_dir (str): Root directory of the dataset.
         image_dir (str): Directory that contains the images.
         image_dir (str): Directory that contains the images.
-        ann_path (str): Path to COCO annotations.
+        anno_path (str): Path to COCO annotations.
         transforms (paddlers.transforms.Compose|list): Data preprocessing and data augmentation operators to apply.
         transforms (paddlers.transforms.Compose|list): Data preprocessing and data augmentation operators to apply.
         label_list (str|None, optional): Path of the file that contains the category names. Defaults to None.
         label_list (str|None, optional): Path of the file that contains the category names. Defaults to None.
         num_workers (int|str, optional): Number of processes used for data loading. If `num_workers` is 'auto',
         num_workers (int|str, optional): Number of processes used for data loading. If `num_workers` is 'auto',
@@ -45,6 +45,7 @@ class COCODetDataset(BaseDataset):
         allow_empty (bool, optional): Whether to add negative samples. Defaults to False.
         allow_empty (bool, optional): Whether to add negative samples. Defaults to False.
         empty_ratio (float, optional): Ratio of negative samples. If `empty_ratio` is smaller than 0 or not less 
         empty_ratio (float, optional): Ratio of negative samples. If `empty_ratio` is smaller than 0 or not less 
             than 1, keep all generated negative samples. Defaults to 1.0.
             than 1, keep all generated negative samples. Defaults to 1.0.
+        batch_transforms (paddlers.transforms.BatchCompose|list): Batch transformation operators to apply.
     """
     """
 
 
     def __init__(self,
     def __init__(self,
@@ -52,11 +53,12 @@ class COCODetDataset(BaseDataset):
                  image_dir,
                  image_dir,
                  anno_path,
                  anno_path,
                  transforms,
                  transforms,
-                 label_list,
+                 label_list=None,
                  num_workers='auto',
                  num_workers='auto',
                  shuffle=False,
                  shuffle=False,
                  allow_empty=False,
                  allow_empty=False,
-                 empty_ratio=1.):
+                 empty_ratio=1.,
+                 batch_transforms=None):
         # matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
         # matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
         # or matplotlib.backends is imported for the first time.
         # or matplotlib.backends is imported for the first time.
         import matplotlib
         import matplotlib
@@ -64,7 +66,8 @@ class COCODetDataset(BaseDataset):
         from pycocotools.coco import COCO
         from pycocotools.coco import COCO
 
 
         super(COCODetDataset, self).__init__(data_dir, label_list, transforms,
         super(COCODetDataset, self).__init__(data_dir, label_list, transforms,
-                                             num_workers, shuffle)
+                                             num_workers, shuffle,
+                                             batch_transforms)
 
 
         self.data_fields = None
         self.data_fields = None
         self.num_max_boxes = 50
         self.num_max_boxes = 50
@@ -83,33 +86,31 @@ class COCODetDataset(BaseDataset):
         self.file_list = list()
         self.file_list = list()
         neg_file_list = list()
         neg_file_list = list()
         self.labels = list()
         self.labels = list()
+        self.anno_path = anno_path
 
 
-        annotations = dict()
-        annotations['images'] = list()
-        annotations['categories'] = list()
-        annotations['annotations'] = list()
+        annotations = defaultdict(list)
 
 
         cname2cid = OrderedDict()
         cname2cid = OrderedDict()
         label_id = 0
         label_id = 0
-        with open(label_list, 'r', encoding=get_encoding(label_list)) as f:
-            for line in f.readlines():
-                cname2cid[line.strip()] = label_id
-                label_id += 1
-                self.labels.append(line.strip())
-
-        for k, v in cname2cid.items():
-            annotations['categories'].append({
-                'supercategory': 'component',
-                'id': v + 1,
-                'name': k
-            })
+        if label_list:
+            with open(label_list, 'r', encoding=get_encoding(label_list)) as f:
+                for line in f.readlines():
+                    cname2cid[line.strip()] = label_id
+                    label_id += 1
+                    self.labels.append(line.strip())
+
+            for k, v in cname2cid.items():
+                annotations['categories'].append({
+                    'supercategory': 'component',
+                    'id': v + 1,
+                    'name': k
+                })
 
 
         anno_path = norm_path(os.path.join(self.data_dir, anno_path))
         anno_path = norm_path(os.path.join(self.data_dir, anno_path))
         image_dir = norm_path(os.path.join(self.data_dir, image_dir))
         image_dir = norm_path(os.path.join(self.data_dir, image_dir))
 
 
         assert anno_path.endswith('.json'), \
         assert anno_path.endswith('.json'), \
             'invalid coco annotation file: ' + anno_path
             'invalid coco annotation file: ' + anno_path
-        from pycocotools.coco import COCO
         coco = COCO(anno_path)
         coco = COCO(anno_path)
         img_ids = coco.getImgIds()
         img_ids = coco.getImgIds()
         img_ids.sort()
         img_ids.sort()
@@ -155,7 +156,8 @@ class COCODetDataset(BaseDataset):
             gt_classes = []
             gt_classes = []
             gt_bboxs = []
             gt_bboxs = []
             gt_scores = []
             gt_scores = []
-            difficults = []
+            gt_poly = []
+            difficulties = []
 
 
             for inst in instances:
             for inst in instances:
                 # Check gt bbox
                 # Check gt bbox
@@ -182,12 +184,21 @@ class COCODetDataset(BaseDataset):
                         'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
                         'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
                             img_id, float(inst['area']), x1, y1, x2, y2))
                             img_id, float(inst['area']), x1, y1, x2, y2))
 
 
+                if 'segmentation' in inst and inst['iscrowd']:
+                    gt_poly.append([0.0 for _ in range(8)])
+                elif 'segmentation' in inst and inst['segmentation']:
+                    if not np.array(
+                            inst['segmentation'],
+                            dtype=object).size > 0 and not self.allow_empty:
+                        continue
+                    else:
+                        gt_poly.append(inst['segmentation'])
+
                 is_crowds.append([inst['iscrowd']])
                 is_crowds.append([inst['iscrowd']])
-                gt_classes.append([inst['category_id']])
+                gt_classes.append([catid2clsid[inst['category_id']]])
                 gt_bboxs.append(inst['clean_bbox'])
                 gt_bboxs.append(inst['clean_bbox'])
                 gt_scores.append([1.])
                 gt_scores.append([1.])
-                difficults.append([0])
-
+                difficulties.append(inst.get('difficult', 0.))
                 annotations['annotations'].append({
                 annotations['annotations'].append({
                     'iscrowd': inst['iscrowd'],
                     'iscrowd': inst['iscrowd'],
                     'image_id': int(inst['image_id']),
                     'image_id': int(inst['image_id']),
@@ -195,18 +206,21 @@ class COCODetDataset(BaseDataset):
                     'area': inst['area'],
                     'area': inst['area'],
                     'category_id': inst['category_id'],
                     'category_id': inst['category_id'],
                     'id': inst['id'],
                     'id': inst['id'],
-                    'difficult': 0
+                    'difficult': inst.get('difficult', 0.)
                 })
                 })
+                if gt_poly:
+                    annotations['annotations'][-1]['gt_poly'] = gt_poly[-1]
 
 
             label_info = {
             label_info = {
                 'is_crowd': np.array(is_crowds),
                 'is_crowd': np.array(is_crowds),
                 'gt_class': np.array(gt_classes),
                 'gt_class': np.array(gt_classes),
                 'gt_bbox': np.array(gt_bboxs).astype(np.float32),
                 'gt_bbox': np.array(gt_bboxs).astype(np.float32),
                 'gt_score': np.array(gt_scores).astype(np.float32),
                 'gt_score': np.array(gt_scores).astype(np.float32),
-                'difficult': np.array(difficults),
+                'difficult': np.array(difficulties),
+                'gt_poly': np.array(gt_poly),
             }
             }
 
 
-            if label_info['gt_bbox'].size > 0:
+            if label_info['gt_bbox'].size > 0 or label_info['gt_poly'].size > 0:
                 self.file_list.append({ ** im_info, ** label_info})
                 self.file_list.append({ ** im_info, ** label_info})
                 annotations['images'].append({
                 annotations['images'].append({
                     'height': im_h,
                     'height': im_h,
@@ -259,6 +273,7 @@ class COCODetDataset(BaseDataset):
                 DecodeImg(to_rgb=False)(sample),
                 DecodeImg(to_rgb=False)(sample),
                 DecodeImg(to_rgb=False)(sample_mix)
                 DecodeImg(to_rgb=False)(sample_mix)
             ])
             ])
+
         sample['trans_info'] = []
         sample['trans_info'] = []
         sample, trans_info = self.transforms(sample)
         sample, trans_info = self.transforms(sample)
         return sample, trans_info
         return sample, trans_info
@@ -266,6 +281,11 @@ class COCODetDataset(BaseDataset):
     def __len__(self):
     def __len__(self):
         return self.num_samples
         return self.num_samples
 
 
+    def get_anno_path(self):
+        if self.anno_path:
+            return norm_path(os.path.join(self.data_dir, self.anno_path))
+        return None
+
     def set_epoch(self, epoch_id):
     def set_epoch(self, epoch_id):
         self._epoch = epoch_id
         self._epoch = epoch_id
 
 

+ 3 - 2
paddlers/datasets/res_dataset.py

@@ -45,9 +45,10 @@ class ResDataset(BaseDataset):
                  transforms,
                  transforms,
                  num_workers='auto',
                  num_workers='auto',
                  shuffle=False,
                  shuffle=False,
-                 sr_factor=None):
+                 sr_factor=None,
+                 batch_transforms=None):
         super(ResDataset, self).__init__(data_dir, None, transforms,
         super(ResDataset, self).__init__(data_dir, None, transforms,
-                                         num_workers, shuffle)
+                                         num_workers, shuffle, batch_transforms)
         self.file_list = list()
         self.file_list = list()
 
 
         with open(file_list, encoding=get_encoding(file_list)) as f:
         with open(file_list, encoding=get_encoding(file_list)) as f:

+ 3 - 2
paddlers/datasets/seg_dataset.py

@@ -43,9 +43,10 @@ class SegDataset(BaseDataset):
                  transforms,
                  transforms,
                  label_list=None,
                  label_list=None,
                  num_workers='auto',
                  num_workers='auto',
-                 shuffle=False):
+                 shuffle=False,
+                 batch_transforms=None):
         super(SegDataset, self).__init__(data_dir, label_list, transforms,
         super(SegDataset, self).__init__(data_dir, label_list, transforms,
-                                         num_workers, shuffle)
+                                         num_workers, shuffle, batch_transforms)
         self.file_list = list()
         self.file_list = list()
         self.labels = list()
         self.labels = list()
 
 

+ 5 - 2
paddlers/datasets/voc.py

@@ -46,6 +46,7 @@ class VOCDetDataset(BaseDataset):
         allow_empty (bool, optional): Whether to add negative samples. Defaults to False.
         allow_empty (bool, optional): Whether to add negative samples. Defaults to False.
         empty_ratio (float, optional): Ratio of negative samples. If `empty_ratio` is smaller than 0 or not less 
         empty_ratio (float, optional): Ratio of negative samples. If `empty_ratio` is smaller than 0 or not less 
             than 1, keep all generated negative samples. Defaults to 1.0.
             than 1, keep all generated negative samples. Defaults to 1.0.
+        batch_transforms (paddlers.transforms.BatchCompose|list): Batch transformation operators to apply.
     """
     """
 
 
     def __init__(self,
     def __init__(self,
@@ -56,14 +57,16 @@ class VOCDetDataset(BaseDataset):
                  num_workers='auto',
                  num_workers='auto',
                  shuffle=False,
                  shuffle=False,
                  allow_empty=False,
                  allow_empty=False,
-                 empty_ratio=1.):
+                 empty_ratio=1.,
+                 batch_transforms=None):
         # matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
         # matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
         # or matplotlib.backends is imported for the first time.
         # or matplotlib.backends is imported for the first time.
         import matplotlib
         import matplotlib
         matplotlib.use('Agg')
         matplotlib.use('Agg')
         from pycocotools.coco import COCO
         from pycocotools.coco import COCO
         super(VOCDetDataset, self).__init__(data_dir, label_list, transforms,
         super(VOCDetDataset, self).__init__(data_dir, label_list, transforms,
-                                            num_workers, shuffle)
+                                            num_workers, shuffle,
+                                            batch_transforms)
 
 
         self.data_fields = None
         self.data_fields = None
         self.num_max_boxes = 50
         self.num_max_boxes = 50

+ 9 - 0
paddlers/models/ppdet/core/workspace.py

@@ -67,6 +67,15 @@ class AttrDict(dict):
             return self[key]
             return self[key]
         raise AttributeError("object has no attribute '{}'".format(key))
         raise AttributeError("object has no attribute '{}'".format(key))
 
 
+    def __setattr__(self, key, value):
+        self[key] = value
+
+    def copy(self):
+        new_dict = AttrDict()
+        for k, v in self.items():
+            new_dict.update({k: v})
+        return new_dict
+
 
 
 global_config = AttrDict()
 global_config = AttrDict()
 
 

+ 1 - 1
paddlers/models/ppdet/data/crop_utils/__init__.py

@@ -10,4 +10,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations under the License. 

+ 91 - 53
paddlers/models/ppdet/data/crop_utils/annotation_cropper.py

@@ -27,14 +27,15 @@ from .chip_box_utils import intersection_over_box
 
 
 
 
 class AnnoCropper(object):
 class AnnoCropper(object):
-    def __init__(self, image_target_sizes: List[int],
+    def __init__(self,
+                 image_target_sizes: List[int],
                  valid_box_ratio_ranges: List[List[float]],
                  valid_box_ratio_ranges: List[List[float]],
-                 chip_target_size: int, chip_target_stride: int,
-                 use_neg_chip: bool = False,
-                 max_neg_num_per_im: int = 8,
-                 max_per_img: int = -1,
-                 nms_thresh: int = 0.5
-                 ):
+                 chip_target_size: int,
+                 chip_target_stride: int,
+                 use_neg_chip: bool=False,
+                 max_neg_num_per_im: int=8,
+                 max_per_img: int=-1,
+                 nms_thresh: int=0.5):
         """
         """
         Generate chips by chip_target_size and chip_target_stride.
         Generate chips by chip_target_size and chip_target_stride.
         These two parameters just like kernel_size and stride in cnn.
         These two parameters just like kernel_size and stride in cnn.
@@ -117,7 +118,8 @@ class AnnoCropper(object):
         self.chip_records = []
         self.chip_records = []
         self._global_chip_id = 1
         self._global_chip_id = 1
         for r in records:
         for r in records:
-            self._cur_im_pos_chips = []  # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]
+            self._cur_im_pos_chips = [
+            ]  # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]
             self._cur_im_neg_chips = []  # element: (chip, neg_box_num)
             self._cur_im_neg_chips = []  # element: (chip, neg_box_num)
             for scale_i in range(self.scale_num):
             for scale_i in range(self.scale_num):
                 self._get_current_scale_parameters(scale_i, r)
                 self._get_current_scale_parameters(scale_i, r)
@@ -126,12 +128,16 @@ class AnnoCropper(object):
                 chips = self._create_chips(r['h'], r['w'], self._cur_scale)
                 chips = self._create_chips(r['h'], r['w'], self._cur_scale)
 
 
                 # # dict: chipid->[box_id, ...]
                 # # dict: chipid->[box_id, ...]
-                pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(r['gt_bbox'], chips)
+                pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(
+                    r['gt_bbox'], chips)
 
 
                 # dict: chipid->neg_box_num
                 # dict: chipid->neg_box_num
-                neg_chip2box_num = self._get_neg_boxes_and_chips(chips, list(pos_chip2boxes_idx.keys()), r.get('proposals', None))
+                neg_chip2box_num = self._get_neg_boxes_and_chips(
+                    chips,
+                    list(pos_chip2boxes_idx.keys()), r.get('proposals', None))
 
 
-                self._add_to_cur_im_chips(chips, pos_chip2boxes_idx, neg_chip2box_num)
+                self._add_to_cur_im_chips(chips, pos_chip2boxes_idx,
+                                          neg_chip2box_num)
 
 
             cur_image_records = self._trans_all_chips2annotations(r)
             cur_image_records = self._trans_all_chips2annotations(r)
             self.chip_records.extend(cur_image_records)
             self.chip_records.extend(cur_image_records)
@@ -147,7 +153,7 @@ class AnnoCropper(object):
 
 
         for neg_chipid, neg_box_num in neg_chip2box_num.items():
         for neg_chipid, neg_box_num in neg_chip2box_num.items():
             chip = np.array(chips[neg_chipid])
             chip = np.array(chips[neg_chipid])
-            self._cur_im_neg_chips.append((chip,  neg_box_num))
+            self._cur_im_neg_chips.append((chip, neg_box_num))
 
 
     def _trans_all_chips2annotations(self, r):
     def _trans_all_chips2annotations(self, r):
         gt_bbox = r['gt_bbox']
         gt_bbox = r['gt_bbox']
@@ -156,20 +162,24 @@ class AnnoCropper(object):
         gt_class = r['gt_class']
         gt_class = r['gt_class']
         # gt_poly = r['gt_poly']   # [None]xN
         # gt_poly = r['gt_poly']   # [None]xN
         # remaining keys: im_id, h, w
         # remaining keys: im_id, h, w
-        chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox, is_crowd, gt_class)
+        chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox,
+                                                         is_crowd, gt_class)
 
 
         if not self.use_neg_chip:
         if not self.use_neg_chip:
             return chip_records
             return chip_records
 
 
         sampled_neg_chips = self._sample_neg_chips()
         sampled_neg_chips = self._sample_neg_chips()
-        neg_chip_records = self._trans_neg_chips2annotations(im_file, sampled_neg_chips)
+        neg_chip_records = self._trans_neg_chips2annotations(im_file,
+                                                             sampled_neg_chips)
         chip_records.extend(neg_chip_records)
         chip_records.extend(neg_chip_records)
         return chip_records
         return chip_records
 
 
-    def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd, gt_class):
+    def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd,
+                                     gt_class):
         chip_records = []
         chip_records = []
         for chip, boxes_idx in self._cur_im_pos_chips:
         for chip, boxes_idx in self._cur_im_pos_chips:
-            chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx, chip)
+            chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx,
+                                                            chip)
             x1, y1, x2, y2 = chip
             x1, y1, x2, y2 = chip
             chip_h = y2 - y1
             chip_h = y2 - y1
             chip_w = x2 - x1
             chip_w = x2 - x1
@@ -197,12 +207,15 @@ class AnnoCropper(object):
             return self._cur_im_neg_chips
             return self._cur_im_neg_chips
 
 
         candidate_num = int(sample_num * 1.5)
         candidate_num = int(sample_num * 1.5)
-        candidate_neg_chips = sorted(self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]
+        candidate_neg_chips = sorted(
+            self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]
         random.shuffle(candidate_neg_chips)
         random.shuffle(candidate_neg_chips)
         sampled_neg_chips = candidate_neg_chips[:sample_num]
         sampled_neg_chips = candidate_neg_chips[:sample_num]
         return sampled_neg_chips
         return sampled_neg_chips
 
 
-    def _trans_neg_chips2annotations(self, im_file: str, sampled_neg_chips: List[Tuple]):
+    def _trans_neg_chips2annotations(self,
+                                     im_file: str,
+                                     sampled_neg_chips: List[Tuple]):
         chip_records = []
         chip_records = []
         for chip, neg_box_num in sampled_neg_chips:
         for chip, neg_box_num in sampled_neg_chips:
             x1, y1, x2, y2 = chip
             x1, y1, x2, y2 = chip
@@ -213,9 +226,12 @@ class AnnoCropper(object):
                 'im_id': np.array([self._global_chip_id]),
                 'im_id': np.array([self._global_chip_id]),
                 'h': chip_h,
                 'h': chip_h,
                 'w': chip_w,
                 'w': chip_w,
-                'gt_bbox': np.zeros((0, 4), dtype=np.float32),
-                'is_crowd': np.zeros((0, 1), dtype=np.int32),
-                'gt_class': np.zeros((0, 1), dtype=np.int32),
+                'gt_bbox': np.zeros(
+                    (0, 4), dtype=np.float32),
+                'is_crowd': np.zeros(
+                    (0, 1), dtype=np.int32),
+                'gt_class': np.zeros(
+                    (0, 1), dtype=np.int32),
                 # 'gt_poly': [],
                 # 'gt_poly': [],
                 'chip': chip
                 'chip': chip
             }
             }
@@ -247,7 +263,8 @@ class AnnoCropper(object):
 
 
         assert chip_size >= stride
         assert chip_size >= stride
         chip_overlap = chip_size - stride
         chip_overlap = chip_size - stride
-        if (width - chip_overlap) % stride > min_chip_location_diff:  # 不能被stride整除的部分比较大,则保留
+        if (width - chip_overlap
+            ) % stride > min_chip_location_diff:  # 不能被stride整除的部分比较大,则保留
             w_steps = max(1, int(math.ceil((width - chip_overlap) / stride)))
             w_steps = max(1, int(math.ceil((width - chip_overlap) / stride)))
         else:  # 不能被stride整除的部分比较小,则丢弃
         else:  # 不能被stride整除的部分比较小,则丢弃
             w_steps = max(1, int(math.floor((width - chip_overlap) / stride)))
             w_steps = max(1, int(math.floor((width - chip_overlap) / stride)))
@@ -267,9 +284,10 @@ class AnnoCropper(object):
 
 
         # check  chip size
         # check  chip size
         for item in chips:
         for item in chips:
-            if item[2] - item[0] > chip_size * 1.1 or item[3] - item[1] > chip_size * 1.1:
+            if item[2] - item[0] > chip_size * 1.1 or item[3] - item[
+                    1] > chip_size * 1.1:
                 raise ValueError(item)
                 raise ValueError(item)
-        chips = np.array(chips, dtype=np.float)
+        chips = np.array(chips, dtype=np.float32)
 
 
         raw_size_chips = chips / scale
         raw_size_chips = chips / scale
         return raw_size_chips
         return raw_size_chips
@@ -279,12 +297,15 @@ class AnnoCropper(object):
         im_size = self._cur_im_size
         im_size = self._cur_im_size
         scale = self._cur_scale
         scale = self._cur_scale
         #   Nx4            N
         #   Nx4            N
-        valid_boxes, valid_boxes_idx = self._validate_boxes(valid_ratio_range, im_size, gt_bbox, scale)
+        valid_boxes, valid_boxes_idx = self._validate_boxes(
+            valid_ratio_range, im_size, gt_bbox, scale)
         # dict: chipid->[box_id, ...]
         # dict: chipid->[box_id, ...]
-        pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes, valid_boxes_idx)
+        pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes,
+                                                  valid_boxes_idx)
         return pos_chip2boxes_idx
         return pos_chip2boxes_idx
 
 
-    def _validate_boxes(self, valid_ratio_range: List[float],
+    def _validate_boxes(self,
+                        valid_ratio_range: List[float],
                         im_size: int,
                         im_size: int,
                         gt_boxes: 'np.array of Nx4',
                         gt_boxes: 'np.array of Nx4',
                         scale: float):
                         scale: float):
@@ -299,20 +320,26 @@ class AnnoCropper(object):
         target_mins = mins * scale
         target_mins = mins * scale
 
 
         low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0
         low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0
-        high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(np.float).max
+        high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(
+            np.float32).max
 
 
-        valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & (target_mins >= 2))[0]
+        valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & (
+            target_mins >= 2))[0]
         valid_boxes = gt_boxes[valid_boxes_idx]
         valid_boxes = gt_boxes[valid_boxes_idx]
         return valid_boxes, valid_boxes_idx
         return valid_boxes, valid_boxes_idx
 
 
-    def _find_pos_chips(self, chips: 'Cx4', valid_boxes: 'Bx4', valid_boxes_idx: 'B'):
+    def _find_pos_chips(self,
+                        chips: 'Cx4',
+                        valid_boxes: 'Bx4',
+                        valid_boxes_idx: 'B'):
         """
         """
         :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...]
         :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...]
         """
         """
         iob = intersection_over_box(chips, valid_boxes)  # overlap, CxB
         iob = intersection_over_box(chips, valid_boxes)  # overlap, CxB
 
 
         iob_threshold_to_find_chips = 1.
         iob_threshold_to_find_chips = 1.
-        pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips)
+        pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(
+            iob, iob_threshold_to_find_chips)
         pos_chip_ids = set(pos_chip_ids)
         pos_chip_ids = set(pos_chip_ids)
 
 
         iob_threshold_to_assign_box = 0.5
         iob_threshold_to_assign_box = 0.5
@@ -323,7 +350,8 @@ class AnnoCropper(object):
     def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold):
     def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold):
         return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold)
         return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold)
 
 
-    def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids, valid_boxes_idx):
+    def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids,
+                                   valid_boxes_idx):
         chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
         chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
         pos_chip2boxes_idx = defaultdict(list)
         pos_chip2boxes_idx = defaultdict(list)
         for chip_id, box_id in zip(chip_ids, box_ids):
         for chip_id, box_id in zip(chip_ids, box_ids):
@@ -333,7 +361,10 @@ class AnnoCropper(object):
             pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx)
             pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx)
         return pos_chip2boxes_idx
         return pos_chip2boxes_idx
 
 
-    def _get_neg_boxes_and_chips(self, chips: 'Cx4', pos_chip_ids: 'D', proposals: 'Px4'):
+    def _get_neg_boxes_and_chips(self,
+                                 chips: 'Cx4',
+                                 pos_chip_ids: 'D',
+                                 proposals: 'Px4'):
         """
         """
         :param chips:
         :param chips:
         :param pos_chip_ids:
         :param pos_chip_ids:
@@ -351,12 +382,16 @@ class AnnoCropper(object):
         im_size = self._cur_im_size
         im_size = self._cur_im_size
         scale = self._cur_scale
         scale = self._cur_scale
 
 
-        valid_props, _ = self._validate_boxes(valid_ratio_range, im_size, proposals, scale)
+        valid_props, _ = self._validate_boxes(valid_ratio_range, im_size,
+                                              proposals, scale)
         neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props)
         neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props)
         neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes)
         neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes)
         return neg_chip2box_num
         return neg_chip2box_num
 
 
-    def _find_neg_boxes(self, chips: 'Cx4', pos_chip_ids: 'D', valid_props: 'Px4'):
+    def _find_neg_boxes(self,
+                        chips: 'Cx4',
+                        pos_chip_ids: 'D',
+                        valid_props: 'Px4'):
         """
         """
         :return: neg_boxes: Nx4
         :return: neg_boxes: Nx4
         """
         """
@@ -370,7 +405,8 @@ class AnnoCropper(object):
         neg_boxes = valid_props[non_overlap_props_idx]
         neg_boxes = valid_props[non_overlap_props_idx]
         return neg_boxes
         return neg_boxes
 
 
-    def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D', neg_boxes: 'Nx4'):
+    def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D',
+                        neg_boxes: 'Nx4'):
         """
         """
         :return: neg_chip2box_num, dict: chipid->neg_box_num
         :return: neg_chip2box_num, dict: chipid->neg_box_num
         """
         """
@@ -469,31 +505,37 @@ class AnnoCropper(object):
         for result in results:
         for result in results:
             bbox_locs = result['bbox']
             bbox_locs = result['bbox']
             bbox_nums = result['bbox_num']
             bbox_nums = result['bbox_num']
-            if len(bbox_locs) == 1 and bbox_locs[0][0] == -1:  # current batch has no detections
+            if len(bbox_locs) == 1 and bbox_locs[0][
+                    0] == -1:  # current batch has no detections
                 # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]]
                 # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]]
                 # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1.
                 # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1.
                 continue
                 continue
-            im_ids = result['im_id'] # replace with range(len(bbox_nums))
+            im_ids = result['im_id']  # replace with range(len(bbox_nums))
 
 
             last_bbox_num = 0
             last_bbox_num = 0
             for idx, im_id in enumerate(im_ids):
             for idx, im_id in enumerate(im_ids):
 
 
                 cur_bbox_len = bbox_nums[idx]
                 cur_bbox_len = bbox_nums[idx]
-                bboxes = bbox_locs[last_bbox_num: last_bbox_num + cur_bbox_len]
+                bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len]
                 last_bbox_num += cur_bbox_len
                 last_bbox_num += cur_bbox_len
                 # box: [num_id, score, xmin, ymin, xmax, ymax]
                 # box: [num_id, score, xmin, ymin, xmax, ymax]
                 if len(bboxes) == 0:  # current image has no detections
                 if len(bboxes) == 0:  # current image has no detections
                     continue
                     continue
 
 
-                chip_rec = records[int(im_id) - 1]  # im_id starts from 1, type is np.int64
+                chip_rec = records[int(im_id) -
+                                   1]  # im_id starts from 1, type is np.int64
                 image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"])
                 image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"])
 
 
-                bboxes = transform_chip_boxes2image_boxes(bboxes, chip_rec["chip"], chip_rec["ori_im_h"], chip_rec["ori_im_w"])
+                bboxes = transform_chip_boxes2image_boxes(
+                    bboxes, chip_rec["chip"], chip_rec["ori_im_h"],
+                    chip_rec["ori_im_w"])
 
 
                 scale_i = chip_rec["scale_i"]
                 scale_i = chip_rec["scale_i"]
-                cur_scale = self._get_current_scale(self.target_sizes[scale_i], image_size)
-                _, valid_boxes_idx = self._validate_boxes(self.valid_box_ratio_ranges[scale_i], image_size,
-                                                                    bboxes[:, 2:], cur_scale)
+                cur_scale = self._get_current_scale(self.target_sizes[scale_i],
+                                                    image_size)
+                _, valid_boxes_idx = self._validate_boxes(
+                    self.valid_box_ratio_ranges[scale_i], image_size,
+                    bboxes[:, 2:], cur_scale)
                 ori_img_id = self._global_chip_id2img_id[int(im_id)]
                 ori_img_id = self._global_chip_id2img_id[int(im_id)]
 
 
                 img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx])
                 img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx])
@@ -507,7 +549,8 @@ class AnnoCropper(object):
         nms_thresh = self.nms_thresh
         nms_thresh = self.nms_thresh
 
 
         for img_id in img_id2bbox:
         for img_id in img_id2bbox:
-            box = img_id2bbox[img_id]  # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
+            box = img_id2bbox[
+                img_id]  # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
             box = np.concatenate(box, axis=0)
             box = np.concatenate(box, axis=0)
             nms_dets = nms(box, nms_thresh)
             nms_dets = nms(box, nms_thresh)
             if max_per_img > 0:
             if max_per_img > 0:
@@ -525,18 +568,13 @@ class AnnoCropper(object):
         results = []
         results = []
         for img_id in im_ids:  # output by original im_id order
         for img_id in im_ids:  # output by original im_id order
             if len(img_id2bbox[img_id]) == 0:
             if len(img_id2bbox[img_id]) == 0:
-                bbox = np.array([[-1.,  0.,  0.,  0.,  0.,  0.]])  # edge case: no detections
+                bbox = np.array(
+                    [[-1., 0., 0., 0., 0., 0.]])  # edge case: no detections
                 bbox_num = np.array([0])
                 bbox_num = np.array([0])
             else:
             else:
                 # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
                 # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
                 bbox = img_id2bbox[img_id]
                 bbox = img_id2bbox[img_id]
                 bbox_num = np.array([len(bbox)])
                 bbox_num = np.array([len(bbox)])
-            res = dict(
-                im_id=np.array([[img_id]]),
-                bbox=bbox,
-                bbox_num=bbox_num
-            )
+            res = dict(im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num)
             results.append(res)
             results.append(res)
         return results
         return results
-
-

+ 10 - 6
paddlers/models/ppdet/data/crop_utils/chip_box_utils.py

@@ -33,8 +33,10 @@ def intersection_over_box(chips, boxes):
 
 
     box_area = bbox_area(boxes)  # B
     box_area = bbox_area(boxes)  # B
 
 
-    inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:], boxes[:, 2:])  # CxBX2
-    inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2], boxes[:, :2])  # CxBx2
+    inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:],
+                            boxes[:, 2:])  # CxBX2
+    inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2],
+                            boxes[:, :2])  # CxBx2
     inter_wh = inter_x2y2 - inter_x1y1
     inter_wh = inter_x2y2 - inter_x1y1
     inter_wh = np.clip(inter_wh, a_min=0, a_max=None)
     inter_wh = np.clip(inter_wh, a_min=0, a_max=None)
     inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # CxB
     inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # CxB
@@ -81,8 +83,9 @@ def transform_chip_box(gt_bbox: 'Gx4', boxes_idx: 'B', chip: '4'):
 def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):
 def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):
     chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
     chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
     chip_id2overlap_box_num = np.bincount(chip_ids)  # 1d array
     chip_id2overlap_box_num = np.bincount(chip_ids)  # 1d array
-    chip_id2overlap_box_num = np.pad(chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)),
-                                     constant_values=0)
+    chip_id2overlap_box_num = np.pad(
+        chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)),
+        constant_values=0)
 
 
     chosen_chip_ids = []
     chosen_chip_ids = []
     while len(box_ids) > 0:
     while len(box_ids) > 0:
@@ -92,7 +95,8 @@ def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):
         chosen_chip_ids.append(max_count_chip_id)
         chosen_chip_ids.append(max_count_chip_id)
 
 
         box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id]
         box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id]
-        ids_not_in_cur_boxes_mask = np.logical_not(np.isin(box_ids, box_ids_in_cur_chip))
+        ids_not_in_cur_boxes_mask = np.logical_not(
+            np.isin(box_ids, box_ids_in_cur_chip))
         chip_ids = chip_ids[ids_not_in_cur_boxes_mask]
         chip_ids = chip_ids[ids_not_in_cur_boxes_mask]
         box_ids = box_ids[ids_not_in_cur_boxes_mask]
         box_ids = box_ids[ids_not_in_cur_boxes_mask]
     return chosen_chip_ids, chip_id2overlap_box_num
     return chosen_chip_ids, chip_id2overlap_box_num
@@ -124,7 +128,7 @@ def nms(dets, thresh):
     order = scores.argsort()[::-1]
     order = scores.argsort()[::-1]
 
 
     ndets = dets.shape[0]
     ndets = dets.shape[0]
-    suppressed = np.zeros((ndets), dtype=np.int)
+    suppressed = np.zeros((ndets), dtype=np.int32)
 
 
     # nominal indices
     # nominal indices
     # _i, _j
     # _i, _j

+ 309 - 0
paddlers/models/ppdet/data/reader.py

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # limitations under the License.
 
 
+import copy
 import os
 import os
 import traceback
 import traceback
 import six
 import six
@@ -21,6 +22,10 @@ if sys.version_info >= (3, 0):
 else:
 else:
     pass
     pass
 import numpy as np
 import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from copy import deepcopy
 
 
 from paddle.io import DataLoader, DistributedBatchSampler
 from paddle.io import DataLoader, DistributedBatchSampler
 from .utils import default_collate_fn
 from .utils import default_collate_fn
@@ -300,3 +305,307 @@ class TestMOTReader(BaseDataLoader):
         super(TestMOTReader, self).__init__(sample_transforms, batch_transforms,
         super(TestMOTReader, self).__init__(sample_transforms, batch_transforms,
                                             batch_size, shuffle, drop_last,
                                             batch_size, shuffle, drop_last,
                                             num_classes, **kwargs)
                                             num_classes, **kwargs)
+
+
+# For Semi-Supervised Object Detection (SSOD)
+class Compose_SSOD(object):
+    def __init__(self, base_transforms, weak_aug, strong_aug, num_classes=80):
+        self.base_transforms = base_transforms
+        self.base_transforms_cls = []
+        for t in self.base_transforms:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.base_transforms_cls.append(f)
+
+        self.weak_augs = weak_aug
+        self.weak_augs_cls = []
+        for t in self.weak_augs:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.weak_augs_cls.append(f)
+
+        self.strong_augs = strong_aug
+        self.strong_augs_cls = []
+        for t in self.strong_augs:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.strong_augs_cls.append(f)
+
+    def __call__(self, data):
+        for f in self.base_transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map sample transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        weak_data = deepcopy(data)
+        strong_data = deepcopy(data)
+        for f in self.weak_augs_cls:
+            try:
+                weak_data = f(weak_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map weak aug [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        for f in self.strong_augs_cls:
+            try:
+                strong_data = f(strong_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map strong aug [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        weak_data['strong_aug'] = strong_data
+        return weak_data
+
+
+class BatchCompose_SSOD(Compose):
+    def __init__(self, transforms, num_classes=80, collate_batch=True):
+        super(BatchCompose_SSOD, self).__init__(transforms, num_classes)
+        self.collate_batch = collate_batch
+
+    def __call__(self, data):
+        # split strong_data from data(weak_data)
+        strong_data = []
+        for sample in data:
+            strong_data.append(sample['strong_aug'])
+            sample.pop('strong_aug')
+
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+                strong_data = f(strong_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map batch transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        # remove keys which is not needed by model
+        extra_key = ['h', 'w', 'flipped']
+        for k in extra_key:
+            for sample in data:
+                if k in sample:
+                    sample.pop(k)
+            for sample in strong_data:
+                if k in sample:
+                    sample.pop(k)
+
+        # batch data, if user-define batch function needed
+        # use user-defined here
+        if self.collate_batch:
+            batch_data = default_collate_fn(data)
+            strong_batch_data = default_collate_fn(strong_data)
+            return batch_data, strong_batch_data
+        else:
+            batch_data = {}
+            for k in data[0].keys():
+                tmp_data = []
+                for i in range(len(data)):
+                    tmp_data.append(data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                batch_data[k] = tmp_data
+
+            strong_batch_data = {}
+            for k in strong_data[0].keys():
+                tmp_data = []
+                for i in range(len(strong_data)):
+                    tmp_data.append(strong_data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                strong_batch_data[k] = tmp_data
+
+        return batch_data, strong_batch_data
+
+
+class CombineSSODLoader(object):
+    def __init__(self, label_loader, unlabel_loader):
+        self.label_loader = label_loader
+        self.unlabel_loader = unlabel_loader
+
+    def __iter__(self):
+        while True:
+            try:
+                label_samples = next(self.label_loader_iter)
+            except:
+                self.label_loader_iter = iter(self.label_loader)
+                label_samples = next(self.label_loader_iter)
+
+            try:
+                unlabel_samples = next(self.unlabel_loader_iter)
+            except:
+                self.unlabel_loader_iter = iter(self.unlabel_loader)
+                unlabel_samples = next(self.unlabel_loader_iter)
+
+            yield (
+                label_samples[0],  # sup weak
+                label_samples[1],  # sup strong
+                unlabel_samples[0],  # unsup weak
+                unlabel_samples[1]  # unsup strong
+            )
+
+    def __call__(self):
+        return self.__iter__()
+
+
+class BaseSemiDataLoader(object):
+    def __init__(self,
+                 sample_transforms=[],
+                 weak_aug=[],
+                 strong_aug=[],
+                 sup_batch_transforms=[],
+                 unsup_batch_transforms=[],
+                 sup_batch_size=1,
+                 unsup_batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 use_shared_memory=False,
+                 **kwargs):
+        # sup transforms
+        self._sample_transforms_label = Compose_SSOD(
+            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
+        self._batch_transforms_label = BatchCompose_SSOD(
+            sup_batch_transforms, num_classes, collate_batch)
+        self.batch_size_label = sup_batch_size
+
+        # unsup transforms
+        self._sample_transforms_unlabel = Compose_SSOD(
+            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
+        self._batch_transforms_unlabel = BatchCompose_SSOD(
+            unsup_batch_transforms, num_classes, collate_batch)
+        self.batch_size_unlabel = unsup_batch_size
+
+        # common
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.use_shared_memory = use_shared_memory
+        self.kwargs = kwargs
+
+    def __call__(self,
+                 dataset_label,
+                 dataset_unlabel,
+                 worker_num,
+                 batch_sampler_label=None,
+                 batch_sampler_unlabel=None,
+                 return_list=False):
+        # sup dataset 
+        self.dataset_label = dataset_label
+        self.dataset_label.check_or_download_dataset()
+        self.dataset_label.parse_dataset()
+        self.dataset_label.set_transform(self._sample_transforms_label)
+        self.dataset_label.set_kwargs(**self.kwargs)
+        if batch_sampler_label is None:
+            self._batch_sampler_label = DistributedBatchSampler(
+                self.dataset_label,
+                batch_size=self.batch_size_label,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler_label = batch_sampler_label
+
+        # unsup dataset
+        self.dataset_unlabel = dataset_unlabel
+        self.dataset_unlabel.length = self.dataset_label.__len__()
+        self.dataset_unlabel.check_or_download_dataset()
+        self.dataset_unlabel.parse_dataset()
+        self.dataset_unlabel.set_transform(self._sample_transforms_unlabel)
+        self.dataset_unlabel.set_kwargs(**self.kwargs)
+        if batch_sampler_unlabel is None:
+            self._batch_sampler_unlabel = DistributedBatchSampler(
+                self.dataset_unlabel,
+                batch_size=self.batch_size_unlabel,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler_unlabel = batch_sampler_unlabel
+
+        # DataLoader do not start sub-process in Windows and Mac
+        # system, do not need to use shared memory
+        use_shared_memory = self.use_shared_memory and \
+                            sys.platform not in ['win32', 'darwin']
+        # check whether shared memory size is bigger than 1G(1024M)
+        if use_shared_memory:
+            shm_size = _get_shared_memory_size_in_M()
+            if shm_size is not None and shm_size < 1024.:
+                logger.warning("Shared memory size is less than 1G, "
+                               "disable shared_memory in DataLoader")
+                use_shared_memory = False
+
+        self.dataloader_label = DataLoader(
+            dataset=self.dataset_label,
+            batch_sampler=self._batch_sampler_label,
+            collate_fn=self._batch_transforms_label,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+
+        self.dataloader_unlabel = DataLoader(
+            dataset=self.dataset_unlabel,
+            batch_sampler=self._batch_sampler_unlabel,
+            collate_fn=self._batch_transforms_unlabel,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+
+        self.dataloader = CombineSSODLoader(self.dataloader_label,
+                                            self.dataloader_unlabel)
+        self.loader = iter(self.dataloader)
+        return self
+
+    def __len__(self):
+        return len(self._batch_sampler_label)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return next(self.loader)
+
+    def next(self):
+        # python2 compatibility
+        return self.__next__()
+
+
+@register
+class SemiTrainReader(BaseSemiDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 weak_aug=[],
+                 strong_aug=[],
+                 sup_batch_transforms=[],
+                 unsup_batch_transforms=[],
+                 sup_batch_size=1,
+                 unsup_batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 **kwargs):
+        super(SemiTrainReader, self).__init__(
+            sample_transforms, weak_aug, strong_aug, sup_batch_transforms,
+            unsup_batch_transforms, sup_batch_size, unsup_batch_size, shuffle,
+            drop_last, num_classes, collate_batch, **kwargs)

+ 1 - 0
paddlers/models/ppdet/data/source/__init__.py

@@ -28,3 +28,4 @@ from .keypoint_coco import *
 from .mot import *
 from .mot import *
 from .sniper_coco import SniperCOCODataSet
 from .sniper_coco import SniperCOCODataSet
 from .dataset import ImageFolder
 from .dataset import ImageFolder
+from .pose3d_cmb import *

+ 3 - 0
paddlers/models/ppdet/data/source/category.py

@@ -118,6 +118,9 @@ def get_categories(metric_type, anno_file=None, arch=None):
     ) == 'keypointtopdownmpiieval':
     ) == 'keypointtopdownmpiieval':
         return (None, {'id': 'keypoint'})
         return (None, {'id': 'keypoint'})
 
 
+    elif metric_type.lower() == 'pose3deval':
+        return (None, {'id': 'pose3d'})
+
     elif metric_type.lower() in ['mot', 'motdet', 'reid']:
     elif metric_type.lower() in ['mot', 'motdet', 'reid']:
         if anno_file and os.path.isfile(anno_file):
         if anno_file and os.path.isfile(anno_file):
             cats = []
             cats = []

+ 237 - 3
paddlers/models/ppdet/data/source/coco.py

@@ -13,6 +13,11 @@
 # limitations under the License.
 # limitations under the License.
 
 
 import os
 import os
+import copy
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
 import numpy as np
 import numpy as np
 from paddlers.models.ppdet.core.workspace import register, serializable
 from paddlers.models.ppdet.core.workspace import register, serializable
 from .dataset import DetDataset
 from .dataset import DetDataset
@@ -20,6 +25,8 @@ from .dataset import DetDataset
 from paddlers.models.ppdet.utils.logger import setup_logger
 from paddlers.models.ppdet.utils.logger import setup_logger
 logger = setup_logger(__name__)
 logger = setup_logger(__name__)
 
 
+__all__ = ['COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet']
+
 
 
 @register
 @register
 @serializable
 @serializable
@@ -170,8 +177,10 @@ class COCODataSet(DetDataset):
                 gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
                 gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
                 is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
                 is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
                 gt_poly = [None] * num_bbox
                 gt_poly = [None] * num_bbox
+                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)
 
 
                 has_segmentation = False
                 has_segmentation = False
+                has_track_id = False
                 for i, box in enumerate(bboxes):
                 for i, box in enumerate(bboxes):
                     catid = box['category_id']
                     catid = box['category_id']
                     gt_class[i][0] = self.catid2clsid[catid]
                     gt_class[i][0] = self.catid2clsid[catid]
@@ -181,8 +190,9 @@ class COCODataSet(DetDataset):
                     if 'segmentation' in box and box['iscrowd'] == 1:
                     if 'segmentation' in box and box['iscrowd'] == 1:
                         gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
                         gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
                     elif 'segmentation' in box and box['segmentation']:
                     elif 'segmentation' in box and box['segmentation']:
-                        if not np.array(box['segmentation']
-                                        ).size > 0 and not self.allow_empty:
+                        if not np.array(
+                                box['segmentation'],
+                                dtype=object).size > 0 and not self.allow_empty:
                             bboxes.pop(i)
                             bboxes.pop(i)
                             gt_poly.pop(i)
                             gt_poly.pop(i)
                             np.delete(is_crowd, i)
                             np.delete(is_crowd, i)
@@ -192,6 +202,10 @@ class COCODataSet(DetDataset):
                             gt_poly[i] = box['segmentation']
                             gt_poly[i] = box['segmentation']
                         has_segmentation = True
                         has_segmentation = True
 
 
+                    if 'track_id' in box:
+                        gt_track_id[i][0] = box['track_id']
+                        has_track_id = True
+
                 if has_segmentation and not any(
                 if has_segmentation and not any(
                         gt_poly) and not self.allow_empty:
                         gt_poly) and not self.allow_empty:
                     continue
                     continue
@@ -202,6 +216,8 @@ class COCODataSet(DetDataset):
                     'gt_bbox': gt_bbox,
                     'gt_bbox': gt_bbox,
                     'gt_poly': gt_poly,
                     'gt_poly': gt_poly,
                 }
                 }
+                if has_track_id:
+                    gt_rec.update({'gt_track_id': gt_track_id})
 
 
                 for k, v in gt_rec.items():
                 for k, v in gt_rec.items():
                     if k in self.data_fields:
                     if k in self.data_fields:
@@ -223,7 +239,8 @@ class COCODataSet(DetDataset):
             if self.sample_num > 0 and ct >= self.sample_num:
             if self.sample_num > 0 and ct >= self.sample_num:
                 break
                 break
         assert ct > 0, 'not found any coco record in %s' % (anno_path)
         assert ct > 0, 'not found any coco record in %s' % (anno_path)
-        logger.debug('{} samples in file {}'.format(ct, anno_path))
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
         if self.allow_empty and len(empty_records) > 0:
         if self.allow_empty and len(empty_records) > 0:
             empty_records = self._sample_empty(empty_records, len(records))
             empty_records = self._sample_empty(empty_records, len(records))
             records += empty_records
             records += empty_records
@@ -351,3 +368,220 @@ class SlicedCOCODataSet(COCODataSet):
             empty_records = self._sample_empty(empty_records, len(records))
             empty_records = self._sample_empty(empty_records, len(records))
             records += empty_records
             records += empty_records
         self.roidbs = records
         self.roidbs = records
+
+
+@register
+@serializable
+class SemiCOCODataSet(COCODataSet):
+    """Semi-COCODataSet used for supervised and unsupervised dataSet"""
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1,
+                 supervised=True):
+        super(SemiCOCODataSet, self).__init__(
+            dataset_dir, image_dir, anno_path, data_fields, sample_num,
+            load_crowd, allow_empty, empty_ratio, repeat)
+        self.supervised = supervised
+        self.length = -1  # defalut -1 means all
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset or self.supervised == False:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    x1, y1, box_w, box_h = inst['bbox']
+                    x2 = x1 + box_w
+                    y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+
+                has_segmentation = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format 
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(box['segmentation']
+                                        ).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                gt_rec = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_poly': gt_poly,
+                }
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+        if self.supervised:
+            logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED')
+        else:
+            if self.length > 0:  # unsup length will be decide by sup length
+                all_roidbs = self.roidbs.copy()
+                selected_idxs = [
+                    np.random.choice(len(all_roidbs))
+                    for _ in range(self.length)
+                ]
+                self.roidbs = [all_roidbs[i] for i in selected_idxs]
+            logger.info(
+                f'Use {len(self.roidbs)} unsup_samples data as UNLABELED')
+
+    def __getitem__(self, idx):
+        n = len(self.roidbs)
+        if self.repeat > 1:
+            idx %= n
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(4)
+            ]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)

+ 9 - 1
paddlers/models/ppdet/data/source/dataset.py

@@ -86,6 +86,12 @@ class DetDataset(Dataset):
                 copy.deepcopy(self.roidbs[np.random.randint(n)])
                 copy.deepcopy(self.roidbs[np.random.randint(n)])
                 for _ in range(4)
                 for _ in range(4)
             ]
             ]
+        elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch:
+            # Add previous image as input, only used in CenterTrack
+            idx_pre_img = idx - 1
+            if idx_pre_img < 0:
+                idx_pre_img = idx + 1
+            roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])]
         if isinstance(roidb, Sequence):
         if isinstance(roidb, Sequence):
             for r in roidb:
             for r in roidb:
                 r['curr_iter'] = self._curr_iter
                 r['curr_iter'] = self._curr_iter
@@ -103,6 +109,7 @@ class DetDataset(Dataset):
         self.mixup_epoch = kwargs.get('mixup_epoch', -1)
         self.mixup_epoch = kwargs.get('mixup_epoch', -1)
         self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
         self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
         self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
         self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
+        self.pre_img_epoch = kwargs.get('pre_img_epoch', -1)
 
 
     def set_transform(self, transform):
     def set_transform(self, transform):
         self.transform = transform
         self.transform = transform
@@ -254,7 +261,8 @@ class ImageFolder(DetDataset):
                 records.append(rec)
                 records.append(rec)
             ct_sub += sub_img_num
             ct_sub += sub_img_num
             ct += 1
             ct += 1
-        print('{} samples and slice to {} sub_samples'.format(ct, ct_sub))
+        logger.info('{} samples and slice to {} sub_samples.'.format(ct,
+                                                                     ct_sub))
         self.roidbs = records
         self.roidbs = records
 
 
     def get_label_list(self):
     def get_label_list(self):

+ 84 - 29
paddlers/models/ppdet/data/source/keypoint_coco.py

@@ -80,7 +80,8 @@ class KeypointBottomUpBaseDataset(DetDataset):
         records = copy.deepcopy(self._get_imganno(idx))
         records = copy.deepcopy(self._get_imganno(idx))
         records['image'] = cv2.imread(records['image_file'])
         records['image'] = cv2.imread(records['image_file'])
         records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
         records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
-        records['mask'] = (records['mask'] + 0).astype('uint8')
+        if 'mask' in records:
+            records['mask'] = (records['mask'] + 0).astype('uint8')
         records = self.transform(records)
         records = self.transform(records)
         return records
         return records
 
 
@@ -135,24 +136,37 @@ class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
                  num_joints,
                  num_joints,
                  transform=[],
                  transform=[],
                  shard=[0, 1],
                  shard=[0, 1],
-                 test_mode=False):
+                 test_mode=False,
+                 return_mask=True,
+                 return_bbox=True,
+                 return_area=True,
+                 return_class=True):
         super().__init__(dataset_dir, image_dir, anno_path, num_joints,
         super().__init__(dataset_dir, image_dir, anno_path, num_joints,
                          transform, shard, test_mode)
                          transform, shard, test_mode)
 
 
         self.ann_file = os.path.join(dataset_dir, anno_path)
         self.ann_file = os.path.join(dataset_dir, anno_path)
         self.shard = shard
         self.shard = shard
         self.test_mode = test_mode
         self.test_mode = test_mode
+        self.return_mask = return_mask
+        self.return_bbox = return_bbox
+        self.return_area = return_area
+        self.return_class = return_class
 
 
     def parse_dataset(self):
     def parse_dataset(self):
         self.coco = COCO(self.ann_file)
         self.coco = COCO(self.ann_file)
 
 
         self.img_ids = self.coco.getImgIds()
         self.img_ids = self.coco.getImgIds()
         if not self.test_mode:
         if not self.test_mode:
-            self.img_ids = [
-                img_id for img_id in self.img_ids
-                if len(self.coco.getAnnIds(
-                    imgIds=img_id, iscrowd=None)) > 0
-            ]
+            self.img_ids_tmp = []
+            for img_id in self.img_ids:
+                ann_ids = self.coco.getAnnIds(imgIds=img_id)
+                anno = self.coco.loadAnns(ann_ids)
+                anno = [obj for obj in anno if obj['iscrowd'] == 0]
+                if len(anno) == 0:
+                    continue
+                self.img_ids_tmp.append(img_id)
+            self.img_ids = self.img_ids_tmp
+
         blocknum = int(len(self.img_ids) / self.shard[1])
         blocknum = int(len(self.img_ids) / self.shard[1])
         self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
         self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
             self.shard[0] + 1))]
             self.shard[0] + 1))]
@@ -199,21 +213,31 @@ class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
         ann_ids = coco.getAnnIds(imgIds=img_id)
         ann_ids = coco.getAnnIds(imgIds=img_id)
         anno = coco.loadAnns(ann_ids)
         anno = coco.loadAnns(ann_ids)
 
 
-        mask = self._get_mask(anno, idx)
         anno = [
         anno = [
             obj for obj in anno
             obj for obj in anno
-            if obj['iscrowd'] == 0 or obj['num_keypoints'] > 0
+            if obj['iscrowd'] == 0 and obj['num_keypoints'] > 0
         ]
         ]
 
 
+        db_rec = {}
         joints, orgsize = self._get_joints(anno, idx)
         joints, orgsize = self._get_joints(anno, idx)
+        db_rec['gt_joints'] = joints
+        db_rec['im_shape'] = orgsize
+
+        if self.return_bbox:
+            db_rec['gt_bbox'] = self._get_bboxs(anno, idx)
+
+        if self.return_class:
+            db_rec['gt_class'] = self._get_labels(anno, idx)
+
+        if self.return_area:
+            db_rec['gt_areas'] = self._get_areas(anno, idx)
+
+        if self.return_mask:
+            db_rec['mask'] = self._get_mask(anno, idx)
 
 
-        db_rec = {}
         db_rec['im_id'] = img_id
         db_rec['im_id'] = img_id
         db_rec['image_file'] = os.path.join(self.img_prefix,
         db_rec['image_file'] = os.path.join(self.img_prefix,
                                             self.id2name[img_id])
                                             self.id2name[img_id])
-        db_rec['mask'] = mask
-        db_rec['joints'] = joints
-        db_rec['im_shape'] = orgsize
 
 
         return db_rec
         return db_rec
 
 
@@ -229,12 +253,41 @@ class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
                 np.array(obj['keypoints']).reshape([-1, 3])
                 np.array(obj['keypoints']).reshape([-1, 3])
 
 
         img_info = self.coco.loadImgs(self.img_ids[idx])[0]
         img_info = self.coco.loadImgs(self.img_ids[idx])[0]
-        joints[..., 0] /= img_info['width']
-        joints[..., 1] /= img_info['height']
-        orgsize = np.array([img_info['height'], img_info['width']])
+        orgsize = np.array([img_info['height'], img_info['width'], 1])
 
 
         return joints, orgsize
         return joints, orgsize
 
 
+    def _get_bboxs(self, anno, idx):
+        num_people = len(anno)
+        gt_bboxes = np.zeros((num_people, 4), dtype=np.float32)
+
+        for idx, obj in enumerate(anno):
+            if 'bbox' in obj:
+                gt_bboxes[idx, :] = obj['bbox']
+
+        gt_bboxes[:, 2] += gt_bboxes[:, 0]
+        gt_bboxes[:, 3] += gt_bboxes[:, 1]
+        return gt_bboxes
+
+    def _get_labels(self, anno, idx):
+        num_people = len(anno)
+        gt_labels = np.zeros((num_people, 1), dtype=np.float32)
+
+        for idx, obj in enumerate(anno):
+            if 'category_id' in obj:
+                catid = obj['category_id']
+                gt_labels[idx, 0] = self.catid2clsid[catid]
+        return gt_labels
+
+    def _get_areas(self, anno, idx):
+        num_people = len(anno)
+        gt_areas = np.zeros((num_people, ), dtype=np.float32)
+
+        for idx, obj in enumerate(anno):
+            if 'area' in obj:
+                gt_areas[idx, ] = obj['area']
+        return gt_areas
+
     def _get_mask(self, anno, idx):
     def _get_mask(self, anno, idx):
         """Get ignore masks to mask out losses."""
         """Get ignore masks to mask out losses."""
         coco = self.coco
         coco = self.coco
@@ -487,9 +540,9 @@ class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
                     continue
                     continue
 
 
                 joints = np.zeros(
                 joints = np.zeros(
-                    (self.ann_info['num_joints'], 3), dtype=np.float)
+                    (self.ann_info['num_joints'], 3), dtype=np.float32)
                 joints_vis = np.zeros(
                 joints_vis = np.zeros(
-                    (self.ann_info['num_joints'], 3), dtype=np.float)
+                    (self.ann_info['num_joints'], 3), dtype=np.float32)
                 for ipt in range(self.ann_info['num_joints']):
                 for ipt in range(self.ann_info['num_joints']):
                     joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
                     joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
                     joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
                     joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
@@ -506,7 +559,7 @@ class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
                     'image_file': os.path.join(self.img_prefix, file_name),
                     'image_file': os.path.join(self.img_prefix, file_name),
                     'center': center,
                     'center': center,
                     'scale': scale,
                     'scale': scale,
-                    'joints': joints,
+                    'gt_joints': joints,
                     'joints_vis': joints_vis,
                     'joints_vis': joints_vis,
                     'im_id': im_id,
                     'im_id': im_id,
                 })
                 })
@@ -560,16 +613,17 @@ class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
                 continue
                 continue
 
 
             center, scale = self._box2cs(box)
             center, scale = self._box2cs(box)
-            joints = np.zeros((self.ann_info['num_joints'], 3), dtype=np.float)
+            joints = np.zeros(
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
             joints_vis = np.ones(
             joints_vis = np.ones(
-                (self.ann_info['num_joints'], 3), dtype=np.float)
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
             kpt_db.append({
             kpt_db.append({
                 'image_file': img_name,
                 'image_file': img_name,
                 'im_id': im_id,
                 'im_id': im_id,
                 'center': center,
                 'center': center,
                 'scale': scale,
                 'scale': scale,
                 'score': score,
                 'score': score,
-                'joints': joints,
+                'gt_joints': joints,
                 'joints_vis': joints_vis,
                 'joints_vis': joints_vis,
             })
             })
 
 
@@ -633,8 +687,8 @@ class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
             im_id = a['image_id'] if 'image_id' in a else int(
             im_id = a['image_id'] if 'image_id' in a else int(
                 os.path.splitext(image_name)[0])
                 os.path.splitext(image_name)[0])
 
 
-            c = np.array(a['center'], dtype=np.float)
-            s = np.array([a['scale'], a['scale']], dtype=np.float)
+            c = np.array(a['center'], dtype=np.float32)
+            s = np.array([a['scale'], a['scale']], dtype=np.float32)
 
 
             # Adjust center/scale slightly to avoid cropping limbs
             # Adjust center/scale slightly to avoid cropping limbs
             if c[0] != -1:
             if c[0] != -1:
@@ -642,11 +696,12 @@ class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
                 s = s * 1.25
                 s = s * 1.25
             c = c - 1
             c = c - 1
 
 
-            joints = np.zeros((self.ann_info['num_joints'], 3), dtype=np.float)
+            joints = np.zeros(
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
             joints_vis = np.zeros(
             joints_vis = np.zeros(
-                (self.ann_info['num_joints'], 3), dtype=np.float)
-            if 'joints' in a:
-                joints_ = np.array(a['joints'])
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
+            if 'gt_joints' in a:
+                joints_ = np.array(a['gt_joints'])
                 joints_[:, 0:2] = joints_[:, 0:2] - 1
                 joints_[:, 0:2] = joints_[:, 0:2] - 1
                 joints_vis_ = np.array(a['joints_vis'])
                 joints_vis_ = np.array(a['joints_vis'])
                 assert len(joints_) == self.ann_info[
                 assert len(joints_) == self.ann_info[
@@ -662,7 +717,7 @@ class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
                 'im_id': im_id,
                 'im_id': im_id,
                 'center': c,
                 'center': c,
                 'scale': s,
                 'scale': s,
-                'joints': joints,
+                'gt_joints': joints,
                 'joints_vis': joints_vis
                 'joints_vis': joints_vis
             })
             })
         print("number length: {}".format(len(gt_db)))
         print("number length: {}".format(len(gt_db)))

+ 380 - 0
paddlers/models/ppdet/data/source/pose3d_cmb.py

@@ -0,0 +1,380 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import cv2
+import numpy as np
+import json
+import copy
+import pycocotools
+from pycocotools.coco import COCO
+from .dataset import DetDataset
+from paddlers.models.ppdet.core.workspace import register, serializable
+from paddle.io import Dataset
+
+
+@serializable
+class Pose3DDataset(DetDataset):
+    """Pose3D Dataset class. 
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        anno_list (list of str): each of the element is a relative path to the annotation file.
+        image_dirs (list of str): each of path is a relative path where images are held.
+        transform (composed(operators)): A sequence of data transforms.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+        24 joints order:
+        0-2: 'R_Ankle', 'R_Knee', 'R_Hip', 
+        3-5:'L_Hip', 'L_Knee', 'L_Ankle', 
+        6-8:'R_Wrist', 'R_Elbow', 'R_Shoulder', 
+        9-11:'L_Shoulder','L_Elbow','L_Wrist',
+        12-14:'Neck','Top_of_Head','Pelvis',
+        15-18:'Thorax','Spine','Jaw','Head',
+        19-23:'Nose','L_Eye','R_Eye','L_Ear','R_Ear'
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dirs,
+                 anno_list,
+                 transform=[],
+                 num_joints=24,
+                 test_mode=False):
+        super().__init__(dataset_dir, image_dirs, anno_list)
+        self.image_info = {}
+        self.ann_info = {}
+        self.num_joints = num_joints
+
+        self.transform = transform
+        self.test_mode = test_mode
+
+        self.img_ids = []
+        self.dataset_dir = dataset_dir
+        self.image_dirs = image_dirs
+        self.anno_list = anno_list
+
+    def get_mask(self, mvm_percent=0.3):
+        num_joints = self.num_joints
+        mjm_mask = np.ones((num_joints, 1)).astype(np.float32)
+        if self.test_mode == False:
+            pb = np.random.random_sample()
+            masked_num = int(
+                pb * mvm_percent *
+                num_joints)  # at most x% of the joints could be masked
+            indices = np.random.choice(
+                np.arange(num_joints), replace=False, size=masked_num)
+            mjm_mask[indices, :] = 0.0
+        # return mjm_mask
+
+        num_joints = 10
+        mvm_mask = np.ones((num_joints, 1)).astype(np.float)
+        if self.test_mode == False:
+            num_vertices = num_joints
+            pb = np.random.random_sample()
+            masked_num = int(
+                pb * mvm_percent *
+                num_vertices)  # at most x% of the vertices could be masked
+            indices = np.random.choice(
+                np.arange(num_vertices), replace=False, size=masked_num)
+            mvm_mask[indices, :] = 0.0
+
+        mjm_mask = np.concatenate([mjm_mask, mvm_mask], axis=0)
+        return mjm_mask
+
+    def filterjoints(self, x):
+        if self.num_joints == 24:
+            return x
+        elif self.num_joints == 14:
+            return x[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18], :]
+        elif self.num_joints == 17:
+            return x[
+                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19], :]
+        else:
+            raise ValueError(
+                "unsupported joint numbers, only [24 or 17 or 14] is supported!")
+
+    def parse_dataset(self):
+        print("Loading annotations..., please wait")
+        self.annos = []
+        im_id = 0
+        self.human36m_num = 0
+        for idx, annof in enumerate(self.anno_list):
+            img_prefix = os.path.join(self.dataset_dir, self.image_dirs[idx])
+            dataf = os.path.join(self.dataset_dir, annof)
+            with open(dataf, 'r') as rf:
+                anno_data = json.load(rf)
+                annos = anno_data['data']
+                new_annos = []
+                print("{} has annos numbers: {}".format(dataf, len(annos)))
+                for anno in annos:
+                    new_anno = {}
+                    new_anno['im_id'] = im_id
+                    im_id += 1
+                    imagename = anno['imageName']
+                    if imagename.startswith("COCO_train2014_"):
+                        imagename = imagename[len("COCO_train2014_"):]
+                    elif imagename.startswith("COCO_val2014_"):
+                        imagename = imagename[len("COCO_val2014_"):]
+                    imagename = os.path.join(img_prefix, imagename)
+                    if not os.path.exists(imagename):
+                        if "train2017" in imagename:
+                            imagename = imagename.replace("train2017",
+                                                          "val2017")
+                            if not os.path.exists(imagename):
+                                print("cannot find imagepath:{}".format(
+                                    imagename))
+                                continue
+                        else:
+                            print("cannot find imagepath:{}".format(imagename))
+                            continue
+                    new_anno['imageName'] = imagename
+                    if 'human3.6m' in imagename:
+                        self.human36m_num += 1
+                    new_anno['bbox_center'] = anno['bbox_center']
+                    new_anno['bbox_scale'] = anno['bbox_scale']
+                    new_anno['joints_2d'] = np.array(anno[
+                        'gt_keypoint_2d']).astype(np.float32)
+                    if new_anno['joints_2d'].shape[0] == 49:
+                        #if the joints_2d is in SPIN format(which generated by eft), choose the last 24 public joints
+                        #for detail please refer: https://github.com/nkolot/SPIN/blob/master/constants.py
+                        new_anno['joints_2d'] = new_anno['joints_2d'][25:]
+                    new_anno['joints_3d'] = np.array(anno[
+                        'pose3d'])[:, :3].astype(np.float32)
+                    new_anno['mjm_mask'] = self.get_mask()
+                    if not 'has_3d_joints' in anno:
+                        new_anno['has_3d_joints'] = int(1)
+                        new_anno['has_2d_joints'] = int(1)
+                    else:
+                        new_anno['has_3d_joints'] = int(anno['has_3d_joints'])
+                        new_anno['has_2d_joints'] = int(anno['has_2d_joints'])
+                    new_anno['joints_2d'] = self.filterjoints(new_anno[
+                        'joints_2d'])
+                    self.annos.append(new_anno)
+                del annos
+
+    def get_temp_num(self):
+        """get temporal data number, like human3.6m"""
+        return self.human36m_num
+
+    def __len__(self):
+        """Get dataset length."""
+        return len(self.annos)
+
+    def _get_imganno(self, idx):
+        """Get anno for a single image."""
+        return self.annos[idx]
+
+    def __getitem__(self, idx):
+        """Prepare image for training given the index."""
+        records = copy.deepcopy(self._get_imganno(idx))
+        imgpath = records['imageName']
+        assert os.path.exists(imgpath), "cannot find image {}".format(imgpath)
+        records['image'] = cv2.imread(imgpath)
+        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
+        records = self.transform(records)
+        return records
+
+    def check_or_download_dataset(self):
+        alldatafind = True
+        for image_dir in self.image_dirs:
+            image_dir = os.path.join(self.dataset_dir, image_dir)
+            if not os.path.isdir(image_dir):
+                print("dataset [{}] is not found".format(image_dir))
+                alldatafind = False
+        if not alldatafind:
+            raise ValueError(
+                "Some dataset is not valid and cannot download automatically now, please prepare the dataset first"
+            )
+
+
+@register
+@serializable
+class Keypoint3DMultiFramesDataset(Dataset):
+    """24 keypoints 3D dataset for pose estimation. 
+
+    each item is a list of images
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+    """
+
+    def __init__(
+            self,
+            dataset_dir,  # 数据集根目录
+            image_dir,  # 图像文件夹
+            p3d_dir,  # 3D关键点文件夹
+            json_path,
+            img_size,  #图像resize大小
+            num_frames,  # 帧序列长度
+            anno_path=None, ):
+
+        self.dataset_dir = dataset_dir
+        self.image_dir = image_dir
+        self.p3d_dir = p3d_dir
+        self.json_path = json_path
+        self.img_size = img_size
+        self.num_frames = num_frames
+        self.anno_path = anno_path
+
+        self.data_labels, self.mf_inds = self._generate_multi_frames_list()
+
+    def _generate_multi_frames_list(self):
+        act_list = os.listdir(self.dataset_dir)  # 动作列表
+        count = 0
+        mf_list = []
+        annos_dict = {'images': [], 'annotations': [], 'act_inds': []}
+        for act in act_list:  #对每个动作,生成帧序列
+            if '.' in act:
+                continue
+
+            json_path = os.path.join(self.dataset_dir, act, self.json_path)
+            with open(json_path, 'r') as j:
+                annos = json.load(j)
+            length = len(annos['images'])
+            for k, v in annos.items():
+                if k in annos_dict:
+                    annos_dict[k].extend(v)
+            annos_dict['act_inds'].extend([act] * length)
+
+            mf = [[i + j + count for j in range(self.num_frames)]
+                  for i in range(0, length - self.num_frames + 1)]
+            mf_list.extend(mf)
+            count += length
+
+        print("total data number:", len(mf_list))
+        return annos_dict, mf_list
+
+    def __call__(self, *args, **kwargs):
+        return self
+
+    def __getitem__(self, index):  # 拿一个连续的序列
+        inds = self.mf_inds[
+            index]  # 如[568, 569, 570, 571, 572, 573],长度为num_frames
+
+        images = self.data_labels['images']  # all images
+        annots = self.data_labels['annotations']  # all annots
+
+        act = self.data_labels['act_inds'][inds[0]]  # 动作名(文件夹名)
+
+        kps3d_list = []
+        kps3d_vis_list = []
+        names = []
+
+        h, w = 0, 0
+        for ind in inds:  # one image
+            height = float(images[ind]['height'])
+            width = float(images[ind]['width'])
+            name = images[ind]['file_name']  # 图像名称,带有后缀
+
+            kps3d_name = name.split('.')[0] + '.obj'
+            kps3d_path = os.path.join(self.dataset_dir, act, self.p3d_dir,
+                                      kps3d_name)
+
+            joints, joints_vis = self.kps3d_process(kps3d_path)
+            joints_vis = np.array(joints_vis, dtype=np.float32)
+
+            kps3d_list.append(joints)
+            kps3d_vis_list.append(joints_vis)
+            names.append(name)
+
+        kps3d = np.array(kps3d_list)  # (6, 24, 3),(num_frames, joints_num, 3)
+        kps3d_vis = np.array(kps3d_vis_list)
+
+        # read image
+        imgs = []
+        for name in names:
+            img_path = os.path.join(self.dataset_dir, act, self.image_dir, name)
+
+            image = cv2.imread(img_path, cv2.IMREAD_COLOR |
+                               cv2.IMREAD_IGNORE_ORIENTATION)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+            imgs.append(np.expand_dims(image, axis=0))
+
+        imgs = np.concatenate(imgs, axis=0)
+        imgs = imgs.astype(
+            np.float32)  # (6, 1080, 1920, 3),(num_frames, h, w, c)
+
+        # attention: 此时图像和标注是镜像的
+        records = {
+            'kps3d': kps3d,
+            'kps3d_vis': kps3d_vis,
+            "image": imgs,
+            'act': act,
+            'names': names,
+            'im_id': index
+        }
+
+        return self.transform(records)
+
+    def kps3d_process(self, kps3d_path):
+        count = 0
+        kps = []
+        kps_vis = []
+
+        with open(kps3d_path, 'r') as f:
+            lines = f.readlines()
+            for line in lines:
+                if line[0] == 'v':
+                    kps.append([])
+                    line = line.strip('\n').split(' ')[1:]
+                    for kp in line:
+                        kps[-1].append(float(kp))
+                    count += 1
+
+                    kps_vis.append([1, 1, 1])
+
+        kps = np.array(kps)  # 52,3
+        kps_vis = np.array(kps_vis)
+
+        kps *= 10  # scale points
+        kps -= kps[[0], :]  # set root point to zero
+
+        kps = np.concatenate((kps[0:23], kps[[37]]), axis=0)  # 24,3
+
+        kps *= 10
+
+        kps_vis = np.concatenate((kps_vis[0:23], kps_vis[[37]]), axis=0)  # 24,3
+
+        return kps, kps_vis
+
+    def __len__(self):
+        return len(self.mf_inds)
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        return os.path.join(self.dataset_dir, self.anno_path)
+
+    def check_or_download_dataset(self):
+        return
+
+    def parse_dataset(self, ):
+        return
+
+    def set_transform(self, transform):
+        self.transform = transform
+
+    def set_epoch(self, epoch_id):
+        self._epoch = epoch_id
+
+    def set_kwargs(self, **kwargs):
+        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
+        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
+        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)

+ 2 - 0
paddlers/models/ppdet/data/transform/__init__.py

@@ -17,12 +17,14 @@ from . import batch_operators
 from . import keypoint_operators
 from . import keypoint_operators
 from . import mot_operators
 from . import mot_operators
 from . import rotated_operators
 from . import rotated_operators
+from . import keypoints_3d_operators
 
 
 from .operators import *
 from .operators import *
 from .batch_operators import *
 from .batch_operators import *
 from .keypoint_operators import *
 from .keypoint_operators import *
 from .mot_operators import *
 from .mot_operators import *
 from .rotated_operators import *
 from .rotated_operators import *
+from .keypoints_3d_operators import *
 
 
 __all__ = []
 __all__ = []
 __all__ += registered_ops
 __all__ += registered_ops

+ 159 - 7
paddlers/models/ppdet/data/transform/atss_assigner.py

@@ -43,7 +43,8 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
     Returns:
     Returns:
         Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
         Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
     """
     """
-    assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
+    assert mode in ['iou', 'iof', 'giou', 'diou'], 'Unsupported mode {}'.format(
+        mode)
     # Either the boxes are empty or the length of boxes's last dimenstion is 4
     # Either the boxes are empty or the length of boxes's last dimenstion is 4
     assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
     assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
     assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
     assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
@@ -83,6 +84,13 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
         if mode == 'giou':
         if mode == 'giou':
             enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
             enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
             enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
             enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
+        if mode == 'diou':
+            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
+            b1_x1, b1_y1 = bboxes1[..., 0], bboxes1[..., 1]
+            b1_x2, b1_y2 = bboxes1[..., 2], bboxes1[..., 3]
+            b2_x1, b2_y1 = bboxes2[..., 0], bboxes2[..., 1]
+            b2_x2, b2_y2 = bboxes2[..., 2], bboxes2[..., 3]
     else:
     else:
         lt = np.maximum(bboxes1[..., :, None, :2],
         lt = np.maximum(bboxes1[..., :, None, :2],
                         bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
                         bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
@@ -101,6 +109,15 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
                                      bboxes2[..., None, :, :2])
                                      bboxes2[..., None, :, :2])
             enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
             enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
                                      bboxes2[..., None, :, 2:])
                                      bboxes2[..., None, :, 2:])
+        if mode == 'diou':
+            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
+                                     bboxes2[..., None, :, :2])
+            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
+                                     bboxes2[..., None, :, 2:])
+            b1_x1, b1_y1 = bboxes1[..., :, None, 0], bboxes1[..., :, None, 1]
+            b1_x2, b1_y2 = bboxes1[..., :, None, 2], bboxes1[..., :, None, 3]
+            b2_x1, b2_y1 = bboxes2[..., None, :, 0], bboxes2[..., None, :, 1]
+            b2_x2, b2_y2 = bboxes2[..., None, :, 2], bboxes2[..., None, :, 3]
 
 
     eps = np.array([eps])
     eps = np.array([eps])
     union = np.maximum(union, eps)
     union = np.maximum(union, eps)
@@ -108,18 +125,32 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
     if mode in ['iou', 'iof']:
     if mode in ['iou', 'iof']:
         return ious
         return ious
     # calculate gious
     # calculate gious
-    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
-    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
-    enclose_area = np.maximum(enclose_area, eps)
-    gious = ious - (enclose_area - union) / enclose_area
-    return gious
+    if mode in ['giou']:
+        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
+        enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+        enclose_area = np.maximum(enclose_area, eps)
+        gious = ious - (enclose_area - union) / enclose_area
+        return gious
+    if mode in ['diou']:
+        left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+        right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+        rho2 = left + right
+        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
+        enclose_c = enclose_wh[..., 0]**2 + enclose_wh[..., 1]**2
+        enclose_c = np.maximum(enclose_c, eps)
+        dious = ious - rho2 / enclose_c
+        return dious
 
 
 
 
 def topk_(input, k, axis=1, largest=True):
 def topk_(input, k, axis=1, largest=True):
     x = -input if largest else input
     x = -input if largest else input
     if axis == 0:
     if axis == 0:
         row_index = np.arange(input.shape[1 - axis])
         row_index = np.arange(input.shape[1 - axis])
-        topk_index = np.argpartition(x, k, axis=axis)[0:k, :]
+        if k == x.shape[0]:  # argpartition requires index < len(input)
+            topk_index = np.argpartition(x, k - 1, axis=axis)[0:k, :]
+        else:
+            topk_index = np.argpartition(x, k, axis=axis)[0:k, :]
+
         topk_data = x[topk_index, row_index]
         topk_data = x[topk_index, row_index]
 
 
         topk_index_sort = np.argsort(topk_data, axis=axis)
         topk_index_sort = np.argsort(topk_data, axis=axis)
@@ -267,3 +298,124 @@ class ATSSAssigner(object):
                          -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
                          -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
 
 
         return assigned_gt_inds, max_overlaps
         return assigned_gt_inds, max_overlaps
+
+    def get_vlr_region(self,
+                       bboxes,
+                       num_level_bboxes,
+                       gt_bboxes,
+                       gt_bboxes_ignore=None,
+                       gt_labels=None):
+        """get vlr region for ld distillation.
+        Args:
+            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
+            num_level_bboxes (List): num of bboxes in each level
+            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
+        """
+        bboxes = bboxes[:, :4]
+
+        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]
+
+        # compute iou between all bbox and gt
+        overlaps = bbox_overlaps(bboxes, gt_bboxes)
+
+        # compute diou between all bbox and gt
+        diou = bbox_overlaps(bboxes, gt_bboxes, mode='diou')
+
+        # assign 0 by default
+        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)
+
+        vlr_region_iou = (assigned_gt_inds + 0).astype(np.float32)
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = np.zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if not np.any(gt_labels):
+                assigned_labels = None
+            else:
+                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
+            return assigned_gt_inds, max_overlaps
+
+        # compute center distance between all bbox and gt
+        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        gt_points = np.stack((gt_cx, gt_cy), axis=1)
+
+        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)
+
+        distances = np.sqrt(
+            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
+            .sum(-1))
+
+        # Selecting candidates based on the center distance
+        candidate_idxs = []
+        candidate_idxs_t = []
+        start_idx = 0
+        for bboxes_per_level in num_level_bboxes:
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_idx = start_idx + bboxes_per_level
+            distances_per_level = distances[start_idx:end_idx, :]
+            selectable_t = min(self.topk, bboxes_per_level)
+            selectable_k = bboxes_per_level  #k for all
+            _, topt_idxs_per_level = topk_(
+                distances_per_level, selectable_t, axis=0, largest=False)
+            _, topk_idxs_per_level = topk_(
+                distances_per_level, selectable_k, axis=0, largest=False)
+            candidate_idxs_t.append(topt_idxs_per_level + start_idx)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+            start_idx = end_idx
+
+        candidate_idxs_t = np.concatenate(candidate_idxs_t, axis=0)
+        candidate_idxs = np.concatenate(candidate_idxs, axis=0)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        candidate_overlaps_t = overlaps[candidate_idxs_t, np.arange(num_gt)]
+
+        # compute tdiou
+        t_diou = diou[candidate_idxs, np.arange(num_gt)]
+
+        overlaps_mean_per_gt = candidate_overlaps_t.mean(0)
+        overlaps_std_per_gt = candidate_overlaps_t.std(
+            0, ddof=1)  # NOTE: use Bessel correction
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        # compute region        
+        is_pos = (t_diou < overlaps_thr_per_gt[None, :]) & (
+            t_diou >= 0.25 * overlaps_thr_per_gt[None, :])
+
+        # limit the positive sample's center in gt
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+
+        candidate_idxs = candidate_idxs.reshape(-1)
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
+        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]
+
+        overlaps_inf[index] = overlaps.T.reshape(-1)[index]
+        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
+
+        max_overlaps = overlaps_inf.max(axis=1)
+        argmax_overlaps = overlaps_inf.argmax(axis=1)
+
+        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
+        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
+
+        assigned_gt_inds[max_overlaps !=
+                         -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
+
+        vlr_region_iou[max_overlaps !=
+                       -np.inf] = max_overlaps[max_overlaps != -np.inf] + 0
+
+        return vlr_region_iou

+ 359 - 42
paddlers/models/ppdet/data/transform/batch_operators.py

@@ -24,6 +24,7 @@ except Exception:
     from collections import Sequence
     from collections import Sequence
 
 
 import cv2
 import cv2
+import copy
 import math
 import math
 import numpy as np
 import numpy as np
 from .operators import register_op, BaseOperator, Resize
 from .operators import register_op, BaseOperator, Resize
@@ -43,10 +44,11 @@ __all__ = [
     'Gt2FCOSTarget',
     'Gt2FCOSTarget',
     'Gt2TTFTarget',
     'Gt2TTFTarget',
     'Gt2Solov2Target',
     'Gt2Solov2Target',
-    'Gt2SparseRCNNTarget',
+    'Gt2SparseTarget',
     'PadMaskBatch',
     'PadMaskBatch',
     'Gt2GFLTarget',
     'Gt2GFLTarget',
     'Gt2CenterNetTarget',
     'Gt2CenterNetTarget',
+    'Gt2CenterTrackTarget',
     'PadGT',
     'PadGT',
     'PadRGT',
     'PadRGT',
 ]
 ]
@@ -169,6 +171,7 @@ class BatchRandomResize(BaseOperator):
 
 
 @register_op
 @register_op
 class Gt2YoloTarget(BaseOperator):
 class Gt2YoloTarget(BaseOperator):
+    __shared__ = ['num_classes']
     """
     """
     Generate YOLOv3 targets by groud truth data, this operator is only used in
     Generate YOLOv3 targets by groud truth data, this operator is only used in
     fine grained YOLOv3 loss mode
     fine grained YOLOv3 loss mode
@@ -292,7 +295,9 @@ class Gt2FCOSTarget(BaseOperator):
                  object_sizes_boundary,
                  object_sizes_boundary,
                  center_sampling_radius,
                  center_sampling_radius,
                  downsample_ratios,
                  downsample_ratios,
-                 norm_reg_targets=False):
+                 num_shift=0.5,
+                 multiply_strides_reg_targets=False,
+                 norm_reg_targets=True):
         super(Gt2FCOSTarget, self).__init__()
         super(Gt2FCOSTarget, self).__init__()
         self.center_sampling_radius = center_sampling_radius
         self.center_sampling_radius = center_sampling_radius
         self.downsample_ratios = downsample_ratios
         self.downsample_ratios = downsample_ratios
@@ -304,6 +309,8 @@ class Gt2FCOSTarget(BaseOperator):
                 self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1]
                 self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1]
             ])
             ])
         self.object_sizes_of_interest = object_sizes_of_interest
         self.object_sizes_of_interest = object_sizes_of_interest
+        self.num_shift = num_shift
+        self.multiply_strides_reg_targets = multiply_strides_reg_targets
         self.norm_reg_targets = norm_reg_targets
         self.norm_reg_targets = norm_reg_targets
 
 
     def _compute_points(self, w, h):
     def _compute_points(self, w, h):
@@ -320,7 +327,8 @@ class Gt2FCOSTarget(BaseOperator):
             shift_x, shift_y = np.meshgrid(shift_x, shift_y)
             shift_x, shift_y = np.meshgrid(shift_x, shift_y)
             shift_x = shift_x.flatten()
             shift_x = shift_x.flatten()
             shift_y = shift_y.flatten()
             shift_y = shift_y.flatten()
-            location = np.stack([shift_x, shift_y], axis=1) + stride // 2
+            location = np.stack(
+                [shift_x, shift_y], axis=1) + stride * self.num_shift
             locations.append(location)
             locations.append(location)
         num_points_each_level = [len(location) for location in locations]
         num_points_each_level = [len(location) for location in locations]
         locations = np.concatenate(locations, axis=0)
         locations = np.concatenate(locations, axis=0)
@@ -459,11 +467,16 @@ class Gt2FCOSTarget(BaseOperator):
                 grid_w = int(np.ceil(w / self.downsample_ratios[lvl]))
                 grid_w = int(np.ceil(w / self.downsample_ratios[lvl]))
                 grid_h = int(np.ceil(h / self.downsample_ratios[lvl]))
                 grid_h = int(np.ceil(h / self.downsample_ratios[lvl]))
                 if self.norm_reg_targets:
                 if self.norm_reg_targets:
-                    sample['reg_target{}'.format(lvl)] = \
-                        np.reshape(
-                            reg_targets_by_level[lvl] / \
-                            self.downsample_ratios[lvl],
+                    if self.multiply_strides_reg_targets:
+                        sample['reg_target{}'.format(lvl)] = np.reshape(
+                            reg_targets_by_level[lvl],
                             newshape=[grid_h, grid_w, 4])
                             newshape=[grid_h, grid_w, 4])
+                    else:
+                        sample['reg_target{}'.format(lvl)] = \
+                            np.reshape(
+                                reg_targets_by_level[lvl] / \
+                                self.downsample_ratios[lvl],
+                                newshape=[grid_h, grid_w, 4])
                 else:
                 else:
                     sample['reg_target{}'.format(lvl)] = np.reshape(
                     sample['reg_target{}'.format(lvl)] = np.reshape(
                         reg_targets_by_level[lvl],
                         reg_targets_by_level[lvl],
@@ -482,6 +495,7 @@ class Gt2FCOSTarget(BaseOperator):
 
 
 @register_op
 @register_op
 class Gt2GFLTarget(BaseOperator):
 class Gt2GFLTarget(BaseOperator):
+    __shared__ = ['num_classes']
     """
     """
     Generate GFocal loss targets by groud truth data
     Generate GFocal loss targets by groud truth data
     """
     """
@@ -490,12 +504,14 @@ class Gt2GFLTarget(BaseOperator):
                  num_classes=80,
                  num_classes=80,
                  downsample_ratios=[8, 16, 32, 64, 128],
                  downsample_ratios=[8, 16, 32, 64, 128],
                  grid_cell_scale=4,
                  grid_cell_scale=4,
-                 cell_offset=0):
+                 cell_offset=0,
+                 compute_vlr_region=False):
         super(Gt2GFLTarget, self).__init__()
         super(Gt2GFLTarget, self).__init__()
         self.num_classes = num_classes
         self.num_classes = num_classes
         self.downsample_ratios = downsample_ratios
         self.downsample_ratios = downsample_ratios
         self.grid_cell_scale = grid_cell_scale
         self.grid_cell_scale = grid_cell_scale
         self.cell_offset = cell_offset
         self.cell_offset = cell_offset
+        self.compute_vlr_region = compute_vlr_region
 
 
         self.assigner = ATSSAssigner()
         self.assigner = ATSSAssigner()
 
 
@@ -574,6 +590,13 @@ class Gt2GFLTarget(BaseOperator):
             assign_gt_inds, _ = self.assigner(grid_cells, num_level_cells,
             assign_gt_inds, _ = self.assigner(grid_cells, num_level_cells,
                                               gt_bboxes, gt_bboxes_ignore,
                                               gt_bboxes, gt_bboxes_ignore,
                                               gt_labels)
                                               gt_labels)
+
+            if self.compute_vlr_region:
+                vlr_region = self.assigner.get_vlr_region(
+                    grid_cells, num_level_cells, gt_bboxes, gt_bboxes_ignore,
+                    gt_labels)
+                sample['vlr_regions'] = vlr_region
+
             pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.get_sample(
             pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.get_sample(
                 assign_gt_inds, gt_bboxes)
                 assign_gt_inds, gt_bboxes)
 
 
@@ -766,7 +789,7 @@ class Gt2Solov2Target(BaseOperator):
                 ins_label = []
                 ins_label = []
                 grid_order = []
                 grid_order = []
                 cate_label = np.zeros([num_grid, num_grid], dtype=np.int64)
                 cate_label = np.zeros([num_grid, num_grid], dtype=np.int64)
-                ins_ind_label = np.zeros([num_grid**2], dtype=np.bool)
+                ins_ind_label = np.zeros([num_grid**2], dtype=np.bool_)
 
 
                 if num_ins == 0:
                 if num_ins == 0:
                     ins_label = np.zeros(
                     ins_label = np.zeros(
@@ -893,27 +916,33 @@ class Gt2Solov2Target(BaseOperator):
 
 
 
 
 @register_op
 @register_op
-class Gt2SparseRCNNTarget(BaseOperator):
-    '''
-    Generate SparseRCNN targets by groud truth data
-    '''
-
-    def __init__(self):
-        super(Gt2SparseRCNNTarget, self).__init__()
+class Gt2SparseTarget(BaseOperator):
+    def __init__(self, use_padding_shape=False):
+        super(Gt2SparseTarget, self).__init__()
+        self.use_padding_shape = use_padding_shape
 
 
     def __call__(self, samples, context=None):
     def __call__(self, samples, context=None):
         for sample in samples:
         for sample in samples:
-            im = sample["image"]
-            h, w = im.shape[1:3]
-            img_whwh = np.array([w, h, w, h], dtype=np.int32)
-            sample["img_whwh"] = img_whwh
-            if "scale_factor" in sample:
-                sample["scale_factor_wh"] = np.array(
-                    [sample["scale_factor"][1], sample["scale_factor"][0]],
-                    dtype=np.float32)
+            ori_h, ori_w = sample['h'], sample['w']
+            if self.use_padding_shape:
+                h, w = sample["image"].shape[1:3]
+                if "scale_factor" in sample:
+                    sf_w, sf_h = sample["scale_factor"][1], sample[
+                        "scale_factor"][0]
+                    sample["scale_factor_whwh"] = np.array(
+                        [sf_w, sf_h, sf_w, sf_h], dtype=np.float32)
+                else:
+                    sample["scale_factor_whwh"] = np.array(
+                        [1.0, 1.0, 1.0, 1.0], dtype=np.float32)
             else:
             else:
-                sample["scale_factor_wh"] = np.array(
-                    [1.0, 1.0], dtype=np.float32)
+                h, w = round(sample['im_shape'][0]), round(sample['im_shape'][
+                    1])
+                sample["scale_factor_whwh"] = np.array(
+                    [w / ori_w, h / ori_h, w / ori_w, h / ori_h],
+                    dtype=np.float32)
+
+            sample["img_whwh"] = np.array([w, h, w, h], dtype=np.float32)
+            sample["ori_shape"] = np.array([ori_h, ori_w], dtype=np.int32)
 
 
         return samples
         return samples
 
 
@@ -981,6 +1010,7 @@ class PadMaskBatch(BaseOperator):
 
 
 @register_op
 @register_op
 class Gt2CenterNetTarget(BaseOperator):
 class Gt2CenterNetTarget(BaseOperator):
+    __shared__ = ['num_classes']
     """Gt2CenterNetTarget
     """Gt2CenterNetTarget
     Genterate CenterNet targets by ground-truth
     Genterate CenterNet targets by ground-truth
     Args:
     Args:
@@ -990,40 +1020,39 @@ class Gt2CenterNetTarget(BaseOperator):
         max_objs (int): The maximum objects detected, 128 by default.
         max_objs (int): The maximum objects detected, 128 by default.
     """
     """
 
 
-    def __init__(self, down_ratio, num_classes=80, max_objs=128):
+    def __init__(self, num_classes=80, down_ratio=4, max_objs=128):
         super(Gt2CenterNetTarget, self).__init__()
         super(Gt2CenterNetTarget, self).__init__()
+        self.nc = num_classes
         self.down_ratio = down_ratio
         self.down_ratio = down_ratio
-        self.num_classes = num_classes
         self.max_objs = max_objs
         self.max_objs = max_objs
 
 
     def __call__(self, sample, context=None):
     def __call__(self, sample, context=None):
         input_h, input_w = sample['image'].shape[1:]
         input_h, input_w = sample['image'].shape[1:]
         output_h = input_h // self.down_ratio
         output_h = input_h // self.down_ratio
         output_w = input_w // self.down_ratio
         output_w = input_w // self.down_ratio
-        num_classes = self.num_classes
-        c = sample['center']
-        s = sample['scale']
         gt_bbox = sample['gt_bbox']
         gt_bbox = sample['gt_bbox']
         gt_class = sample['gt_class']
         gt_class = sample['gt_class']
 
 
-        hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32)
+        hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)
         wh = np.zeros((self.max_objs, 2), dtype=np.float32)
         wh = np.zeros((self.max_objs, 2), dtype=np.float32)
-        dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)
         reg = np.zeros((self.max_objs, 2), dtype=np.float32)
         reg = np.zeros((self.max_objs, 2), dtype=np.float32)
         ind = np.zeros((self.max_objs), dtype=np.int64)
         ind = np.zeros((self.max_objs), dtype=np.int64)
         reg_mask = np.zeros((self.max_objs), dtype=np.int32)
         reg_mask = np.zeros((self.max_objs), dtype=np.int32)
-        cat_spec_wh = np.zeros(
-            (self.max_objs, num_classes * 2), dtype=np.float32)
-        cat_spec_mask = np.zeros(
-            (self.max_objs, num_classes * 2), dtype=np.int32)
+        cat_spec_wh = np.zeros((self.max_objs, self.nc * 2), dtype=np.float32)
+        cat_spec_mask = np.zeros((self.max_objs, self.nc * 2), dtype=np.int32)
 
 
-        trans_output = get_affine_transform(c, [s, s], 0, [output_w, output_h])
+        trans_output = get_affine_transform(
+            center=sample['center'],
+            input_size=[sample['scale'], sample['scale']],
+            rot=0,
+            output_size=[output_w, output_h])
 
 
         gt_det = []
         gt_det = []
         for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
         for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
             cls = int(cls)
             cls = int(cls)
             bbox[:2] = affine_transform(bbox[:2], trans_output)
             bbox[:2] = affine_transform(bbox[:2], trans_output)
             bbox[2:] = affine_transform(bbox[2:], trans_output)
             bbox[2:] = affine_transform(bbox[2:], trans_output)
+            bbox_amodal = copy.deepcopy(bbox)
             bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
             bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
             bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
             bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
             h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
             h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
@@ -1034,10 +1063,12 @@ class Gt2CenterNetTarget(BaseOperator):
                     [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
                     [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
                     dtype=np.float32)
                     dtype=np.float32)
                 ct_int = ct.astype(np.int32)
                 ct_int = ct.astype(np.int32)
+
+                # get hm,wh,reg,ind,ind_mask
                 draw_umich_gaussian(hm[cls], ct_int, radius)
                 draw_umich_gaussian(hm[cls], ct_int, radius)
                 wh[i] = 1. * w, 1. * h
                 wh[i] = 1. * w, 1. * h
-                ind[i] = ct_int[1] * output_w + ct_int[0]
                 reg[i] = ct - ct_int
                 reg[i] = ct - ct_int
+                ind[i] = ct_int[1] * output_w + ct_int[0]
                 reg_mask[i] = 1
                 reg_mask[i] = 1
                 cat_spec_wh[i, cls * 2:cls * 2 + 2] = wh[i]
                 cat_spec_wh[i, cls * 2:cls * 2 + 2] = wh[i]
                 cat_spec_mask[i, cls * 2:cls * 2 + 2] = 1
                 cat_spec_mask[i, cls * 2:cls * 2 + 2] = 1
@@ -1052,9 +1083,10 @@ class Gt2CenterNetTarget(BaseOperator):
         sample.pop('scale', None)
         sample.pop('scale', None)
         sample.pop('is_crowd', None)
         sample.pop('is_crowd', None)
         sample.pop('difficult', None)
         sample.pop('difficult', None)
-        sample['heatmap'] = hm
-        sample['index_mask'] = reg_mask
+
         sample['index'] = ind
         sample['index'] = ind
+        sample['index_mask'] = reg_mask
+        sample['heatmap'] = hm
         sample['size'] = wh
         sample['size'] = wh
         sample['offset'] = reg
         sample['offset'] = reg
         return sample
         return sample
@@ -1070,13 +1102,115 @@ class PadGT(BaseOperator):
                                 1 means bbox, 0 means no bbox.
                                 1 means bbox, 0 means no bbox.
     """
     """
 
 
-    def __init__(self, return_gt_mask=True):
+    def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0):
         super(PadGT, self).__init__()
         super(PadGT, self).__init__()
         self.return_gt_mask = return_gt_mask
         self.return_gt_mask = return_gt_mask
+        self.pad_img = pad_img
+        self.minimum_gtnum = minimum_gtnum
+
+    def _impad(self, img: np.ndarray,
+            *,
+            shape = None,
+            padding = None,
+            pad_val = 0,
+            padding_mode = 'constant') -> np.ndarray:
+        """Pad the given image to a certain shape or pad on all sides with
+        specified padding mode and padding value.
+
+        Args:
+            img (ndarray): Image to be padded.
+            shape (tuple[int]): Expected padding shape (h, w). Default: None.
+            padding (int or tuple[int]): Padding on each border. If a single int is
+                provided this is used to pad all borders. If tuple of length 2 is
+                provided this is the padding on left/right and top/bottom
+                respectively. If a tuple of length 4 is provided this is the
+                padding for the left, top, right and bottom borders respectively.
+                Default: None. Note that `shape` and `padding` can not be both
+                set.
+            pad_val (Number | Sequence[Number]): Values to be filled in padding
+                areas when padding_mode is 'constant'. Default: 0.
+            padding_mode (str): Type of padding. Should be: constant, edge,
+                reflect or symmetric. Default: constant.
+                - constant: pads with a constant value, this value is specified
+                with pad_val.
+                - edge: pads with the last value at the edge of the image.
+                - reflect: pads with reflection of image without repeating the last
+                value on the edge. For example, padding [1, 2, 3, 4] with 2
+                elements on both sides in reflect mode will result in
+                [3, 2, 1, 2, 3, 4, 3, 2].
+                - symmetric: pads with reflection of image repeating the last value
+                on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+                both sides in symmetric mode will result in
+                [2, 1, 1, 2, 3, 4, 4, 3]
+
+        Returns:
+            ndarray: The padded image.
+        """
+
+        assert (shape is not None) ^ (padding is not None)
+        if shape is not None:
+            width = max(shape[1] - img.shape[1], 0)
+            height = max(shape[0] - img.shape[0], 0)
+            padding = (0, 0, int(width), int(height))
+
+        # check pad_val
+        import numbers
+        if isinstance(pad_val, tuple):
+            assert len(pad_val) == img.shape[-1]
+        elif not isinstance(pad_val, numbers.Number):
+            raise TypeError('pad_val must be a int or a tuple. '
+                            f'But received {type(pad_val)}')
+
+        # check padding
+        if isinstance(padding, tuple) and len(padding) in [2, 4]:
+            if len(padding) == 2:
+                padding = (padding[0], padding[1], padding[0], padding[1])
+        elif isinstance(padding, numbers.Number):
+            padding = (padding, padding, padding, padding)
+        else:
+            raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+                            f'But received {padding}')
+
+        # check padding mode
+        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+        border_type = {
+            'constant': cv2.BORDER_CONSTANT,
+            'edge': cv2.BORDER_REPLICATE,
+            'reflect': cv2.BORDER_REFLECT_101,
+            'symmetric': cv2.BORDER_REFLECT
+        }
+        img = cv2.copyMakeBorder(
+            img,
+            padding[1],
+            padding[3],
+            padding[0],
+            padding[2],
+            border_type[padding_mode],
+            value=pad_val)
+
+        return img
+
+    def checkmaxshape(self, samples):
+        maxh, maxw = 0, 0
+        for sample in samples:
+            h,w = sample['im_shape']
+            if h>maxh:
+                maxh = h
+            if w>maxw:
+                maxw = w
+        return (maxh, maxw)
 
 
     def __call__(self, samples, context=None):
     def __call__(self, samples, context=None):
         num_max_boxes = max([len(s['gt_bbox']) for s in samples])
         num_max_boxes = max([len(s['gt_bbox']) for s in samples])
+        num_max_boxes = max(self.minimum_gtnum, num_max_boxes)
+        if self.pad_img:
+            maxshape = self.checkmaxshape(samples)
         for sample in samples:
         for sample in samples:
+            if self.pad_img:
+                img = sample['image']
+                padimg = self._impad(img, shape=maxshape)
+                sample['image'] = padimg
             if self.return_gt_mask:
             if self.return_gt_mask:
                 sample['pad_gt_mask'] = np.zeros(
                 sample['pad_gt_mask'] = np.zeros(
                     (num_max_boxes, 1), dtype=np.float32)
                     (num_max_boxes, 1), dtype=np.float32)
@@ -1110,6 +1244,17 @@ class PadGT(BaseOperator):
                 if num_gt > 0:
                 if num_gt > 0:
                     pad_diff[:num_gt] = sample['difficult']
                     pad_diff[:num_gt] = sample['difficult']
                 sample['difficult'] = pad_diff
                 sample['difficult'] = pad_diff
+            if 'gt_joints' in sample:
+                num_joints = sample['gt_joints'].shape[1]
+                pad_gt_joints = np.zeros((num_max_boxes, num_joints, 3), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_joints[:num_gt] = sample['gt_joints']
+                sample['gt_joints'] = pad_gt_joints
+            if 'gt_areas' in sample:
+                pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_areas[:num_gt, 0] = sample['gt_areas']
+                sample['gt_areas'] = pad_gt_areas
         return samples
         return samples
 
 
 
 
@@ -1165,3 +1310,175 @@ class PadRGT(BaseOperator):
                                num_gt)
                                num_gt)
 
 
         return samples
         return samples
+
+
+@register_op
+class Gt2CenterTrackTarget(BaseOperator):
+    __shared__ = ['num_classes']
+    """Gt2CenterTrackTarget
+    Genterate CenterTrack targets by ground-truth
+    Args:
+        num_classes (int): The number of classes, 1 by default.
+        down_ratio (int): The down sample ratio between output feature and 
+                          input image.
+        max_objs (int): The maximum objects detected, 256 by default.
+    """
+
+    def __init__(self,
+                 num_classes=1,
+                 down_ratio=4,
+                 max_objs=256,
+                 hm_disturb=0.05,
+                 lost_disturb=0.4,
+                 fp_disturb=0.1,
+                 pre_hm=True,
+                 add_tracking=True,
+                 add_ltrb_amodal=True):
+        super(Gt2CenterTrackTarget, self).__init__()
+        self.nc = num_classes
+        self.down_ratio = down_ratio
+        self.max_objs = max_objs
+
+        self.hm_disturb = hm_disturb
+        self.lost_disturb = lost_disturb
+        self.fp_disturb = fp_disturb
+        self.pre_hm = pre_hm
+        self.add_tracking = add_tracking
+        self.add_ltrb_amodal = add_ltrb_amodal
+
+    def _get_pre_dets(self, input_h, input_w, trans_input_pre, gt_bbox_pre,
+                      gt_class_pre, gt_track_id_pre):
+        hm_h, hm_w = input_h, input_w
+        reutrn_hm = self.pre_hm
+        pre_hm = np.zeros(
+            (1, hm_h, hm_w), dtype=np.float32) if reutrn_hm else None
+        pre_cts, track_ids = [], []
+
+        for i, (
+                bbox, cls, track_id
+        ) in enumerate(zip(gt_bbox_pre, gt_class_pre, gt_track_id_pre)):
+            cls = int(cls)
+            bbox[:2] = affine_transform(bbox[:2], trans_input_pre)
+            bbox[2:] = affine_transform(bbox[2:], trans_input_pre)
+            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, hm_w - 1)
+            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, hm_h - 1)
+            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+            max_rad = 1
+            if (h > 0 and w > 0):
+                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
+                radius = max(0, int(radius))
+                max_rad = max(max_rad, radius)
+                ct = np.array(
+                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
+                    dtype=np.float32)
+                ct0 = ct.copy()
+                conf = 1
+
+                ct[0] = ct[0] + np.random.randn() * self.hm_disturb * w
+                ct[1] = ct[1] + np.random.randn() * self.hm_disturb * h
+                conf = 1 if np.random.rand() > self.lost_disturb else 0
+
+                ct_int = ct.astype(np.int32)
+                if conf == 0:
+                    pre_cts.append(ct / self.down_ratio)
+                else:
+                    pre_cts.append(ct0 / self.down_ratio)
+
+                track_ids.append(track_id)
+                if reutrn_hm:
+                    draw_umich_gaussian(pre_hm[0], ct_int, radius, k=conf)
+
+                if np.random.rand() < self.fp_disturb and reutrn_hm:
+                    ct2 = ct0.copy()
+                    # Hard code heatmap disturb ratio, haven't tried other numbers.
+                    ct2[0] = ct2[0] + np.random.randn() * 0.05 * w
+                    ct2[1] = ct2[1] + np.random.randn() * 0.05 * h
+                    ct2_int = ct2.astype(np.int32)
+                    draw_umich_gaussian(pre_hm[0], ct2_int, radius, k=conf)
+        return pre_hm, pre_cts, track_ids
+
+    def __call__(self, sample, context=None):
+        input_h, input_w = sample['image'].shape[1:]
+        output_h = input_h // self.down_ratio
+        output_w = input_w // self.down_ratio
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+
+        # init
+        hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)
+        wh = np.zeros((self.max_objs, 2), dtype=np.float32)
+        reg = np.zeros((self.max_objs, 2), dtype=np.float32)
+        ind = np.zeros((self.max_objs), dtype=np.int64)
+        reg_mask = np.zeros((self.max_objs), dtype=np.int32)
+        if self.add_tracking:
+            tr = np.zeros((self.max_objs, 2), dtype=np.float32)
+        if self.add_ltrb_amodal:
+            ltrb_amodal = np.zeros((self.max_objs, 4), dtype=np.float32)
+
+        trans_output = get_affine_transform(
+            center=sample['center'],
+            input_size=[sample['scale'], sample['scale']],
+            rot=0,
+            output_size=[output_w, output_h])
+
+        pre_hm, pre_cts, track_ids = self._get_pre_dets(
+            input_h, input_w, sample['trans_input'], sample['pre_gt_bbox'],
+            sample['pre_gt_class'], sample['pre_gt_track_id'])
+
+        for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
+            cls = int(cls)
+            rect = np.array(
+                [[bbox[0], bbox[1]], [bbox[0], bbox[3]], [bbox[2], bbox[3]],
+                 [bbox[2], bbox[1]]],
+                dtype=np.float32)
+            for t in range(4):
+                rect[t] = affine_transform(rect[t], trans_output)
+                bbox[:2] = rect[:, 0].min(), rect[:, 1].min()
+                bbox[2:] = rect[:, 0].max(), rect[:, 1].max()
+
+            bbox_amodal = copy.deepcopy(bbox)
+            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
+            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
+
+            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+            if h > 0 and w > 0:
+                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
+                radius = max(0, int(radius))
+                ct = np.array(
+                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
+                    dtype=np.float32)
+                ct_int = ct.astype(np.int32)
+
+                # get hm,wh,reg,ind,ind_mask
+                draw_umich_gaussian(hm[cls], ct_int, radius)
+                wh[i] = 1. * w, 1. * h
+                reg[i] = ct - ct_int
+                ind[i] = ct_int[1] * output_w + ct_int[0]
+                reg_mask[i] = 1
+                if self.add_tracking:
+                    if sample['gt_track_id'][i] in track_ids:
+                        pre_ct = pre_cts[track_ids.index(sample['gt_track_id'][
+                            i])]
+                        tr[i] = pre_ct - ct_int
+
+                if self.add_ltrb_amodal:
+                    ltrb_amodal[i] = \
+                        bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \
+                        bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1]
+
+        new_sample = {'image': sample['image']}
+        new_sample['index'] = ind
+        new_sample['index_mask'] = reg_mask
+        new_sample['heatmap'] = hm
+        new_sample['size'] = wh
+        new_sample['offset'] = reg
+        if self.add_tracking:
+            new_sample['tracking'] = tr
+        if self.add_ltrb_amodal:
+            new_sample['ltrb_amodal'] = ltrb_amodal
+
+        new_sample['pre_image'] = sample['pre_image']
+        new_sample['pre_hm'] = pre_hm
+
+        del sample
+        return new_sample

+ 832 - 85
paddlers/models/ppdet/data/transform/keypoint_operators.py

@@ -36,19 +36,12 @@ logger = setup_logger(__name__)
 registered_ops = []
 registered_ops = []
 
 
 __all__ = [
 __all__ = [
-    'RandomAffine',
-    'KeyPointFlip',
-    'TagGenerate',
-    'ToHeatmaps',
-    'NormalizePermute',
-    'EvalAffine',
-    'RandomFlipHalfBodyTransform',
-    'TopDownAffine',
-    'ToHeatmapsTopDown',
-    'ToHeatmapsTopDown_DARK',
-    'ToHeatmapsTopDown_UDP',
-    'TopDownEvalAffine',
-    'AugmentationbyInformantionDropping',
+    'RandomAffine', 'KeyPointFlip', 'TagGenerate', 'ToHeatmaps',
+    'NormalizePermute', 'EvalAffine', 'RandomFlipHalfBodyTransform',
+    'TopDownAffine', 'ToHeatmapsTopDown', 'ToHeatmapsTopDown_DARK',
+    'ToHeatmapsTopDown_UDP', 'TopDownEvalAffine',
+    'AugmentationbyInformantionDropping', 'SinglePoseAffine', 'NoiseJitter',
+    'FlipPose', 'PETR_Resize'
 ]
 ]
 
 
 
 
@@ -72,38 +65,77 @@ class KeyPointFlip(object):
 
 
     """
     """
 
 
-    def __init__(self, flip_permutation, hmsize, flip_prob=0.5):
+    def __init__(self, flip_permutation, hmsize=None, flip_prob=0.5):
         super(KeyPointFlip, self).__init__()
         super(KeyPointFlip, self).__init__()
         assert isinstance(flip_permutation, Sequence)
         assert isinstance(flip_permutation, Sequence)
         self.flip_permutation = flip_permutation
         self.flip_permutation = flip_permutation
         self.flip_prob = flip_prob
         self.flip_prob = flip_prob
         self.hmsize = hmsize
         self.hmsize = hmsize
 
 
-    def __call__(self, records):
-        image = records['image']
-        kpts_lst = records['joints']
-        mask_lst = records['mask']
-        flip = np.random.random() < self.flip_prob
-        if flip:
-            image = image[:, ::-1]
-            for idx, hmsize in enumerate(self.hmsize):
-                if len(mask_lst) > idx:
-                    mask_lst[idx] = mask_lst[idx][:, ::-1]
+    def _flipjoints(self, records, sizelst):
+        '''
+        records['gt_joints'] is Sequence in higherhrnet
+        '''
+        if not ('gt_joints' in records and len(records['gt_joints']) > 0):
+            return records
+
+        kpts_lst = records['gt_joints']
+        if isinstance(kpts_lst, Sequence):
+            for idx, hmsize in enumerate(sizelst):
                 if kpts_lst[idx].ndim == 3:
                 if kpts_lst[idx].ndim == 3:
                     kpts_lst[idx] = kpts_lst[idx][:, self.flip_permutation]
                     kpts_lst[idx] = kpts_lst[idx][:, self.flip_permutation]
                 else:
                 else:
                     kpts_lst[idx] = kpts_lst[idx][self.flip_permutation]
                     kpts_lst[idx] = kpts_lst[idx][self.flip_permutation]
                 kpts_lst[idx][..., 0] = hmsize - kpts_lst[idx][..., 0]
                 kpts_lst[idx][..., 0] = hmsize - kpts_lst[idx][..., 0]
-                kpts_lst[idx] = kpts_lst[idx].astype(np.int64)
-                kpts_lst[idx][kpts_lst[idx][..., 0] >= hmsize, 2] = 0
-                kpts_lst[idx][kpts_lst[idx][..., 1] >= hmsize, 2] = 0
-                kpts_lst[idx][kpts_lst[idx][..., 0] < 0, 2] = 0
-                kpts_lst[idx][kpts_lst[idx][..., 1] < 0, 2] = 0
-        records['image'] = image
-        records['joints'] = kpts_lst
+        else:
+            hmsize = sizelst[0]
+            if kpts_lst.ndim == 3:
+                kpts_lst = kpts_lst[:, self.flip_permutation]
+            else:
+                kpts_lst = kpts_lst[self.flip_permutation]
+            kpts_lst[..., 0] = hmsize - kpts_lst[..., 0]
+
+        records['gt_joints'] = kpts_lst
+        return records
+
+    def _flipmask(self, records, sizelst):
+        if not 'mask' in records:
+            return records
+
+        mask_lst = records['mask']
+        for idx, hmsize in enumerate(sizelst):
+            if len(mask_lst) > idx:
+                mask_lst[idx] = mask_lst[idx][:, ::-1]
         records['mask'] = mask_lst
         records['mask'] = mask_lst
         return records
         return records
 
 
+    def _flipbbox(self, records, sizelst):
+        if not 'gt_bbox' in records:
+            return records
+
+        bboxes = records['gt_bbox']
+        hmsize = sizelst[0]
+        bboxes[:, 0::2] = hmsize - bboxes[:, 0::2][:, ::-1]
+        bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, hmsize)
+        records['gt_bbox'] = bboxes
+        return records
+
+    def __call__(self, records):
+        flip = np.random.random() < self.flip_prob
+        if flip:
+            image = records['image']
+            image = image[:, ::-1]
+            records['image'] = image
+            if self.hmsize is None:
+                sizelst = [image.shape[1]]
+            else:
+                sizelst = self.hmsize
+            self._flipjoints(records, sizelst)
+            self._flipmask(records, sizelst)
+            self._flipbbox(records, sizelst)
+
+        return records
+
 
 
 @register_keypointop
 @register_keypointop
 class RandomAffine(object):
 class RandomAffine(object):
@@ -115,7 +147,7 @@ class RandomAffine(object):
         max_scale (list[2]): the scale range to apply, transform range is [min, max]
         max_scale (list[2]): the scale range to apply, transform range is [min, max]
         max_shift (float): the max abslute shift ratio to apply, transform range is [-max_shift*imagesize, max_shift*imagesize]
         max_shift (float): the max abslute shift ratio to apply, transform range is [-max_shift*imagesize, max_shift*imagesize]
         hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
         hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
-        trainsize (int): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard
+        trainsize (list[2]): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard
         scale_type (str): the length of [h,w] to used for trainsize, chosed between 'short' and 'long'
         scale_type (str): the length of [h,w] to used for trainsize, chosed between 'short' and 'long'
         records(dict): the dict contained the image, mask and coords
         records(dict): the dict contained the image, mask and coords
 
 
@@ -128,9 +160,10 @@ class RandomAffine(object):
                  max_degree=30,
                  max_degree=30,
                  scale=[0.75, 1.5],
                  scale=[0.75, 1.5],
                  max_shift=0.2,
                  max_shift=0.2,
-                 hmsize=[128, 256],
-                 trainsize=512,
-                 scale_type='short'):
+                 hmsize=None,
+                 trainsize=[512, 512],
+                 scale_type='short',
+                 boldervalue=[114, 114, 114]):
         super(RandomAffine, self).__init__()
         super(RandomAffine, self).__init__()
         self.max_degree = max_degree
         self.max_degree = max_degree
         self.min_scale = scale[0]
         self.min_scale = scale[0]
@@ -139,8 +172,9 @@ class RandomAffine(object):
         self.hmsize = hmsize
         self.hmsize = hmsize
         self.trainsize = trainsize
         self.trainsize = trainsize
         self.scale_type = scale_type
         self.scale_type = scale_type
+        self.boldervalue = boldervalue
 
 
-    def _get_affine_matrix(self, center, scale, res, rot=0):
+    def _get_affine_matrix_old(self, center, scale, res, rot=0):
         """Generate transformation matrix."""
         """Generate transformation matrix."""
         h = scale
         h = scale
         t = np.zeros((3, 3), dtype=np.float32)
         t = np.zeros((3, 3), dtype=np.float32)
@@ -166,21 +200,94 @@ class RandomAffine(object):
             t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
             t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
         return t
         return t
 
 
+    def _get_affine_matrix(self, center, scale, res, rot=0):
+        """Generate transformation matrix."""
+        w, h = scale
+        t = np.zeros((3, 3), dtype=np.float32)
+        t[0, 0] = float(res[0]) / w
+        t[1, 1] = float(res[1]) / h
+        t[0, 2] = res[0] * (-float(center[0]) / w + .5)
+        t[1, 2] = res[1] * (-float(center[1]) / h + .5)
+        t[2, 2] = 1
+        if rot != 0:
+            rot = -rot  # To match direction of rotation from cropping
+            rot_mat = np.zeros((3, 3), dtype=np.float32)
+            rot_rad = rot * np.pi / 180
+            sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+            rot_mat[0, :2] = [cs, -sn]
+            rot_mat[1, :2] = [sn, cs]
+            rot_mat[2, 2] = 1
+            # Need to rotate around center
+            t_mat = np.eye(3)
+            t_mat[0, 2] = -res[0] / 2
+            t_mat[1, 2] = -res[1] / 2
+            t_inv = t_mat.copy()
+            t_inv[:2, 2] *= -1
+            t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
+        return t
+
+    def _affine_joints_mask(self,
+                            degree,
+                            center,
+                            roi_size,
+                            dsize,
+                            keypoints=None,
+                            heatmap_mask=None,
+                            gt_bbox=None):
+        kpts = None
+        mask = None
+        bbox = None
+        mask_affine_mat = self._get_affine_matrix(center, roi_size, dsize,
+                                                  degree)[:2]
+        if heatmap_mask is not None:
+            mask = cv2.warpAffine(heatmap_mask, mask_affine_mat, dsize)
+            mask = ((mask / 255) > 0.5).astype(np.float32)
+        if keypoints is not None:
+            kpts = copy.deepcopy(keypoints)
+            kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(),
+                                                mask_affine_mat)
+            kpts[(kpts[..., 0]) > dsize[0], :] = 0
+            kpts[(kpts[..., 1]) > dsize[1], :] = 0
+            kpts[(kpts[..., 0]) < 0, :] = 0
+            kpts[(kpts[..., 1]) < 0, :] = 0
+        if gt_bbox is not None:
+            temp_bbox = gt_bbox[:, [0, 3, 2, 1]]
+            cat_bbox = np.concatenate((gt_bbox, temp_bbox), axis=-1)
+            gt_bbox_warped = warp_affine_joints(cat_bbox, mask_affine_mat)
+            bbox = np.zeros_like(gt_bbox)
+            bbox[:, 0] = gt_bbox_warped[:, 0::2].min(1).clip(0, dsize[0])
+            bbox[:, 2] = gt_bbox_warped[:, 0::2].max(1).clip(0, dsize[0])
+            bbox[:, 1] = gt_bbox_warped[:, 1::2].min(1).clip(0, dsize[1])
+            bbox[:, 3] = gt_bbox_warped[:, 1::2].max(1).clip(0, dsize[1])
+        return kpts, mask, bbox
+
     def __call__(self, records):
     def __call__(self, records):
         image = records['image']
         image = records['image']
-        keypoints = records['joints']
-        heatmap_mask = records['mask']
+        shape = np.array(image.shape[:2][::-1])
+        keypoints = None
+        heatmap_mask = None
+        gt_bbox = None
+        if 'gt_joints' in records:
+            keypoints = records['gt_joints']
+
+        if 'mask' in records:
+            heatmap_mask = records['mask']
+            heatmap_mask *= 255
+
+        if 'gt_bbox' in records:
+            gt_bbox = records['gt_bbox']
 
 
         degree = (np.random.random() * 2 - 1) * self.max_degree
         degree = (np.random.random() * 2 - 1) * self.max_degree
-        shape = np.array(image.shape[:2][::-1])
         center = center = np.array((np.array(shape) / 2))
         center = center = np.array((np.array(shape) / 2))
 
 
         aug_scale = np.random.random() * (self.max_scale - self.min_scale
         aug_scale = np.random.random() * (self.max_scale - self.min_scale
                                           ) + self.min_scale
                                           ) + self.min_scale
         if self.scale_type == 'long':
         if self.scale_type == 'long':
-            scale = max(shape[0], shape[1]) / 1.0
+            scale = np.array([max(shape[0], shape[1]) / 1.0] * 2)
         elif self.scale_type == 'short':
         elif self.scale_type == 'short':
-            scale = min(shape[0], shape[1]) / 1.0
+            scale = np.array([min(shape[0], shape[1]) / 1.0] * 2)
+        elif self.scale_type == 'wh':
+            scale = shape
         else:
         else:
             raise ValueError('Unknown scale type: {}'.format(self.scale_type))
             raise ValueError('Unknown scale type: {}'.format(self.scale_type))
         roi_size = aug_scale * scale
         roi_size = aug_scale * scale
@@ -188,44 +295,55 @@ class RandomAffine(object):
         dy = int(0)
         dy = int(0)
         if self.max_shift > 0:
         if self.max_shift > 0:
 
 
-            dx = np.random.randint(-self.max_shift * roi_size,
-                                   self.max_shift * roi_size)
-            dy = np.random.randint(-self.max_shift * roi_size,
-                                   self.max_shift * roi_size)
+            dx = np.random.randint(-self.max_shift * roi_size[0],
+                                   self.max_shift * roi_size[0])
+            dy = np.random.randint(-self.max_shift * roi_size[0],
+                                   self.max_shift * roi_size[1])
 
 
         center += np.array([dx, dy])
         center += np.array([dx, dy])
         input_size = 2 * center
         input_size = 2 * center
+        if self.trainsize != -1:
+            dsize = self.trainsize
+            imgshape = (dsize)
+        else:
+            dsize = scale
+            imgshape = (shape.tolist())
 
 
-        keypoints[..., :2] *= shape
-        heatmap_mask *= 255
-        kpts_lst = []
-        mask_lst = []
-
-        image_affine_mat = self._get_affine_matrix(
-            center, roi_size, (self.trainsize, self.trainsize), degree)[:2]
+        image_affine_mat = self._get_affine_matrix(center, roi_size, dsize,
+                                                   degree)[:2]
         image = cv2.warpAffine(
         image = cv2.warpAffine(
             image,
             image,
-            image_affine_mat, (self.trainsize, self.trainsize),
-            flags=cv2.INTER_LINEAR)
+            image_affine_mat,
+            imgshape,
+            flags=cv2.INTER_LINEAR,
+            borderValue=self.boldervalue)
+
+        if self.hmsize is None:
+            kpts, mask, gt_bbox = self._affine_joints_mask(
+                degree, center, roi_size, dsize, keypoints, heatmap_mask,
+                gt_bbox)
+            records['image'] = image
+            if kpts is not None: records['gt_joints'] = kpts
+            if mask is not None: records['mask'] = mask
+            if gt_bbox is not None: records['gt_bbox'] = gt_bbox
+            return records
+
+        kpts_lst = []
+        mask_lst = []
         for hmsize in self.hmsize:
         for hmsize in self.hmsize:
-            kpts = copy.deepcopy(keypoints)
-            mask_affine_mat = self._get_affine_matrix(
-                center, roi_size, (hmsize, hmsize), degree)[:2]
-            if heatmap_mask is not None:
-                mask = cv2.warpAffine(heatmap_mask, mask_affine_mat,
-                                      (hmsize, hmsize))
-                mask = ((mask / 255) > 0.5).astype(np.float32)
-            kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(),
-                                                mask_affine_mat)
-            kpts[np.trunc(kpts[..., 0]) >= hmsize, 2] = 0
-            kpts[np.trunc(kpts[..., 1]) >= hmsize, 2] = 0
-            kpts[np.trunc(kpts[..., 0]) < 0, 2] = 0
-            kpts[np.trunc(kpts[..., 1]) < 0, 2] = 0
+            kpts, mask, gt_bbox = self._affine_joints_mask(
+                degree, center, roi_size, [hmsize, hmsize], keypoints,
+                heatmap_mask, gt_bbox)
             kpts_lst.append(kpts)
             kpts_lst.append(kpts)
             mask_lst.append(mask)
             mask_lst.append(mask)
         records['image'] = image
         records['image'] = image
-        records['joints'] = kpts_lst
-        records['mask'] = mask_lst
+
+        if 'gt_joints' in records:
+            records['gt_joints'] = kpts_lst
+        if 'mask' in records:
+            records['mask'] = mask_lst
+        if 'gt_bbox' in records:
+            records['gt_bbox'] = gt_bbox
         return records
         return records
 
 
 
 
@@ -258,9 +376,10 @@ class EvalAffine(object):
         if mask is not None:
         if mask is not None:
             mask = cv2.warpAffine(mask, trans, size_resized)
             mask = cv2.warpAffine(mask, trans, size_resized)
             records['mask'] = mask
             records['mask'] = mask
-        if 'joints' in records:
-            del records['joints']
+        if 'gt_joints' in records:
+            del records['gt_joints']
         records['image'] = image_resized
         records['image'] = image_resized
+        records['scale_factor'] = self.size / min(h, w)
         return records
         return records
 
 
 
 
@@ -310,7 +429,7 @@ class TagGenerate(object):
         self.num_joints = num_joints
         self.num_joints = num_joints
 
 
     def __call__(self, records):
     def __call__(self, records):
-        kpts_lst = records['joints']
+        kpts_lst = records['gt_joints']
         kpts = kpts_lst[0]
         kpts = kpts_lst[0]
         tagmap = np.zeros((self.max_people, self.num_joints, 4), dtype=np.int64)
         tagmap = np.zeros((self.max_people, self.num_joints, 4), dtype=np.int64)
         inds = np.where(kpts[..., 2] > 0)
         inds = np.where(kpts[..., 2] > 0)
@@ -322,7 +441,7 @@ class TagGenerate(object):
         tagmap[p, j, 2] = visible[..., 0]  # x
         tagmap[p, j, 2] = visible[..., 0]  # x
         tagmap[p, j, 3] = 1
         tagmap[p, j, 3] = 1
         records['tagmap'] = tagmap
         records['tagmap'] = tagmap
-        del records['joints']
+        del records['gt_joints']
         return records
         return records
 
 
 
 
@@ -356,7 +475,7 @@ class ToHeatmaps(object):
         self.gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
         self.gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
 
 
     def __call__(self, records):
     def __call__(self, records):
-        kpts_lst = records['joints']
+        kpts_lst = records['gt_joints']
         mask_lst = records['mask']
         mask_lst = records['mask']
         for idx, hmsize in enumerate(self.hmsize):
         for idx, hmsize in enumerate(self.hmsize):
             mask = mask_lst[idx]
             mask = mask_lst[idx]
@@ -477,7 +596,7 @@ class RandomFlipHalfBodyTransform(object):
 
 
     def __call__(self, records):
     def __call__(self, records):
         image = records['image']
         image = records['image']
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         joints_vis = records['joints_vis']
         c = records['center']
         c = records['center']
         s = records['scale']
         s = records['scale']
@@ -500,7 +619,7 @@ class RandomFlipHalfBodyTransform(object):
                 joints, joints_vis, image.shape[1], self.flip_pairs)
                 joints, joints_vis, image.shape[1], self.flip_pairs)
             c[0] = image.shape[1] - c[0] - 1
             c[0] = image.shape[1] - c[0] - 1
         records['image'] = image
         records['image'] = image
-        records['joints'] = joints
+        records['gt_joints'] = joints
         records['joints_vis'] = joints_vis
         records['joints_vis'] = joints_vis
         records['center'] = c
         records['center'] = c
         records['scale'] = s
         records['scale'] = s
@@ -560,7 +679,7 @@ class AugmentationbyInformantionDropping(object):
 
 
     def __call__(self, records):
     def __call__(self, records):
         img = records['image']
         img = records['image']
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         joints_vis = records['joints_vis']
         if np.random.rand() < self.prob_cutout:
         if np.random.rand() < self.prob_cutout:
             img = self._cutout(img, joints, joints_vis)
             img = self._cutout(img, joints, joints_vis)
@@ -588,7 +707,7 @@ class TopDownAffine(object):
 
 
     def __call__(self, records):
     def __call__(self, records):
         image = records['image']
         image = records['image']
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         joints_vis = records['joints_vis']
         rot = records['rotate'] if "rotate" in records else 0
         rot = records['rotate'] if "rotate" in records else 0
         if self.use_udp:
         if self.use_udp:
@@ -613,8 +732,171 @@ class TopDownAffine(object):
                     joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
                     joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
 
 
         records['image'] = image
         records['image'] = image
-        records['joints'] = joints
+        records['gt_joints'] = joints
+
+        return records
+
+
+@register_keypointop
+class SinglePoseAffine(object):
+    """apply affine transform to image and coords
+
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        use_udp (bool): whether to use Unbiased Data Processing.
+        records(dict): the dict contained the image and coords
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self,
+                 trainsize,
+                 rotate=[1.0, 30],
+                 scale=[1.0, 0.25],
+                 use_udp=False):
+        self.trainsize = trainsize
+        self.use_udp = use_udp
+        self.rot_prob = rotate[0]
+        self.rot_range = rotate[1]
+        self.scale_prob = scale[0]
+        self.scale_ratio = scale[1]
+
+    def __call__(self, records):
+        image = records['image']
+        if 'joints_2d' in records:
+            joints = records['joints_2d'] if 'joints_2d' in records else None
+            joints_vis = records[
+                'joints_vis'] if 'joints_vis' in records else np.ones(
+                    (len(joints), 1))
+        rot = 0
+        s = 1.
+        if np.random.random() < self.rot_prob:
+            rot = np.clip(np.random.randn() * self.rot_range,
+                          -self.rot_range * 2, self.rot_range * 2)
+        if np.random.random() < self.scale_prob:
+            s = np.clip(np.random.randn() * self.scale_ratio + 1,
+                        1 - self.scale_ratio, 1 + self.scale_ratio)
+
+        if self.use_udp:
+            trans = get_warp_matrix(
+                rot,
+                np.array(records['bbox_center']) * 2.0,
+                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
+                records['bbox_scale'] * 200.0 * s)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+            if 'joints_2d' in records:
+                joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(),
+                                                    trans)
+        else:
+            trans = get_affine_transform(
+                np.array(records['bbox_center']),
+                records['bbox_scale'] * s * 200, rot, self.trainsize)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+            if 'joints_2d' in records:
+                for i in range(len(joints)):
+                    if joints_vis[i, 0] > 0.0:
+                        joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
+
+        if 'joints_3d' in records:
+            pose3d = records['joints_3d']
+            if not rot == 0:
+                trans_3djoints = np.eye(3)
+                rot_rad = -rot * np.pi / 180
+                sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+                trans_3djoints[0, :2] = [cs, -sn]
+                trans_3djoints[1, :2] = [sn, cs]
+                pose3d[:, :3] = np.einsum('ij,kj->ki', trans_3djoints,
+                                          pose3d[:, :3])
+                records['joints_3d'] = pose3d
+
+        records['image'] = image
+        if 'joints_2d' in records:
+            records['joints_2d'] = joints
+
+        return records
+
+
+@register_keypointop
+class NoiseJitter(object):
+    """apply NoiseJitter to image
+
+    Args:
+        noise_factor (float): the noise factor ratio used to generate the jitter
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
 
 
+    def __init__(self, noise_factor=0.4):
+        self.noise_factor = noise_factor
+
+    def __call__(self, records):
+        self.pn = np.random.uniform(1 - self.noise_factor,
+                                    1 + self.noise_factor, 3)
+        rgb_img = records['image']
+        rgb_img[:, :, 0] = np.minimum(
+            255.0, np.maximum(0.0, rgb_img[:, :, 0] * self.pn[0]))
+        rgb_img[:, :, 1] = np.minimum(
+            255.0, np.maximum(0.0, rgb_img[:, :, 1] * self.pn[1]))
+        rgb_img[:, :, 2] = np.minimum(
+            255.0, np.maximum(0.0, rgb_img[:, :, 2] * self.pn[2]))
+        records['image'] = rgb_img
+        return records
+
+
+@register_keypointop
+class FlipPose(object):
+    """random apply flip to image
+
+    Args:
+        noise_factor (float): the noise factor ratio used to generate the jitter
+
+    Returns:
+        records (dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self, flip_prob=0.5, img_res=224, num_joints=14):
+        self.flip_pob = flip_prob
+        self.img_res = img_res
+        if num_joints == 24:
+            self.perm = [
+                5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17,
+                18, 19, 21, 20, 23, 22
+            ]
+        elif num_joints == 14:
+            self.perm = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13]
+        else:
+            print("error num_joints in flip :{}".format(num_joints))
+
+    def __call__(self, records):
+
+        if np.random.random() < self.flip_pob:
+            img = records['image']
+            img = np.fliplr(img)
+
+            if 'joints_2d' in records:
+                joints_2d = records['joints_2d']
+                joints_2d = joints_2d[self.perm]
+                joints_2d[:, 0] = self.img_res - joints_2d[:, 0]
+                records['joints_2d'] = joints_2d
+
+            if 'joints_3d' in records:
+                joints_3d = records['joints_3d']
+                joints_3d = joints_3d[self.perm]
+                joints_3d[:, 0] = -joints_3d[:, 0]
+                records['joints_3d'] = joints_3d
+
+            records['image'] = img
         return records
         return records
 
 
 
 
@@ -686,7 +968,7 @@ class ToHeatmapsTopDown(object):
             https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
             https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
             Copyright (c) Microsoft, under the MIT License.
             Copyright (c) Microsoft, under the MIT License.
         """
         """
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         joints_vis = records['joints_vis']
         num_joints = joints.shape[0]
         num_joints = joints.shape[0]
         image_size = np.array(
         image_size = np.array(
@@ -729,7 +1011,7 @@ class ToHeatmapsTopDown(object):
                     0]:g_y[1], g_x[0]:g_x[1]]
                     0]:g_y[1], g_x[0]:g_x[1]]
         records['target'] = target
         records['target'] = target
         records['target_weight'] = target_weight
         records['target_weight'] = target_weight
-        del records['joints'], records['joints_vis']
+        del records['gt_joints'], records['joints_vis']
 
 
         return records
         return records
 
 
@@ -754,7 +1036,7 @@ class ToHeatmapsTopDown_DARK(object):
         self.sigma = sigma
         self.sigma = sigma
 
 
     def __call__(self, records):
     def __call__(self, records):
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         joints_vis = records['joints_vis']
         num_joints = joints.shape[0]
         num_joints = joints.shape[0]
         image_size = np.array(
         image_size = np.array(
@@ -787,7 +1069,7 @@ class ToHeatmapsTopDown_DARK(object):
                     (x - mu_x)**2 + (y - mu_y)**2) / (2 * self.sigma**2))
                     (x - mu_x)**2 + (y - mu_y)**2) / (2 * self.sigma**2))
         records['target'] = target
         records['target'] = target
         records['target_weight'] = target_weight
         records['target_weight'] = target_weight
-        del records['joints'], records['joints_vis']
+        del records['gt_joints'], records['joints_vis']
 
 
         return records
         return records
 
 
@@ -816,7 +1098,7 @@ class ToHeatmapsTopDown_UDP(object):
         self.sigma = sigma
         self.sigma = sigma
 
 
     def __call__(self, records):
     def __call__(self, records):
-        joints = records['joints']
+        joints = records['gt_joints']
         joints_vis = records['joints_vis']
         joints_vis = records['joints_vis']
         num_joints = joints.shape[0]
         num_joints = joints.shape[0]
         image_size = np.array(
         image_size = np.array(
@@ -861,6 +1143,471 @@ class ToHeatmapsTopDown_UDP(object):
                     0]:g_y[1], g_x[0]:g_x[1]]
                     0]:g_y[1], g_x[0]:g_x[1]]
         records['target'] = target
         records['target'] = target
         records['target_weight'] = target_weight
         records['target_weight'] = target_weight
-        del records['joints'], records['joints_vis']
+        del records['gt_joints'], records['joints_vis']
 
 
         return records
         return records
+
+
+from typing import Optional, Tuple, Union, List
+import numbers
+
+
+def _scale_size(
+        size: Tuple[int, int],
+        scale: Union[float, int, tuple], ) -> Tuple[int, int]:
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float | tuple(float)): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    if isinstance(scale, (float, int)):
+        scale = (scale, scale)
+    w, h = size
+    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
+
+
+def rescale_size(old_size: tuple,
+                 scale: Union[float, int, tuple],
+                 return_scale: bool=False) -> tuple:
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, list):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+    new_size = _scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imrescale(img: np.ndarray,
+              scale: Union[float, Tuple[int, int]],
+              return_scale: bool=False,
+              interpolation: str='bilinear',
+              backend: Optional[str]=None) -> Union[np.ndarray, Tuple[
+                  np.ndarray, float]]:
+    """Resize image while keeping the aspect ratio.
+
+    Args:
+        img (ndarray): The input image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The rescaled image.
+    """
+    h, w = img.shape[:2]
+    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+    rescaled_img = imresize(
+        img, new_size, interpolation=interpolation, backend=backend)
+    if return_scale:
+        return rescaled_img, scale_factor
+    else:
+        return rescaled_img
+
+
+def imresize(
+        img: np.ndarray,
+        size: Tuple[int, int],
+        return_scale: bool=False,
+        interpolation: str='bilinear',
+        out: Optional[np.ndarray]=None,
+        backend: Optional[str]=None,
+        interp=cv2.INTER_LINEAR, ) -> Union[Tuple[np.ndarray, float, float],
+                                            np.ndarray]:
+    """Resize image to a given size.
+
+    Args:
+        img (ndarray): The input image.
+        size (tuple[int]): Target size (w, h).
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported for resize.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+        resized_img = np.array(pil_image)
+    else:
+        resized_img = cv2.resize(img, size, dst=out, interpolation=interp)
+    if not return_scale:
+        return resized_img
+    else:
+        w_scale = size[0] / w
+        h_scale = size[1] / h
+        return resized_img, w_scale, h_scale
+
+
+class PETR_Resize:
+    """Resize images & bbox & mask.
+
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. If the input dict contains the key
+    "scale", then the scale in the input dict is used, otherwise the specified
+    scale in the init method is used. If the input dict contains the key
+    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
+    scale_factor), the actual scale will be computed by image shape and
+    scale_factor.
+
+    `img_scale` can either be a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 3 multiscale modes:
+
+    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
+      range and multiply it with the image scale.
+    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
+      sample a scale from the multiscale range.
+    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
+      sample a scale from multiple scales.
+
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+        multiscale_mode (str): Either "range" or "value".
+        ratio_range (tuple[float]): (min_ratio, max_ratio)
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        override (bool, optional): Whether to override `scale` and
+            `scale_factor` so as to call resize twice. Default False. If True,
+            after the first resizing, the existed `scale` and `scale_factor`
+            will be ignored so the second resizing can be allowed.
+            This option is a work-around for multiple times of resize in DETR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 bbox_clip_border=True,
+                 backend='cv2',
+                 interpolation='bilinear',
+                 override=False,
+                 keypoint_clip_border=True):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert isinstance(self.img_scale, list)
+
+        if ratio_range is not None:
+            # mode 1: given a scale and a range of image ratio
+            assert len(self.img_scale) == 1
+        else:
+            # mode 2: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+
+        self.backend = backend
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        # TODO: refactor the override option in Resize
+        self.interpolation = interpolation
+        self.override = override
+        self.bbox_clip_border = bbox_clip_border
+        self.keypoint_clip_border = keypoint_clip_border
+
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
+                where ``img_scale`` is the selected image scale and \
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        assert isinstance(img_scales, list)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and upper bound of image scales.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
+                ``img_scale`` is sampled scale and None is just a placeholder \
+                to be consistent with :func:`random_select`.
+        """
+
+        assert isinstance(img_scales, list) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long), max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short), max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+
+        Args:
+            img_scale (list): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where \
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
+                None is just a placeholder to be consistent with \
+                :func:`random_select`.
+        """
+
+        assert isinstance(img_scale, list) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into \
+                ``results``, which would be used by subsequent pipelines.
+        """
+
+        if self.ratio_range is not None:
+            scale, scale_idx = self.random_sample_ratio(self.img_scale[0],
+                                                        self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        for key in ['image'] if 'image' in results else []:
+            if self.keep_ratio:
+                img, scale_factor = imrescale(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    interpolation=self.interpolation,
+                    backend=self.backend)
+                # the w_scale and h_scale has minor difference
+                # a real fix should be done in the imrescale in the future
+                new_h, new_w = img.shape[:2]
+                h, w = results[key].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = imresize(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    interpolation=self.interpolation,
+                    backend=self.backend)
+
+            scale_factor = np.array(
+                [w_scale, h_scale, w_scale, h_scale], dtype=np.float32)
+            results['im_shape'] = np.array(img.shape)
+            # in case that there is no padding
+            results['pad_shape'] = img.shape
+            results['scale_factor'] = scale_factor
+            results['keep_ratio'] = self.keep_ratio
+            # img_pad = self.impad(img, shape=results['scale'])
+            results[key] = img
+
+    def _resize_bboxes(self, results):
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        for key in ['gt_bbox'] if 'gt_bbox' in results else []:
+            bboxes = results[key] * results['scale_factor']
+            if self.bbox_clip_border:
+                img_shape = results['im_shape']
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            results[key] = bboxes
+
+    def _resize_masks(self, results):
+        """Resize masks with ``results['scale']``"""
+        for key in ['mask'] if 'mask' in results else []:
+            if results[key] is None:
+                continue
+            if self.keep_ratio:
+                results[key] = results[key].rescale(results['scale'])
+            else:
+                results[key] = results[key].resize(results['im_shape'][:2])
+
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in ['seg'] if 'seg' in results else []:
+            if self.keep_ratio:
+                gt_seg = imrescale(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            else:
+                gt_seg = imresize(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            results[key] = gt_seg
+
+    def _resize_keypoints(self, results):
+        """Resize keypoints with ``results['scale_factor']``."""
+        for key in ['gt_joints'] if 'gt_joints' in results else []:
+            keypoints = results[key].copy()
+            keypoints[..., 0] = keypoints[..., 0] * results['scale_factor'][0]
+            keypoints[..., 1] = keypoints[..., 1] * results['scale_factor'][1]
+            if self.keypoint_clip_border:
+                img_shape = results['im_shape']
+                keypoints[..., 0] = np.clip(keypoints[..., 0], 0, img_shape[1])
+                keypoints[..., 1] = np.clip(keypoints[..., 1], 0, img_shape[0])
+            results[key] = keypoints
+
+    def _resize_areas(self, results):
+        """Resize mask areas with ``results['scale_factor']``."""
+        for key in ['gt_areas'] if 'gt_areas' in results else []:
+            areas = results[key].copy()
+            areas = areas * results['scale_factor'][0] * results[
+                'scale_factor'][1]
+            results[key] = areas
+
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'im_shape', 'pad_shape', 'scale_factor', \
+                'keep_ratio' keys are added into result dict.
+        """
+        if 'scale' not in results:
+            if 'scale_factor' in results:
+                img_shape = results['image'].shape[:2]
+                scale_factor = results['scale_factor'][0]
+                # assert isinstance(scale_factor, float)
+                results['scale'] = [int(x * scale_factor)
+                                    for x in img_shape][::-1]
+            else:
+                self._random_scale(results)
+        else:
+            if not self.override:
+                assert 'scale_factor' not in results, (
+                    'scale and scale_factor cannot be both set.')
+            else:
+                results.pop('scale')
+                if 'scale_factor' in results:
+                    results.pop('scale_factor')
+                self._random_scale(results)
+
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        self._resize_keypoints(results)
+        self._resize_areas(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'multiscale_mode={self.multiscale_mode}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        repr_str += f'keypoint_clip_border={self.keypoint_clip_border})'
+        return repr_str

+ 296 - 0
paddlers/models/ppdet/data/transform/keypoints_3d_operators.py

@@ -0,0 +1,296 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+import cv2
+import numpy as np
+import math
+import copy
+import random
+import uuid
+from numbers import Number, Integral
+
+from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix
+from paddlers.models.ppdet.core.workspace import serializable
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+registered_ops = []
+
+__all__ = [
+    'CropAndFlipImages', 'PermuteImages', 'RandomFlipHalfBody3DTransformImages'
+]
+
+import matplotlib.pyplot as plt
+from PIL import Image, ImageDraw
+from mpl_toolkits.mplot3d import Axes3D
+
+
+def register_keypointop(cls):
+    return serializable(cls)
+
+
+def register_op(cls):
+    registered_ops.append(cls.__name__)
+    if not hasattr(BaseOperator, cls.__name__):
+        setattr(BaseOperator, cls.__name__, cls)
+    else:
+        raise KeyError("The {} class has been registered.".format(cls.__name__))
+    return serializable(cls)
+
+
+class BaseOperator(object):
+    def __init__(self, name=None):
+        if name is None:
+            name = self.__class__.__name__
+        self._id = name + '_' + str(uuid.uuid4())[-6:]
+
+    def apply(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        return sample
+
+    def __call__(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        if isinstance(sample, Sequence):  # for batch_size
+            for i in range(len(sample)):
+                sample[i] = self.apply(sample[i], context)
+        else:
+            # image.shape changed
+            sample = self.apply(sample, context)
+        return sample
+
+    def __str__(self):
+        return str(self._id)
+
+
+@register_keypointop
+class CropAndFlipImages(object):
+    """Crop all images"""
+
+    def __init__(self, crop_range, flip_pairs=None):
+        super(CropAndFlipImages, self).__init__()
+        self.crop_range = crop_range
+        self.flip_pairs = flip_pairs
+
+    def __call__(self, records):  # tuple
+        images = records["image"]
+        images = images[:, :, ::-1, :]
+        images = images[:, :, self.crop_range[0]:self.crop_range[1]]
+        records["image"] = images
+
+        if "kps2d" in records.keys():
+            kps2d = records["kps2d"]
+
+            width, height = images.shape[2], images.shape[1]
+            kps2d = np.array(kps2d)
+            kps2d[:, :, 0] = kps2d[:, :, 0] - self.crop_range[0]
+
+            for pair in self.flip_pairs:
+                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
+                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()
+
+            records["kps2d"] = kps2d
+
+        return records
+
+
+@register_op
+class PermuteImages(BaseOperator):
+    def __init__(self):
+        """
+        Change the channel to be (batch_size, C, H, W) #(6, 3, 1080, 1920)
+        """
+        super(PermuteImages, self).__init__()
+
+    def apply(self, sample, context=None):
+        images = sample["image"]
+        images = images.transpose((0, 3, 1, 2))
+
+        sample["image"] = images
+
+        return sample
+
+
+@register_keypointop
+class RandomFlipHalfBody3DTransformImages(object):
+    """apply data augment to images and coords
+    to achieve the flip, scale, rotate and half body transform effect for training image
+    Args:
+        trainsize (list):[w, h], Image target size
+        upper_body_ids (list): The upper body joint ids
+        flip_pairs (list): The left-right joints exchange order list
+        pixel_std (int): The pixel std of the scale
+        scale (float): The scale factor to transform the image
+        rot (int): The rotate factor to transform the image
+        num_joints_half_body (int): The joints threshold of the half body transform
+        prob_half_body (float): The threshold of the half body transform
+        flip (bool): Whether to flip the image
+    Returns:
+        records(dict): contain the image and coords after tranformed
+    """
+
+    def __init__(self,
+                 trainsize,
+                 upper_body_ids,
+                 flip_pairs,
+                 pixel_std,
+                 scale=0.35,
+                 rot=40,
+                 num_joints_half_body=8,
+                 prob_half_body=0.3,
+                 flip=True,
+                 rot_prob=0.6,
+                 do_occlusion=False):
+        super(RandomFlipHalfBody3DTransformImages, self).__init__()
+        self.trainsize = trainsize
+        self.upper_body_ids = upper_body_ids
+        self.flip_pairs = flip_pairs
+        self.pixel_std = pixel_std
+        self.scale = scale
+        self.rot = rot
+        self.num_joints_half_body = num_joints_half_body
+        self.prob_half_body = prob_half_body
+        self.flip = flip
+        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
+        self.rot_prob = rot_prob
+        self.do_occlusion = do_occlusion
+
+    def halfbody_transform(self, joints, joints_vis):
+        upper_joints = []
+        lower_joints = []
+        for joint_id in range(joints.shape[0]):
+            if joints_vis[joint_id][0] > 0:
+                if joint_id in self.upper_body_ids:
+                    upper_joints.append(joints[joint_id])
+                else:
+                    lower_joints.append(joints[joint_id])
+        if np.random.randn() < 0.5 and len(upper_joints) > 2:
+            selected_joints = upper_joints
+        else:
+            selected_joints = lower_joints if len(
+                lower_joints) > 2 else upper_joints
+        if len(selected_joints) < 2:
+            return None, None
+        selected_joints = np.array(selected_joints, dtype=np.float32)
+        center = selected_joints.mean(axis=0)[:2]
+        left_top = np.amin(selected_joints, axis=0)
+        right_bottom = np.amax(selected_joints, axis=0)
+        w = right_bottom[0] - left_top[0]
+        h = right_bottom[1] - left_top[1]
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        scale = scale * 1.5
+
+        return center, scale
+
+    def flip_joints(self, joints, joints_vis, width, matched_parts, kps2d=None):
+        # joints: (6, 24, 3),(num_frames, num_joints, 3)
+
+        joints[:, :, 0] = width - joints[:, :, 0] - 1  # x
+        if kps2d is not None:
+            kps2d[:, :, 0] = width - kps2d[:, :, 0] - 1
+
+        for pair in matched_parts:
+            joints[:, pair[0], :], joints[:,pair[1], :] = \
+                joints[:,pair[1], :], joints[:,pair[0], :].copy()
+
+            joints_vis[:,pair[0], :], joints_vis[:,pair[1], :] = \
+                joints_vis[:,pair[1], :], joints_vis[:,pair[0], :].copy()
+
+            if kps2d is not None:
+                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
+                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()
+
+        # move to zero
+        joints -= joints[:, [0], :]  # (batch_size, 24, 3),numpy.ndarray
+
+        return joints, joints_vis, kps2d
+
+    def __call__(self, records):
+        images = records[
+            'image']  #kps3d, kps3d_vis, images. images.shape(num_frames, width, height, 3)
+
+        joints = records['kps3d']
+        joints_vis = records['kps3d_vis']
+
+        kps2d = None
+        if 'kps2d' in records.keys():
+            kps2d = records['kps2d']
+
+        if self.flip and np.random.random() <= 0.5:
+            images = images[:, :, ::-1, :]  # 图像水平翻转 (6, 1080, 810, 3)
+            joints, joints_vis, kps2d = self.flip_joints(
+                joints, joints_vis, images.shape[2], self.flip_pairs,
+                kps2d)  # 关键点左右对称翻转
+        occlusion = False
+        if self.do_occlusion and random.random() <= 0.5:  # 随机遮挡
+            height = images[0].shape[0]
+            width = images[0].shape[1]
+            occlusion = True
+            while True:
+                area_min = 0.0
+                area_max = 0.2
+                synth_area = (random.random() *
+                              (area_max - area_min) + area_min) * width * height
+
+                ratio_min = 0.3
+                ratio_max = 1 / 0.3
+                synth_ratio = (random.random() *
+                               (ratio_max - ratio_min) + ratio_min)
+
+                synth_h = math.sqrt(synth_area * synth_ratio)
+                synth_w = math.sqrt(synth_area / synth_ratio)
+                synth_xmin = random.random() * (width - synth_w - 1)
+                synth_ymin = random.random() * (height - synth_h - 1)
+
+                if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < width and synth_ymin + synth_h < height:
+                    xmin = int(synth_xmin)
+                    ymin = int(synth_ymin)
+                    w = int(synth_w)
+                    h = int(synth_h)
+
+                    mask = np.random.rand(h, w, 3) * 255
+                    images[:, ymin:ymin + h, xmin:xmin + w, :] = mask[
+                        None, :, :, :]
+                    break
+
+        records['image'] = images
+        records['kps3d'] = joints
+        records['kps3d_vis'] = joints_vis
+        if kps2d is not None:
+            records['kps2d'] = kps2d
+
+        return records

文件差异内容过多而无法显示
+ 500 - 71
paddlers/models/ppdet/data/transform/operators.py


+ 7 - 0
paddlers/models/ppdet/engine/__init__.py

@@ -15,6 +15,9 @@
 from . import trainer
 from . import trainer
 from .trainer import *
 from .trainer import *
 
 
+from . import trainer_cot
+from .trainer_cot import *
+
 from . import callbacks
 from . import callbacks
 from .callbacks import *
 from .callbacks import *
 
 
@@ -28,3 +31,7 @@ __all__ = trainer.__all__ \
 from . import tracker
 from . import tracker
 from .tracker import *
 from .tracker import *
 __all__ = __all__ + tracker.__all__
 __all__ = __all__ + tracker.__all__
+
+from . import trainer_ssod
+from .trainer_ssod import *
+__all__ = __all__ + trainer_ssod.__all__

+ 111 - 47
paddlers/models/ppdet/engine/callbacks.py

@@ -152,15 +152,14 @@ class LogPrinter(Callback):
             if mode == 'eval':
             if mode == 'eval':
                 sample_num = status['sample_num']
                 sample_num = status['sample_num']
                 cost_time = status['cost_time']
                 cost_time = status['cost_time']
-                logger.info('Total sample number: {}, averge FPS: {}'.format(
+                logger.info('Total sample number: {}, average FPS: {}'.format(
                     sample_num, sample_num / cost_time))
                     sample_num, sample_num / cost_time))
 
 
 
 
 class Checkpointer(Callback):
 class Checkpointer(Callback):
     def __init__(self, model):
     def __init__(self, model):
         super(Checkpointer, self).__init__(model)
         super(Checkpointer, self).__init__(model)
-        cfg = self.model.cfg
-        self.best_ap = 0.
+        self.best_ap = -1000.
         self.save_dir = os.path.join(self.model.cfg.save_dir,
         self.save_dir = os.path.join(self.model.cfg.save_dir,
                                      self.model.cfg.filename)
                                      self.model.cfg.filename)
         if hasattr(self.model.model, 'student_model'):
         if hasattr(self.model.model, 'student_model'):
@@ -187,7 +186,11 @@ class Checkpointer(Callback):
                 if 'save_best_model' in status and status['save_best_model']:
                 if 'save_best_model' in status and status['save_best_model']:
                     for metric in self.model._metrics:
                     for metric in self.model._metrics:
                         map_res = metric.get_results()
                         map_res = metric.get_results()
-                        if 'bbox' in map_res:
+                        eval_func = "ap"
+                        if 'pose3d' in map_res:
+                            key = 'pose3d'
+                            eval_func = "mpjpe"
+                        elif 'bbox' in map_res:
                             key = 'bbox'
                             key = 'bbox'
                         elif 'keypoint' in map_res:
                         elif 'keypoint' in map_res:
                             key = 'keypoint'
                             key = 'keypoint'
@@ -202,18 +205,36 @@ class Checkpointer(Callback):
                             self.best_ap = map_res[key][0]
                             self.best_ap = map_res[key][0]
                             save_name = 'best_model'
                             save_name = 'best_model'
                             weight = self.weight.state_dict()
                             weight = self.weight.state_dict()
-                        logger.info("Best test {} ap is {:0.3f}.".format(
-                            key, self.best_ap))
+                        logger.info("Best test {} {} is {:0.3f}.".format(
+                            key, eval_func, abs(self.best_ap)))
             if weight:
             if weight:
                 if self.model.use_ema:
                 if self.model.use_ema:
-                    # save model and ema_model
-                    save_model(
-                        status['weight'],
-                        self.model.optimizer,
-                        self.save_dir,
-                        save_name,
-                        epoch_id + 1,
-                        ema_model=weight)
+                    exchange_save_model = status.get('exchange_save_model',
+                                                     False)
+                    if not exchange_save_model:
+                        # save model and ema_model
+                        save_model(
+                            status['weight'],
+                            self.model.optimizer,
+                            self.save_dir,
+                            save_name,
+                            epoch_id + 1,
+                            ema_model=weight)
+                    else:
+                        # save model(student model) and ema_model(teacher model)
+                        # in DenseTeacher SSOD, the teacher model will be higher,
+                        # so exchange when saving pdparams
+                        student_model = status['weight']  # model
+                        teacher_model = weight  # ema_model
+                        save_model(
+                            teacher_model,
+                            self.model.optimizer,
+                            self.save_dir,
+                            save_name,
+                            epoch_id + 1,
+                            ema_model=student_model)
+                        del teacher_model
+                        del student_model
                 else:
                 else:
                     save_model(weight, self.model.optimizer, self.save_dir,
                     save_model(weight, self.model.optimizer, self.save_dir,
                                save_name, epoch_id + 1)
                                save_name, epoch_id + 1)
@@ -288,6 +309,7 @@ class VisualDLWriter(Callback):
                                                    self.vdl_mAP_step)
                                                    self.vdl_mAP_step)
                 self.vdl_mAP_step += 1
                 self.vdl_mAP_step += 1
 
 
+
 class WandbCallback(Callback):
 class WandbCallback(Callback):
     def __init__(self, model):
     def __init__(self, model):
         super(WandbCallback, self).__init__(model)
         super(WandbCallback, self).__init__(model)
@@ -307,10 +329,8 @@ class WandbCallback(Callback):
             self.wandb_params = {}
             self.wandb_params = {}
         for k, v in model.cfg.items():
         for k, v in model.cfg.items():
             if k.startswith("wandb_"):
             if k.startswith("wandb_"):
-                self.wandb_params.update({
-                    k.lstrip("wandb_"): v
-                })
-        
+                self.wandb_params.update({k.lstrip("wandb_"): v})
+
         self._run = None
         self._run = None
         if dist.get_world_size() < 2 or dist.get_rank() == 0:
         if dist.get_world_size() < 2 or dist.get_rank() == 0:
             _ = self.run
             _ = self.run
@@ -318,37 +338,50 @@ class WandbCallback(Callback):
             self.run.define_metric("epoch")
             self.run.define_metric("epoch")
             self.run.define_metric("eval/*", step_metric="epoch")
             self.run.define_metric("eval/*", step_metric="epoch")
 
 
-        self.best_ap = 0
-    
+        self.best_ap = -1000.
+        self.fps = []
+
     @property
     @property
     def run(self):
     def run(self):
         if self._run is None:
         if self._run is None:
             if self.wandb.run is not None:
             if self.wandb.run is not None:
-                logger.info("There is an ongoing wandb run which will be used"
-                        "for logging. Please use `wandb.finish()` to end that"
-                        "if the behaviour is not intended")
+                logger.info(
+                    "There is an ongoing wandb run which will be used"
+                    "for logging. Please use `wandb.finish()` to end that"
+                    "if the behaviour is not intended")
                 self._run = self.wandb.run
                 self._run = self.wandb.run
             else:
             else:
                 self._run = self.wandb.init(**self.wandb_params)
                 self._run = self.wandb.init(**self.wandb_params)
         return self._run
         return self._run
-    
+
     def save_model(self,
     def save_model(self,
-                optimizer,
-                save_dir,
-                save_name,
-                last_epoch,
-                ema_model=None,
-                ap=None, 
-                tags=None):
+                   optimizer,
+                   save_dir,
+                   save_name,
+                   last_epoch,
+                   ema_model=None,
+                   ap=None,
+                   fps=None,
+                   tags=None):
         if dist.get_world_size() < 2 or dist.get_rank() == 0:
         if dist.get_world_size() < 2 or dist.get_rank() == 0:
             model_path = os.path.join(save_dir, save_name)
             model_path = os.path.join(save_dir, save_name)
             metadata = {}
             metadata = {}
             metadata["last_epoch"] = last_epoch
             metadata["last_epoch"] = last_epoch
             if ap:
             if ap:
                 metadata["ap"] = ap
                 metadata["ap"] = ap
+
+            if fps:
+                metadata["fps"] = fps
+
             if ema_model is None:
             if ema_model is None:
-                ema_artifact = self.wandb.Artifact(name="ema_model-{}".format(self.run.id), type="model", metadata=metadata)
-                model_artifact = self.wandb.Artifact(name="model-{}".format(self.run.id), type="model", metadata=metadata)
+                ema_artifact = self.wandb.Artifact(
+                    name="ema_model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+                model_artifact = self.wandb.Artifact(
+                    name="model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
 
 
                 ema_artifact.add_file(model_path + ".pdema", name="model_ema")
                 ema_artifact.add_file(model_path + ".pdema", name="model_ema")
                 model_artifact.add_file(model_path + ".pdparams", name="model")
                 model_artifact.add_file(model_path + ".pdparams", name="model")
@@ -356,10 +389,13 @@ class WandbCallback(Callback):
                 self.run.log_artifact(ema_artifact, aliases=tags)
                 self.run.log_artifact(ema_artifact, aliases=tags)
                 self.run.log_artfact(model_artifact, aliases=tags)
                 self.run.log_artfact(model_artifact, aliases=tags)
             else:
             else:
-                model_artifact = self.wandb.Artifact(name="model-{}".format(self.run.id), type="model", metadata=metadata)
+                model_artifact = self.wandb.Artifact(
+                    name="model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
                 model_artifact.add_file(model_path + ".pdparams", name="model")
                 model_artifact.add_file(model_path + ".pdparams", name="model")
                 self.run.log_artifact(model_artifact, aliases=tags)
                 self.run.log_artifact(model_artifact, aliases=tags)
-    
+
     def on_step_end(self, status):
     def on_step_end(self, status):
 
 
         mode = status['mode']
         mode = status['mode']
@@ -368,22 +404,41 @@ class WandbCallback(Callback):
                 training_status = status['training_staus'].get()
                 training_status = status['training_staus'].get()
                 for k, v in training_status.items():
                 for k, v in training_status.items():
                     training_status[k] = float(v)
                     training_status[k] = float(v)
-                metrics = {
-                    "train/" + k: v for k,v in training_status.items()
-                }
+
+                # calculate ips, data_cost, batch_cost
+                batch_time = status['batch_time']
+                data_time = status['data_time']
+                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
+                ))]['batch_size']
+
+                ips = float(batch_size) / float(batch_time.avg)
+                data_cost = float(data_time.avg)
+                batch_cost = float(batch_time.avg)
+
+                metrics = {"train/" + k: v for k, v in training_status.items()}
+
+                metrics["train/ips"] = ips
+                metrics["train/data_cost"] = data_cost
+                metrics["train/batch_cost"] = batch_cost
+
+                self.fps.append(ips)
                 self.run.log(metrics)
                 self.run.log(metrics)
-    
+
     def on_epoch_end(self, status):
     def on_epoch_end(self, status):
         mode = status['mode']
         mode = status['mode']
         epoch_id = status['epoch_id']
         epoch_id = status['epoch_id']
         save_name = None
         save_name = None
         if dist.get_world_size() < 2 or dist.get_rank() == 0:
         if dist.get_world_size() < 2 or dist.get_rank() == 0:
             if mode == 'train':
             if mode == 'train':
+                fps = sum(self.fps) / len(self.fps)
+                self.fps = []
+
                 end_epoch = self.model.cfg.epoch
                 end_epoch = self.model.cfg.epoch
                 if (
                 if (
                         epoch_id + 1
                         epoch_id + 1
                 ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
                 ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
-                    save_name = str(epoch_id) if epoch_id != end_epoch - 1 else "model_final"
+                    save_name = str(
+                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
                     tags = ["latest", "epoch_{}".format(epoch_id)]
                     tags = ["latest", "epoch_{}".format(epoch_id)]
                     self.save_model(
                     self.save_model(
                         self.model.optimizer,
                         self.model.optimizer,
@@ -391,20 +446,29 @@ class WandbCallback(Callback):
                         save_name,
                         save_name,
                         epoch_id + 1,
                         epoch_id + 1,
                         self.model.use_ema,
                         self.model.use_ema,
-                        tags=tags
-                    )
+                        fps=fps,
+                        tags=tags)
             if mode == 'eval':
             if mode == 'eval':
+                sample_num = status['sample_num']
+                cost_time = status['cost_time']
+
+                fps = sample_num / cost_time
+
                 merged_dict = {}
                 merged_dict = {}
                 for metric in self.model._metrics:
                 for metric in self.model._metrics:
                     for key, map_value in metric.get_results().items():
                     for key, map_value in metric.get_results().items():
                         merged_dict["eval/{}-mAP".format(key)] = map_value[0]
                         merged_dict["eval/{}-mAP".format(key)] = map_value[0]
                 merged_dict["epoch"] = status["epoch_id"]
                 merged_dict["epoch"] = status["epoch_id"]
+                merged_dict["eval/fps"] = sample_num / cost_time
+
                 self.run.log(merged_dict)
                 self.run.log(merged_dict)
 
 
                 if 'save_best_model' in status and status['save_best_model']:
                 if 'save_best_model' in status and status['save_best_model']:
                     for metric in self.model._metrics:
                     for metric in self.model._metrics:
                         map_res = metric.get_results()
                         map_res = metric.get_results()
-                        if 'bbox' in map_res:
+                        if 'pose3d' in map_res:
+                            key = 'pose3d'
+                        elif 'bbox' in map_res:
                             key = 'bbox'
                             key = 'bbox'
                         elif 'keypoint' in map_res:
                         elif 'keypoint' in map_res:
                             key = 'keypoint'
                             key = 'keypoint'
@@ -426,10 +490,10 @@ class WandbCallback(Callback):
                                 save_name,
                                 save_name,
                                 last_epoch=epoch_id + 1,
                                 last_epoch=epoch_id + 1,
                                 ema_model=self.model.use_ema,
                                 ema_model=self.model.use_ema,
-                                ap=self.best_ap,
-                                tags=tags
-                            )
-    
+                                ap=abs(self.best_ap),
+                                fps=fps,
+                                tags=tags)
+
     def on_train_end(self, status):
     def on_train_end(self, status):
         self.run.finish()
         self.run.finish()
 
 

+ 54 - 6
paddlers/models/ppdet/engine/export_utils.py

@@ -29,6 +29,7 @@ logger = setup_logger('ppdet.engine')
 # Global dictionary
 # Global dictionary
 TRT_MIN_SUBGRAPH = {
 TRT_MIN_SUBGRAPH = {
     'YOLO': 3,
     'YOLO': 3,
+    'PPYOLOE': 3,
     'SSD': 60,
     'SSD': 60,
     'RCNN': 40,
     'RCNN': 40,
     'RetinaNet': 40,
     'RetinaNet': 40,
@@ -42,6 +43,7 @@ TRT_MIN_SUBGRAPH = {
     'HRNet': 3,
     'HRNet': 3,
     'DeepSORT': 3,
     'DeepSORT': 3,
     'ByteTrack': 10,
     'ByteTrack': 10,
+    'CenterTrack': 5,
     'JDE': 10,
     'JDE': 10,
     'FairMOT': 5,
     'FairMOT': 5,
     'GFL': 16,
     'GFL': 16,
@@ -49,10 +51,46 @@ TRT_MIN_SUBGRAPH = {
     'CenterNet': 5,
     'CenterNet': 5,
     'TOOD': 5,
     'TOOD': 5,
     'YOLOX': 8,
     'YOLOX': 8,
+    'YOLOF': 40,
+    'METRO_Body': 3,
+    'DETR': 3,
 }
 }
 
 
 KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
 KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
-MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack']
+MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
+
+TO_STATIC_SPEC = {
+    'yolov3_darknet53_270e_coco': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'is_crowd': paddle.static.InputSpec(
+            name='is_crowd', shape=[-1, 50], dtype='float32'),
+        'gt_bbox': paddle.static.InputSpec(
+            name='gt_bbox', shape=[-1, 50, 4], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'target0': paddle.static.InputSpec(
+            name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+        'target1': paddle.static.InputSpec(
+            name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+        'target2': paddle.static.InputSpec(
+            name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+    }],
+}
+
+
+def apply_to_static(config, model):
+    filename = config.get('filename', None)
+    spec = TO_STATIC_SPEC.get(filename, None)
+    model = paddle.jit.to_static(model, input_spec=spec)
+    logger.info("Successfully to apply @to_static with specs: {}".format(spec))
+    return model
 
 
 
 
 def _prune_input_spec(input_spec, program, targets):
 def _prune_input_spec(input_spec, program, targets):
@@ -140,10 +178,11 @@ def _dump_infer_config(config, path, image_shape, model):
         infer_cfg['export_onnx'] = True
         infer_cfg['export_onnx'] = True
         infer_cfg['export_eb'] = export_eb
         infer_cfg['export_eb'] = export_eb
 
 
-
     if infer_arch in MOT_ARCH:
     if infer_arch in MOT_ARCH:
         if infer_arch == 'DeepSORT':
         if infer_arch == 'DeepSORT':
             tracker_cfg = config['DeepSORTTracker']
             tracker_cfg = config['DeepSORTTracker']
+        elif infer_arch == 'CenterTrack':
+            tracker_cfg = config['CenterTracker']
         else:
         else:
             tracker_cfg = config['JDETracker']
             tracker_cfg = config['JDETracker']
         infer_cfg['tracker'] = _parse_tracker(tracker_cfg)
         infer_cfg['tracker'] = _parse_tracker(tracker_cfg)
@@ -155,7 +194,10 @@ def _dump_infer_config(config, path, image_shape, model):
             arch_state = True
             arch_state = True
             break
             break
 
 
-    if infer_arch == 'YOLOX':
+    if infer_arch == 'PPYOLOEWithAuxHead':
+        infer_arch = 'PPYOLOE'
+
+    if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']:
         infer_cfg['arch'] = infer_arch
         infer_cfg['arch'] = infer_arch
         infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
         infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
         arch_state = True
         arch_state = True
@@ -174,9 +216,15 @@ def _dump_infer_config(config, path, image_shape, model):
         label_arch = 'keypoint_arch'
         label_arch = 'keypoint_arch'
 
 
     if infer_arch in MOT_ARCH:
     if infer_arch in MOT_ARCH:
-        label_arch = 'mot_arch'
-        reader_cfg = config['TestMOTReader']
-        dataset_cfg = config['TestMOTDataset']
+        if config['metric'] in ['COCO', 'VOC']:
+            # MOT model run as Detector
+            reader_cfg = config['TestReader']
+            dataset_cfg = config['TestDataset']
+        else:
+            # 'metric' in ['MOT', 'MCMOT', 'KITTI']
+            label_arch = 'mot_arch'
+            reader_cfg = config['TestMOTReader']
+            dataset_cfg = config['TestMOTDataset']
     else:
     else:
         reader_cfg = config['TestReader']
         reader_cfg = config['TestReader']
         dataset_cfg = config['TestDataset']
         dataset_cfg = config['TestDataset']

+ 107 - 10
paddlers/models/ppdet/engine/tracker.py

@@ -29,9 +29,11 @@ from paddlers.models.ppdet.core.workspace import create
 from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
 from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
 from paddlers.models.ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
 from paddlers.models.ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
 from paddlers.models.ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results
 from paddlers.models.ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results
-from paddlers.models.ppdet.modeling.mot.tracker import JDETracker, DeepSORTTracker, OCSORTTracker
+from paddlers.models.ppdet.modeling.mot.tracker import JDETracker, CenterTracker
+from paddlers.models.ppdet.modeling.mot.tracker import DeepSORTTracker, OCSORTTracker, BOTSORTTracker
 from paddlers.models.ppdet.modeling.architectures import YOLOX
 from paddlers.models.ppdet.modeling.architectures import YOLOX
 from paddlers.models.ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric
 from paddlers.models.ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric
+from paddlers.models.ppdet.data.source.category import get_categories
 import paddlers.models.ppdet.utils.stats as stats
 import paddlers.models.ppdet.utils.stats as stats
 
 
 from .callbacks import Callback, ComposeCallback
 from .callbacks import Callback, ComposeCallback
@@ -39,9 +41,9 @@ from .callbacks import Callback, ComposeCallback
 from paddlers.models.ppdet.utils.logger import setup_logger
 from paddlers.models.ppdet.utils.logger import setup_logger
 logger = setup_logger(__name__)
 logger = setup_logger(__name__)
 
 
-MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack']
-MOT_ARCH_JDE = ['JDE', 'FairMOT']
-MOT_ARCH_SDE = ['DeepSORT', 'ByteTrack']
+MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
+MOT_ARCH_JDE = MOT_ARCH[:2]
+MOT_ARCH_SDE = MOT_ARCH[2:4]
 MOT_DATA_TYPE = ['mot', 'mcmot', 'kitti']
 MOT_DATA_TYPE = ['mot', 'mcmot', 'kitti']
 
 
 __all__ = ['Tracker']
 __all__ = ['Tracker']
@@ -67,6 +69,13 @@ class Tracker(object):
                     m._epsilon = 1e-3  # for amp(fp16)
                     m._epsilon = 1e-3  # for amp(fp16)
                     m._momentum = 0.97  # 0.03 in pytorch
                     m._momentum = 0.97  # 0.03 in pytorch
 
 
+        anno_file = self.dataset.get_anno()
+        clsid2catid, catid2name = get_categories(
+            self.cfg.metric, anno_file=anno_file)
+        self.ids2names = []
+        for k, v in catid2name.items():
+            self.ids2names.append(v)
+
         self.status = {}
         self.status = {}
         self.start_epoch = 0
         self.start_epoch = 0
 
 
@@ -130,6 +139,53 @@ class Tracker(object):
         else:
         else:
             load_weight(self.model.reid, reid_weights)
             load_weight(self.model.reid, reid_weights)
 
 
+    def _eval_seq_centertrack(self,
+                              dataloader,
+                              save_dir=None,
+                              show_image=False,
+                              frame_rate=30,
+                              draw_threshold=0):
+        assert isinstance(self.model.tracker, CenterTracker)
+        if save_dir:
+            if not os.path.exists(save_dir): os.makedirs(save_dir)
+        tracker = self.model.tracker
+
+        timer = MOTTimer()
+        frame_id = 0
+        self.status['mode'] = 'track'
+        self.model.eval()
+        results = defaultdict(list)  # only support single class now
+
+        for step_id, data in enumerate(tqdm(dataloader)):
+            self.status['step_id'] = step_id
+            if step_id == 0:
+                self.model.reset_tracking()
+
+            # forward
+            timer.tic()
+            pred_ret = self.model(data)
+
+            online_targets = tracker.update(pred_ret)
+            online_tlwhs, online_scores, online_ids = [], [], []
+            for t in online_targets:
+                bbox = t['bbox']
+                tlwh = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]]
+                tscore = float(t['score'])
+                tid = int(t['tracking_id'])
+                if tlwh[2] * tlwh[3] > 0:
+                    online_tlwhs.append(tlwh)
+                    online_ids.append(tid)
+                    online_scores.append(tscore)
+            timer.toc()
+            # save results
+            results[0].append(
+                (frame_id + 1, online_tlwhs, online_scores, online_ids))
+            save_vis_results(data, frame_id, online_ids, online_tlwhs,
+                             online_scores, timer.average_time, show_image,
+                             save_dir, self.cfg.num_classes, self.ids2names)
+            frame_id += 1
+        return results, frame_id, timer.average_time, timer.calls
+
     def _eval_seq_jde(self,
     def _eval_seq_jde(self,
                       dataloader,
                       dataloader,
                       save_dir=None,
                       save_dir=None,
@@ -180,7 +236,7 @@ class Tracker(object):
             timer.toc()
             timer.toc()
             save_vis_results(data, frame_id, online_ids, online_tlwhs,
             save_vis_results(data, frame_id, online_ids, online_tlwhs,
                              online_scores, timer.average_time, show_image,
                              online_scores, timer.average_time, show_image,
-                             save_dir, self.cfg.num_classes)
+                             save_dir, self.cfg.num_classes, self.ids2names)
             frame_id += 1
             frame_id += 1
 
 
         return results, frame_id, timer.average_time, timer.calls
         return results, frame_id, timer.average_time, timer.calls
@@ -197,7 +253,11 @@ class Tracker(object):
         if save_dir:
         if save_dir:
             if not os.path.exists(save_dir): os.makedirs(save_dir)
             if not os.path.exists(save_dir): os.makedirs(save_dir)
         use_detector = False if not self.model.detector else True
         use_detector = False if not self.model.detector else True
-        use_reid = False if not self.model.reid else True
+        use_reid = hasattr(self.model, 'reid')
+        if use_reid and self.model.reid is not None:
+            use_reid = True
+        else:
+            use_reid = False
 
 
         timer = MOTTimer()
         timer = MOTTimer()
         results = defaultdict(list)
         results = defaultdict(list)
@@ -290,7 +350,7 @@ class Tracker(object):
                 online_ids, online_tlwhs, online_scores = None, None, None
                 online_ids, online_tlwhs, online_scores = None, None, None
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                  online_scores, timer.average_time, show_image,
                                  online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes)
+                                 save_dir, self.cfg.num_classes, self.ids2names)
                 frame_id += 1
                 frame_id += 1
                 # thus will not inference reid model
                 # thus will not inference reid model
                 continue
                 continue
@@ -338,7 +398,7 @@ class Tracker(object):
                     (frame_id + 1, online_tlwhs, online_scores, online_ids))
                     (frame_id + 1, online_tlwhs, online_scores, online_ids))
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                  online_scores, timer.average_time, show_image,
                                  online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes)
+                                 save_dir, self.cfg.num_classes, self.ids2names)
 
 
             elif isinstance(tracker, JDETracker):
             elif isinstance(tracker, JDETracker):
                 # trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set
                 # trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set
@@ -369,7 +429,8 @@ class Tracker(object):
                 timer.toc()
                 timer.toc()
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                  online_scores, timer.average_time, show_image,
                                  online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes)
+                                 save_dir, self.cfg.num_classes, self.ids2names)
+
             elif isinstance(tracker, OCSORTTracker):
             elif isinstance(tracker, OCSORTTracker):
                 # OC_SORT Tracker
                 # OC_SORT Tracker
                 online_targets = tracker.update(pred_dets_old, pred_embs)
                 online_targets = tracker.update(pred_dets_old, pred_embs)
@@ -390,7 +451,31 @@ class Tracker(object):
                     (frame_id + 1, online_tlwhs, online_scores, online_ids))
                     (frame_id + 1, online_tlwhs, online_scores, online_ids))
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                 save_vis_results(data, frame_id, online_ids, online_tlwhs,
                                  online_scores, timer.average_time, show_image,
                                  online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes)
+                                 save_dir, self.cfg.num_classes, self.ids2names)
+
+            elif isinstance(tracker, BOTSORTTracker):
+                # BOTSORT Tracker
+                online_targets = tracker.update(
+                    pred_dets_old, img=ori_image.numpy())
+                online_tlwhs = []
+                online_ids = []
+                online_scores = []
+                for t in online_targets:
+                    tlwh = t.tlwh
+                    tid = t.track_id
+                    tscore = t.score
+                    if tlwh[2] * tlwh[3] > 0:
+                        online_tlwhs.append(tlwh)
+                        online_ids.append(tid)
+                        online_scores.append(tscore)
+                timer.toc()
+                # save results
+                results[0].append(
+                    (frame_id + 1, online_tlwhs, online_scores, online_ids))
+                save_vis_results(data, frame_id, online_ids, online_tlwhs,
+                                 online_scores, timer.average_time, show_image,
+                                 save_dir, self.cfg.num_classes, self.ids2names)
+
             else:
             else:
                 raise ValueError(tracker)
                 raise ValueError(tracker)
             frame_id += 1
             frame_id += 1
@@ -461,6 +546,12 @@ class Tracker(object):
                         scaled=scaled,
                         scaled=scaled,
                         det_file=os.path.join(det_results_dir,
                         det_file=os.path.join(det_results_dir,
                                               '{}.txt'.format(seq)))
                                               '{}.txt'.format(seq)))
+                elif model_type == 'CenterTrack':
+                    results, nf, ta, tc = self._eval_seq_centertrack(
+                        dataloader,
+                        save_dir=save_dir,
+                        show_image=show_image,
+                        frame_rate=frame_rate)
                 else:
                 else:
                     raise ValueError(model_type)
                     raise ValueError(model_type)
 
 
@@ -587,6 +678,12 @@ class Tracker(object):
                     det_file=os.path.join(det_results_dir,
                     det_file=os.path.join(det_results_dir,
                                           '{}.txt'.format(seq)),
                                           '{}.txt'.format(seq)),
                     draw_threshold=draw_threshold)
                     draw_threshold=draw_threshold)
+            elif model_type == 'CenterTrack':
+                results, nf, ta, tc = self._eval_seq_centertrack(
+                    dataloader,
+                    save_dir=save_dir,
+                    show_image=show_image,
+                    frame_rate=frame_rate)
             else:
             else:
                 raise ValueError(model_type)
                 raise ValueError(model_type)
 
 

+ 147 - 30
paddlers/models/ppdet/engine/trainer.py

@@ -38,7 +38,7 @@ from paddlers.models.ppdet.optimizer import ModelEMA
 from paddlers.models.ppdet.core.workspace import create
 from paddlers.models.ppdet.core.workspace import create
 from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
 from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
 from paddlers.models.ppdet.utils.visualizer import visualize_results, save_result
 from paddlers.models.ppdet.utils.visualizer import visualize_results, save_result
-from paddlers.models.ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownMPIIEval
+from paddlers.models.ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownMPIIEval, Pose3DEval
 from paddlers.models.ppdet.metrics import RBoxMetric, JDEDetMetric, SNIPERCOCOMetric
 from paddlers.models.ppdet.metrics import RBoxMetric, JDEDetMetric, SNIPERCOCOMetric
 from paddlers.models.ppdet.data.source.sniper_coco import SniperCOCODataSet
 from paddlers.models.ppdet.data.source.sniper_coco import SniperCOCODataSet
 from paddlers.models.ppdet.data.source.category import get_categories
 from paddlers.models.ppdet.data.source.category import get_categories
@@ -48,7 +48,7 @@ from paddlers.models.ppdet.utils import profiler
 from paddlers.models.ppdet.modeling.post_process import multiclass_nms
 from paddlers.models.ppdet.modeling.post_process import multiclass_nms
 
 
 from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback
 from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback
-from .export_utils import _dump_infer_config, _prune_input_spec
+from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static
 
 
 from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
 from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
 
 
@@ -57,12 +57,12 @@ logger = setup_logger('ppdet.engine')
 
 
 __all__ = ['Trainer']
 __all__ = ['Trainer']
 
 
-MOT_ARCH = ['DeepSORT', 'JDE', 'FairMOT', 'ByteTrack']
+MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
 
 
 
 
 class Trainer(object):
 class Trainer(object):
     def __init__(self, cfg, mode='train'):
     def __init__(self, cfg, mode='train'):
-        self.cfg = cfg
+        self.cfg = cfg.copy()
         assert mode.lower() in ['train', 'eval', 'test'], \
         assert mode.lower() in ['train', 'eval', 'test'], \
                 "mode should be 'train', 'eval' or 'test'"
                 "mode should be 'train', 'eval' or 'test'"
         self.mode = mode.lower()
         self.mode = mode.lower()
@@ -72,10 +72,14 @@ class Trainer(object):
         self.amp_level = self.cfg.get('amp_level', 'O1')
         self.amp_level = self.cfg.get('amp_level', 'O1')
         self.custom_white_list = self.cfg.get('custom_white_list', None)
         self.custom_white_list = self.cfg.get('custom_white_list', None)
         self.custom_black_list = self.cfg.get('custom_black_list', None)
         self.custom_black_list = self.cfg.get('custom_black_list', None)
+        if 'slim' in cfg and cfg['slim_type'] == 'PTQ':
+            self.cfg['TestDataset'] = create('TestDataset')()
 
 
         # build data loader
         # build data loader
         capital_mode = self.mode.capitalize()
         capital_mode = self.mode.capitalize()
-        if cfg.architecture in MOT_ARCH and self.mode in ['eval', 'test']:
+        if cfg.architecture in MOT_ARCH and self.mode in [
+                'eval', 'test'
+        ] and cfg.metric not in ['COCO', 'VOC']:
             self.dataset = self.cfg['{}MOTDataset'.format(
             self.dataset = self.cfg['{}MOTDataset'.format(
                 capital_mode)] = create('{}MOTDataset'.format(capital_mode))()
                 capital_mode)] = create('{}MOTDataset'.format(capital_mode))()
         else:
         else:
@@ -95,12 +99,12 @@ class Trainer(object):
                 self.dataset, cfg.worker_num)
                 self.dataset, cfg.worker_num)
 
 
         if cfg.architecture == 'JDE' and self.mode == 'train':
         if cfg.architecture == 'JDE' and self.mode == 'train':
-            cfg['JDEEmbeddingHead'][
+            self.cfg['JDEEmbeddingHead'][
                 'num_identities'] = self.dataset.num_identities_dict[0]
                 'num_identities'] = self.dataset.num_identities_dict[0]
             # JDE only support single class MOT now.
             # JDE only support single class MOT now.
 
 
         if cfg.architecture == 'FairMOT' and self.mode == 'train':
         if cfg.architecture == 'FairMOT' and self.mode == 'train':
-            cfg['FairMOTEmbeddingHead'][
+            self.cfg['FairMOTEmbeddingHead'][
                 'num_identities_dict'] = self.dataset.num_identities_dict
                 'num_identities_dict'] = self.dataset.num_identities_dict
             # FairMOT support single class and multi-class MOT now.
             # FairMOT support single class and multi-class MOT now.
 
 
@@ -136,17 +140,30 @@ class Trainer(object):
         if self.mode == 'eval':
         if self.mode == 'eval':
             if cfg.architecture == 'FairMOT':
             if cfg.architecture == 'FairMOT':
                 self.loader = create('EvalMOTReader')(self.dataset, 0)
                 self.loader = create('EvalMOTReader')(self.dataset, 0)
+            elif cfg.architecture == "METRO_Body":
+                reader_name = '{}Reader'.format(self.mode.capitalize())
+                self.loader = create(reader_name)(self.dataset, cfg.worker_num)
             else:
             else:
                 self._eval_batch_sampler = paddle.io.BatchSampler(
                 self._eval_batch_sampler = paddle.io.BatchSampler(
                     self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
                     self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
                 reader_name = '{}Reader'.format(self.mode.capitalize())
                 reader_name = '{}Reader'.format(self.mode.capitalize())
                 # If metric is VOC, need to be set collate_batch=False.
                 # If metric is VOC, need to be set collate_batch=False.
                 if cfg.metric == 'VOC':
                 if cfg.metric == 'VOC':
-                    cfg[reader_name]['collate_batch'] = False
+                    self.cfg[reader_name]['collate_batch'] = False
                 self.loader = create(reader_name)(self.dataset, cfg.worker_num,
                 self.loader = create(reader_name)(self.dataset, cfg.worker_num,
                                                   self._eval_batch_sampler)
                                                   self._eval_batch_sampler)
         # TestDataset build after user set images, skip loader creation here
         # TestDataset build after user set images, skip loader creation here
 
 
+        # get Params
+        print_params = self.cfg.get('print_params', False)
+        if print_params:
+            params = sum([
+                p.numel() for n, p in self.model.named_parameters()
+                if all([x not in n for x in ['_mean', '_variance', 'aux_']])
+            ])  # exclude BatchNorm running status
+            logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[
+                0]))
+
         # build optimizer in train mode
         # build optimizer in train mode
         if self.mode == 'train':
         if self.mode == 'train':
             steps_per_epoch = len(self.loader)
             steps_per_epoch = len(self.loader)
@@ -172,12 +189,14 @@ class Trainer(object):
             ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
             ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
             cycle_epoch = self.cfg.get('cycle_epoch', -1)
             cycle_epoch = self.cfg.get('cycle_epoch', -1)
             ema_black_list = self.cfg.get('ema_black_list', None)
             ema_black_list = self.cfg.get('ema_black_list', None)
+            ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False)
             self.ema = ModelEMA(
             self.ema = ModelEMA(
                 self.model,
                 self.model,
                 decay=ema_decay,
                 decay=ema_decay,
                 ema_decay_type=ema_decay_type,
                 ema_decay_type=ema_decay_type,
                 cycle_epoch=cycle_epoch,
                 cycle_epoch=cycle_epoch,
-                ema_black_list=ema_black_list)
+                ema_black_list=ema_black_list,
+                ema_filter_no_grad=ema_filter_no_grad)
 
 
         self._nranks = dist.get_world_size()
         self._nranks = dist.get_world_size()
         self._local_rank = dist.get_rank()
         self._local_rank = dist.get_rank()
@@ -342,6 +361,13 @@ class Trainer(object):
                     self.cfg.save_dir,
                     self.cfg.save_dir,
                     save_prediction_only=save_prediction_only)
                     save_prediction_only=save_prediction_only)
             ]
             ]
+        elif self.cfg.metric == 'Pose3DEval':
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+            self._metrics = [
+                Pose3DEval(
+                    self.cfg.save_dir,
+                    save_prediction_only=save_prediction_only)
+            ]
         elif self.cfg.metric == 'MOTDet':
         elif self.cfg.metric == 'MOTDet':
             self._metrics = [JDEDetMetric(), ]
             self._metrics = [JDEDetMetric(), ]
         else:
         else:
@@ -378,7 +404,8 @@ class Trainer(object):
     def load_weights_sde(self, det_weights, reid_weights):
     def load_weights_sde(self, det_weights, reid_weights):
         if self.model.detector:
         if self.model.detector:
             load_weight(self.model.detector, det_weights)
             load_weight(self.model.detector, det_weights)
-            load_weight(self.model.reid, reid_weights)
+            if self.model.reid:
+                load_weight(self.model.reid, reid_weights)
         else:
         else:
             load_weight(self.model.reid, reid_weights)
             load_weight(self.model.reid, reid_weights)
 
 
@@ -400,15 +427,19 @@ class Trainer(object):
                 "EvalDataset")()
                 "EvalDataset")()
 
 
         model = self.model
         model = self.model
-        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
-                   self.cfg.use_gpu and self._nranks > 1)
+        if self.cfg.get('to_static', False):
+            model = apply_to_static(self.cfg, model)
+        sync_bn = (
+            getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
+            (self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu) and
+            self._nranks > 1)
         if sync_bn:
         if sync_bn:
             model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
             model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
 
 
         # enabel auto mixed precision mode
         # enabel auto mixed precision mode
         if self.use_amp:
         if self.use_amp:
             scaler = paddle.amp.GradScaler(
             scaler = paddle.amp.GradScaler(
-                enable=self.cfg.use_gpu or self.cfg.use_npu,
+                enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu,
                 init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))
                 init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))
         # get distributed model
         # get distributed model
         if self.cfg.get('fleet', False):
         if self.cfg.get('fleet', False):
@@ -463,7 +494,8 @@ class Trainer(object):
                             DataParallel) and use_fused_allreduce_gradients:
                             DataParallel) and use_fused_allreduce_gradients:
                         with model.no_sync():
                         with model.no_sync():
                             with paddle.amp.auto_cast(
                             with paddle.amp.auto_cast(
-                                    enable=self.cfg.use_gpu,
+                                    enable=self.cfg.use_gpu or
+                                    self.cfg.use_npu or self.cfg.use_mlu,
                                     custom_white_list=self.custom_white_list,
                                     custom_white_list=self.custom_white_list,
                                     custom_black_list=self.custom_black_list,
                                     custom_black_list=self.custom_black_list,
                                     level=self.amp_level):
                                     level=self.amp_level):
@@ -477,7 +509,8 @@ class Trainer(object):
                             list(model.parameters()), None)
                             list(model.parameters()), None)
                     else:
                     else:
                         with paddle.amp.auto_cast(
                         with paddle.amp.auto_cast(
-                                enable=self.cfg.use_gpu,
+                                enable=self.cfg.use_gpu or self.cfg.use_npu or
+                                self.cfg.use_mlu,
                                 custom_white_list=self.custom_white_list,
                                 custom_white_list=self.custom_white_list,
                                 custom_black_list=self.custom_black_list,
                                 custom_black_list=self.custom_black_list,
                                 level=self.amp_level):
                                 level=self.amp_level):
@@ -527,7 +560,7 @@ class Trainer(object):
             if self.cfg.get('unstructured_prune'):
             if self.cfg.get('unstructured_prune'):
                 self.pruner.update_params()
                 self.pruner.update_params()
 
 
-            is_snapshot = (self._nranks < 2 or self._local_rank == 0) \
+            is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == "Pose3DEval")) \
                        and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
                        and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
             if is_snapshot and self.use_ema:
             if is_snapshot and self.use_ema:
                 # apply ema weight on model
                 # apply ema weight on model
@@ -548,10 +581,14 @@ class Trainer(object):
                     # If metric is VOC, need to be set collate_batch=False.
                     # If metric is VOC, need to be set collate_batch=False.
                     if self.cfg.metric == 'VOC':
                     if self.cfg.metric == 'VOC':
                         self.cfg['EvalReader']['collate_batch'] = False
                         self.cfg['EvalReader']['collate_batch'] = False
-                    self._eval_loader = create('EvalReader')(
-                        self._eval_dataset,
-                        self.cfg.worker_num,
-                        batch_sampler=self._eval_batch_sampler)
+                    if self.cfg.metric == "Pose3DEval":
+                        self._eval_loader = create('EvalReader')(
+                            self._eval_dataset, self.cfg.worker_num)
+                    else:
+                        self._eval_loader = create('EvalReader')(
+                            self._eval_dataset,
+                            self.cfg.worker_num,
+                            batch_sampler=self._eval_batch_sampler)
                 # if validation in training is enabled, metrics should be re-init
                 # if validation in training is enabled, metrics should be re-init
                 # Init_mark makes sure this code will only execute once
                 # Init_mark makes sure this code will only execute once
                 if validate and Init_mark == False:
                 if validate and Init_mark == False:
@@ -575,6 +612,7 @@ class Trainer(object):
         tic = time.time()
         tic = time.time()
         self._compose_callback.on_epoch_begin(self.status)
         self._compose_callback.on_epoch_begin(self.status)
         self.status['mode'] = 'eval'
         self.status['mode'] = 'eval'
+
         self.model.eval()
         self.model.eval()
         if self.cfg.get('print_flops', False):
         if self.cfg.get('print_flops', False):
             flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
             flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
@@ -586,7 +624,8 @@ class Trainer(object):
             # forward
             # forward
             if self.use_amp:
             if self.use_amp:
                 with paddle.amp.auto_cast(
                 with paddle.amp.auto_cast(
-                        enable=self.cfg.use_gpu,
+                        enable=self.cfg.use_gpu or self.cfg.use_npu or
+                        self.cfg.use_mlu,
                         custom_white_list=self.custom_white_list,
                         custom_white_list=self.custom_white_list,
                         custom_black_list=self.custom_black_list,
                         custom_black_list=self.custom_black_list,
                         level=self.amp_level):
                         level=self.amp_level):
@@ -617,6 +656,15 @@ class Trainer(object):
         self._reset_metrics()
         self._reset_metrics()
 
 
     def evaluate(self):
     def evaluate(self):
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
         with paddle.no_grad():
         with paddle.no_grad():
             self._eval_with_loader(self.loader)
             self._eval_with_loader(self.loader)
 
 
@@ -644,7 +692,8 @@ class Trainer(object):
             # forward
             # forward
             if self.use_amp:
             if self.use_amp:
                 with paddle.amp.auto_cast(
                 with paddle.amp.auto_cast(
-                        enable=self.cfg.use_gpu,
+                        enable=self.cfg.use_gpu or self.cfg.use_npu or
+                        self.cfg.use_mlu,
                         custom_white_list=self.custom_white_list,
                         custom_white_list=self.custom_white_list,
                         custom_black_list=self.custom_black_list,
                         custom_black_list=self.custom_black_list,
                         level=self.amp_level):
                         level=self.amp_level):
@@ -722,11 +771,51 @@ class Trainer(object):
                       output_dir='output',
                       output_dir='output',
                       save_results=False,
                       save_results=False,
                       visualize=True):
                       visualize=True):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
         self.dataset.set_slice_images(images, slice_size, overlap_ratio)
         self.dataset.set_slice_images(images, slice_size, overlap_ratio)
         loader = create('TestReader')(self.dataset, 0)
         loader = create('TestReader')(self.dataset, 0)
-
         imid2path = self.dataset.get_imid2path()
         imid2path = self.dataset.get_imid2path()
 
 
+        def setup_metrics_for_loader():
+            # mem
+            metrics = copy.deepcopy(self._metrics)
+            mode = self.mode
+            save_prediction_only = self.cfg[
+                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
+            output_eval = self.cfg[
+                'output_eval'] if 'output_eval' in self.cfg else None
+
+            # modify
+            self.mode = '_test'
+            self.cfg['save_prediction_only'] = True
+            self.cfg['output_eval'] = output_dir
+            self.cfg['imid2path'] = imid2path
+            self._init_metrics()
+
+            # restore
+            self.mode = mode
+            self.cfg.pop('save_prediction_only')
+            if save_prediction_only is not None:
+                self.cfg['save_prediction_only'] = save_prediction_only
+
+            self.cfg.pop('output_eval')
+            if output_eval is not None:
+                self.cfg['output_eval'] = output_eval
+
+            self.cfg.pop('imid2path')
+
+            _metrics = copy.deepcopy(self._metrics)
+            self._metrics = metrics
+
+            return _metrics
+
+        if save_results:
+            metrics = setup_metrics_for_loader()
+        else:
+            metrics = []
+
         anno_file = self.dataset.get_anno()
         anno_file = self.dataset.get_anno()
         clsid2catid, catid2name = get_categories(
         clsid2catid, catid2name = get_categories(
             self.cfg.metric, anno_file=anno_file)
             self.cfg.metric, anno_file=anno_file)
@@ -772,6 +861,9 @@ class Trainer(object):
                 merged_bboxs = []
                 merged_bboxs = []
                 data['im_id'] = data['ori_im_id']
                 data['im_id'] = data['ori_im_id']
 
 
+                for _m in metrics:
+                    _m.update(data, merged_results)
+
                 for key in ['im_shape', 'scale_factor', 'im_id']:
                 for key in ['im_shape', 'scale_factor', 'im_id']:
                     if isinstance(data, typing.Sequence):
                     if isinstance(data, typing.Sequence):
                         merged_results[key] = data[0][key]
                         merged_results[key] = data[0][key]
@@ -782,23 +874,36 @@ class Trainer(object):
                         merged_results[key] = value.numpy()
                         merged_results[key] = value.numpy()
                 results.append(merged_results)
                 results.append(merged_results)
 
 
+        for _m in metrics:
+            _m.accumulate()
+            _m.reset()
+
         if visualize:
         if visualize:
             for outs in results:
             for outs in results:
                 batch_res = get_infer_results(outs, clsid2catid)
                 batch_res = get_infer_results(outs, clsid2catid)
                 bbox_num = outs['bbox_num']
                 bbox_num = outs['bbox_num']
+
                 start = 0
                 start = 0
                 for i, im_id in enumerate(outs['im_id']):
                 for i, im_id in enumerate(outs['im_id']):
                     image_path = imid2path[int(im_id)]
                     image_path = imid2path[int(im_id)]
                     image = Image.open(image_path).convert('RGB')
                     image = Image.open(image_path).convert('RGB')
                     image = ImageOps.exif_transpose(image)
                     image = ImageOps.exif_transpose(image)
                     self.status['original_image'] = np.array(image.copy())
                     self.status['original_image'] = np.array(image.copy())
+
                     end = start + bbox_num[i]
                     end = start + bbox_num[i]
                     bbox_res = batch_res['bbox'][start:end] \
                     bbox_res = batch_res['bbox'][start:end] \
                             if 'bbox' in batch_res else None
                             if 'bbox' in batch_res else None
-                    mask_res, segm_res, keypoint_res = None, None, None
+                    mask_res = batch_res['mask'][start:end] \
+                            if 'mask' in batch_res else None
+                    segm_res = batch_res['segm'][start:end] \
+                            if 'segm' in batch_res else None
+                    keypoint_res = batch_res['keypoint'][start:end] \
+                            if 'keypoint' in batch_res else None
+                    pose3d_res = batch_res['pose3d'][start:end] \
+                            if 'pose3d' in batch_res else None
                     image = visualize_results(
                     image = visualize_results(
                         image, bbox_res, mask_res, segm_res, keypoint_res,
                         image, bbox_res, mask_res, segm_res, keypoint_res,
-                        int(im_id), catid2name, draw_threshold)
+                        pose3d_res, int(im_id), catid2name, draw_threshold)
                     self.status['result_image'] = np.array(image.copy())
                     self.status['result_image'] = np.array(image.copy())
                     if self._compose_callback:
                     if self._compose_callback:
                         self._compose_callback.on_step_end(self.status)
                         self._compose_callback.on_step_end(self.status)
@@ -808,6 +913,7 @@ class Trainer(object):
                     logger.info("Detection bbox results save in {}".format(
                     logger.info("Detection bbox results save in {}".format(
                         save_name))
                         save_name))
                     image.save(save_name, quality=95)
                     image.save(save_name, quality=95)
+
                     start = end
                     start = end
 
 
     def predict(self,
     def predict(self,
@@ -921,9 +1027,11 @@ class Trainer(object):
                             if 'segm' in batch_res else None
                             if 'segm' in batch_res else None
                     keypoint_res = batch_res['keypoint'][start:end] \
                     keypoint_res = batch_res['keypoint'][start:end] \
                             if 'keypoint' in batch_res else None
                             if 'keypoint' in batch_res else None
+                    pose3d_res = batch_res['pose3d'][start:end] \
+                            if 'pose3d' in batch_res else None
                     image = visualize_results(
                     image = visualize_results(
                         image, bbox_res, mask_res, segm_res, keypoint_res,
                         image, bbox_res, mask_res, segm_res, keypoint_res,
-                        int(im_id), catid2name, draw_threshold)
+                        pose3d_res, int(im_id), catid2name, draw_threshold)
                     self.status['result_image'] = np.array(image.copy())
                     self.status['result_image'] = np.array(image.copy())
                     if self._compose_callback:
                     if self._compose_callback:
                         self._compose_callback.on_step_end(self.status)
                         self._compose_callback.on_step_end(self.status)
@@ -935,6 +1043,7 @@ class Trainer(object):
                     image.save(save_name, quality=95)
                     image.save(save_name, quality=95)
 
 
                     start = end
                     start = end
+        return results
 
 
     def _get_save_image_name(self, output_dir, image_path):
     def _get_save_image_name(self, output_dir, image_path):
         """
         """
@@ -976,6 +1085,10 @@ class Trainer(object):
                 if hasattr(layer, 'convert_to_deploy'):
                 if hasattr(layer, 'convert_to_deploy'):
                     layer.convert_to_deploy()
                     layer.convert_to_deploy()
 
 
+        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
+                'export'] and self.cfg['export']['fuse_conv_bn']:
+            self.model = fuse_conv_bn(self.model)
+
         export_post_process = self.cfg['export'].get(
         export_post_process = self.cfg['export'].get(
             'post_process', False) if hasattr(self.cfg, 'export') else True
             'post_process', False) if hasattr(self.cfg, 'export') else True
         export_nms = self.cfg['export'].get('nms', False) if hasattr(
         export_nms = self.cfg['export'].get('nms', False) if hasattr(
@@ -1045,12 +1158,12 @@ class Trainer(object):
         return static_model, pruned_input_spec
         return static_model, pruned_input_spec
 
 
     def export(self, output_dir='output_inference'):
     def export(self, output_dir='output_inference'):
+        if hasattr(self.model, 'aux_neck'):
+            self.model.__delattr__('aux_neck')
+        if hasattr(self.model, 'aux_head'):
+            self.model.__delattr__('aux_head')
         self.model.eval()
         self.model.eval()
 
 
-        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
-                'export'] and self.cfg['export']['fuse_conv_bn']:
-            self.model = fuse_conv_bn(self.model)
-
         model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
         model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
         save_dir = os.path.join(output_dir, model_name)
         save_dir = os.path.join(output_dir, model_name)
         if not os.path.exists(save_dir):
         if not os.path.exists(save_dir):
@@ -1095,6 +1208,10 @@ class Trainer(object):
         logger.info("Export Post-Quant model and saved in {}".format(save_dir))
         logger.info("Export Post-Quant model and saved in {}".format(save_dir))
 
 
     def _flops(self, loader):
     def _flops(self, loader):
+        if hasattr(self.model, 'aux_neck'):
+            self.model.__delattr__('aux_neck')
+        if hasattr(self.model, 'aux_head'):
+            self.model.__delattr__('aux_head')
         self.model.eval()
         self.model.eval()
         try:
         try:
             import paddleslim
             import paddleslim

+ 42 - 0
paddlers/models/ppdet/engine/trainer_cot.py

@@ -0,0 +1,42 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlers.models.ppdet.core.workspace import create
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+from . import Trainer
+__all__ = ['TrainerCot']
+
+class TrainerCot(Trainer):
+    """
+    Trainer for label-cotuning
+    calculate the relationship between base_classes and novel_classes
+    """
+    def __init__(self, cfg, mode='train'):
+        super(TrainerCot, self).__init__(cfg, mode)
+        self.cotuning_init()
+
+    def cotuning_init(self):    
+        num_classes_novel = self.cfg['num_classes']
+
+        self.load_weights(self.cfg.pretrain_weights)
+
+        self.model.eval()
+        relationship = self.model.relationship_learning(self.loader, num_classes_novel)
+    
+        self.model.init_cot_head(relationship)
+        self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
+
+

+ 475 - 0
paddlers/models/ppdet/engine/trainer_ssod.py

@@ -0,0 +1,475 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import time
+import typing
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+from paddle.distributed import fleet
+from paddlers.models.ppdet.optimizer import ModelEMA, SimpleModelEMA
+
+from paddlers.models.ppdet.core.workspace import create
+from paddlers.models.ppdet.utils.checkpoint import load_weight, load_pretrain_weight
+import paddlers.models.ppdet.utils.stats as stats
+from paddlers.models.ppdet.utils import profiler
+from paddlers.models.ppdet.modeling.ssod.utils import align_weak_strong_shape
+from .trainer import Trainer
+
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = ['Trainer_DenseTeacher']
+
+
+class Trainer_DenseTeacher(Trainer):
+    def __init__(self, cfg, mode='train'):
+        self.cfg = cfg
+        assert mode.lower() in ['train', 'eval', 'test'], \
+                "mode should be 'train', 'eval' or 'test'"
+        self.mode = mode.lower()
+        self.optimizer = None
+        self.is_loaded_weights = False
+        self.use_amp = self.cfg.get('amp', False)
+        self.amp_level = self.cfg.get('amp_level', 'O1')
+        self.custom_white_list = self.cfg.get('custom_white_list', None)
+        self.custom_black_list = self.cfg.get('custom_black_list', None)
+
+        # build data loader
+        capital_mode = self.mode.capitalize()
+        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
+            '{}Dataset'.format(capital_mode))()
+
+        if self.mode == 'train':
+            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
+                'UnsupTrainDataset')
+            self.loader = create('SemiTrainReader')(
+                self.dataset, self.dataset_unlabel, cfg.worker_num)
+
+        # build model
+        if 'model' not in self.cfg:
+            self.model = create(cfg.architecture)
+        else:
+            self.model = self.cfg.model
+            self.is_loaded_weights = True
+
+        # EvalDataset build with BatchSampler to evaluate in single device
+        # TODO: multi-device evaluate
+        if self.mode == 'eval':
+            self._eval_batch_sampler = paddle.io.BatchSampler(
+                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
+            # If metric is VOC, need to be set collate_batch=False.
+            if cfg.metric == 'VOC':
+                cfg['EvalReader']['collate_batch'] = False
+            self.loader = create('EvalReader')(self.dataset, cfg.worker_num,
+                                               self._eval_batch_sampler)
+        # TestDataset build after user set images, skip loader creation here
+
+        # build optimizer in train mode
+        if self.mode == 'train':
+            steps_per_epoch = len(self.loader)
+            if steps_per_epoch < 1:
+                logger.warning(
+                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
+                )
+            self.lr = create('LearningRate')(steps_per_epoch)
+            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
+
+            # Unstructured pruner is only enabled in the train mode.
+            if self.cfg.get('unstructured_prune'):
+                self.pruner = create('UnstructuredPruner')(self.model,
+                                                           steps_per_epoch)
+        if self.use_amp and self.amp_level == 'O2':
+            self.model, self.optimizer = paddle.amp.decorate(
+                models=self.model,
+                optimizers=self.optimizer,
+                level=self.amp_level)
+
+        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
+        if self.use_ema:
+            ema_decay = self.cfg.get('ema_decay', 0.9998)
+            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
+            cycle_epoch = self.cfg.get('cycle_epoch', -1)
+            ema_black_list = self.cfg.get('ema_black_list', None)
+            self.ema = ModelEMA(
+                self.model,
+                decay=ema_decay,
+                ema_decay_type=ema_decay_type,
+                cycle_epoch=cycle_epoch,
+                ema_black_list=ema_black_list)
+            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)
+
+        # simple_ema for SSOD
+        self.use_simple_ema = ('use_simple_ema' in cfg and
+                               cfg['use_simple_ema'])
+        if self.use_simple_ema:
+            self.use_ema = True
+            ema_decay = self.cfg.get('ema_decay', 0.9996)
+            self.ema = SimpleModelEMA(self.model, decay=ema_decay)
+            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)
+
+        self._nranks = dist.get_world_size()
+        self._local_rank = dist.get_rank()
+
+        self.status = {}
+
+        self.start_epoch = 0
+        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
+
+        # initial default callbacks
+        self._init_callbacks()
+
+        # initial default metrics
+        self._init_metrics()
+        self._reset_metrics()
+
+    def load_weights(self, weights):
+        if self.is_loaded_weights:
+            return
+        self.start_epoch = 0
+        load_pretrain_weight(self.model, weights)
+        load_pretrain_weight(self.ema.model, weights)
+        logger.info("Load weights {} to start training for teacher and student".
+                    format(weights))
+
+    def resume_weights(self, weights, exchange=True):
+        # support Distill resume weights
+        if hasattr(self.model, 'student_model'):
+            self.start_epoch = load_weight(self.model.student_model, weights,
+                                           self.optimizer, exchange)
+        else:
+            self.start_epoch = load_weight(self.model, weights, self.optimizer,
+                                           self.ema
+                                           if self.use_ema else None, exchange)
+        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
+
+    def train(self, validate=False):
+        self.semi_start_iters = self.cfg.get('semi_start_iters', 5000)
+        Init_mark = False
+        if validate:
+            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
+                "EvalDataset")()
+
+        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
+                   self.cfg.use_gpu and self._nranks > 1)
+        if sync_bn:
+            self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
+                self.model)
+
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
+            self.ema.model = paddle.DataParallel(
+                self.ema.model, find_unused_parameters=find_unused_parameters)
+
+        self.status.update({
+            'epoch_id': self.start_epoch,
+            'step_id': 0,
+            'steps_per_epoch': len(self.loader),
+            'exchange_save_model': True,
+        })
+        # Note: exchange_save_model
+        # in DenseTeacher SSOD, the teacher model will be higher, so exchange when saving pdparams
+
+        self.status['batch_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['data_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
+
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, self.cfg.worker_num)
+            self._flops(flops_loader)
+        profiler_options = self.cfg.get('profiler_options', None)
+        self._compose_callback.on_train_begin(self.status)
+
+        train_cfg = self.cfg.DenseTeacher['train_cfg']
+        concat_sup_data = train_cfg.get('concat_sup_data', True)
+
+        for param in self.ema.model.parameters():
+            param.stop_gradient = True
+
+        for epoch_id in range(self.start_epoch, self.cfg.epoch):
+            self.status['mode'] = 'train'
+            self.status['epoch_id'] = epoch_id
+            self._compose_callback.on_epoch_begin(self.status)
+            self.loader.dataset_label.set_epoch(epoch_id)
+            self.loader.dataset_unlabel.set_epoch(epoch_id)
+            iter_tic = time.time()
+            loss_dict = {
+                'loss': paddle.to_tensor([0]),
+                'loss_sup_sum': paddle.to_tensor([0]),
+                'loss_unsup_sum': paddle.to_tensor([0]),
+                'fg_sum': paddle.to_tensor([0]),
+            }
+            if self._nranks > 1:
+                for k in self.model._layers.get_loss_keys():
+                    loss_dict.update({k: paddle.to_tensor([0.])})
+                for k in self.model._layers.get_loss_keys():
+                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
+            else:
+                for k in self.model.get_loss_keys():
+                    loss_dict.update({k: paddle.to_tensor([0.])})
+                for k in self.model.get_loss_keys():
+                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
+
+            # Note: for step_id, data in enumerate(self.loader): # enumerate bug
+            for step_id in range(len(self.loader)):
+                data = next(self.loader)
+
+                self.model.train()
+                self.ema.model.eval()
+                data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data
+
+                self.status['data_time'].update(time.time() - iter_tic)
+                self.status['step_id'] = step_id
+                profiler.add_profiler_step(profiler_options)
+                self._compose_callback.on_step_begin(self.status)
+
+                if data_sup_w['image'].shape != data_sup_s['image'].shape:
+                    data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
+                                                                     data_sup_s)
+
+                data_sup_w['epoch_id'] = epoch_id
+                data_sup_s['epoch_id'] = epoch_id
+                if concat_sup_data:
+                    for k, v in data_sup_s.items():
+                        if k in ['epoch_id']:
+                            continue
+                        data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
+                    loss_dict_sup = self.model(data_sup_s)
+                else:
+                    loss_dict_sup_w = self.model(data_sup_w)
+                    loss_dict_sup = self.model(data_sup_s)
+                    for k, v in loss_dict_sup_w.items():
+                        loss_dict_sup[k] = (loss_dict_sup[k] + v) * 0.5
+
+                losses_sup = loss_dict_sup['loss'] * train_cfg['sup_weight']
+                losses_sup.backward()
+
+                losses = losses_sup.detach()
+                loss_dict.update(loss_dict_sup)
+                loss_dict.update({'loss_sup_sum': loss_dict['loss']})
+
+                curr_iter = len(self.loader) * epoch_id + step_id
+                st_iter = self.semi_start_iters
+                if curr_iter == st_iter:
+                    logger.info("***" * 30)
+                    logger.info('Semi starting ...')
+                    logger.info("***" * 30)
+                if curr_iter > st_iter:
+                    unsup_weight = train_cfg['unsup_weight']
+                    if train_cfg['suppress'] == 'linear':
+                        tar_iter = st_iter * 2
+                        if curr_iter <= tar_iter:
+                            unsup_weight *= (curr_iter - st_iter) / st_iter
+                    elif train_cfg['suppress'] == 'exp':
+                        tar_iter = st_iter + 2000
+                        if curr_iter <= tar_iter:
+                            scale = np.exp((curr_iter - tar_iter) / 1000)
+                            unsup_weight *= scale
+                    elif train_cfg['suppress'] == 'step':
+                        tar_iter = st_iter * 2
+                        if curr_iter <= tar_iter:
+                            unsup_weight *= 0.25
+                    else:
+                        raise ValueError
+
+                    if data_unsup_w['image'].shape != data_unsup_s[
+                            'image'].shape:
+                        data_unsup_w, data_unsup_s = align_weak_strong_shape(
+                            data_unsup_w, data_unsup_s)
+
+                    data_unsup_w['epoch_id'] = epoch_id
+                    data_unsup_s['epoch_id'] = epoch_id
+
+                    data_unsup_s['get_data'] = True
+                    student_preds = self.model(data_unsup_s)
+
+                    with paddle.no_grad():
+                        data_unsup_w['is_teacher'] = True
+                        teacher_preds = self.ema.model(data_unsup_w)
+
+                    train_cfg['curr_iter'] = curr_iter
+                    train_cfg['st_iter'] = st_iter
+                    if self._nranks > 1:
+                        loss_dict_unsup = self.model._layers.get_ssod_loss(
+                            student_preds, teacher_preds, train_cfg)
+                    else:
+                        loss_dict_unsup = self.model.get_ssod_loss(
+                            student_preds, teacher_preds, train_cfg)
+
+                    fg_num = loss_dict_unsup["fg_sum"]
+                    del loss_dict_unsup["fg_sum"]
+                    distill_weights = train_cfg['loss_weight']
+                    loss_dict_unsup = {
+                        k: v * distill_weights[k]
+                        for k, v in loss_dict_unsup.items()
+                    }
+
+                    losses_unsup = sum([
+                        metrics_value
+                        for metrics_value in loss_dict_unsup.values()
+                    ]) * unsup_weight
+                    losses_unsup.backward()
+
+                    loss_dict.update(loss_dict_unsup)
+                    loss_dict.update({'loss_unsup_sum': losses_unsup})
+                    losses += losses_unsup.detach()
+                    loss_dict.update({"fg_sum": fg_num})
+                    loss_dict['loss'] = losses
+
+                self.optimizer.step()
+                curr_lr = self.optimizer.get_lr()
+                self.lr.step()
+                self.optimizer.clear_grad()
+                self.status['learning_rate'] = curr_lr
+                if self._nranks < 2 or self._local_rank == 0:
+                    self.status['training_staus'].update(loss_dict)
+
+                self.status['batch_time'].update(time.time() - iter_tic)
+                self._compose_callback.on_step_end(self.status)
+                # Note: ema_start_iters
+                if self.use_ema and curr_iter == self.ema_start_iters:
+                    logger.info("***" * 30)
+                    logger.info('EMA starting ...')
+                    logger.info("***" * 30)
+                    self.ema.update(self.model, decay=0)
+                elif self.use_ema and curr_iter > self.ema_start_iters:
+                    self.ema.update(self.model)
+                iter_tic = time.time()
+
+            is_snapshot = (self._nranks < 2 or self._local_rank == 0) \
+                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
+            if is_snapshot and self.use_ema:
+                # apply ema weight on model
+                weight = copy.deepcopy(self.ema.model.state_dict())
+                for k, v in weight.items():
+                    if paddle.is_floating_point(v):
+                        weight[k].stop_gradient = True
+                self.status['weight'] = weight
+
+            self._compose_callback.on_epoch_end(self.status)
+
+            if validate and is_snapshot:
+                if not hasattr(self, '_eval_loader'):
+                    # build evaluation dataset and loader
+                    self._eval_dataset = self.cfg.EvalDataset
+                    self._eval_batch_sampler = \
+                        paddle.io.BatchSampler(
+                            self._eval_dataset,
+                            batch_size=self.cfg.EvalReader['batch_size'])
+                    # If metric is VOC, need to be set collate_batch=False.
+                    if self.cfg.metric == 'VOC':
+                        self.cfg['EvalReader']['collate_batch'] = False
+                    self._eval_loader = create('EvalReader')(
+                        self._eval_dataset,
+                        self.cfg.worker_num,
+                        batch_sampler=self._eval_batch_sampler)
+                # if validation in training is enabled, metrics should be re-init
+                # Init_mark makes sure this code will only execute once
+                if validate and Init_mark == False:
+                    Init_mark = True
+                    self._init_metrics(validate=validate)
+                    self._reset_metrics()
+
+                with paddle.no_grad():
+                    self.status['save_best_model'] = True
+                    self._eval_with_loader(self._eval_loader)
+
+            if is_snapshot and self.use_ema:
+                self.status.pop('weight')
+
+        self._compose_callback.on_train_end(self.status)
+
+    def evaluate(self):
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
+        with paddle.no_grad():
+            self._eval_with_loader(self.loader)
+
+    def _eval_with_loader(self, loader):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+
+        test_cfg = self.cfg.DenseTeacher['test_cfg']
+        if test_cfg['inference_on'] == 'teacher':
+            logger.info("***** teacher model evaluating *****")
+            eval_model = self.ema.model
+        else:
+            logger.info("***** student model evaluating *****")
+            eval_model = self.model
+
+        eval_model.eval()
+        if self.cfg.get('print_flops', False):
+            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
+                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
+            self._flops(flops_loader)
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            # forward
+            if self.use_amp:
+                with paddle.amp.auto_cast(
+                        enable=self.cfg.use_gpu or self.cfg.use_mlu,
+                        custom_white_list=self.custom_white_list,
+                        custom_black_list=self.custom_black_list,
+                        level=self.amp_level):
+                    outs = eval_model(data)
+            else:
+                outs = eval_model(data)
+
+            # update metrics
+            for metric in self._metrics:
+                metric.update(data, outs)
+
+            # multi-scale inputs: all inputs have same im_id
+            if isinstance(data, typing.Sequence):
+                sample_num += data[0]['im_id'].numpy().shape[0]
+            else:
+                sample_num += data['im_id'].numpy().shape[0]
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()

+ 18 - 17
paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cc → paddlers/models/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc

@@ -13,14 +13,14 @@
 // limitations under the License.
 // limitations under the License.
 //
 //
 // The code is based on
 // The code is based on
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
 
 
+#include "../rbox_iou/rbox_iou_utils.h"
 #include "paddle/extension.h"
 #include "paddle/extension.h"
-#include "rbox_iou_op.h"
 
 
 template <typename T>
 template <typename T>
 void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,
 void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,
-                            const T *rbox2_data_ptr, T *output_data_ptr) {
+                                 const T *rbox2_data_ptr, T *output_data_ptr) {
 
 
   int i;
   int i;
   for (i = 0; i < rbox_num; i++) {
   for (i = 0; i < rbox_num; i++) {
@@ -30,42 +30,43 @@ void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,
 }
 }
 
 
 #define CHECK_INPUT_CPU(x)                                                     \
 #define CHECK_INPUT_CPU(x)                                                     \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 
-std::vector<paddle::Tensor> MatchedRboxIouCPUForward(const paddle::Tensor &rbox1,
-                                                 const paddle::Tensor &rbox2) {
+std::vector<paddle::Tensor>
+MatchedRboxIouCPUForward(const paddle::Tensor &rbox1,
+                         const paddle::Tensor &rbox2) {
   CHECK_INPUT_CPU(rbox1);
   CHECK_INPUT_CPU(rbox1);
   CHECK_INPUT_CPU(rbox2);
   CHECK_INPUT_CPU(rbox2);
   PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
   PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
 
 
   auto rbox_num = rbox1.shape()[0];
   auto rbox_num = rbox1.shape()[0];
-  auto output = paddle::Tensor(paddle::PlaceType::kCPU, {rbox_num});
+  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::CPUPlace());
 
 
-  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rotated_iou_cpu_kernel", ([&] {
+  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "matched_rbox_iou_cpu_kernel", ([&] {
                                matched_rbox_iou_cpu_kernel<data_t>(
                                matched_rbox_iou_cpu_kernel<data_t>(
                                    rbox_num, rbox1.data<data_t>(),
                                    rbox_num, rbox1.data<data_t>(),
-                                   rbox2.data<data_t>(),
-                                   output.mutable_data<data_t>());
+                                   rbox2.data<data_t>(), output.data<data_t>());
                              }));
                              }));
 
 
   return {output};
   return {output};
 }
 }
 
 
 #ifdef PADDLE_WITH_CUDA
 #ifdef PADDLE_WITH_CUDA
-std::vector<paddle::Tensor> MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
-                                                  const paddle::Tensor &rbox2);
+std::vector<paddle::Tensor>
+MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
+                          const paddle::Tensor &rbox2);
 #endif
 #endif
 
 
 #define CHECK_INPUT_SAME(x1, x2)                                               \
 #define CHECK_INPUT_SAME(x1, x2)                                               \
   PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
   PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
 
 
 std::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1,
 std::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1,
-                                              const paddle::Tensor &rbox2) {
+                                                  const paddle::Tensor &rbox2) {
   CHECK_INPUT_SAME(rbox1, rbox2);
   CHECK_INPUT_SAME(rbox1, rbox2);
-  if (rbox1.place() == paddle::PlaceType::kCPU) {
+  if (rbox1.is_cpu()) {
     return MatchedRboxIouCPUForward(rbox1, rbox2);
     return MatchedRboxIouCPUForward(rbox1, rbox2);
 #ifdef PADDLE_WITH_CUDA
 #ifdef PADDLE_WITH_CUDA
-  } else if (rbox1.place() == paddle::PlaceType::kGPU) {
+  } else if (rbox1.is_gpu()) {
     return MatchedRboxIouCUDAForward(rbox1, rbox2);
     return MatchedRboxIouCUDAForward(rbox1, rbox2);
 #endif
 #endif
   }
   }
@@ -73,12 +74,12 @@ std::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1,
 
 
 std::vector<std::vector<int64_t>>
 std::vector<std::vector<int64_t>>
 MatchedRboxIouInferShape(std::vector<int64_t> rbox1_shape,
 MatchedRboxIouInferShape(std::vector<int64_t> rbox1_shape,
-                     std::vector<int64_t> rbox2_shape) {
+                         std::vector<int64_t> rbox2_shape) {
   return {{rbox1_shape[0]}};
   return {{rbox1_shape[0]}};
 }
 }
 
 
 std::vector<paddle::DataType> MatchedRboxIouInferDtype(paddle::DataType t1,
 std::vector<paddle::DataType> MatchedRboxIouInferDtype(paddle::DataType t1,
-                                                   paddle::DataType t2) {
+                                                       paddle::DataType t2) {
   return {t1};
   return {t1};
 }
 }
 
 

+ 9 - 14
paddlers/models/ppdet/ext_op/csrc/rbox_iou/matched_rbox_iou_op.cu → paddlers/models/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu

@@ -13,21 +13,15 @@
 // limitations under the License.
 // limitations under the License.
 //
 //
 // The code is based on
 // The code is based on
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
 
 
+#include "../rbox_iou/rbox_iou_utils.h"
 #include "paddle/extension.h"
 #include "paddle/extension.h"
-#include "rbox_iou_op.h"
-
-/**
-   Computes ceil(a / b)
-*/
-
-static inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; }
 
 
 template <typename T>
 template <typename T>
 __global__ void
 __global__ void
 matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,
 matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,
-                        const T *rbox2_data_ptr, T *output_data_ptr) {
+                             const T *rbox2_data_ptr, T *output_data_ptr) {
   for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num;
   for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num;
        tid += blockDim.x * gridDim.x) {
        tid += blockDim.x * gridDim.x) {
     output_data_ptr[tid] =
     output_data_ptr[tid] =
@@ -36,17 +30,18 @@ matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,
 }
 }
 
 
 #define CHECK_INPUT_GPU(x)                                                     \
 #define CHECK_INPUT_GPU(x)                                                     \
-  PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
+  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
 
 
-std::vector<paddle::Tensor> MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
-                                                  const paddle::Tensor &rbox2) {
+std::vector<paddle::Tensor>
+MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
+                          const paddle::Tensor &rbox2) {
   CHECK_INPUT_GPU(rbox1);
   CHECK_INPUT_GPU(rbox1);
   CHECK_INPUT_GPU(rbox2);
   CHECK_INPUT_GPU(rbox2);
   PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
   PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
 
 
   auto rbox_num = rbox1.shape()[0];
   auto rbox_num = rbox1.shape()[0];
 
 
-  auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox_num});
+  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::GPUPlace());
 
 
   const int thread_per_block = 512;
   const int thread_per_block = 512;
   const int block_per_grid = CeilDiv(rbox_num, thread_per_block);
   const int block_per_grid = CeilDiv(rbox_num, thread_per_block);
@@ -56,7 +51,7 @@ std::vector<paddle::Tensor> MatchedRboxIouCUDAForward(const paddle::Tensor &rbox
         matched_rbox_iou_cuda_kernel<
         matched_rbox_iou_cuda_kernel<
             data_t><<<block_per_grid, thread_per_block, 0, rbox1.stream()>>>(
             data_t><<<block_per_grid, thread_per_block, 0, rbox1.stream()>>>(
             rbox_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
             rbox_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
-            output.mutable_data<data_t>());
+            output.data<data_t>());
       }));
       }));
 
 
   return {output};
   return {output};

+ 121 - 0
paddlers/models/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc

@@ -0,0 +1,121 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../rbox_iou/rbox_iou_utils.h"
+#include "paddle/extension.h"
+
+template <typename T>
+void nms_rotated_cpu_kernel(const T *boxes_data, const float threshold,
+                            const int64_t num_boxes, int64_t *num_keep_boxes,
+                            int64_t *output_data) {
+
+  int num_masks = CeilDiv(num_boxes, 64);
+  std::vector<int64_t> masks(num_masks, 0);
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (masks[i / 64] & 1ULL << (i % 64))
+      continue;
+    T box_1[5];
+    for (int k = 0; k < 5; ++k) {
+      box_1[k] = boxes_data[i * 5 + k];
+    }
+    for (int64_t j = i + 1; j < num_boxes; ++j) {
+      if (masks[j / 64] & 1ULL << (j % 64))
+        continue;
+      T box_2[5];
+      for (int k = 0; k < 5; ++k) {
+        box_2[k] = boxes_data[j * 5 + k];
+      }
+      if (rbox_iou_single<T>(box_1, box_2) > threshold) {
+        masks[j / 64] |= 1ULL << (j % 64);
+      }
+    }
+  }
+  int64_t output_data_idx = 0;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (masks[i / 64] & 1ULL << (i % 64))
+      continue;
+    output_data[output_data_idx++] = i;
+  }
+  *num_keep_boxes = output_data_idx;
+  for (; output_data_idx < num_boxes; ++output_data_idx) {
+    output_data[output_data_idx] = 0;
+  }
+}
+
+#define CHECK_INPUT_CPU(x)                                                     \
+  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+std::vector<paddle::Tensor> NMSRotatedCPUForward(const paddle::Tensor &boxes,
+                                                 const paddle::Tensor &scores,
+                                                 float threshold) {
+  CHECK_INPUT_CPU(boxes);
+  CHECK_INPUT_CPU(scores);
+
+  auto num_boxes = boxes.shape()[0];
+
+  auto order_t =
+      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
+  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);
+
+  auto keep =
+      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
+  int64_t num_keep_boxes = 0;
+
+  PD_DISPATCH_FLOATING_TYPES(boxes.type(), "nms_rotated_cpu_kernel", ([&] {
+                               nms_rotated_cpu_kernel<data_t>(
+                                   boxes_sorted.data<data_t>(), threshold,
+                                   num_boxes, &num_keep_boxes,
+                                   keep.data<int64_t>());
+                             }));
+
+  keep = keep.slice(0, num_keep_boxes);
+  return {paddle::gather(order_t, keep, /* axis=*/0)};
+}
+
+#ifdef PADDLE_WITH_CUDA
+std::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,
+                                                  const paddle::Tensor &scores,
+                                                  float threshold);
+#endif
+
+std::vector<paddle::Tensor> NMSRotatedForward(const paddle::Tensor &boxes,
+                                              const paddle::Tensor &scores,
+                                              float threshold) {
+  if (boxes.is_cpu()) {
+    return NMSRotatedCPUForward(boxes, scores, threshold);
+#ifdef PADDLE_WITH_CUDA
+  } else if (boxes.is_gpu()) {
+    return NMSRotatedCUDAForward(boxes, scores, threshold);
+#endif
+  }
+}
+
+std::vector<std::vector<int64_t>>
+NMSRotatedInferShape(std::vector<int64_t> boxes_shape,
+                     std::vector<int64_t> scores_shape) {
+  return {{-1}};
+}
+
+std::vector<paddle::DataType> NMSRotatedInferDtype(paddle::DataType t1,
+                                                   paddle::DataType t2) {
+  return {paddle::DataType::INT64};
+}
+
+PD_BUILD_OP(nms_rotated)
+    .Inputs({"Boxes", "Scores"})
+    .Outputs({"Output"})
+    .Attrs({"threshold: float"})
+    .SetKernelFn(PD_KERNEL(NMSRotatedForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(NMSRotatedInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(NMSRotatedInferDtype));

+ 96 - 0
paddlers/models/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu

@@ -0,0 +1,96 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../rbox_iou/rbox_iou_utils.h"
+#include "paddle/extension.h"
+
+static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
+
+template <typename T>
+__global__ void
+nms_rotated_cuda_kernel(const T *boxes_data, const float threshold,
+                        const int64_t num_boxes, int64_t *masks) {
+  auto raw_start = blockIdx.y;
+  auto col_start = blockIdx.x;
+  if (raw_start > col_start)
+    return;
+  const int raw_last_storage =
+      min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
+  const int col_last_storage =
+      min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
+  if (threadIdx.x < raw_last_storage) {
+    int64_t mask = 0;
+    auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
+    const T *current_box = boxes_data + current_box_idx * 5;
+    for (int i = 0; i < col_last_storage; ++i) {
+      const T *target_box = boxes_data + (col_start * threadsPerBlock + i) * 5;
+      if (rbox_iou_single<T>(current_box, target_box) > threshold) {
+        mask |= 1ULL << i;
+      }
+    }
+    const int blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
+    masks[current_box_idx * blocks_per_line + col_start] = mask;
+  }
+}
+
+#define CHECK_INPUT_GPU(x)                                                     \
+  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+
+std::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,
+                                                  const paddle::Tensor &scores,
+                                                  float threshold) {
+  CHECK_INPUT_GPU(boxes);
+  CHECK_INPUT_GPU(scores);
+
+  auto num_boxes = boxes.shape()[0];
+  auto order_t =
+      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
+  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);
+
+  const auto blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
+  dim3 block(threadsPerBlock);
+  dim3 grid(blocks_per_line, blocks_per_line);
+  auto mask_dev = paddle::empty({num_boxes * blocks_per_line},
+                                paddle::DataType::INT64, paddle::GPUPlace());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      boxes.type(), "nms_rotated_cuda_kernel", ([&] {
+        nms_rotated_cuda_kernel<data_t><<<grid, block, 0, boxes.stream()>>>(
+            boxes_sorted.data<data_t>(), threshold, num_boxes,
+            mask_dev.data<int64_t>());
+      }));
+
+  auto mask_host = mask_dev.copy_to(paddle::CPUPlace(), true);
+  auto keep_host =
+      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
+  int64_t *keep_host_ptr = keep_host.data<int64_t>();
+  int64_t *mask_host_ptr = mask_host.data<int64_t>();
+  std::vector<int64_t> remv(blocks_per_line);
+  int64_t last_box_num = 0;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    auto remv_element_id = i / threadsPerBlock;
+    auto remv_bit_id = i % threadsPerBlock;
+    if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
+      keep_host_ptr[last_box_num++] = i;
+      int64_t *current_mask = mask_host_ptr + i * blocks_per_line;
+      for (auto j = remv_element_id; j < blocks_per_line; ++j) {
+        remv[j] |= current_mask[j];
+      }
+    }
+  }
+
+  keep_host = keep_host.slice(0, last_box_num);
+  auto keep_dev = keep_host.copy_to(paddle::GPUPlace(), true);
+  return {paddle::gather(order_t, keep_dev, /* axis=*/0)};
+}

+ 95 - 0
paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc

@@ -0,0 +1,95 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
+
+#include "paddle/extension.h"
+#include "rbox_iou_utils.h"
+
+template <typename T>
+void rbox_iou_cpu_kernel(const int rbox1_num, const int rbox2_num,
+                         const T *rbox1_data_ptr, const T *rbox2_data_ptr,
+                         T *output_data_ptr) {
+
+  int i, j;
+  for (i = 0; i < rbox1_num; i++) {
+    for (j = 0; j < rbox2_num; j++) {
+      int offset = i * rbox2_num + j;
+      output_data_ptr[offset] =
+          rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5);
+    }
+  }
+}
+
+#define CHECK_INPUT_CPU(x)                                                     \
+  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+std::vector<paddle::Tensor> RboxIouCPUForward(const paddle::Tensor &rbox1,
+                                              const paddle::Tensor &rbox2) {
+  CHECK_INPUT_CPU(rbox1);
+  CHECK_INPUT_CPU(rbox2);
+
+  auto rbox1_num = rbox1.shape()[0];
+  auto rbox2_num = rbox2.shape()[0];
+
+  auto output =
+      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::CPUPlace());
+
+  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rbox_iou_cpu_kernel", ([&] {
+                               rbox_iou_cpu_kernel<data_t>(
+                                   rbox1_num, rbox2_num, rbox1.data<data_t>(),
+                                   rbox2.data<data_t>(), output.data<data_t>());
+                             }));
+
+  return {output};
+}
+
+#ifdef PADDLE_WITH_CUDA
+std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
+                                               const paddle::Tensor &rbox2);
+#endif
+
+#define CHECK_INPUT_SAME(x1, x2)                                               \
+  PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
+
+std::vector<paddle::Tensor> RboxIouForward(const paddle::Tensor &rbox1,
+                                           const paddle::Tensor &rbox2) {
+  CHECK_INPUT_SAME(rbox1, rbox2);
+  if (rbox1.is_cpu()) {
+    return RboxIouCPUForward(rbox1, rbox2);
+#ifdef PADDLE_WITH_CUDA
+  } else if (rbox1.is_gpu()) {
+    return RboxIouCUDAForward(rbox1, rbox2);
+#endif
+  }
+}
+
+std::vector<std::vector<int64_t>>
+RboxIouInferShape(std::vector<int64_t> rbox1_shape,
+                  std::vector<int64_t> rbox2_shape) {
+  return {{rbox1_shape[0], rbox2_shape[0]}};
+}
+
+std::vector<paddle::DataType> RboxIouInferDtype(paddle::DataType t1,
+                                                paddle::DataType t2) {
+  return {t1};
+}
+
+PD_BUILD_OP(rbox_iou)
+    .Inputs({"RBox1", "RBox2"})
+    .Outputs({"Output"})
+    .SetKernelFn(PD_KERNEL(RboxIouForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(RboxIouInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(RboxIouInferDtype));

+ 6 - 11
paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cu → paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu

@@ -13,21 +13,15 @@
 // limitations under the License.
 // limitations under the License.
 //
 //
 // The code is based on
 // The code is based on
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
 
 
 #include "paddle/extension.h"
 #include "paddle/extension.h"
-#include "rbox_iou_op.h"
+#include "rbox_iou_utils.h"
 
 
 // 2D block with 32 * 16 = 512 threads per block
 // 2D block with 32 * 16 = 512 threads per block
 const int BLOCK_DIM_X = 32;
 const int BLOCK_DIM_X = 32;
 const int BLOCK_DIM_Y = 16;
 const int BLOCK_DIM_Y = 16;
 
 
-/**
-   Computes ceil(a / b)
-*/
-
-static inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; }
-
 template <typename T>
 template <typename T>
 __global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,
 __global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,
                                      const T *rbox1_data_ptr,
                                      const T *rbox1_data_ptr,
@@ -85,7 +79,7 @@ __global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,
 }
 }
 
 
 #define CHECK_INPUT_GPU(x)                                                     \
 #define CHECK_INPUT_GPU(x)                                                     \
-  PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
+  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
 
 
 std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
 std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
                                                const paddle::Tensor &rbox2) {
                                                const paddle::Tensor &rbox2) {
@@ -95,7 +89,8 @@ std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
   auto rbox1_num = rbox1.shape()[0];
   auto rbox1_num = rbox1.shape()[0];
   auto rbox2_num = rbox2.shape()[0];
   auto rbox2_num = rbox2.shape()[0];
 
 
-  auto output = paddle::Tensor(paddle::PlaceType::kGPU, {rbox1_num, rbox2_num});
+  auto output =
+      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::GPUPlace());
 
 
   const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X);
   const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X);
   const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y);
   const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y);
@@ -107,7 +102,7 @@ std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
       rbox1.type(), "rbox_iou_cuda_kernel", ([&] {
       rbox1.type(), "rbox_iou_cuda_kernel", ([&] {
         rbox_iou_cuda_kernel<data_t><<<blocks, threads, 0, rbox1.stream()>>>(
         rbox_iou_cuda_kernel<data_t><<<blocks, threads, 0, rbox1.stream()>>>(
             rbox1_num, rbox2_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
             rbox1_num, rbox2_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
-            output.mutable_data<data_t>());
+            output.data<data_t>());
       }));
       }));
 
 
   return {output};
   return {output};

+ 0 - 97
paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.cc

@@ -1,97 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
-
-#include "rbox_iou_op.h"
-#include "paddle/extension.h"
-
-
-template <typename T>
-void rbox_iou_cpu_kernel(
-    const int rbox1_num,
-    const int rbox2_num,
-    const T* rbox1_data_ptr,
-    const T* rbox2_data_ptr,
-    T* output_data_ptr) {
-
-    int i, j;
-    for (i = 0; i < rbox1_num; i++) {
-        for (j = 0; j < rbox2_num; j++) {
-		int offset = i * rbox2_num + j;
-		output_data_ptr[offset] = rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5);
-        }
-    }
-}
-
-
-#define CHECK_INPUT_CPU(x) PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
-
-std::vector<paddle::Tensor> RboxIouCPUForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) {
-    CHECK_INPUT_CPU(rbox1);
-    CHECK_INPUT_CPU(rbox2);
-
-    auto rbox1_num = rbox1.shape()[0];
-    auto rbox2_num = rbox2.shape()[0];
-
-    auto output = paddle::Tensor(paddle::PlaceType::kCPU, {rbox1_num, rbox2_num});
-
-    PD_DISPATCH_FLOATING_TYPES(
-        rbox1.type(),
-        "rbox_iou_cpu_kernel",
-        ([&] {
-            rbox_iou_cpu_kernel<data_t>(
-                rbox1_num,
-                rbox2_num,
-                rbox1.data<data_t>(),
-                rbox2.data<data_t>(),
-                output.mutable_data<data_t>());
-        }));
-    
-    return {output};
-}
-
-
-#ifdef PADDLE_WITH_CUDA
-std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2);
-#endif
-
-
-#define CHECK_INPUT_SAME(x1, x2) PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
-
-std::vector<paddle::Tensor> RboxIouForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) {
-    CHECK_INPUT_SAME(rbox1, rbox2);
-    if (rbox1.place() == paddle::PlaceType::kCPU) {
-        return RboxIouCPUForward(rbox1, rbox2);
-#ifdef PADDLE_WITH_CUDA
-    } else if (rbox1.place() == paddle::PlaceType::kGPU) {
-        return RboxIouCUDAForward(rbox1, rbox2);
-#endif
-    }
-}
-
-std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> rbox1_shape, std::vector<int64_t> rbox2_shape) {
-    return {{rbox1_shape[0], rbox2_shape[0]}};
-}
-
-std::vector<paddle::DataType> InferDtype(paddle::DataType t1, paddle::DataType t2) {
-    return {t1};
-}
-
-PD_BUILD_OP(rbox_iou)
-    .Inputs({"RBOX1", "RBOX2"})
-    .Outputs({"Output"})
-    .SetKernelFn(PD_KERNEL(RboxIouForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype));

+ 12 - 4
paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_op.h → paddlers/models/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h

@@ -13,7 +13,7 @@
 // limitations under the License.
 // limitations under the License.
 //
 //
 // The code is based on
 // The code is based on
-// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/box_iou_rotated
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
 
 
 #pragma once
 #pragma once
 
 
@@ -336,13 +336,21 @@ HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw,
   box2.h = box2_raw[3];
   box2.h = box2_raw[3];
   box2.a = box2_raw[4];
   box2.a = box2_raw[4];
 
 
-  const T area1 = box1.w * box1.h;
-  const T area2 = box2.w * box2.h;
-  if (area1 < 1e-14 || area2 < 1e-14) {
+  if (box1.w < 1e-2 || box1.h < 1e-2 || box2.w < 1e-2 || box2.h < 1e-2) {
     return 0.f;
     return 0.f;
   }
   }
+  const T area1 = box1.w * box1.h;
+  const T area2 = box2.w * box2.h;
 
 
   const T intersection = rboxes_intersection<T>(box1, box2);
   const T intersection = rboxes_intersection<T>(box1, box2);
   const T iou = intersection / (area1 + area2 - intersection);
   const T iou = intersection / (area1 + area2 - intersection);
   return iou;
   return iou;
 }
 }
+
+/**
+   Computes ceil(a / b)
+*/
+
+HOST_DEVICE inline int CeilDiv(const int a, const int b) {
+  return (a + b - 1) / b;
+}

+ 1 - 1
paddlers/models/ppdet/hash.txt

@@ -1 +1 @@
-e3f8dd16bffca04060ec1edc388c5a618e15bbf8
+00fe2a1c35603b6fb37b73265aecf6282e5e2ad4

+ 2 - 1
paddlers/models/ppdet/metrics/__init__.py

@@ -17,6 +17,7 @@ from . import keypoint_metrics
 
 
 from .metrics import *
 from .metrics import *
 from .keypoint_metrics import *
 from .keypoint_metrics import *
+from .pose3d_metrics import *
 
 
 __all__ = metrics.__all__ + keypoint_metrics.__all__
 __all__ = metrics.__all__ + keypoint_metrics.__all__
 
 
@@ -26,4 +27,4 @@ __all__ = metrics.__all__ + mot_metrics.__all__
 
 
 from . import mcmot_metrics
 from . import mcmot_metrics
 from .mcmot_metrics import *
 from .mcmot_metrics import *
-__all__ = metrics.__all__ + mcmot_metrics.__all__
+__all__ = metrics.__all__ + mcmot_metrics.__all__ 

+ 6 - 2
paddlers/models/ppdet/metrics/coco_utils.py

@@ -21,7 +21,7 @@ import sys
 import numpy as np
 import numpy as np
 import itertools
 import itertools
 
 
-from paddlers.models.ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res
+from paddlers.models.ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res
 from paddlers.models.ppdet.metrics.map_utils import draw_pr_curve
 from paddlers.models.ppdet.metrics.map_utils import draw_pr_curve
 
 
 from paddlers.models.ppdet.utils.logger import setup_logger
 from paddlers.models.ppdet.utils.logger import setup_logger
@@ -64,6 +64,10 @@ def get_infer_results(outs, catid, bias=0):
         infer_res['keypoint'] = get_keypoint_res(outs, im_id)
         infer_res['keypoint'] = get_keypoint_res(outs, im_id)
         outs['bbox_num'] = [len(infer_res['keypoint'])]
         outs['bbox_num'] = [len(infer_res['keypoint'])]
 
 
+    if 'pose3d' in outs:
+        infer_res['pose3d'] = get_pose3d_res(outs, im_id)
+        outs['bbox_num'] = [len(infer_res['pose3d'])]
+
     return infer_res
     return infer_res
 
 
 
 
@@ -150,7 +154,7 @@ def cocoapi_eval(jsonfile,
         results_flatten = list(itertools.chain(*results_per_category))
         results_flatten = list(itertools.chain(*results_per_category))
         headers = ['category', 'AP'] * (num_columns // 2)
         headers = ['category', 'AP'] * (num_columns // 2)
         results_2d = itertools.zip_longest(
         results_2d = itertools.zip_longest(
-            *[results_flatten[i::num_columns] for i in range(num_columns)])
+            * [results_flatten[i::num_columns] for i in range(num_columns)])
         table_data = [headers]
         table_data = [headers]
         table_data += [result for result in results_2d]
         table_data += [result for result in results_2d]
         table = AsciiTable(table_data)
         table = AsciiTable(table_data)

+ 16 - 0
paddlers/models/ppdet/metrics/json_results.py

@@ -157,3 +157,19 @@ def get_keypoint_res(results, im_id):
             ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
             ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
             anns.append(ann)
             anns.append(ann)
     return anns
     return anns
+
+
+def get_pose3d_res(results, im_id):
+    anns = []
+    preds = results['pose3d']
+    for idx in range(im_id.shape[0]):
+        image_id = im_id[idx].item()
+        pose3d = preds[idx]
+        ann = {
+            'image_id': image_id,
+            'category_id': 1,  # XXX hard code
+            'pose3d': pose3d.tolist(),
+            'score': float(1.)
+        }
+        anns.append(ann)
+    return anns

+ 1 - 1
paddlers/models/ppdet/metrics/metrics.py

@@ -350,7 +350,7 @@ class WiderFaceMetric(Metric):
 class RBoxMetric(Metric):
 class RBoxMetric(Metric):
     def __init__(self, anno_file, **kwargs):
     def __init__(self, anno_file, **kwargs):
         self.anno_file = anno_file
         self.anno_file = anno_file
-        self.clsid2catid, self.catid2name = get_categories('COCO', anno_file)
+        self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file)
         self.catid2clsid = {v: k for k, v in self.clsid2catid.items()}
         self.catid2clsid = {v: k for k, v in self.clsid2catid.items()}
         self.classwise = kwargs.get('classwise', False)
         self.classwise = kwargs.get('classwise', False)
         self.output_eval = kwargs.get('output_eval', None)
         self.output_eval = kwargs.get('output_eval', None)

+ 200 - 0
paddlers/models/ppdet/metrics/pose3d_metrics.py

@@ -0,0 +1,200 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+from paddle.distributed import ParallelEnv
+import os
+import json
+from collections import defaultdict, OrderedDict
+import numpy as np
+from paddlers.models.ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['Pose3DEval']
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def mean_per_joint_position_error(pred, gt, has_3d_joints):
+    """ 
+    Compute mPJPE
+    """
+    gt = gt[has_3d_joints == 1]
+    gt = gt[:, :, :3]
+    pred = pred[has_3d_joints == 1]
+
+    with paddle.no_grad():
+        gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2
+        gt = gt - gt_pelvis[:, None, :]
+        pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2
+        pred = pred - pred_pelvis[:, None, :]
+        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy()
+        return error
+
+
+def compute_similarity_transform(S1, S2):
+    """Computes a similarity transform (sR, t) that takes
+    a set of 3D points S1 (3 x N) closest to a set of 3D points S2,
+    where R is an 3x3 rotation matrix, t 3x1 translation, s scale.
+    i.e. solves the orthogonal Procrutes problem.
+    """
+    transposed = False
+    if S1.shape[0] != 3 and S1.shape[0] != 2:
+        S1 = S1.T
+        S2 = S2.T
+        transposed = True
+    assert (S2.shape[1] == S1.shape[1])
+
+    # 1. Remove mean.
+    mu1 = S1.mean(axis=1, keepdims=True)
+    mu2 = S2.mean(axis=1, keepdims=True)
+    X1 = S1 - mu1
+    X2 = S2 - mu2
+
+    # 2. Compute variance of X1 used for scale.
+    var1 = np.sum(X1**2)
+
+    # 3. The outer product of X1 and X2.
+    K = X1.dot(X2.T)
+
+    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
+    # singular vectors of K.
+    U, s, Vh = np.linalg.svd(K)
+    V = Vh.T
+    # Construct Z that fixes the orientation of R to get det(R)=1.
+    Z = np.eye(U.shape[0])
+    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
+    # Construct R.
+    R = V.dot(Z.dot(U.T))
+
+    # 5. Recover scale.
+    scale = np.trace(R.dot(K)) / var1
+
+    # 6. Recover translation.
+    t = mu2 - scale * (R.dot(mu1))
+
+    # 7. Error:
+    S1_hat = scale * R.dot(S1) + t
+
+    if transposed:
+        S1_hat = S1_hat.T
+
+    return S1_hat
+
+
+def compute_similarity_transform_batch(S1, S2):
+    """Batched version of compute_similarity_transform."""
+    S1_hat = np.zeros_like(S1)
+    for i in range(S1.shape[0]):
+        S1_hat[i] = compute_similarity_transform(S1[i], S2[i])
+    return S1_hat
+
+
+def reconstruction_error(S1, S2, reduction='mean'):
+    """Do Procrustes alignment and compute reconstruction error."""
+    S1_hat = compute_similarity_transform_batch(S1, S2)
+    re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1)
+    if reduction == 'mean':
+        re = re.mean()
+    elif reduction == 'sum':
+        re = re.sum()
+    return re
+
+
+def all_gather(data):
+    if paddle.distributed.get_world_size() == 1:
+        return data
+    vlist = []
+    paddle.distributed.all_gather(vlist, data)
+    data = paddle.concat(vlist, 0)
+    return data
+
+
+class Pose3DEval(object):
+    def __init__(self, output_eval, save_prediction_only=False):
+        super(Pose3DEval, self).__init__()
+        self.output_eval = output_eval
+        self.res_file = os.path.join(output_eval, "pose3d_results.json")
+        self.save_prediction_only = save_prediction_only
+        self.reset()
+
+    def reset(self):
+        self.PAmPJPE = AverageMeter()
+        self.mPJPE = AverageMeter()
+        self.eval_results = {}
+
+    def get_human36m_joints(self, input):
+        J24_TO_J14 = paddle.to_tensor(
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18])
+        J24_TO_J17 = paddle.to_tensor(
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19])
+        return paddle.index_select(input, J24_TO_J14, axis=1)
+
+    def update(self, inputs, outputs):
+        gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv()
+                                                           .local_rank))
+        has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv()
+                                                                .local_rank))
+        pred_3d_joints = all_gather(outputs['pose3d'])
+        if gt_3d_joints.shape[1] == 24:
+            gt_3d_joints = self.get_human36m_joints(gt_3d_joints)
+        if pred_3d_joints.shape[1] == 24:
+            pred_3d_joints = self.get_human36m_joints(pred_3d_joints)
+        mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints,
+                                                  has_3d_joints).mean()
+        PAmPJPE_val = reconstruction_error(
+            pred_3d_joints.numpy(),
+            gt_3d_joints[:, :, :3].numpy(),
+            reduction=None).mean()
+        count = int(np.sum(has_3d_joints.numpy()))
+        self.PAmPJPE.update(PAmPJPE_val * 1000., count)
+        self.mPJPE.update(mPJPE_val * 1000., count)
+
+    def accumulate(self):
+        if self.save_prediction_only:
+            logger.info(f'The pose3d result is saved to {self.res_file} '
+                        'and do not evaluate the model.')
+            return
+        self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg]
+
+    def log(self):
+        if self.save_prediction_only:
+            return
+        stats_names = ['mPJPE', 'PAmPJPE']
+        num_values = len(stats_names)
+        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
+        print('|---' * (num_values + 1) + '|')
+
+        print(' '.join([
+            '| {:.3f}'.format(abs(value))
+            for value in self.eval_results['pose3d']
+        ]) + ' |')
+
+    def get_results(self):
+        return self.eval_results

+ 2 - 0
paddlers/models/ppdet/modeling/__init__.py

@@ -30,6 +30,7 @@ from . import mot
 from . import transformers
 from . import transformers
 from . import assigners
 from . import assigners
 from . import rbox_utils
 from . import rbox_utils
+from . import ssod
 
 
 from .ops import *
 from .ops import *
 from .backbones import *
 from .backbones import *
@@ -45,3 +46,4 @@ from .mot import *
 from .transformers import *
 from .transformers import *
 from .assigners import *
 from .assigners import *
 from .rbox_utils import *
 from .rbox_utils import *
+from .ssod import *

+ 11 - 0
paddlers/models/ppdet/modeling/architectures/__init__.py

@@ -16,6 +16,7 @@ from . import meta_arch
 from . import faster_rcnn
 from . import faster_rcnn
 from . import mask_rcnn
 from . import mask_rcnn
 from . import yolo
 from . import yolo
+from . import ppyoloe
 from . import cascade_rcnn
 from . import cascade_rcnn
 from . import ssd
 from . import ssd
 from . import fcos
 from . import fcos
@@ -36,11 +37,16 @@ from . import tood
 from . import retinanet
 from . import retinanet
 from . import bytetrack
 from . import bytetrack
 from . import yolox
 from . import yolox
+from . import yolof
+from . import pose3d_metro
+from . import centertrack
+from . import queryinst
 
 
 from .meta_arch import *
 from .meta_arch import *
 from .faster_rcnn import *
 from .faster_rcnn import *
 from .mask_rcnn import *
 from .mask_rcnn import *
 from .yolo import *
 from .yolo import *
+from .ppyoloe import *
 from .cascade_rcnn import *
 from .cascade_rcnn import *
 from .ssd import *
 from .ssd import *
 from .fcos import *
 from .fcos import *
@@ -62,3 +68,8 @@ from .tood import *
 from .retinanet import *
 from .retinanet import *
 from .bytetrack import *
 from .bytetrack import *
 from .yolox import *
 from .yolox import *
+from .yolof import *
+from .pose3d_metro import *
+from .centertrack import *
+from .queryinst import *
+from .keypoint_petr import *

+ 35 - 9
paddlers/models/ppdet/modeling/architectures/blazeface.py

@@ -18,6 +18,8 @@ from __future__ import print_function
 
 
 from paddlers.models.ppdet.core.workspace import register, create
 from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 from .meta_arch import BaseArch
+import paddle
+import paddle.nn.functional as F
 
 
 __all__ = ['BlazeFace']
 __all__ = ['BlazeFace']
 
 
@@ -74,18 +76,42 @@ class BlazeFace(BaseArch):
                                    self.inputs['gt_class'])
                                    self.inputs['gt_class'])
         else:
         else:
             preds, anchors = self.blaze_head(neck_feats, self.inputs['image'])
             preds, anchors = self.blaze_head(neck_feats, self.inputs['image'])
-            bbox, bbox_num = self.post_process(preds, anchors,
-                                               self.inputs['im_shape'],
-                                               self.inputs['scale_factor'])
-            return bbox, bbox_num
+            bbox, bbox_num, nms_keep_idx = self.post_process(
+                preds, anchors, self.inputs['im_shape'],
+                self.inputs['scale_factor'])
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]
+                extra_data['scores'] = F.softmax(paddle.concat(
+                    preds_logits, axis=1)).transpose([0, 2, 1])
+                extra_data['logits'] = paddle.concat(
+                    preds_logits, axis=1).transpose([0, 2, 1])
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox, bbox_num, extra_data
+            else:
+                return bbox, bbox_num
 
 
     def get_loss(self, ):
     def get_loss(self, ):
         return {"loss": self._forward()}
         return {"loss": self._forward()}
 
 
     def get_pred(self):
     def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {
-            "bbox": bbox_pred,
-            "bbox_num": bbox_num,
-        }
+        if self.use_extra_data:
+            bbox_pred, bbox_num, extra_data = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+                "extra_data": extra_data
+            }
+        else:
+            bbox_pred, bbox_num = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+            }
+
         return output
         return output

+ 1 - 1
paddlers/models/ppdet/modeling/architectures/cascade_rcnn.py

@@ -108,7 +108,7 @@ class CascadeRCNN(BaseArch):
             im_shape = self.inputs['im_shape']
             im_shape = self.inputs['im_shape']
             scale_factor = self.inputs['scale_factor']
             scale_factor = self.inputs['scale_factor']
 
 
-            bbox, bbox_num = self.bbox_post_process(
+            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
                 preds, (refined_rois, rois_num), im_shape, scale_factor)
                 preds, (refined_rois, rois_num), im_shape, scale_factor)
             # rescale the prediction back to origin image
             # rescale the prediction back to origin image
             bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
             bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(

+ 11 - 16
paddlers/models/ppdet/modeling/architectures/centernet.py

@@ -78,30 +78,25 @@ class CenterNet(BaseArch):
 
 
     def get_pred(self):
     def get_pred(self):
         head_out = self._forward()
         head_out = self._forward()
+        bbox, bbox_num, bbox_inds, topk_clses, topk_ys, topk_xs = self.post_process(
+            head_out['heatmap'],
+            head_out['size'],
+            head_out['offset'],
+            im_shape=self.inputs['im_shape'],
+            scale_factor=self.inputs['scale_factor'])
+
         if self.for_mot:
         if self.for_mot:
-            bbox, bbox_inds, topk_clses = self.post_process(
-                head_out['heatmap'],
-                head_out['size'],
-                head_out['offset'],
-                im_shape=self.inputs['im_shape'],
-                scale_factor=self.inputs['scale_factor'])
             output = {
             output = {
                 "bbox": bbox,
                 "bbox": bbox,
+                "bbox_num": bbox_num,
                 "bbox_inds": bbox_inds,
                 "bbox_inds": bbox_inds,
                 "topk_clses": topk_clses,
                 "topk_clses": topk_clses,
+                "topk_ys": topk_ys,
+                "topk_xs": topk_xs,
                 "neck_feat": head_out['neck_feat']
                 "neck_feat": head_out['neck_feat']
             }
             }
         else:
         else:
-            bbox, bbox_num, _ = self.post_process(
-                head_out['heatmap'],
-                head_out['size'],
-                head_out['offset'],
-                im_shape=self.inputs['im_shape'],
-                scale_factor=self.inputs['scale_factor'])
-            output = {
-                "bbox": bbox,
-                "bbox_num": bbox_num,
-            }
+            output = {"bbox": bbox, "bbox_num": bbox_num}
         return output
         return output
 
 
     def get_loss(self):
     def get_loss(self):

+ 176 - 0
paddlers/models/ppdet/modeling/architectures/centertrack.py

@@ -0,0 +1,176 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import math
+import numpy as np
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+from ..keypoint_utils import affine_transform
+from paddlers.models.ppdet.data.transform.op_helper import gaussian_radius, gaussian2D, draw_umich_gaussian
+
+__all__ = ['CenterTrack']
+
+
+@register
+class CenterTrack(BaseArch):
+    """
+    CenterTrack network, see http://arxiv.org/abs/2004.01177
+
+    Args:
+        detector (object): 'CenterNet' instance
+        plugin_head (object): 'CenterTrackHead' instance
+        tracker (object): 'CenterTracker' instance
+    """
+    __category__ = 'architecture'
+    __shared__ = ['mot_metric']
+
+    def __init__(self,
+                 detector='CenterNet',
+                 plugin_head='CenterTrackHead',
+                 tracker='CenterTracker',
+                 mot_metric=False):
+        super(CenterTrack, self).__init__()
+        self.detector = detector
+        self.plugin_head = plugin_head
+        self.tracker = tracker
+        self.mot_metric = mot_metric
+        self.pre_image = None
+        self.deploy = False
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        detector = create(cfg['detector'])
+        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape
+
+        kwargs = {'input_shape': detector_out_shape}
+        plugin_head = create(cfg['plugin_head'], **kwargs)
+        tracker = create(cfg['tracker'])
+
+        return {
+            'detector': detector,
+            'plugin_head': plugin_head,
+            'tracker': tracker,
+        }
+
+    def _forward(self):
+        if self.training:
+            det_outs = self.detector(self.inputs)
+            neck_feat = det_outs['neck_feat']
+
+            losses = {}
+            for k, v in det_outs.items():
+                if 'loss' not in k: continue
+                losses.update({k: v})
+
+            plugin_outs = self.plugin_head(neck_feat, self.inputs)
+            for k, v in plugin_outs.items():
+                if 'loss' not in k: continue
+                losses.update({k: v})
+
+            losses['loss'] = det_outs['det_loss'] + plugin_outs['plugin_loss']
+            return losses
+
+        else:
+            if not self.mot_metric:
+                # detection, support bs>=1
+                det_outs = self.detector(self.inputs)
+                return {
+                    'bbox': det_outs['bbox'],
+                    'bbox_num': det_outs['bbox_num']
+                }
+
+            else:
+                # MOT, only support bs=1
+                if not self.deploy:
+                    if self.pre_image is None:
+                        self.pre_image = self.inputs['image']
+                        # initializing tracker for the first frame
+                        self.tracker.init_track([])
+                    self.inputs['pre_image'] = self.pre_image
+                    self.pre_image = self.inputs[
+                        'image']  # Note: update for next image
+
+                    # render input heatmap from tracker status
+                    pre_hm = self.get_additional_inputs(
+                        self.tracker.tracks, self.inputs, with_hm=True)
+                    self.inputs['pre_hm'] = paddle.to_tensor(pre_hm)
+
+                # model inference
+                det_outs = self.detector(self.inputs)
+                neck_feat = det_outs['neck_feat']
+                result = self.plugin_head(
+                    neck_feat, self.inputs, det_outs['bbox'],
+                    det_outs['bbox_inds'], det_outs['topk_clses'],
+                    det_outs['topk_ys'], det_outs['topk_xs'])
+
+                if not self.deploy:
+                    # convert the cropped and 4x downsampled output coordinate system
+                    # back to the input image coordinate system
+                    result = self.plugin_head.centertrack_post_process(
+                        result, self.inputs, self.tracker.out_thresh)
+                return result
+
+    def get_pred(self):
+        return self._forward()
+
+    def get_loss(self):
+        return self._forward()
+
+    def reset_tracking(self):
+        self.tracker.reset()
+        self.pre_image = None
+
+    def get_additional_inputs(self, dets, meta, with_hm=True):
+        # Render input heatmap from previous trackings.
+        trans_input = meta['trans_input'][0].numpy()
+        inp_width, inp_height = int(meta['inp_width'][0]), int(meta[
+            'inp_height'][0])
+        input_hm = np.zeros((1, inp_height, inp_width), dtype=np.float32)
+
+        for det in dets:
+            if det['score'] < self.tracker.pre_thresh:
+                continue
+            bbox = affine_transform_bbox(det['bbox'], trans_input, inp_width,
+                                         inp_height)
+            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+            if (h > 0 and w > 0):
+                radius = gaussian_radius(
+                    (math.ceil(h), math.ceil(w)), min_overlap=0.7)
+                radius = max(0, int(radius))
+                ct = np.array(
+                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
+                    dtype=np.float32)
+                ct_int = ct.astype(np.int32)
+                if with_hm:
+                    input_hm[0] = draw_umich_gaussian(input_hm[0], ct_int,
+                                                      radius)
+        if with_hm:
+            input_hm = input_hm[np.newaxis]
+        return input_hm
+
+
+def affine_transform_bbox(bbox, trans, width, height):
+    bbox = np.array(copy.deepcopy(bbox), dtype=np.float32)
+    bbox[:2] = affine_transform(bbox[:2], trans)
+    bbox[2:] = affine_transform(bbox[2:], trans)
+    bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, width - 1)
+    bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, height - 1)
+    return bbox

+ 13 - 5
paddlers/models/ppdet/modeling/architectures/detr.py

@@ -27,17 +27,20 @@ __all__ = ['DETR']
 class DETR(BaseArch):
 class DETR(BaseArch):
     __category__ = 'architecture'
     __category__ = 'architecture'
     __inject__ = ['post_process']
     __inject__ = ['post_process']
+    __shared__ = ['exclude_post_process']
 
 
     def __init__(self,
     def __init__(self,
                  backbone,
                  backbone,
                  transformer,
                  transformer,
                  detr_head,
                  detr_head,
-                 post_process='DETRBBoxPostProcess'):
+                 post_process='DETRBBoxPostProcess',
+                 exclude_post_process=False):
         super(DETR, self).__init__()
         super(DETR, self).__init__()
         self.backbone = backbone
         self.backbone = backbone
         self.transformer = transformer
         self.transformer = transformer
         self.detr_head = detr_head
         self.detr_head = detr_head
         self.post_process = post_process
         self.post_process = post_process
+        self.exclude_post_process = exclude_post_process
 
 
     @classmethod
     @classmethod
     def from_config(cls, cfg, *args, **kwargs):
     def from_config(cls, cfg, *args, **kwargs):
@@ -65,18 +68,23 @@ class DETR(BaseArch):
         body_feats = self.backbone(self.inputs)
         body_feats = self.backbone(self.inputs)
 
 
         # Transformer
         # Transformer
-        out_transformer = self.transformer(body_feats, self.inputs['pad_mask'])
+        pad_mask = self.inputs['pad_mask'] if self.training else None
+        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
 
 
         # DETR Head
         # DETR Head
         if self.training:
         if self.training:
             return self.detr_head(out_transformer, body_feats, self.inputs)
             return self.detr_head(out_transformer, body_feats, self.inputs)
         else:
         else:
             preds = self.detr_head(out_transformer, body_feats)
             preds = self.detr_head(out_transformer, body_feats)
-            bbox, bbox_num = self.post_process(preds, self.inputs['im_shape'],
-                                               self.inputs['scale_factor'])
+            if self.exclude_post_process:
+                bboxes, logits, masks = preds
+                return bboxes, logits
+            else:
+                bbox, bbox_num = self.post_process(
+                    preds, self.inputs['im_shape'], self.inputs['scale_factor'])
             return bbox, bbox_num
             return bbox, bbox_num
 
 
-    def get_loss(self, ):
+    def get_loss(self):
         losses = self._forward()
         losses = self._forward()
         losses.update({
         losses.update({
             'loss':
             'loss':

+ 61 - 5
paddlers/models/ppdet/modeling/architectures/faster_rcnn.py

@@ -19,6 +19,7 @@ from __future__ import print_function
 import paddle
 import paddle
 from paddlers.models.ppdet.core.workspace import register, create
 from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 from .meta_arch import BaseArch
+import numpy as np
 
 
 __all__ = ['FasterRCNN']
 __all__ = ['FasterRCNN']
 
 
@@ -51,6 +52,9 @@ class FasterRCNN(BaseArch):
         self.bbox_head = bbox_head
         self.bbox_head = bbox_head
         self.bbox_post_process = bbox_post_process
         self.bbox_post_process = bbox_post_process
 
 
+    def init_cot_head(self, relationship):
+        self.bbox_head.init_cot_head(relationship)
+
     @classmethod
     @classmethod
     def from_config(cls, cfg, *args, **kwargs):
     def from_config(cls, cfg, *args, **kwargs):
         backbone = create(cfg['backbone'])
         backbone = create(cfg['backbone'])
@@ -80,16 +84,29 @@ class FasterRCNN(BaseArch):
         else:
         else:
             rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
             rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
             preds, _ = self.bbox_head(body_feats, rois, rois_num, None)
             preds, _ = self.bbox_head(body_feats, rois, rois_num, None)
-
             im_shape = self.inputs['im_shape']
             im_shape = self.inputs['im_shape']
             scale_factor = self.inputs['scale_factor']
             scale_factor = self.inputs['scale_factor']
-            bbox, bbox_num = self.bbox_post_process(preds, (rois, rois_num),
+            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(preds, (rois, rois_num),
                                                     im_shape, scale_factor)
                                                     im_shape, scale_factor)
 
 
             # rescale the prediction back to origin image
             # rescale the prediction back to origin image
             bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
             bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
                 bbox, bbox_num, im_shape, scale_factor)
                 bbox, bbox_num, im_shape, scale_factor)
-            return bbox_pred, bbox_num
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                """
+                extra_data['scores'] = preds[1]  # predict scores (probability)
+                # Todo: get logits output
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox_pred, bbox_num, extra_data
+            else:
+                return bbox_pred, bbox_num
+
 
 
     def get_loss(self, ):
     def get_loss(self, ):
         rpn_loss, bbox_loss = self._forward()
         rpn_loss, bbox_loss = self._forward()
@@ -101,6 +118,45 @@ class FasterRCNN(BaseArch):
         return loss
         return loss
 
 
     def get_pred(self):
     def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        if self.use_extra_data:
+            bbox_pred, bbox_num, extra_data = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'extra_data': extra_data}
+        else:
+            bbox_pred, bbox_num = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
         return output
         return output
+
+    def target_bbox_forward(self, data):
+        body_feats = self.backbone(data)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+        rois = [roi for roi in data['gt_bbox']]
+        rois_num = paddle.concat([paddle.shape(roi)[0] for roi in rois])
+
+        preds, _ = self.bbox_head(body_feats, rois, rois_num, None, cot=True)
+        return preds
+
+    def relationship_learning(self, loader, num_classes_novel):
+        print('computing relationship')
+        train_labels_list = []
+        label_list = []
+
+        for step_id, data in enumerate(loader):
+            _, bbox_prob = self.target_bbox_forward(data)      
+            batch_size = data['im_id'].shape[0]
+            for i in range(batch_size):
+                num_bbox = data['gt_class'][i].shape[0]           
+                train_labels = data['gt_class'][i]
+                train_labels_list.append(train_labels.numpy().squeeze(1))
+            base_labels = bbox_prob.detach().numpy()[:,:-1]
+            label_list.append(base_labels)
+
+        labels = np.concatenate(train_labels_list, 0)
+        probabilities = np.concatenate(label_list, 0)
+        N_t = np.max(labels) + 1
+        conditional = []
+        for i in range(N_t):
+            this_class = probabilities[labels == i]
+            average = np.mean(this_class, axis=0, keepdims=True)
+            conditional.append(average)
+        return np.concatenate(conditional) 

+ 30 - 39
paddlers/models/ppdet/modeling/architectures/fcos.py

@@ -16,7 +16,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import division
 from __future__ import print_function
 from __future__ import print_function
 
 
-import paddle
 from paddlers.models.ppdet.core.workspace import register, create
 from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 from .meta_arch import BaseArch
 
 
@@ -32,22 +31,25 @@ class FCOS(BaseArch):
         backbone (object): backbone instance
         backbone (object): backbone instance
         neck (object): 'FPN' instance
         neck (object): 'FPN' instance
         fcos_head (object): 'FCOSHead' instance
         fcos_head (object): 'FCOSHead' instance
-        post_process (object): 'FCOSPostProcess' instance
+        ssod_loss (object): 'SSODFCOSLoss' instance, only used for semi-det(ssod)
     """
     """
 
 
     __category__ = 'architecture'
     __category__ = 'architecture'
-    __inject__ = ['fcos_post_process']
+    __inject__ = ['ssod_loss']
 
 
     def __init__(self,
     def __init__(self,
-                 backbone,
-                 neck,
+                 backbone='ResNet',
+                 neck='FPN',
                  fcos_head='FCOSHead',
                  fcos_head='FCOSHead',
-                 fcos_post_process='FCOSPostProcess'):
+                 ssod_loss='SSODFCOSLoss'):
         super(FCOS, self).__init__()
         super(FCOS, self).__init__()
         self.backbone = backbone
         self.backbone = backbone
         self.neck = neck
         self.neck = neck
         self.fcos_head = fcos_head
         self.fcos_head = fcos_head
-        self.fcos_post_process = fcos_post_process
+
+        # for ssod, semi-det
+        self.is_teacher = False
+        self.ssod_loss = ssod_loss
 
 
     @classmethod
     @classmethod
     def from_config(cls, cfg, *args, **kwargs):
     def from_config(cls, cfg, *args, **kwargs):
@@ -68,38 +70,27 @@ class FCOS(BaseArch):
     def _forward(self):
     def _forward(self):
         body_feats = self.backbone(self.inputs)
         body_feats = self.backbone(self.inputs)
         fpn_feats = self.neck(body_feats)
         fpn_feats = self.neck(body_feats)
-        fcos_head_outs = self.fcos_head(fpn_feats, self.training)
-        if not self.training:
-            scale_factor = self.inputs['scale_factor']
-            bboxes = self.fcos_post_process(fcos_head_outs, scale_factor)
-            return bboxes
+
+        self.is_teacher = self.inputs.get('is_teacher', False)
+        if self.training or self.is_teacher:
+            losses = self.fcos_head(fpn_feats, self.inputs)
+            return losses
         else:
         else:
-            return fcos_head_outs
-
-    def get_loss(self, ):
-        loss = {}
-        tag_labels, tag_bboxes, tag_centerness = [], [], []
-        for i in range(len(self.fcos_head.fpn_stride)):
-            # labels, reg_target, centerness
-            k_lbl = 'labels{}'.format(i)
-            if k_lbl in self.inputs:
-                tag_labels.append(self.inputs[k_lbl])
-            k_box = 'reg_target{}'.format(i)
-            if k_box in self.inputs:
-                tag_bboxes.append(self.inputs[k_box])
-            k_ctn = 'centerness{}'.format(i)
-            if k_ctn in self.inputs:
-                tag_centerness.append(self.inputs[k_ctn])
-
-        fcos_head_outs = self._forward()
-        loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels,
-                                            tag_bboxes, tag_centerness)
-        loss.update(loss_fcos)
-        total_loss = paddle.add_n(list(loss.values()))
-        loss.update({'loss': total_loss})
-        return loss
+            fcos_head_outs = self.fcos_head(fpn_feats)
+            bbox_pred, bbox_num = self.fcos_head.post_process(
+                fcos_head_outs, self.inputs['scale_factor'])
+            return {'bbox': bbox_pred, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
 
 
     def get_pred(self):
     def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
-        return output
+        return self._forward()
+
+    def get_loss_keys(self):
+        return ['loss_cls', 'loss_box', 'loss_quality']
+
+    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
+        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
+                                     train_cfg)
+        return ssod_losses

+ 207 - 6
paddlers/models/ppdet/modeling/architectures/keypoint_hrnet.py

@@ -24,8 +24,9 @@ from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 from .meta_arch import BaseArch
 from ..keypoint_utils import transform_preds
 from ..keypoint_utils import transform_preds
 from .. import layers as L
 from .. import layers as L
+from paddle.nn import functional as F
 
 
-__all__ = ['TopDownHRNet']
+__all__ = ['TopDownHRNet', 'TinyPose3DHRNet', 'TinyPose3DHRHeatmapNet']
 
 
 
 
 @register
 @register
@@ -45,7 +46,7 @@ class TopDownHRNet(BaseArch):
                  use_dark=True):
                  use_dark=True):
         """
         """
         HRNet network, see https://arxiv.org/abs/1902.09212
         HRNet network, see https://arxiv.org/abs/1902.09212
-
+ 
         Args:
         Args:
             backbone (nn.Layer): backbone instance
             backbone (nn.Layer): backbone instance
             post_process (object): `HRNetPostProcess` instance
             post_process (object): `HRNetPostProcess` instance
@@ -131,10 +132,10 @@ class HRNetPostProcess(object):
 
 
     def get_max_preds(self, heatmaps):
     def get_max_preds(self, heatmaps):
         '''get predictions from score maps
         '''get predictions from score maps
-
+ 
         Args:
         Args:
             heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
             heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
-
+ 
         Returns:
         Returns:
             preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
             preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
             maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
             maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
@@ -219,12 +220,12 @@ class HRNetPostProcess(object):
     def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
     def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
         """the highest heatvalue location with a quarter offset in the
         """the highest heatvalue location with a quarter offset in the
         direction from the highest response to the second highest response.
         direction from the highest response to the second highest response.
-
+ 
         Args:
         Args:
             heatmaps (numpy.ndarray): The predicted heatmaps
             heatmaps (numpy.ndarray): The predicted heatmaps
             center (numpy.ndarray): The boxes center
             center (numpy.ndarray): The boxes center
             scale (numpy.ndarray): The scale factor
             scale (numpy.ndarray): The scale factor
-
+ 
         Returns:
         Returns:
             preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
             preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
             maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
             maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
@@ -265,3 +266,203 @@ class HRNetPostProcess(object):
                     maxvals, axis=1)
                     maxvals, axis=1)
         ]]
         ]]
         return outputs
         return outputs
+
+
+class TinyPose3DPostProcess(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, output, center, scale):
+        """
+        Args:
+            output (numpy.ndarray): numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
+            scale (numpy.ndarray): The scale factor
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
+        """
+
+        preds = output.numpy().copy()
+
+        # Transform back
+        for i in range(output.shape[0]):  # batch_size
+            preds[i][:, 0] = preds[i][:, 0] * scale[i][0]
+            preds[i][:, 1] = preds[i][:, 1] * scale[i][1]
+
+        return preds
+
+
+def soft_argmax(heatmaps, joint_num):
+    dims = heatmaps.shape
+    depth_dim = (int)(dims[1] / joint_num)
+    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim * dims[2] * dims[3]))
+    heatmaps = F.softmax(heatmaps, 2)
+    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim, dims[2], dims[3]))
+
+    accu_x = heatmaps.sum(axis=(2, 3))
+    accu_y = heatmaps.sum(axis=(2, 4))
+    accu_z = heatmaps.sum(axis=(3, 4))
+
+    accu_x = accu_x * paddle.arange(1, 33)
+    accu_y = accu_y * paddle.arange(1, 33)
+    accu_z = accu_z * paddle.arange(1, 33)
+
+    accu_x = accu_x.sum(axis=2, keepdim=True) - 1
+    accu_y = accu_y.sum(axis=2, keepdim=True) - 1
+    accu_z = accu_z.sum(axis=2, keepdim=True) - 1
+
+    coord_out = paddle.concat(
+        (accu_x, accu_y, accu_z), axis=2)  # [batch_size, joint_num, 3]
+
+    return coord_out
+
+
+@register
+class TinyPose3DHRHeatmapNet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(
+            self,
+            width,  # 40, backbone输出的channel数目
+            num_joints,
+            backbone='HRNet',
+            loss='KeyPointRegressionMSELoss',
+            post_process=TinyPose3DPostProcess):
+        """
+        Args:
+            backbone (nn.Layer): backbone instance
+            post_process (object): post process instance
+        """
+        super(TinyPose3DHRHeatmapNet, self).__init__()
+
+        self.backbone = backbone
+        self.post_process = TinyPose3DPostProcess()
+        self.loss = loss
+        self.deploy = False
+        self.num_joints = num_joints
+
+        self.final_conv = L.Conv2d(width, num_joints * 32, 1, 1, 0, bias=True)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        return {'backbone': backbone, }
+
+    def _forward(self):
+        feats = self.backbone(self.inputs)  # feats:[[batch_size, 40, 32, 24]]
+
+        hrnet_outputs = self.final_conv(feats[0])
+        res = soft_argmax(hrnet_outputs, self.num_joints)
+        return res
+
+    def get_loss(self):
+        pose3d = self._forward()
+        loss = self.loss(pose3d, None, self.inputs)
+        outputs = {'loss': loss}
+        return outputs
+
+    def get_pred(self):
+        res_lst = self._forward()
+        outputs = {'pose3d': res_lst}
+        return outputs
+
+    def flip_back(self, output_flipped, matched_parts):
+        assert output_flipped.ndim == 4,\
+                'output_flipped should be [batch_size, num_joints, height, width]'
+
+        output_flipped = output_flipped[:, :, :, ::-1]
+
+        for pair in matched_parts:
+            tmp = output_flipped[:, pair[0], :, :].copy()
+            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+            output_flipped[:, pair[1], :, :] = tmp
+
+        return output_flipped
+
+
+@register
+class TinyPose3DHRNet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 width,
+                 num_joints,
+                 fc_channel=768,
+                 backbone='HRNet',
+                 loss='KeyPointRegressionMSELoss',
+                 post_process=TinyPose3DPostProcess):
+        """
+        Args:
+            backbone (nn.Layer): backbone instance
+            post_process (object): post process instance
+        """
+        super(TinyPose3DHRNet, self).__init__()
+        self.backbone = backbone
+        self.post_process = TinyPose3DPostProcess()
+        self.loss = loss
+        self.deploy = False
+        self.num_joints = num_joints
+
+        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)
+
+        self.flatten = paddle.nn.Flatten(start_axis=2, stop_axis=3)
+        self.fc1 = paddle.nn.Linear(fc_channel, 256)
+        self.act1 = paddle.nn.ReLU()
+        self.fc2 = paddle.nn.Linear(256, 64)
+        self.act2 = paddle.nn.ReLU()
+        self.fc3 = paddle.nn.Linear(64, 3)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        return {'backbone': backbone, }
+
+    def _forward(self):
+        '''
+        self.inputs is a dict
+        '''
+        feats = self.backbone(
+            self.inputs)  # feats:[[batch_size, 40, width/4, height/4]]
+
+        hrnet_outputs = self.final_conv(
+            feats[0])  # hrnet_outputs: [batch_size, num_joints*32,32,32]
+
+        flatten_res = self.flatten(
+            hrnet_outputs)  # [batch_size,num_joints*32,32*32]
+
+        res = self.fc1(flatten_res)
+        res = self.act1(res)
+        res = self.fc2(res)
+        res = self.act2(res)
+        res = self.fc3(res)
+
+        if self.training:
+            return self.loss(res, self.inputs)
+        else:  # export model need
+            return res
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        res_lst = self._forward()
+        outputs = {'pose3d': res_lst}
+        return outputs
+
+    def flip_back(self, output_flipped, matched_parts):
+        assert output_flipped.ndim == 4,\
+                'output_flipped should be [batch_size, num_joints, height, width]'
+
+        output_flipped = output_flipped[:, :, :, ::-1]
+
+        for pair in matched_parts:
+            tmp = output_flipped[:, pair[0], :, :].copy()
+            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+            output_flipped[:, pair[1], :, :] = tmp
+
+        return output_flipped

+ 217 - 0
paddlers/models/ppdet/modeling/architectures/keypoint_petr.py

@@ -0,0 +1,217 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and 
+# limitations under the License.
+"""
+this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/detectors/petr.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddlers.models.ppdet.core.workspace import register
+from .meta_arch import BaseArch
+from .. import layers as L
+
+__all__ = ['PETR']
+
+
+@register
+class PETR(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['backbone', 'neck', 'bbox_head']
+
+    def __init__(self,
+                 backbone='ResNet',
+                 neck='ChannelMapper',
+                 bbox_head='PETRHead'):
+        """
+        PETR, see https://openaccess.thecvf.com/content/CVPR2022/papers/Shi_End-to-End_Multi-Person_Pose_Estimation_With_Transformers_CVPR_2022_paper.pdf
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck between backbone and head
+            bbox_head (nn.Layer): model output and loss
+        """
+        super(PETR, self).__init__()
+        self.backbone = backbone
+        if neck is not None:
+            self.with_neck = True
+        self.neck = neck
+        self.bbox_head = bbox_head
+        self.deploy = False
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def get_inputs(self):
+        img_metas = []
+        gt_bboxes = []
+        gt_labels = []
+        gt_keypoints = []
+        gt_areas = []
+        pad_gt_mask = self.inputs['pad_gt_mask'].astype("bool").squeeze(-1)
+        for idx, im_shape in enumerate(self.inputs['im_shape']):
+            img_meta = {
+                'img_shape': im_shape.astype("int32").tolist() + [1, ],
+                'batch_input_shape': self.inputs['image'].shape[-2:],
+                'image_name': self.inputs['image_file'][idx]
+            }
+            img_metas.append(img_meta)
+            if (not pad_gt_mask[idx].any()):
+                gt_keypoints.append(self.inputs['gt_joints'][idx][:1])
+                gt_labels.append(self.inputs['gt_class'][idx][:1])
+                gt_bboxes.append(self.inputs['gt_bbox'][idx][:1])
+                gt_areas.append(self.inputs['gt_areas'][idx][:1])
+                continue
+
+            gt_keypoints.append(self.inputs['gt_joints'][idx][pad_gt_mask[idx]])
+            gt_labels.append(self.inputs['gt_class'][idx][pad_gt_mask[idx]])
+            gt_bboxes.append(self.inputs['gt_bbox'][idx][pad_gt_mask[idx]])
+            gt_areas.append(self.inputs['gt_areas'][idx][pad_gt_mask[idx]])
+
+        return img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas
+
+    def get_loss(self):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+            gt_keypoints (list[Tensor]): Each item are the truth keypoints for
+                each image in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x,
+                p^{K}_y, p^{K}_v] format.
+            gt_areas (list[Tensor]): mask areas corresponding to each box.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas = self.get_inputs(
+        )
+        gt_bboxes_ignore = getattr(self.inputs, 'gt_bboxes_ignore', None)
+
+        x = self.extract_feat(self.inputs)
+        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
+                                              gt_labels, gt_keypoints, gt_areas,
+                                              gt_bboxes_ignore)
+        loss = 0
+        for k, v in losses.items():
+            loss += v
+        losses['loss'] = loss
+
+        return losses
+
+    def get_pred_numpy(self):
+        """Used for computing network flops.
+        """
+
+        img = self.inputs['image']
+        batch_size, _, height, width = img.shape
+        dummy_img_metas = [
+            dict(
+                batch_input_shape=(height, width),
+                img_shape=(height, width, 3),
+                scale_factor=(1., 1., 1., 1.)) for _ in range(batch_size)
+        ]
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x, img_metas=dummy_img_metas)
+        bbox_list = self.bbox_head.get_bboxes(
+            *outs, dummy_img_metas, rescale=True)
+        return bbox_list
+
+    def get_pred(self):
+        """
+        """
+        img = self.inputs['image']
+        batch_size, _, height, width = img.shape
+        img_metas = [
+            dict(
+                batch_input_shape=(height, width),
+                img_shape=(height, width, 3),
+                scale_factor=self.inputs['scale_factor'][i])
+            for i in range(batch_size)
+        ]
+        kptpred = self.simple_test(
+            self.inputs, img_metas=img_metas, rescale=True)
+        keypoints = kptpred[0][1][0]
+        bboxs = kptpred[0][0][0]
+        keypoints[..., 2] = bboxs[:, None, 4]
+        res_lst = [[keypoints, bboxs[:, 4]]]
+        outputs = {'keypoint': res_lst}
+        return outputs
+
+    def simple_test(self, inputs, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            inputs (list[paddle.Tensor]): List of multiple images.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox and keypoint results of each image
+                and classes. The outer list corresponds to each image.
+                The inner list corresponds to each class.
+        """
+        batch_size = len(img_metas)
+        assert batch_size == 1, 'Currently only batch_size 1 for inference ' \
+            f'mode is supported. Found batch_size {batch_size}.'
+        feat = self.extract_feat(inputs)
+        results_list = self.bbox_head.simple_test(
+            feat, img_metas, rescale=rescale)
+
+        bbox_kpt_results = [
+            self.bbox_kpt2result(det_bboxes, det_labels, det_kpts,
+                                 self.bbox_head.num_classes)
+            for det_bboxes, det_labels, det_kpts in results_list
+        ]
+        return bbox_kpt_results
+
+    def bbox_kpt2result(self, bboxes, labels, kpts, num_classes):
+        """Convert detection results to a list of numpy arrays.
+
+        Args:
+            bboxes (paddle.Tensor | np.ndarray): shape (n, 5).
+            labels (paddle.Tensor | np.ndarray): shape (n, ).
+            kpts (paddle.Tensor | np.ndarray): shape (n, K, 3).
+            num_classes (int): class number, including background class.
+
+        Returns:
+            list(ndarray): bbox and keypoint results of each class.
+        """
+        if bboxes.shape[0] == 0:
+            return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)], \
+                [np.zeros((0, kpts.size(1), 3), dtype=np.float32)
+                    for i in range(num_classes)]
+        else:
+            if isinstance(bboxes, paddle.Tensor):
+                bboxes = bboxes.numpy()
+                labels = labels.numpy()
+                kpts = kpts.numpy()
+            return [bboxes[labels == i, :] for i in range(num_classes)], \
+                [kpts[labels == i, :, :] for i in range(num_classes)]

+ 22 - 5
paddlers/models/ppdet/modeling/architectures/mask_rcnn.py

@@ -106,8 +106,8 @@ class MaskRCNN(BaseArch):
             im_shape = self.inputs['im_shape']
             im_shape = self.inputs['im_shape']
             scale_factor = self.inputs['scale_factor']
             scale_factor = self.inputs['scale_factor']
 
 
-            bbox, bbox_num = self.bbox_post_process(preds, (rois, rois_num),
-                                                    im_shape, scale_factor)
+            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
+                preds, (rois, rois_num), im_shape, scale_factor)
             mask_out = self.mask_head(
             mask_out = self.mask_head(
                 body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)
                 body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)
 
 
@@ -117,7 +117,20 @@ class MaskRCNN(BaseArch):
             origin_shape = self.bbox_post_process.get_origin_shape()
             origin_shape = self.bbox_post_process.get_origin_shape()
             mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
             mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
                                                origin_shape)
                                                origin_shape)
-            return bbox_pred, bbox_num, mask_pred
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                """
+                extra_data['scores'] = preds[1]  # predict scores (probability)
+                # Todo: get logits output
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox_pred, bbox_num, mask_pred, extra_data
+            else:
+                return bbox_pred, bbox_num, mask_pred
 
 
     def get_loss(self, ):
     def get_loss(self, ):
         bbox_loss, mask_loss, rpn_loss = self._forward()
         bbox_loss, mask_loss, rpn_loss = self._forward()
@@ -130,6 +143,10 @@ class MaskRCNN(BaseArch):
         return loss
         return loss
 
 
     def get_pred(self):
     def get_pred(self):
-        bbox_pred, bbox_num, mask_pred = self._forward()
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
+        if self.use_extra_data:
+            bbox_pred, bbox_num, mask_pred, extra_data = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred, 'extra_data': extra_data}
+        else:
+            bbox_pred, bbox_num, mask_pred = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
         return output
         return output

+ 2 - 1
paddlers/models/ppdet/modeling/architectures/meta_arch.py

@@ -15,11 +15,12 @@ __all__ = ['BaseArch']
 
 
 @register
 @register
 class BaseArch(nn.Layer):
 class BaseArch(nn.Layer):
-    def __init__(self, data_format='NCHW'):
+    def __init__(self, data_format='NCHW', use_extra_data=False):
         super(BaseArch, self).__init__()
         super(BaseArch, self).__init__()
         self.data_format = data_format
         self.data_format = data_format
         self.inputs = {}
         self.inputs = {}
         self.fuse_norm = False
         self.fuse_norm = False
+        self.use_extra_data = use_extra_data
 
 
     def load_meanstd(self, cfg_transform):
     def load_meanstd(self, cfg_transform):
         scale = 1.
         scale = 1.

+ 114 - 0
paddlers/models/ppdet/modeling/architectures/pose3d_metro.py

@@ -0,0 +1,114 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and 
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+from .. import layers as L
+
+__all__ = ['METRO_Body']
+
+
+def orthographic_projection(X, camera):
+    """Perform orthographic projection of 3D points X using the camera parameters
+    Args:
+        X: size = [B, N, 3]
+        camera: size = [B, 3]
+    Returns:
+        Projected 2D points -- size = [B, N, 2]
+    """
+    camera = camera.reshape((-1, 1, 3))
+    X_trans = X[:, :, :2] + camera[:, :, 1:]
+    shape = paddle.shape(X_trans)
+    X_2d = (camera[:, :, 0] * X_trans.reshape((shape[0], -1))).reshape(shape)
+    return X_2d
+
+
+@register
+class METRO_Body(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(
+            self,
+            num_joints,
+            backbone='HRNet',
+            trans_encoder='',
+            loss='Pose3DLoss', ):
+        """
+        Modified from METRO network, see https://arxiv.org/abs/2012.09760
+
+        Args:
+            backbone (nn.Layer): backbone instance
+        """
+        super(METRO_Body, self).__init__()
+        self.num_joints = num_joints
+        self.backbone = backbone
+        self.loss = loss
+        self.deploy = False
+
+        self.trans_encoder = trans_encoder
+        self.conv_learn_tokens = paddle.nn.Conv1D(49, num_joints + 10, 1)
+        self.cam_param_fc = paddle.nn.Linear(3, 2)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        trans_encoder = create(cfg['trans_encoder'])
+
+        return {'backbone': backbone, 'trans_encoder': trans_encoder}
+
+    def _forward(self):
+        batch_size = self.inputs['image'].shape[0]
+
+        image_feat = self.backbone(self.inputs)
+        image_feat_flatten = image_feat.reshape((batch_size, 2048, 49))
+        image_feat_flatten = image_feat_flatten.transpose(perm=(0, 2, 1))
+        # and apply a conv layer to learn image token for each 3d joint/vertex position
+        features = self.conv_learn_tokens(image_feat_flatten)  # (B, J, C)
+
+        if self.training:
+            # apply mask vertex/joint modeling
+            # meta_masks is a tensor of all the masks, randomly generated in dataloader
+            # we pre-define a [MASK] token, which is a floating-value vector with 0.01s
+            meta_masks = self.inputs['mjm_mask'].expand((-1, -1, 2048))
+            constant_tensor = paddle.ones_like(features) * 0.01
+            features = features * meta_masks + constant_tensor * (1 - meta_masks
+                                                                  )
+        pred_out = self.trans_encoder(features)
+
+        pred_3d_joints = pred_out[:, :self.num_joints, :]
+        cam_features = pred_out[:, self.num_joints:, :]
+
+        # learn camera parameters
+        pred_2d_joints = self.cam_param_fc(cam_features)
+        return pred_3d_joints, pred_2d_joints
+
+    def get_loss(self):
+        preds_3d, preds_2d = self._forward()
+        loss = self.loss(preds_3d, preds_2d, self.inputs)
+        output = {'loss': loss}
+        return output
+
+    def get_pred(self):
+        preds_3d, preds_2d = self._forward()
+        outputs = {'pose3d': preds_3d, 'pose2d': preds_2d}
+        return outputs

+ 260 - 0
paddlers/models/ppdet/modeling/architectures/ppyoloe.py

@@ -0,0 +1,260 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['PPYOLOE', 'PPYOLOEWithAuxHead']
+# PP-YOLOE and PP-YOLOE+ are recommended to use this architecture, especially when use distillation or aux head
+# PP-YOLOE and PP-YOLOE+ can also use the same architecture of YOLOv3 in yolo.py when not use distillation or aux head
+
+
+@register
+class PPYOLOE(BaseArch):
+    """
+    PPYOLOE network, see https://arxiv.org/abs/2203.16250
+
+    Args:
+        backbone (nn.Layer): backbone instance
+        neck (nn.Layer): neck instance
+        yolo_head (nn.Layer): anchor_head instance
+        post_process (object): `BBoxPostProcess` instance
+        ssod_loss (object): 'SSODPPYOLOELoss' instance, only used for semi-det(ssod)
+        for_distill (bool): whether for distillation
+        feat_distill_place (str): distill which feature for distillation
+        for_mot (bool): whether return other features for multi-object tracking
+            models, default False in pure object detection models.
+    """
+
+    __category__ = 'architecture'
+    __shared__ = ['for_distill']
+    __inject__ = ['post_process', 'ssod_loss']
+
+    def __init__(self,
+                 backbone='CSPResNet',
+                 neck='CustomCSPPAN',
+                 yolo_head='PPYOLOEHead',
+                 post_process='BBoxPostProcess',
+                 ssod_loss='SSODPPYOLOELoss',
+                 for_distill=False,
+                 feat_distill_place='neck_feats',
+                 for_mot=False):
+        super(PPYOLOE, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+        # for ssod, semi-det
+        self.is_teacher = False
+        self.ssod_loss = ssod_loss
+
+        # distill
+        self.for_distill = for_distill
+        self.feat_distill_place = feat_distill_place
+        if for_distill:
+            assert feat_distill_place in ['backbone_feats', 'neck_feats']
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        self.is_teacher = self.inputs.get('is_teacher', False)  # for semi-det
+        if self.training or self.is_teacher:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+
+            if self.for_distill:
+                if self.feat_distill_place == 'backbone_feats':
+                    self.yolo_head.distill_pairs['backbone_feats'] = body_feats
+                elif self.feat_distill_place == 'neck_feats':
+                    self.yolo_head.distill_pairs['neck_feats'] = neck_feats
+                else:
+                    raise ValueError
+            return yolo_losses
+        else:
+
+            yolo_head_outs = self.yolo_head(neck_feats)
+
+            if self.post_process is not None:
+                bbox, bbox_num, nms_keep_idx = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors,
+                    self.inputs['im_shape'], self.inputs['scale_factor'])
+
+            else:
+                bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
+                    yolo_head_outs, self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
+                extra_data['nms_keep_idx'] = nms_keep_idx
+                output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
+            else:
+                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
+
+    def get_loss_keys(self):
+        return ['loss_cls', 'loss_iou', 'loss_dfl', 'loss_contrast']
+
+    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
+        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
+                                     train_cfg)
+        return ssod_losses
+
+
+@register
+class PPYOLOEWithAuxHead(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='CSPResNet',
+                 neck='CustomCSPPAN',
+                 yolo_head='PPYOLOEHead',
+                 aux_head='SimpleConvHead',
+                 post_process='BBoxPostProcess',
+                 for_mot=False,
+                 detach_epoch=5):
+        """
+        PPYOLOE network, see https://arxiv.org/abs/2203.16250
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): anchor_head instance
+            post_process (object): `BBoxPostProcess` instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(PPYOLOEWithAuxHead, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.aux_neck = copy.deepcopy(self.neck)
+
+        self.yolo_head = yolo_head
+        self.aux_head = aux_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+        self.detach_epoch = detach_epoch
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+        aux_neck = copy.deepcopy(neck)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+        aux_head = create(cfg['aux_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+            'aux_head': aux_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            if self.inputs['epoch_id'] >= self.detach_epoch:
+                aux_neck_feats = self.aux_neck([f.detach() for f in body_feats])
+                dual_neck_feats = (paddle.concat(
+                    [f.detach(), aux_f], axis=1) for f, aux_f in
+                                   zip(neck_feats, aux_neck_feats))
+            else:
+                aux_neck_feats = self.aux_neck(body_feats)
+                dual_neck_feats = (paddle.concat(
+                    [f, aux_f], axis=1) for f, aux_f in
+                                   zip(neck_feats, aux_neck_feats))
+            aux_cls_scores, aux_bbox_preds = self.aux_head(dual_neck_feats)
+            loss = self.yolo_head(
+                neck_feats,
+                self.inputs,
+                aux_pred=[aux_cls_scores, aux_bbox_preds])
+            return loss
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+
+            if self.post_process is not None:
+                bbox, bbox_num, nms_keep_idx = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors,
+                    self.inputs['im_shape'], self.inputs['scale_factor'])
+            else:
+                bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
+                    yolo_head_outs, self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
+                # Todo: get logits output
+                extra_data['nms_keep_idx'] = nms_keep_idx
+                output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
+            else:
+                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()

+ 104 - 0
paddlers/models/ppdet/modeling/architectures/queryinst.py

@@ -0,0 +1,104 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['QueryInst']
+
+
+@register
+class QueryInst(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 roi_head,
+                 post_process='SparsePostProcess'):
+        super(QueryInst, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.rpn_head = rpn_head
+        self.roi_head = roi_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        roi_head = create(cfg['roi_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            'rpn_head': rpn_head,
+            "roi_head": roi_head
+        }
+
+    def _forward(self, targets=None):
+        features = self.backbone(self.inputs)
+        features = self.neck(features)
+
+        proposal_bboxes, proposal_features = self.rpn_head(self.inputs[
+            'img_whwh'])
+        outputs = self.roi_head(features, proposal_bboxes, proposal_features,
+                                targets)
+
+        if self.training:
+            return outputs
+        else:
+            bbox_pred, bbox_num, mask_pred = self.post_process(
+                outputs['class_logits'], outputs['bbox_pred'],
+                self.inputs['scale_factor_whwh'], self.inputs['ori_shape'],
+                outputs['mask_logits'])
+            return bbox_pred, bbox_num, mask_pred
+
+    def get_loss(self):
+        targets = []
+        for i in range(len(self.inputs['img_whwh'])):
+            boxes = self.inputs['gt_bbox'][i]
+            labels = self.inputs['gt_class'][i].squeeze(-1)
+            img_whwh = self.inputs['img_whwh'][i]
+            if boxes.shape[0] != 0:
+                img_whwh_tgt = img_whwh.unsqueeze(0).tile([boxes.shape[0], 1])
+            else:
+                img_whwh_tgt = paddle.zeros_like(boxes)
+            gt_segm = self.inputs['gt_segm'][i].astype('float32')
+            targets.append({
+                'boxes': boxes,
+                'labels': labels,
+                'img_whwh': img_whwh,
+                'img_whwh_tgt': img_whwh_tgt,
+                'gt_segm': gt_segm
+            })
+        losses = self._forward(targets)
+        losses.update({'loss': sum(losses.values())})
+        return losses
+
+    def get_pred(self):
+        bbox_pred, bbox_num, mask_pred = self._forward()
+        return {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}

+ 18 - 2
paddlers/models/ppdet/modeling/architectures/retinanet.py

@@ -19,6 +19,7 @@ from __future__ import print_function
 from paddlers.models.ppdet.core.workspace import register, create
 from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 from .meta_arch import BaseArch
 import paddle
 import paddle
+import paddle.nn.functional as F
 
 
 __all__ = ['RetinaNet']
 __all__ = ['RetinaNet']
 
 
@@ -57,9 +58,24 @@ class RetinaNet(BaseArch):
             return self.head(neck_feats, self.inputs)
             return self.head(neck_feats, self.inputs)
         else:
         else:
             head_outs = self.head(neck_feats)
             head_outs = self.head(neck_feats)
-            bbox, bbox_num = self.head.post_process(
+            bbox, bbox_num, nms_keep_idx = self.head.post_process(
                 head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
                 head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
-            return {'bbox': bbox, 'bbox_num': bbox_num}
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                preds_logits = self.head.decode_cls_logits(head_outs[0])
+                preds_scores = F.sigmoid(preds_logits)
+                extra_data['logits'] = preds_logits
+                extra_data['scores'] = preds_scores
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return {'bbox': bbox, 'bbox_num': bbox_num, "extra_data": extra_data}
+            else:
+                return {'bbox': bbox, 'bbox_num': bbox_num}
 
 
     def get_loss(self):
     def get_loss(self):
         return self._forward()
         return self._forward()

+ 3 - 3
paddlers/models/ppdet/modeling/architectures/sparse_rcnn.py

@@ -60,10 +60,10 @@ class SparseRCNN(BaseArch):
         head_outs = self.head(fpn_feats, self.inputs["img_whwh"])
         head_outs = self.head(fpn_feats, self.inputs["img_whwh"])
 
 
         if not self.training:
         if not self.training:
-            bboxes = self.postprocess(
+            bbox_pred, bbox_num = self.postprocess(
                 head_outs["pred_logits"], head_outs["pred_boxes"],
                 head_outs["pred_logits"], head_outs["pred_boxes"],
-                self.inputs["scale_factor_wh"], self.inputs["img_whwh"])
-            return bboxes
+                self.inputs["scale_factor_whwh"], self.inputs["ori_shape"])
+            return bbox_pred, bbox_num
         else:
         else:
             return head_outs
             return head_outs
 
 

+ 35 - 9
paddlers/models/ppdet/modeling/architectures/ssd.py

@@ -18,6 +18,8 @@ from __future__ import print_function
 
 
 from paddlers.models.ppdet.core.workspace import register, create
 from paddlers.models.ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 from .meta_arch import BaseArch
+import paddle
+import paddle.nn.functional as F
 
 
 __all__ = ['SSD']
 __all__ = ['SSD']
 
 
@@ -75,18 +77,42 @@ class SSD(BaseArch):
                                  self.inputs['gt_class'])
                                  self.inputs['gt_class'])
         else:
         else:
             preds, anchors = self.ssd_head(body_feats, self.inputs['image'])
             preds, anchors = self.ssd_head(body_feats, self.inputs['image'])
-            bbox, bbox_num = self.post_process(preds, anchors,
-                                               self.inputs['im_shape'],
-                                               self.inputs['scale_factor'])
-            return bbox, bbox_num
+            bbox, bbox_num, nms_keep_idx = self.post_process(
+                preds, anchors, self.inputs['im_shape'],
+                self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]
+                extra_data['scores'] = F.softmax(paddle.concat(
+                    preds_logits, axis=1)).transpose([0, 2, 1])
+                extra_data['logits'] = paddle.concat(
+                    preds_logits, axis=1).transpose([0, 2, 1])
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox, bbox_num, extra_data
+            else:
+                return bbox, bbox_num
 
 
     def get_loss(self, ):
     def get_loss(self, ):
         return {"loss": self._forward()}
         return {"loss": self._forward()}
 
 
     def get_pred(self):
     def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {
-            "bbox": bbox_pred,
-            "bbox_num": bbox_num,
-        }
+        if self.use_extra_data:
+            bbox_pred, bbox_num, extra_data = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+                "extra_data": extra_data
+            }
+        else:
+            bbox_pred, bbox_num = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+            }
         return output
         return output

+ 28 - 5
paddlers/models/ppdet/modeling/architectures/yolo.py

@@ -21,6 +21,8 @@ from .meta_arch import BaseArch
 from ..post_process import JDEBBoxPostProcess
 from ..post_process import JDEBBoxPostProcess
 
 
 __all__ = ['YOLOv3']
 __all__ = ['YOLOv3']
+# YOLOv3,PP-YOLO,PP-YOLOv2,PP-YOLOE,PP-YOLOE+ use the same architecture as YOLOv3
+# PP-YOLOE and PP-YOLOE+ are recommended to use PPYOLOE architecture in ppyoloe.py, especially when use distillation or aux head
 
 
 
 
 @register
 @register
@@ -77,7 +79,10 @@ class YOLOv3(BaseArch):
 
 
     def _forward(self):
     def _forward(self):
         body_feats = self.backbone(self.inputs)
         body_feats = self.backbone(self.inputs)
-        neck_feats = self.neck(body_feats, self.for_mot)
+        if self.for_mot:
+            neck_feats = self.neck(body_feats, self.for_mot)
+        else:
+            neck_feats = self.neck(body_feats)
 
 
         if isinstance(neck_feats, dict):
         if isinstance(neck_feats, dict):
             assert self.for_mot == True
             assert self.for_mot == True
@@ -96,6 +101,7 @@ class YOLOv3(BaseArch):
             yolo_head_outs = self.yolo_head(neck_feats)
             yolo_head_outs = self.yolo_head(neck_feats)
 
 
             if self.for_mot:
             if self.for_mot:
+                # the detection part of JDE MOT model
                 boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process(
                 boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process(
                     yolo_head_outs, self.yolo_head.mask_anchors)
                     yolo_head_outs, self.yolo_head.mask_anchors)
                 output = {
                 output = {
@@ -107,16 +113,33 @@ class YOLOv3(BaseArch):
                 }
                 }
             else:
             else:
                 if self.return_idx:
                 if self.return_idx:
-                    _, bbox, bbox_num, _ = self.post_process(
+                    # the detection part of JDE MOT model
+                    _, bbox, bbox_num, nms_keep_idx = self.post_process(
                         yolo_head_outs, self.yolo_head.mask_anchors)
                         yolo_head_outs, self.yolo_head.mask_anchors)
                 elif self.post_process is not None:
                 elif self.post_process is not None:
-                    bbox, bbox_num = self.post_process(
+                    # anchor based YOLOs: YOLOv3,PP-YOLO,PP-YOLOv2 use mask_anchors
+                    bbox, bbox_num, nms_keep_idx = self.post_process(
                         yolo_head_outs, self.yolo_head.mask_anchors,
                         yolo_head_outs, self.yolo_head.mask_anchors,
                         self.inputs['im_shape'], self.inputs['scale_factor'])
                         self.inputs['im_shape'], self.inputs['scale_factor'])
                 else:
                 else:
-                    bbox, bbox_num = self.yolo_head.post_process(
+                    # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
+                    bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
                         yolo_head_outs, self.inputs['scale_factor'])
                         yolo_head_outs, self.inputs['scale_factor'])
-                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+                if self.use_extra_data:
+                    extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                    """extra_data:{
+                                'scores': predict scores,
+                                'nms_keep_idx': bbox index before nms,
+                               }
+                    """
+                    extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
+                    # Todo: get logits output
+                    extra_data['nms_keep_idx'] = nms_keep_idx
+                    # Todo support for mask_anchors yolo
+                    output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
+                else:
+                    output = {'bbox': bbox, 'bbox_num': bbox_num}
 
 
             return output
             return output
 
 

+ 88 - 0
paddlers/models/ppdet/modeling/architectures/yolof.py

@@ -0,0 +1,88 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddlers.models.ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['YOLOF']
+
+
+@register
+class YOLOF(BaseArch):
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 backbone='ResNet',
+                 neck='DilatedEncoder',
+                 head='YOLOFHead',
+                 for_mot=False):
+        """
+        YOLOF network, see https://arxiv.org/abs/2103.09460
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): DilatedEncoder instance
+            head (nn.Layer): YOLOFHead instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOF, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolo_losses = self.head(neck_feats, self.inputs)
+            return yolo_losses
+        else:
+            yolo_head_outs = self.head(neck_feats)
+            bbox, bbox_num = self.head.post_process(yolo_head_outs,
+                                                    self.inputs['im_shape'],
+                                                    self.inputs['scale_factor'])
+            output = {'bbox': bbox, 'bbox_num': bbox_num}
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()

+ 10 - 0
paddlers/models/ppdet/modeling/assigners/__init__.py

@@ -17,9 +17,19 @@ from . import task_aligned_assigner
 from . import atss_assigner
 from . import atss_assigner
 from . import simota_assigner
 from . import simota_assigner
 from . import max_iou_assigner
 from . import max_iou_assigner
+from . import fcosr_assigner
+from . import rotated_task_aligned_assigner
+from . import task_aligned_assigner_cr
+from . import uniform_assigner
 
 
 from .utils import *
 from .utils import *
 from .task_aligned_assigner import *
 from .task_aligned_assigner import *
 from .atss_assigner import *
 from .atss_assigner import *
 from .simota_assigner import *
 from .simota_assigner import *
 from .max_iou_assigner import *
 from .max_iou_assigner import *
+from .fcosr_assigner import *
+from .rotated_task_aligned_assigner import *
+from .task_aligned_assigner_cr import *
+from .uniform_assigner import *
+from .hungarian_assigner import *
+from .pose_utils import *

+ 16 - 6
paddlers/models/ppdet/modeling/assigners/atss_assigner.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # you may not use this file except in compliance with the License.
@@ -41,12 +41,14 @@ class ATSSAssigner(nn.Layer):
                  topk=9,
                  topk=9,
                  num_classes=80,
                  num_classes=80,
                  force_gt_matching=False,
                  force_gt_matching=False,
-                 eps=1e-9):
+                 eps=1e-9,
+                 sm_use=False):
         super(ATSSAssigner, self).__init__()
         super(ATSSAssigner, self).__init__()
         self.topk = topk
         self.topk = topk
         self.num_classes = num_classes
         self.num_classes = num_classes
         self.force_gt_matching = force_gt_matching
         self.force_gt_matching = force_gt_matching
         self.eps = eps
         self.eps = eps
+        self.sm_use = sm_use
 
 
     def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list,
     def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list,
                              pad_gt_mask):
                              pad_gt_mask):
@@ -124,7 +126,8 @@ class ATSSAssigner(nn.Layer):
             assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
             assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
             assigned_scores = paddle.zeros(
             assigned_scores = paddle.zeros(
                 [batch_size, num_anchors, self.num_classes])
                 [batch_size, num_anchors, self.num_classes])
-            return assigned_labels, assigned_bboxes, assigned_scores
+            mask_positive = paddle.zeros([batch_size, 1, num_anchors])
+            return assigned_labels, assigned_bboxes, assigned_scores, mask_positive
 
 
         # 1. compute iou between gt and anchor bbox, [B, n, L]
         # 1. compute iou between gt and anchor bbox, [B, n, L]
         ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
         ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
@@ -154,7 +157,11 @@ class ATSSAssigner(nn.Layer):
                                   paddle.zeros_like(is_in_topk))
                                   paddle.zeros_like(is_in_topk))
 
 
         # 6. check the positive sample's center in gt, [B, n, L]
         # 6. check the positive sample's center in gt, [B, n, L]
-        is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)
+        if self.sm_use:
+            is_in_gts = check_points_inside_bboxes(
+                anchor_centers, gt_bboxes, sm_use=True)
+        else:
+            is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)
 
 
         # select positive sample, [B, n, L]
         # select positive sample, [B, n, L]
         mask_positive = is_in_topk * is_in_gts * pad_gt_mask
         mask_positive = is_in_topk * is_in_gts * pad_gt_mask
@@ -165,7 +172,10 @@ class ATSSAssigner(nn.Layer):
         if mask_positive_sum.max() > 1:
         if mask_positive_sum.max() > 1:
             mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
             mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
                 [1, num_max_boxes, 1])
                 [1, num_max_boxes, 1])
-            is_max_iou = compute_max_iou_anchor(ious)
+            if self.sm_use:
+                is_max_iou = compute_max_iou_anchor(ious * mask_positive)
+            else:
+                is_max_iou = compute_max_iou_anchor(ious)
             mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
             mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
                                          mask_positive)
                                          mask_positive)
             mask_positive_sum = mask_positive.sum(axis=-2)
             mask_positive_sum = mask_positive.sum(axis=-2)
@@ -212,4 +222,4 @@ class ATSSAssigner(nn.Layer):
                                          paddle.zeros_like(gather_scores))
                                          paddle.zeros_like(gather_scores))
             assigned_scores *= gather_scores.unsqueeze(-1)
             assigned_scores *= gather_scores.unsqueeze(-1)
 
 
-        return assigned_labels, assigned_bboxes, assigned_scores
+        return assigned_labels, assigned_bboxes, assigned_scores, mask_positive

+ 227 - 0
paddlers/models/ppdet/modeling/assigners/fcosr_assigner.py

@@ -0,0 +1,227 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register
+from paddlers.models.ppdet.modeling.rbox_utils import box2corners, check_points_in_polys, paddle_gather
+
+__all__ = ['FCOSRAssigner']
+
+EPS = 1e-9
+
+
+@register
+class FCOSRAssigner(nn.Layer):
+    """ FCOSR Assigner, refer to https://arxiv.org/abs/2111.10780 for details
+
+    1. compute normalized gaussian distribution score and refined gaussian distribution score
+    2. refer to ellipse center sampling, sample points whose normalized gaussian distribution score is greater than threshold
+    3. refer to multi-level sampling, assign ground truth to feature map which follows two conditions.
+        i). first, the ratio between the short edge of the target and the stride of the feature map is less than 2.
+        ii). second, the long edge of minimum bounding rectangle of the target is larger than the acceptance range of feature map
+    4. refer to fuzzy sample label assignment, the points satisfying 2 and 3 will be assigned to the ground truth according to gaussian distribution score
+    """
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 factor=12,
+                 threshold=0.23,
+                 boundary=[[-1, 128], [128, 320], [320, 10000]],
+                 score_type='iou'):
+        super(FCOSRAssigner, self).__init__()
+        self.num_classes = num_classes
+        self.factor = factor
+        self.threshold = threshold
+        self.boundary = [
+            paddle.to_tensor(
+                l, dtype=paddle.float32).reshape([1, 1, 2]) for l in boundary
+        ]
+        self.score_type = score_type
+
+    def get_gaussian_distribution_score(self, points, gt_rboxes, gt_polys):
+        # projecting points to coordinate system defined by each rbox
+        # [B, N, 4, 2] -> 4 * [B, N, 1, 2]
+        a, b, c, d = gt_polys.split(4, axis=2)
+        # [1, L, 2] -> [1, 1, L, 2]
+        points = points.unsqueeze(0)
+        ab = b - a
+        ad = d - a
+        # [B, N, 5] -> [B, N, 2], [B, N, 2], [B, N, 1]
+        xy, wh, angle = gt_rboxes.split([2, 2, 1], axis=-1)
+        # [B, N, 2] -> [B, N, 1, 2]
+        xy = xy.unsqueeze(2)
+        # vector of points to center [B, N, L, 2]
+        vec = points - xy
+        # <ab, vec> = |ab| * |vec| * cos(theta) [B, N, L]
+        vec_dot_ab = paddle.sum(vec * ab, axis=-1)
+        # <ad, vec> = |ad| * |vec| * cos(theta) [B, N, L]
+        vec_dot_ad = paddle.sum(vec * ad, axis=-1)
+        # norm_ab [B, N, L]
+        norm_ab = paddle.sum(ab * ab, axis=-1).sqrt()
+        # norm_ad [B, N, L]
+        norm_ad = paddle.sum(ad * ad, axis=-1).sqrt()
+        # min(h, w), [B, N, 1]
+        min_edge = paddle.min(wh, axis=-1, keepdim=True)
+        # delta_x, delta_y [B, N, L]
+        delta_x = vec_dot_ab.pow(2) / (norm_ab.pow(3) * min_edge + EPS)
+        delta_y = vec_dot_ad.pow(2) / (norm_ad.pow(3) * min_edge + EPS)
+        # score [B, N, L]
+        norm_score = paddle.exp(-0.5 * self.factor * (delta_x + delta_y))
+
+        # simplified calculation
+        sigma = min_edge / self.factor
+        refined_score = norm_score / (2 * np.pi * sigma + EPS)
+        return norm_score, refined_score
+
+    def get_rotated_inside_mask(self, points, gt_polys, scores):
+        inside_mask = check_points_in_polys(points, gt_polys)
+        center_mask = scores >= self.threshold
+        return (inside_mask & center_mask).cast(paddle.float32)
+
+    def get_inside_range_mask(self, points, gt_bboxes, gt_rboxes, stride_tensor,
+                              regress_range):
+        # [1, L, 2] -> [1, 1, L, 2]
+        points = points.unsqueeze(0)
+        # [B, n, 4] -> [B, n, 1, 4]
+        x1y1, x2y2 = gt_bboxes.unsqueeze(2).split(2, axis=-1)
+        # [B, n, L, 2]
+        lt = points - x1y1
+        rb = x2y2 - points
+        # [B, n, L, 4]
+        ltrb = paddle.concat([lt, rb], axis=-1)
+        # [B, n, L, 4] -> [B, n, L]
+        inside_mask = paddle.min(ltrb, axis=-1) > EPS
+        # regress_range [1, L, 2] -> [1, 1, L, 2]
+        regress_range = regress_range.unsqueeze(0)
+        # stride_tensor [1, L, 1] -> [1, 1, L]
+        stride_tensor = stride_tensor.transpose((0, 2, 1))
+        # fcos range
+        # [B, n, L, 4] -> [B, n, L]
+        ltrb_max = paddle.max(ltrb, axis=-1)
+        # [1, 1, L, 2] -> [1, 1, L]
+        low, high = regress_range[..., 0], regress_range[..., 1]
+        # [B, n, L]
+        regress_mask = (ltrb_max >= low) & (ltrb_max <= high)
+        # mask for rotated
+        # [B, n, 1]
+        min_edge = paddle.min(gt_rboxes[..., 2:4], axis=-1, keepdim=True)
+        # [B, n , L]
+        rotated_mask = ((min_edge / stride_tensor) < 2.0) & (ltrb_max > high)
+        mask = inside_mask & (regress_mask | rotated_mask)
+        return mask.cast(paddle.float32)
+
+    @paddle.no_grad()
+    def forward(self,
+                anchor_points,
+                stride_tensor,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                gt_rboxes,
+                pad_gt_mask,
+                bg_index,
+                pred_rboxes=None):
+        r"""
+
+        Args:
+            anchor_points (Tensor, float32): pre-defined anchor points, shape(1, L, 2),
+                    "x, y" format
+            stride_tensor (Tensor, float32): stride tensor, shape (1, L, 1)
+            num_anchors_list (List): num of anchors in each level
+            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
+            gt_rboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)
+            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
+            bg_index (int): background index
+            pred_rboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 5)
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_rboxes (Tensor): (B, L, 5)
+            assigned_scores (Tensor): (B, L, C), if pred_rboxes is not None, then output ious
+        """
+
+        _, num_anchors, _ = anchor_points.shape
+        batch_size, num_max_boxes, _ = gt_rboxes.shape
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full(
+                [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)
+            assigned_rboxes = paddle.zeros([batch_size, num_anchors, 5])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, self.num_classes])
+            return assigned_labels, assigned_rboxes, assigned_scores
+
+        # get normalized gaussian distribution score and refined distribution score
+        gt_polys = box2corners(gt_rboxes)
+        score, refined_score = self.get_gaussian_distribution_score(
+            anchor_points, gt_rboxes, gt_polys)
+        inside_mask = self.get_rotated_inside_mask(anchor_points, gt_polys,
+                                                   score)
+        regress_ranges = []
+        for num, bound in zip(num_anchors_list, self.boundary):
+            regress_ranges.append(bound.tile((1, num, 1)))
+        regress_ranges = paddle.concat(regress_ranges, axis=1)
+        regress_mask = self.get_inside_range_mask(
+            anchor_points, gt_bboxes, gt_rboxes, stride_tensor, regress_ranges)
+        # [B, n, L]
+        mask_positive = inside_mask * regress_mask * pad_gt_mask
+        refined_score = refined_score * mask_positive - (1. - mask_positive)
+
+        argmax_refined_score = refined_score.argmax(axis=-2)
+        max_refined_score = refined_score.max(axis=-2)
+        assigned_gt_index = argmax_refined_score
+
+        # assigned target
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            max_refined_score > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_rboxes = paddle.gather(
+            gt_rboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)
+        assigned_rboxes = assigned_rboxes.reshape([batch_size, num_anchors, 5])
+
+        assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)
+        ind = list(range(self.num_classes + 1))
+        ind.remove(bg_index)
+        assigned_scores = paddle.index_select(
+            assigned_scores, paddle.to_tensor(ind), axis=-1)
+
+        if self.score_type == 'gaussian':
+            selected_scores = paddle_gather(
+                score, 1, argmax_refined_score.unsqueeze(-2)).squeeze(-2)
+            assigned_scores = assigned_scores * selected_scores.unsqueeze(-1)
+        elif self.score_type == 'iou':
+            assert pred_rboxes is not None, 'If score type is iou, pred_rboxes should not be None'
+            from ext_op import matched_rbox_iou
+            b, l = pred_rboxes.shape[:2]
+            iou_score = matched_rbox_iou(
+                pred_rboxes.reshape((-1, 5)), assigned_rboxes.reshape(
+                    (-1, 5))).reshape((b, l, 1))
+            assigned_scores = assigned_scores * iou_score
+
+        return assigned_labels, assigned_rboxes, assigned_scores 

+ 316 - 0
paddlers/models/ppdet/modeling/assigners/hungarian_assigner.py

@@ -0,0 +1,316 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+import paddle
+
+from paddlers.models.ppdet.core.workspace import register
+
+__all__ = ['PoseHungarianAssigner', 'PseudoSampler']
+
+
+class AssignResult:
+    """Stores assignments between predicted and truth boxes.
+
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+
+        gt_inds (LongTensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+
+        max_overlaps (FloatTensor): the iou between the predicted box and its
+            assigned truth box.
+
+        labels (None | LongTensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+    """
+
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+
+@register
+class PoseHungarianAssigner:
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression oks cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt.
+    - positive integer: positive sample, index (1-based) of assigned gt.
+
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        kpt_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        oks_weight (int | float, optional): The scale factor for regression
+            oks cost. Default 1.0.
+    """
+    __inject__ = ['cls_cost', 'kpt_cost', 'oks_cost']
+
+    def __init__(self,
+                 cls_cost='ClassificationCost',
+                 kpt_cost='KptL1Cost',
+                 oks_cost='OksCost'):
+        self.cls_cost = cls_cost
+        self.kpt_cost = kpt_cost
+        self.oks_cost = oks_cost
+
+    def assign(self,
+               cls_pred,
+               kpt_pred,
+               gt_labels,
+               gt_keypoints,
+               gt_areas,
+               img_meta,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            kpt_pred (Tensor): Predicted keypoints with normalized coordinates
+                (x_{i}, y_{i}), which are all in range [0, 1]. Shape
+                [num_query, K*2].
+            gt_labels (Tensor): Label of `gt_keypoints`, shape (num_gt,).
+            gt_keypoints (Tensor): Ground truth keypoints with unnormalized
+                coordinates [p^{1}_x, p^{1}_y, p^{1}_v, ..., \
+                    p^{K}_x, p^{K}_y, p^{K}_v]. Shape [num_gt, K*3].
+            gt_areas (Tensor): Ground truth mask areas, shape (num_gt,).
+            img_meta (dict): Meta information for current image.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        num_gts, num_kpts = gt_keypoints.shape[0], kpt_pred.shape[0]
+        if not gt_keypoints.astype('bool').any():
+            num_gts = 0
+
+        # 1. assign -1 by default
+        assigned_gt_inds = paddle.full((num_kpts, ), -1, dtype="int64")
+        assigned_labels = paddle.full((num_kpts, ), -1, dtype="int64")
+        if num_gts == 0 or num_kpts == 0:
+            # No ground truth or keypoints, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+        img_h, img_w, _ = img_meta['img_shape']
+        factor = paddle.to_tensor(
+            [img_w, img_h, img_w, img_h], dtype=gt_keypoints.dtype).reshape(
+                (1, -1))
+
+        # 2. compute the weighted costs
+        # classification cost
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+
+        # keypoint regression L1 cost
+        gt_keypoints_reshape = gt_keypoints.reshape((gt_keypoints.shape[0], -1,
+                                                     3))
+        valid_kpt_flag = gt_keypoints_reshape[..., -1]
+        kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,
+                                                          2))
+        normalize_gt_keypoints = gt_keypoints_reshape[
+            ..., :2] / factor[:, :2].unsqueeze(0)
+        kpt_cost = self.kpt_cost(kpt_pred_tmp, normalize_gt_keypoints,
+                                 valid_kpt_flag)
+        # keypoint OKS cost
+        kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,
+                                                          2))
+        kpt_pred_tmp = kpt_pred_tmp * factor[:, :2].unsqueeze(0)
+        oks_cost = self.oks_cost(kpt_pred_tmp, gt_keypoints_reshape[..., :2],
+                                 valid_kpt_flag, gt_areas)
+        # weighted sum of above three costs
+        cost = cls_cost + kpt_cost + oks_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = paddle.to_tensor(matched_row_inds)
+        matched_col_inds = paddle.to_tensor(matched_col_inds)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds][
+            ..., 0].astype("int64")
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+
+class SamplingResult:
+    """Bbox sampling result.
+    """
+
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        if pos_inds.size > 0:
+            self.pos_bboxes = bboxes[pos_inds]
+            self.neg_bboxes = bboxes[neg_inds]
+            self.pos_is_gt = gt_flags[pos_inds]
+
+            self.num_gts = gt_bboxes.shape[0]
+            self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+            if gt_bboxes.numel() == 0:
+                # hack for index error case
+                assert self.pos_assigned_gt_inds.numel() == 0
+                self.pos_gt_bboxes = paddle.zeros(
+                    gt_bboxes.shape, dtype=gt_bboxes.dtype).reshape((-1, 4))
+            else:
+                if len(gt_bboxes.shape) < 2:
+                    gt_bboxes = gt_bboxes.reshape((-1, 4))
+
+                self.pos_gt_bboxes = paddle.index_select(
+                    gt_bboxes,
+                    self.pos_assigned_gt_inds.astype('int64'),
+                    axis=0)
+
+            if assign_result.labels is not None:
+                self.pos_gt_labels = assign_result.labels[pos_inds]
+            else:
+                self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        """paddle.Tensor: concatenated positive and negative boxes"""
+        return paddle.concat([self.pos_bboxes, self.neg_bboxes])
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_bboxes'] = data.pop('pos_bboxes').shape
+        data['neg_bboxes'] = data.pop('neg_bboxes').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_bboxes': self.pos_bboxes,
+            'neg_bboxes': self.neg_bboxes,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
+
+
+@register
+class PseudoSampler:
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result, bboxes, gt_bboxes, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            bboxes (paddle.Tensor): Bounding boxes
+            gt_bboxes (paddle.Tensor): Ground truth boxes
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pos_inds = paddle.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1)
+        neg_inds = paddle.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1)
+        gt_flags = paddle.zeros([bboxes.shape[0]], dtype='int32')
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result

+ 275 - 0
paddlers/models/ppdet/modeling/assigners/pose_utils.py

@@ -0,0 +1,275 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register
+
+__all__ = ['KptL1Cost', 'OksCost', 'ClassificationCost']
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+@register
+class KptL1Cost(object):
+    """KptL1Cost.
+
+    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
+
+    Args:
+        weight (int | float, optional): loss_weight.
+    """
+
+    def __init__(self, weight=1.0):
+        self.weight = weight
+
+    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag):
+        """
+        Args:
+            kpt_pred (Tensor): Predicted keypoints with normalized coordinates
+                (x_{i}, y_{i}), which are all in range [0, 1]. Shape
+                [num_query, K, 2].
+            gt_keypoints (Tensor): Ground truth keypoints with normalized
+                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
+            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
+                Shape [num_gt, K].
+
+        Returns:
+            paddle.Tensor: kpt_cost value with weight.
+        """
+        kpt_cost = []
+        for i in range(len(gt_keypoints)):
+            if gt_keypoints[i].size == 0:
+                kpt_cost.append(kpt_pred.sum() * 0)
+            kpt_pred_tmp = kpt_pred.clone()
+            valid_flag = valid_kpt_flag[i] > 0
+            valid_flag_expand = valid_flag.unsqueeze(0).unsqueeze(-1).expand_as(
+                kpt_pred_tmp)
+            if not valid_flag_expand.all():
+                kpt_pred_tmp = masked_fill(kpt_pred_tmp, ~valid_flag_expand, 0)
+            cost = F.pairwise_distance(
+                kpt_pred_tmp.reshape((kpt_pred_tmp.shape[0], -1)),
+                gt_keypoints[i].reshape((-1, )).unsqueeze(0),
+                p=1,
+                keepdim=True)
+            avg_factor = paddle.clip(
+                valid_flag.astype('float32').sum() * 2, 1.0)
+            cost = cost / avg_factor
+            kpt_cost.append(cost)
+        kpt_cost = paddle.concat(kpt_cost, axis=1)
+        return kpt_cost * self.weight
+
+
+@register
+class OksCost(object):
+    """OksCost.
+
+    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
+
+    Args:
+        num_keypoints (int): number of keypoints
+        weight (int | float, optional): loss_weight.
+    """
+
+    def __init__(self, num_keypoints=17, weight=1.0):
+        self.weight = weight
+        if num_keypoints == 17:
+            self.sigmas = np.array(
+                [
+                    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
+                    1.07, .87, .87, .89, .89
+                ],
+                dtype=np.float32) / 10.0
+        elif num_keypoints == 14:
+            self.sigmas = np.array(
+                [
+                    .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89,
+                    .89, .79, .79
+                ],
+                dtype=np.float32) / 10.0
+        else:
+            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
+
+    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag, gt_areas):
+        """
+        Args:
+            kpt_pred (Tensor): Predicted keypoints with unnormalized
+                coordinates (x_{i}, y_{i}). Shape [num_query, K, 2].
+            gt_keypoints (Tensor): Ground truth keypoints with unnormalized
+                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
+            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
+                Shape [num_gt, K].
+            gt_areas (Tensor): Ground truth mask areas. Shape [num_gt,].
+
+        Returns:
+            paddle.Tensor: oks_cost value with weight.
+        """
+        sigmas = paddle.to_tensor(self.sigmas)
+        variances = (sigmas * 2)**2
+
+        oks_cost = []
+        assert len(gt_keypoints) == len(gt_areas)
+        for i in range(len(gt_keypoints)):
+            if gt_keypoints[i].size == 0:
+                oks_cost.append(kpt_pred.sum() * 0)
+            squared_distance = \
+                (kpt_pred[:, :, 0] - gt_keypoints[i, :, 0].unsqueeze(0)) ** 2 + \
+                (kpt_pred[:, :, 1] - gt_keypoints[i, :, 1].unsqueeze(0)) ** 2
+            vis_flag = (valid_kpt_flag[i] > 0).astype('int')
+            vis_ind = vis_flag.nonzero(as_tuple=False)[:, 0]
+            num_vis_kpt = vis_ind.shape[0]
+            # assert num_vis_kpt > 0
+            if num_vis_kpt == 0:
+                oks_cost.append(paddle.zeros((squared_distance.shape[0], 1)))
+                continue
+            area = gt_areas[i]
+
+            squared_distance0 = squared_distance / (area * variances * 2)
+            squared_distance0 = paddle.index_select(
+                squared_distance0, vis_ind, axis=1)
+            squared_distance1 = paddle.exp(-squared_distance0).sum(axis=1,
+                                                                   keepdim=True)
+            oks = squared_distance1 / num_vis_kpt
+            # The 1 is a constant that doesn't change the matching, so omitted.
+            oks_cost.append(-oks)
+        oks_cost = paddle.concat(oks_cost, axis=1)
+        return oks_cost * self.weight
+
+
+@register
+class ClassificationCost:
+    """ClsSoftmaxCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            paddle.Tensor: cls_cost value with weight
+        """
+        # Following the official DETR repo, contrary to the loss that
+        # NLL is used, we approximate it in 1 - cls_score[gt_label].
+        # The 1 is a constant that doesn't change the matching,
+        # so it can be omitted.
+        cls_score = cls_pred.softmax(-1)
+        cls_cost = -cls_score[:, gt_labels]
+        return cls_cost * self.weight
+
+
+@register
+class FocalLossCost:
+    """FocalLossCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+         alpha (int | float, optional): focal_loss alpha
+         gamma (int | float, optional): focal_loss gamma
+         eps (float, optional): default 1e-12
+         binary_input (bool, optional): Whether the input is binary,
+            default False.
+    """
+
+    def __init__(self,
+                 weight=1.,
+                 alpha=0.25,
+                 gamma=2,
+                 eps=1e-12,
+                 binary_input=False):
+        self.weight = weight
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+        self.binary_input = binary_input
+
+    def _focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            paddle.Tensor: cls_cost value with weight
+        """
+        if gt_labels.size == 0:
+            return cls_pred.sum() * 0
+        cls_pred = F.sigmoid(cls_pred)
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = paddle.index_select(
+            pos_cost, gt_labels, axis=1) - paddle.index_select(
+                neg_cost, gt_labels, axis=1)
+        return cls_cost * self.weight
+
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits
+                in shape (num_query, d1, ..., dn), dtype=paddle.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=paddle.long. Labels should be binary.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = F.sigmoid(cls_pred)
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = paddle.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            paddle.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits.
+            gt_labels (Tensor)): Labels.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        if self.binary_input:
+            return self._mask_focal_loss_cost(cls_pred, gt_labels)
+        else:
+            return self._focal_loss_cost(cls_pred, gt_labels)

+ 164 - 0
paddlers/models/ppdet/modeling/assigners/rotated_task_aligned_assigner.py

@@ -0,0 +1,164 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register
+from ..rbox_utils import rotated_iou_similarity, check_points_in_rotated_boxes
+from .utils import gather_topk_anchors, compute_max_iou_anchor
+
+__all__ = ['RotatedTaskAlignedAssigner']
+
+
+@register
+class RotatedTaskAlignedAssigner(nn.Layer):
+    """TOOD: Task-aligned One-stage Object Detection
+    """
+
+    def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
+        super(RotatedTaskAlignedAssigner, self).__init__()
+        self.topk = topk
+        self.alpha = alpha
+        self.beta = beta
+        self.eps = eps
+
+    @paddle.no_grad()
+    def forward(self,
+                pred_scores,
+                pred_bboxes,
+                anchor_points,
+                num_anchors_list,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index,
+                gt_scores=None):
+        r"""This code is based on
+            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
+
+        The assignment is done in following steps
+        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free detector
+           only can predict positive distance)
+        4. if an anchor box is assigned to multiple gts, the one with the
+           highest iou will be selected.
+        Args:
+            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
+            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 5)
+            anchor_points (Tensor, float32): pre-defined anchors, shape(1, L, 2), "cxcy" format
+            num_anchors_list (List): num of anchors in each level, shape(L)
+            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)
+            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
+            bg_index (int): background index
+            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_bboxes (Tensor): (B, L, 5)
+            assigned_scores (Tensor): (B, L, C)
+        """
+        assert pred_scores.ndim == pred_bboxes.ndim
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+
+        batch_size, num_anchors, num_classes = pred_scores.shape
+        _, num_max_boxes, _ = gt_bboxes.shape
+
+        # negative batch
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full(
+                [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)
+            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 5])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, num_classes])
+            return assigned_labels, assigned_bboxes, assigned_scores
+
+        # compute iou between gt and pred bbox, [B, n, L]
+        ious = rotated_iou_similarity(gt_bboxes, pred_bboxes)
+        ious = paddle.where(ious > 1 + self.eps, paddle.zeros_like(ious), ious)
+        ious.stop_gradient = True
+        # gather pred bboxes class score
+        pred_scores = pred_scores.transpose([0, 2, 1])
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        gt_labels_ind = paddle.stack(
+            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
+            axis=-1)
+        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
+        # compute alignment metrics, [B, n, L]
+        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
+            self.beta)
+
+        # check the positive sample's center in gt, [B, n, L]
+        is_in_gts = check_points_in_rotated_boxes(anchor_points, gt_bboxes)
+
+        # select topk largest alignment metrics pred bbox as candidates
+        # for each gt, [B, n, L]
+        is_in_topk = gather_topk_anchors(
+            alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)
+
+        # select positive sample, [B, n, L]
+        mask_positive = is_in_topk * is_in_gts * pad_gt_mask
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected, [B, n, L]
+        mask_positive_sum = mask_positive.sum(axis=-2)
+        if mask_positive_sum.max() > 1:
+            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
+                [1, num_max_boxes, 1])
+            is_max_iou = compute_max_iou_anchor(ious)
+            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        assigned_gt_index = mask_positive.argmax(axis=-2)
+
+        # assigned target
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            mask_positive_sum > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_bboxes = paddle.gather(
+            gt_bboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)
+        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 5])
+
+        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
+        ind = list(range(num_classes + 1))
+        ind.remove(bg_index)
+        assigned_scores = paddle.index_select(
+            assigned_scores, paddle.to_tensor(ind), axis=-1)
+        # rescale alignment metrics
+        alignment_metrics *= mask_positive
+        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
+        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
+                                                           keepdim=True)
+        alignment_metrics = alignment_metrics / (
+            max_metrics_per_instance + self.eps) * max_ious_per_instance
+        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
+        assigned_scores = assigned_scores * alignment_metrics
+
+        assigned_bboxes.stop_gradient = True
+        assigned_scores.stop_gradient = True
+        assigned_labels.stop_gradient = True
+        return assigned_labels, assigned_bboxes, assigned_scores

+ 1 - 1
paddlers/models/ppdet/modeling/assigners/simota_assigner.py

@@ -236,7 +236,7 @@ class SimOTAAssigner(object):
         )] = match_fg_mask_inmatrix
         )] = match_fg_mask_inmatrix
 
 
         assigned_gt_inds[match_fg_mask_inall.astype(
         assigned_gt_inds[match_fg_mask_inall.astype(
-            np.bool)] = match_gt_inds_to_fg + 1
+            np.bool_)] = match_gt_inds_to_fg + 1
 
 
         pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \
         pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \
             = self.get_sample(assigned_gt_inds, gt_bboxes.numpy())
             = self.get_sample(assigned_gt_inds, gt_bboxes.numpy())

+ 38 - 4
paddlers/models/ppdet/modeling/assigners/task_aligned_assigner.py

@@ -28,17 +28,47 @@ from .utils import (gather_topk_anchors, check_points_inside_bboxes,
 __all__ = ['TaskAlignedAssigner']
 __all__ = ['TaskAlignedAssigner']
 
 
 
 
+def is_close_gt(anchor, gt, stride_lst, max_dist=2.0, alpha=2.):
+    """Calculate distance ratio of box1 and box2 in batch for larger stride
+        anchors dist/stride to promote the survive of large distance match
+    Args:
+        anchor (Tensor): box with the shape [L, 2]
+        gt (Tensor): box with the shape [N, M2, 4]
+    Return:
+        dist (Tensor): dist ratio between box1 and box2 with the shape [N, M1, M2]
+    """
+    center1 = anchor.unsqueeze(0)
+    center2 = (gt[..., :2] + gt[..., -2:]) / 2.
+    center1 = center1.unsqueeze(1)  # [N, M1, 2] -> [N, 1, M1, 2]
+    center2 = center2.unsqueeze(2)  # [N, M2, 2] -> [N, M2, 1, 2]
+
+    stride = paddle.concat([
+        paddle.full([x], 32 / pow(2, idx)) for idx, x in enumerate(stride_lst)
+    ]).unsqueeze(0).unsqueeze(0)
+    dist = paddle.linalg.norm(center1 - center2, p=2, axis=-1) / stride
+    dist_ratio = dist
+    dist_ratio[dist < max_dist] = 1.
+    dist_ratio[dist >= max_dist] = 0.
+    return dist_ratio
+
+
 @register
 @register
 class TaskAlignedAssigner(nn.Layer):
 class TaskAlignedAssigner(nn.Layer):
     """TOOD: Task-aligned One-stage Object Detection
     """TOOD: Task-aligned One-stage Object Detection
     """
     """
 
 
-    def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
+    def __init__(self,
+                 topk=13,
+                 alpha=1.0,
+                 beta=6.0,
+                 eps=1e-9,
+                 is_close_gt=False):
         super(TaskAlignedAssigner, self).__init__()
         super(TaskAlignedAssigner, self).__init__()
         self.topk = topk
         self.topk = topk
         self.alpha = alpha
         self.alpha = alpha
         self.beta = beta
         self.beta = beta
         self.eps = eps
         self.eps = eps
+        self.is_close_gt = is_close_gt
 
 
     @paddle.no_grad()
     @paddle.no_grad()
     def forward(self,
     def forward(self,
@@ -90,7 +120,8 @@ class TaskAlignedAssigner(nn.Layer):
             assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
             assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
             assigned_scores = paddle.zeros(
             assigned_scores = paddle.zeros(
                 [batch_size, num_anchors, num_classes])
                 [batch_size, num_anchors, num_classes])
-            return assigned_labels, assigned_bboxes, assigned_scores
+            mask_positive = paddle.zeros([batch_size, 1, num_anchors])
+            return assigned_labels, assigned_bboxes, assigned_scores, mask_positive
 
 
         # compute iou between gt and pred bbox, [B, n, L]
         # compute iou between gt and pred bbox, [B, n, L]
         ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
         ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
@@ -107,7 +138,10 @@ class TaskAlignedAssigner(nn.Layer):
             self.beta)
             self.beta)
 
 
         # check the positive sample's center in gt, [B, n, L]
         # check the positive sample's center in gt, [B, n, L]
-        is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)
+        if self.is_close_gt:
+            is_in_gts = is_close_gt(anchor_points, gt_bboxes, num_anchors_list)
+        else:
+            is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)
 
 
         # select topk largest alignment metrics pred bbox as candidates
         # select topk largest alignment metrics pred bbox as candidates
         # for each gt, [B, n, L]
         # for each gt, [B, n, L]
@@ -157,4 +191,4 @@ class TaskAlignedAssigner(nn.Layer):
         alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
         alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
         assigned_scores = assigned_scores * alignment_metrics
         assigned_scores = assigned_scores * alignment_metrics
 
 
-        return assigned_labels, assigned_bboxes, assigned_scores
+        return assigned_labels, assigned_bboxes, assigned_scores, mask_positive

+ 182 - 0
paddlers/models/ppdet/modeling/assigners/task_aligned_assigner_cr.py

@@ -0,0 +1,182 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppdet.core.workspace import register
+from ..bbox_utils import batch_iou_similarity
+from .utils import (gather_topk_anchors, check_points_inside_bboxes,
+                    compute_max_iou_anchor)
+
+__all__ = ['TaskAlignedAssigner_CR']
+
+
+@register
+class TaskAlignedAssigner_CR(nn.Layer):
+    """TOOD: Task-aligned One-stage Object Detection with Center R
+    """
+
+    def __init__(self,
+                 topk=13,
+                 alpha=1.0,
+                 beta=6.0,
+                 center_radius=None,
+                 eps=1e-9):
+        super(TaskAlignedAssigner_CR, self).__init__()
+        self.topk = topk
+        self.alpha = alpha
+        self.beta = beta
+        self.center_radius = center_radius
+        self.eps = eps
+
+    @paddle.no_grad()
+    def forward(self,
+                pred_scores,
+                pred_bboxes,
+                anchor_points,
+                stride_tensor,
+                gt_labels,
+                gt_bboxes,
+                pad_gt_mask,
+                bg_index,
+                gt_scores=None):
+        r"""This code is based on
+            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
+
+        The assignment is done in following steps
+        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free detector
+           only can predict positive distance)
+        4. if an anchor box is assigned to multiple gts, the one with the
+           highest iou will be selected.
+        Args:
+            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
+            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
+            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
+            stride_tensor (Tensor, float32): stride of feature map, shape(L, 1)
+            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
+            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
+            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
+            bg_index (int): background index
+            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
+        Returns:
+            assigned_labels (Tensor): (B, L)
+            assigned_bboxes (Tensor): (B, L, 4)
+            assigned_scores (Tensor): (B, L, C)
+        """
+        assert pred_scores.ndim == pred_bboxes.ndim
+        assert gt_labels.ndim == gt_bboxes.ndim and \
+               gt_bboxes.ndim == 3
+
+        batch_size, num_anchors, num_classes = pred_scores.shape
+        _, num_max_boxes, _ = gt_bboxes.shape
+
+        # negative batch
+        if num_max_boxes == 0:
+            assigned_labels = paddle.full(
+                [batch_size, num_anchors], bg_index, dtype='int32')
+            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
+            assigned_scores = paddle.zeros(
+                [batch_size, num_anchors, num_classes])
+            mask_positive = paddle.zeros([batch_size, 1, num_anchors])
+            return assigned_labels, assigned_bboxes, assigned_scores, mask_positive
+
+        # compute iou between gt and pred bbox, [B, n, L]
+        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
+        # gather pred bboxes class score
+        pred_scores = pred_scores.transpose([0, 2, 1])
+        batch_ind = paddle.arange(
+            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
+        gt_labels_ind = paddle.stack(
+            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
+            axis=-1)
+        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
+        # compute alignment metrics, [B, n, L]
+        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
+            self.beta) * pad_gt_mask
+
+        # select positive sample, [B, n, L]
+        if self.center_radius is None:
+            # check the positive sample's center in gt, [B, n, L]
+            is_in_gts = check_points_inside_bboxes(
+                anchor_points, gt_bboxes, sm_use=True)
+            # select topk largest alignment metrics pred bbox as candidates
+            # for each gt, [B, n, L]
+            mask_positive = gather_topk_anchors(
+                alignment_metrics, self.topk, topk_mask=pad_gt_mask) * is_in_gts
+        else:
+            is_in_gts, is_in_center = check_points_inside_bboxes(
+                anchor_points,
+                gt_bboxes,
+                stride_tensor * self.center_radius,
+                sm_use=True)
+            is_in_gts *= pad_gt_mask
+            is_in_center *= pad_gt_mask
+            candidate_metrics = paddle.where(
+                is_in_gts.sum(-1, keepdim=True) == 0,
+                alignment_metrics + is_in_center,
+                alignment_metrics)
+            mask_positive = gather_topk_anchors(
+                candidate_metrics, self.topk,
+                topk_mask=pad_gt_mask) * paddle.cast((is_in_center > 0) |
+                                                     (is_in_gts > 0), 'float32')
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected, [B, n, L]
+        mask_positive_sum = mask_positive.sum(axis=-2)
+        if mask_positive_sum.max() > 1:
+            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
+                [1, num_max_boxes, 1])
+            is_max_iou = compute_max_iou_anchor(ious * mask_positive)
+            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
+                                         mask_positive)
+            mask_positive_sum = mask_positive.sum(axis=-2)
+        assigned_gt_index = mask_positive.argmax(axis=-2)
+
+        # assigned target
+        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
+        assigned_labels = paddle.gather(
+            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
+        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
+        assigned_labels = paddle.where(
+            mask_positive_sum > 0, assigned_labels,
+            paddle.full_like(assigned_labels, bg_index))
+
+        assigned_bboxes = paddle.gather(
+            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
+        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
+
+        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
+        ind = list(range(num_classes + 1))
+        ind.remove(bg_index)
+        assigned_scores = paddle.index_select(
+            assigned_scores, paddle.to_tensor(ind), axis=-1)
+        # rescale alignment metrics
+        alignment_metrics *= mask_positive
+        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
+        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
+                                                           keepdim=True)
+        alignment_metrics = alignment_metrics / (
+            max_metrics_per_instance + self.eps) * max_ious_per_instance
+        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
+        assigned_scores = assigned_scores * alignment_metrics
+
+        return assigned_labels, assigned_bboxes, assigned_scores, mask_positive

+ 93 - 0
paddlers/models/ppdet/modeling/assigners/uniform_assigner.py

@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlers.models.ppdet.core.workspace import register
+
+from paddlers.models.ppdet.modeling.bbox_utils import batch_bbox_overlaps
+from paddlers.models.ppdet.modeling.transformers import bbox_xyxy_to_cxcywh
+
+__all__ = ['UniformAssigner']
+
+
+def batch_p_dist(x, y, p=2):
+    """
+    calculate pairwise p_dist, the first index of x and y are batch
+    return [x.shape[0], y.shape[0]]
+    """
+    x = x.unsqueeze(1)
+    diff = x - y
+    return paddle.norm(diff, p=p, axis=list(range(2, diff.dim())))
+
+
+@register
+class UniformAssigner(nn.Layer):
+    def __init__(self, pos_ignore_thr, neg_ignore_thr, match_times=4):
+        super(UniformAssigner, self).__init__()
+        self.pos_ignore_thr = pos_ignore_thr
+        self.neg_ignore_thr = neg_ignore_thr
+        self.match_times = match_times
+
+    def forward(self, bbox_pred, anchor, gt_bboxes, gt_labels=None):
+        num_bboxes = bbox_pred.shape[0]
+        num_gts = gt_bboxes.shape[0]
+        match_labels = paddle.full([num_bboxes], -1, dtype=paddle.int32)
+
+        pred_ious = batch_bbox_overlaps(bbox_pred, gt_bboxes)
+        pred_max_iou = pred_ious.max(axis=1)
+        neg_ignore = pred_max_iou > self.neg_ignore_thr
+        # exclude potential ignored neg samples first, deal with pos samples later
+        #match_labels: -2(ignore), -1(neg) or >=0(pos_inds)
+        match_labels = paddle.where(neg_ignore,
+                                    paddle.full_like(match_labels, -2),
+                                    match_labels)
+
+        bbox_pred_c = bbox_xyxy_to_cxcywh(bbox_pred)
+        anchor_c = bbox_xyxy_to_cxcywh(anchor)
+        gt_bboxes_c = bbox_xyxy_to_cxcywh(gt_bboxes)
+        bbox_pred_dist = batch_p_dist(bbox_pred_c, gt_bboxes_c, p=1)
+        anchor_dist = batch_p_dist(anchor_c, gt_bboxes_c, p=1)
+
+        top_pred = bbox_pred_dist.topk(
+            k=self.match_times, axis=0, largest=False)[1]
+        top_anchor = anchor_dist.topk(
+            k=self.match_times, axis=0, largest=False)[1]
+
+        tar_pred = paddle.arange(num_gts).expand([self.match_times, num_gts])
+        tar_anchor = paddle.arange(num_gts).expand([self.match_times, num_gts])
+        pos_places = paddle.concat([top_pred, top_anchor]).reshape([-1])
+        pos_inds = paddle.concat([tar_pred, tar_anchor]).reshape([-1])
+
+        pos_anchor = anchor[pos_places]
+        pos_tar_bbox = gt_bboxes[pos_inds]
+        pos_ious = batch_bbox_overlaps(
+            pos_anchor, pos_tar_bbox, is_aligned=True)
+        pos_ignore = pos_ious < self.pos_ignore_thr
+        pos_inds = paddle.where(pos_ignore,
+                                paddle.full_like(pos_inds, -2), pos_inds)
+        match_labels[pos_places] = pos_inds
+        match_labels.stop_gradient = True
+        pos_keep = ~pos_ignore
+
+        if pos_keep.sum() > 0:
+            pos_places_keep = pos_places[pos_keep]
+            pos_bbox_pred = bbox_pred[pos_places_keep].reshape([-1, 4])
+            pos_bbox_tar = pos_tar_bbox[pos_keep].reshape([-1, 4]).detach()
+        else:
+            pos_bbox_pred = None
+            pos_bbox_tar = None
+
+        return match_labels, pos_bbox_pred, pos_bbox_tar

+ 8 - 3
paddlers/models/ppdet/modeling/assigners/utils.py

@@ -108,7 +108,8 @@ def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
 def check_points_inside_bboxes(points,
 def check_points_inside_bboxes(points,
                                bboxes,
                                bboxes,
                                center_radius_tensor=None,
                                center_radius_tensor=None,
-                               eps=1e-9):
+                               eps=1e-9,
+                               sm_use=False):
     r"""
     r"""
     Args:
     Args:
         points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
         points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
@@ -139,8 +140,12 @@ def check_points_inside_bboxes(points,
         b = (cy + center_radius_tensor) - y
         b = (cy + center_radius_tensor) - y
         delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1)
         delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1)
         is_in_center = (delta_ltrb_c.min(axis=-1) > eps)
         is_in_center = (delta_ltrb_c.min(axis=-1) > eps)
-        return (paddle.logical_and(is_in_bboxes, is_in_center),
-                paddle.logical_or(is_in_bboxes, is_in_center))
+        if sm_use:
+            return is_in_bboxes.astype(bboxes.dtype), is_in_center.astype(
+                bboxes.dtype)
+        else:
+            return (paddle.logical_and(is_in_bboxes, is_in_center),
+                    paddle.logical_or(is_in_bboxes, is_in_center))
 
 
     return is_in_bboxes.astype(bboxes.dtype)
     return is_in_bboxes.astype(bboxes.dtype)
 
 

+ 2 - 0
paddlers/models/ppdet/modeling/backbones/__init__.py

@@ -34,6 +34,7 @@ from . import csp_darknet
 from . import convnext
 from . import convnext
 from . import vision_transformer
 from . import vision_transformer
 from . import mobileone
 from . import mobileone
+from . import trans_encoder
 
 
 from .vgg import *
 from .vgg import *
 from .resnet import *
 from .resnet import *
@@ -58,3 +59,4 @@ from .convnext import *
 from .vision_transformer import *
 from .vision_transformer import *
 from .vision_transformer import *
 from .vision_transformer import *
 from .mobileone import *
 from .mobileone import *
+from .trans_encoder import *

+ 49 - 9
paddlers/models/ppdet/modeling/backbones/dla.py

@@ -19,7 +19,7 @@ from paddlers.models.ppdet.core.workspace import register, serializable
 from paddlers.models.ppdet.modeling.layers import ConvNormLayer
 from paddlers.models.ppdet.modeling.layers import ConvNormLayer
 from ..shape_spec import ShapeSpec
 from ..shape_spec import ShapeSpec
 
 
-DLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512])}
+DLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512]), }
 
 
 
 
 class BasicBlock(nn.Layer):
 class BasicBlock(nn.Layer):
@@ -157,17 +157,25 @@ class DLA(nn.Layer):
     DLA, see https://arxiv.org/pdf/1707.06484.pdf
     DLA, see https://arxiv.org/pdf/1707.06484.pdf
 
 
     Args:
     Args:
-        depth (int): DLA depth, should be 34.
+        depth (int): DLA depth, only support 34 now.
         residual_root (bool): whether use a reidual layer in the root block
         residual_root (bool): whether use a reidual layer in the root block
-
+        pre_img (bool): add pre_img, only used in CenterTrack
+        pre_hm (bool): add pre_hm, only used in CenterTrack
     """
     """
 
 
-    def __init__(self, depth=34, residual_root=False):
+    def __init__(self,
+                 depth=34,
+                 residual_root=False,
+                 pre_img=False,
+                 pre_hm=False):
         super(DLA, self).__init__()
         super(DLA, self).__init__()
-        levels, channels = DLA_cfg[depth]
+        assert depth == 34, 'Only support DLA with depth of 34 now.'
         if depth == 34:
         if depth == 34:
             block = BasicBlock
             block = BasicBlock
+        levels, channels = DLA_cfg[depth]
         self.channels = channels
         self.channels = channels
+        self.num_levels = len(levels)
+
         self.base_layer = nn.Sequential(
         self.base_layer = nn.Sequential(
             ConvNormLayer(
             ConvNormLayer(
                 3,
                 3,
@@ -213,6 +221,29 @@ class DLA(nn.Layer):
             level_root=True,
             level_root=True,
             root_residual=residual_root)
             root_residual=residual_root)
 
 
+        if pre_img:
+            self.pre_img_layer = nn.Sequential(
+                ConvNormLayer(
+                    3,
+                    channels[0],
+                    filter_size=7,
+                    stride=1,
+                    bias_on=False,
+                    norm_decay=None),
+                nn.ReLU())
+        if pre_hm:
+            self.pre_hm_layer = nn.Sequential(
+                ConvNormLayer(
+                    1,
+                    channels[0],
+                    filter_size=7,
+                    stride=1,
+                    bias_on=False,
+                    norm_decay=None),
+                nn.ReLU())
+        self.pre_img = pre_img
+        self.pre_hm = pre_hm
+
     def _make_conv_level(self, ch_in, ch_out, conv_num, stride=1):
     def _make_conv_level(self, ch_in, ch_out, conv_num, stride=1):
         modules = []
         modules = []
         for i in range(conv_num):
         for i in range(conv_num):
@@ -230,13 +261,22 @@ class DLA(nn.Layer):
 
 
     @property
     @property
     def out_shape(self):
     def out_shape(self):
-        return [ShapeSpec(channels=self.channels[i]) for i in range(6)]
+        return [
+            ShapeSpec(channels=self.channels[i]) for i in range(self.num_levels)
+        ]
 
 
     def forward(self, inputs):
     def forward(self, inputs):
         outs = []
         outs = []
-        im = inputs['image']
-        feats = self.base_layer(im)
-        for i in range(6):
+        feats = self.base_layer(inputs['image'])
+
+        if self.pre_img and 'pre_image' in inputs and inputs[
+                'pre_image'] is not None:
+            feats = feats + self.pre_img_layer(inputs['pre_image'])
+
+        if self.pre_hm and 'pre_hm' in inputs and inputs['pre_hm'] is not None:
+            feats = feats + self.pre_hm_layer(inputs['pre_hm'])
+
+        for i in range(self.num_levels):
             feats = getattr(self, 'level{}'.format(i))(feats)
             feats = getattr(self, 'level{}'.format(i))(feats)
             outs.append(feats)
             outs.append(feats)
 
 

+ 144 - 2
paddlers/models/ppdet/modeling/backbones/hrnet.py

@@ -37,6 +37,7 @@ class ConvNormLayer(nn.Layer):
                  norm_type='bn',
                  norm_type='bn',
                  norm_groups=32,
                  norm_groups=32,
                  use_dcn=False,
                  use_dcn=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  norm_decay=0.,
                  freeze_norm=False,
                  freeze_norm=False,
                  act=None,
                  act=None,
@@ -66,6 +67,7 @@ class ConvNormLayer(nn.Layer):
         if norm_type in ['bn', 'sync_bn']:
         if norm_type in ['bn', 'sync_bn']:
             self.norm = nn.BatchNorm2D(
             self.norm = nn.BatchNorm2D(
                 ch_out,
                 ch_out,
+                momentum=norm_momentum,
                 weight_attr=param_attr,
                 weight_attr=param_attr,
                 bias_attr=bias_attr,
                 bias_attr=bias_attr,
                 use_global_stats=global_stats)
                 use_global_stats=global_stats)
@@ -93,6 +95,7 @@ class Layer1(nn.Layer):
     def __init__(self,
     def __init__(self,
                  num_channels,
                  num_channels,
                  has_se=False,
                  has_se=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  norm_decay=0.,
                  freeze_norm=True,
                  freeze_norm=True,
                  name=None):
                  name=None):
@@ -109,6 +112,7 @@ class Layer1(nn.Layer):
                     has_se=has_se,
                     has_se=has_se,
                     stride=1,
                     stride=1,
                     downsample=True if i == 0 else False,
                     downsample=True if i == 0 else False,
+                    norm_momentum=norm_momentum,
                     norm_decay=norm_decay,
                     norm_decay=norm_decay,
                     freeze_norm=freeze_norm,
                     freeze_norm=freeze_norm,
                     name=name + '_' + str(i + 1)))
                     name=name + '_' + str(i + 1)))
@@ -125,6 +129,7 @@ class TransitionLayer(nn.Layer):
     def __init__(self,
     def __init__(self,
                  in_channels,
                  in_channels,
                  out_channels,
                  out_channels,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  norm_decay=0.,
                  freeze_norm=True,
                  freeze_norm=True,
                  name=None):
                  name=None):
@@ -144,6 +149,7 @@ class TransitionLayer(nn.Layer):
                             ch_in=in_channels[i],
                             ch_in=in_channels[i],
                             ch_out=out_channels[i],
                             ch_out=out_channels[i],
                             filter_size=3,
                             filter_size=3,
+                            norm_momentum=norm_momentum,
                             norm_decay=norm_decay,
                             norm_decay=norm_decay,
                             freeze_norm=freeze_norm,
                             freeze_norm=freeze_norm,
                             act='relu',
                             act='relu',
@@ -156,6 +162,7 @@ class TransitionLayer(nn.Layer):
                         ch_out=out_channels[i],
                         ch_out=out_channels[i],
                         filter_size=3,
                         filter_size=3,
                         stride=2,
                         stride=2,
+                        norm_momentum=norm_momentum,
                         norm_decay=norm_decay,
                         norm_decay=norm_decay,
                         freeze_norm=freeze_norm,
                         freeze_norm=freeze_norm,
                         act='relu',
                         act='relu',
@@ -181,6 +188,7 @@ class Branches(nn.Layer):
                  in_channels,
                  in_channels,
                  out_channels,
                  out_channels,
                  has_se=False,
                  has_se=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  norm_decay=0.,
                  freeze_norm=True,
                  freeze_norm=True,
                  name=None):
                  name=None):
@@ -197,6 +205,7 @@ class Branches(nn.Layer):
                         num_channels=in_ch,
                         num_channels=in_ch,
                         num_filters=out_channels[i],
                         num_filters=out_channels[i],
                         has_se=has_se,
                         has_se=has_se,
+                        norm_momentum=norm_momentum,
                         norm_decay=norm_decay,
                         norm_decay=norm_decay,
                         freeze_norm=freeze_norm,
                         freeze_norm=freeze_norm,
                         name=name + '_branch_layer_' + str(i + 1) + '_' +
                         name=name + '_branch_layer_' + str(i + 1) + '_' +
@@ -221,6 +230,7 @@ class BottleneckBlock(nn.Layer):
                  has_se,
                  has_se,
                  stride=1,
                  stride=1,
                  downsample=False,
                  downsample=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  norm_decay=0.,
                  freeze_norm=True,
                  freeze_norm=True,
                  name=None):
                  name=None):
@@ -233,6 +243,7 @@ class BottleneckBlock(nn.Layer):
             ch_in=num_channels,
             ch_in=num_channels,
             ch_out=num_filters,
             ch_out=num_filters,
             filter_size=1,
             filter_size=1,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             act="relu",
             act="relu",
@@ -242,6 +253,7 @@ class BottleneckBlock(nn.Layer):
             ch_out=num_filters,
             ch_out=num_filters,
             filter_size=3,
             filter_size=3,
             stride=stride,
             stride=stride,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             act="relu",
             act="relu",
@@ -250,6 +262,7 @@ class BottleneckBlock(nn.Layer):
             ch_in=num_filters,
             ch_in=num_filters,
             ch_out=num_filters * 4,
             ch_out=num_filters * 4,
             filter_size=1,
             filter_size=1,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             act=None,
             act=None,
@@ -260,6 +273,7 @@ class BottleneckBlock(nn.Layer):
                 ch_in=num_channels,
                 ch_in=num_channels,
                 ch_out=num_filters * 4,
                 ch_out=num_filters * 4,
                 filter_size=1,
                 filter_size=1,
+                norm_momentum=norm_momentum,
                 norm_decay=norm_decay,
                 norm_decay=norm_decay,
                 freeze_norm=freeze_norm,
                 freeze_norm=freeze_norm,
                 act=None,
                 act=None,
@@ -296,6 +310,7 @@ class BasicBlock(nn.Layer):
                  stride=1,
                  stride=1,
                  has_se=False,
                  has_se=False,
                  downsample=False,
                  downsample=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  norm_decay=0.,
                  freeze_norm=True,
                  freeze_norm=True,
                  name=None):
                  name=None):
@@ -307,6 +322,7 @@ class BasicBlock(nn.Layer):
             ch_in=num_channels,
             ch_in=num_channels,
             ch_out=num_filters,
             ch_out=num_filters,
             filter_size=3,
             filter_size=3,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             stride=stride,
             stride=stride,
@@ -316,6 +332,7 @@ class BasicBlock(nn.Layer):
             ch_in=num_filters,
             ch_in=num_filters,
             ch_out=num_filters,
             ch_out=num_filters,
             filter_size=3,
             filter_size=3,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             stride=1,
             stride=1,
@@ -327,6 +344,7 @@ class BasicBlock(nn.Layer):
                 ch_in=num_channels,
                 ch_in=num_channels,
                 ch_out=num_filters * 4,
                 ch_out=num_filters * 4,
                 filter_size=1,
                 filter_size=1,
+                norm_momentum=norm_momentum,
                 norm_decay=norm_decay,
                 norm_decay=norm_decay,
                 freeze_norm=freeze_norm,
                 freeze_norm=freeze_norm,
                 act=None,
                 act=None,
@@ -394,6 +412,7 @@ class Stage(nn.Layer):
                  num_modules,
                  num_modules,
                  num_filters,
                  num_filters,
                  has_se=False,
                  has_se=False,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  norm_decay=0.,
                  freeze_norm=True,
                  freeze_norm=True,
                  multi_scale_output=True,
                  multi_scale_output=True,
@@ -410,6 +429,7 @@ class Stage(nn.Layer):
                         num_channels=num_channels,
                         num_channels=num_channels,
                         num_filters=num_filters,
                         num_filters=num_filters,
                         has_se=has_se,
                         has_se=has_se,
+                        norm_momentum=norm_momentum,
                         norm_decay=norm_decay,
                         norm_decay=norm_decay,
                         freeze_norm=freeze_norm,
                         freeze_norm=freeze_norm,
                         multi_scale_output=False,
                         multi_scale_output=False,
@@ -421,6 +441,7 @@ class Stage(nn.Layer):
                         num_channels=num_channels,
                         num_channels=num_channels,
                         num_filters=num_filters,
                         num_filters=num_filters,
                         has_se=has_se,
                         has_se=has_se,
+                        norm_momentum=norm_momentum,
                         norm_decay=norm_decay,
                         norm_decay=norm_decay,
                         freeze_norm=freeze_norm,
                         freeze_norm=freeze_norm,
                         name=name + '_' + str(i + 1)))
                         name=name + '_' + str(i + 1)))
@@ -440,6 +461,7 @@ class HighResolutionModule(nn.Layer):
                  num_filters,
                  num_filters,
                  has_se=False,
                  has_se=False,
                  multi_scale_output=True,
                  multi_scale_output=True,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  norm_decay=0.,
                  freeze_norm=True,
                  freeze_norm=True,
                  name=None):
                  name=None):
@@ -449,6 +471,7 @@ class HighResolutionModule(nn.Layer):
             in_channels=num_channels,
             in_channels=num_channels,
             out_channels=num_filters,
             out_channels=num_filters,
             has_se=has_se,
             has_se=has_se,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             name=name)
             name=name)
@@ -457,6 +480,7 @@ class HighResolutionModule(nn.Layer):
             in_channels=num_filters,
             in_channels=num_filters,
             out_channels=num_filters,
             out_channels=num_filters,
             multi_scale_output=multi_scale_output,
             multi_scale_output=multi_scale_output,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             name=name)
             name=name)
@@ -472,6 +496,7 @@ class FuseLayers(nn.Layer):
                  in_channels,
                  in_channels,
                  out_channels,
                  out_channels,
                  multi_scale_output=True,
                  multi_scale_output=True,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  norm_decay=0.,
                  freeze_norm=True,
                  freeze_norm=True,
                  name=None):
                  name=None):
@@ -493,6 +518,7 @@ class FuseLayers(nn.Layer):
                             filter_size=1,
                             filter_size=1,
                             stride=1,
                             stride=1,
                             act=None,
                             act=None,
+                            norm_momentum=norm_momentum,
                             norm_decay=norm_decay,
                             norm_decay=norm_decay,
                             freeze_norm=freeze_norm,
                             freeze_norm=freeze_norm,
                             name=name + '_layer_' + str(i + 1) + '_' +
                             name=name + '_layer_' + str(i + 1) + '_' +
@@ -510,6 +536,7 @@ class FuseLayers(nn.Layer):
                                     ch_out=out_channels[i],
                                     ch_out=out_channels[i],
                                     filter_size=3,
                                     filter_size=3,
                                     stride=2,
                                     stride=2,
+                                    norm_momentum=norm_momentum,
                                     norm_decay=norm_decay,
                                     norm_decay=norm_decay,
                                     freeze_norm=freeze_norm,
                                     freeze_norm=freeze_norm,
                                     act=None,
                                     act=None,
@@ -525,6 +552,7 @@ class FuseLayers(nn.Layer):
                                     ch_out=out_channels[j],
                                     ch_out=out_channels[j],
                                     filter_size=3,
                                     filter_size=3,
                                     stride=2,
                                     stride=2,
+                                    norm_momentum=norm_momentum,
                                     norm_decay=norm_decay,
                                     norm_decay=norm_decay,
                                     freeze_norm=freeze_norm,
                                     freeze_norm=freeze_norm,
                                     act="relu",
                                     act="relu",
@@ -549,7 +577,6 @@ class FuseLayers(nn.Layer):
                     for k in range(i - j):
                     for k in range(i - j):
                         y = self.residual_func_list[residual_func_idx](y)
                         y = self.residual_func_list[residual_func_idx](y)
                         residual_func_idx += 1
                         residual_func_idx += 1
-
                     residual = paddle.add(x=residual, y=y)
                     residual = paddle.add(x=residual, y=y)
             residual = F.relu(residual)
             residual = F.relu(residual)
             outs.append(residual)
             outs.append(residual)
@@ -567,6 +594,7 @@ class HRNet(nn.Layer):
         has_se (bool): whether to add SE block for each stage
         has_se (bool): whether to add SE block for each stage
         freeze_at (int): the stage to freeze
         freeze_at (int): the stage to freeze
         freeze_norm (bool): whether to freeze norm in HRNet
         freeze_norm (bool): whether to freeze norm in HRNet
+        norm_momentum (float): momentum of BatchNorm
         norm_decay (float): weight decay for normalization layer weights
         norm_decay (float): weight decay for normalization layer weights
         return_idx (List): the stage to return
         return_idx (List): the stage to return
         upsample (bool): whether to upsample and concat the backbone feats
         upsample (bool): whether to upsample and concat the backbone feats
@@ -577,9 +605,11 @@ class HRNet(nn.Layer):
                  has_se=False,
                  has_se=False,
                  freeze_at=0,
                  freeze_at=0,
                  freeze_norm=True,
                  freeze_norm=True,
+                 norm_momentum=0.9,
                  norm_decay=0.,
                  norm_decay=0.,
                  return_idx=[0, 1, 2, 3],
                  return_idx=[0, 1, 2, 3],
-                 upsample=False):
+                 upsample=False,
+                 downsample=False):
         super(HRNet, self).__init__()
         super(HRNet, self).__init__()
 
 
         self.width = width
         self.width = width
@@ -591,6 +621,7 @@ class HRNet(nn.Layer):
         self.freeze_at = freeze_at
         self.freeze_at = freeze_at
         self.return_idx = return_idx
         self.return_idx = return_idx
         self.upsample = upsample
         self.upsample = upsample
+        self.downsample = downsample
 
 
         self.channels = {
         self.channels = {
             18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]],
             18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]],
@@ -613,6 +644,7 @@ class HRNet(nn.Layer):
             ch_out=64,
             ch_out=64,
             filter_size=3,
             filter_size=3,
             stride=2,
             stride=2,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             act='relu',
             act='relu',
@@ -623,6 +655,7 @@ class HRNet(nn.Layer):
             ch_out=64,
             ch_out=64,
             filter_size=3,
             filter_size=3,
             stride=2,
             stride=2,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             act='relu',
             act='relu',
@@ -631,6 +664,7 @@ class HRNet(nn.Layer):
         self.la1 = Layer1(
         self.la1 = Layer1(
             num_channels=64,
             num_channels=64,
             has_se=has_se,
             has_se=has_se,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             name="layer2")
             name="layer2")
@@ -638,6 +672,7 @@ class HRNet(nn.Layer):
         self.tr1 = TransitionLayer(
         self.tr1 = TransitionLayer(
             in_channels=[256],
             in_channels=[256],
             out_channels=channels_2,
             out_channels=channels_2,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             name="tr1")
             name="tr1")
@@ -647,6 +682,7 @@ class HRNet(nn.Layer):
             num_modules=num_modules_2,
             num_modules=num_modules_2,
             num_filters=channels_2,
             num_filters=channels_2,
             has_se=self.has_se,
             has_se=self.has_se,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             name="st2")
             name="st2")
@@ -654,6 +690,7 @@ class HRNet(nn.Layer):
         self.tr2 = TransitionLayer(
         self.tr2 = TransitionLayer(
             in_channels=channels_2,
             in_channels=channels_2,
             out_channels=channels_3,
             out_channels=channels_3,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             name="tr2")
             name="tr2")
@@ -663,6 +700,7 @@ class HRNet(nn.Layer):
             num_modules=num_modules_3,
             num_modules=num_modules_3,
             num_filters=channels_3,
             num_filters=channels_3,
             has_se=self.has_se,
             has_se=self.has_se,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             name="st3")
             name="st3")
@@ -670,6 +708,7 @@ class HRNet(nn.Layer):
         self.tr3 = TransitionLayer(
         self.tr3 = TransitionLayer(
             in_channels=channels_3,
             in_channels=channels_3,
             out_channels=channels_4,
             out_channels=channels_4,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             name="tr3")
             name="tr3")
@@ -678,11 +717,106 @@ class HRNet(nn.Layer):
             num_modules=num_modules_4,
             num_modules=num_modules_4,
             num_filters=channels_4,
             num_filters=channels_4,
             has_se=self.has_se,
             has_se=self.has_se,
+            norm_momentum=norm_momentum,
             norm_decay=norm_decay,
             norm_decay=norm_decay,
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             multi_scale_output=len(return_idx) > 1,
             multi_scale_output=len(return_idx) > 1,
             name="st4")
             name="st4")
 
 
+        if self.downsample:
+            self.incre_modules, self.downsamp_modules, \
+                self.final_layer = self._make_head(channels_4, norm_momentum=norm_momentum, has_se=self.has_se)
+
+    def _make_layer(self,
+                    block,
+                    inplanes,
+                    planes,
+                    blocks,
+                    stride=1,
+                    norm_momentum=0.9,
+                    has_se=False,
+                    name=None):
+        downsample = None
+        if stride != 1 or inplanes != planes * 4:
+            downsample = True
+
+        layers = []
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                has_se,
+                stride,
+                downsample,
+                norm_momentum=norm_momentum,
+                freeze_norm=False,
+                name=name + "_s0"))
+        inplanes = planes * 4
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    has_se,
+                    norm_momentum=norm_momentum,
+                    freeze_norm=False,
+                    name=name + "_s" + str(i)))
+
+        return nn.Sequential(*layers)
+
+    def _make_head(self, pre_stage_channels, norm_momentum=0.9, has_se=False):
+        head_block = BottleneckBlock
+        head_channels = [32, 64, 128, 256]
+
+        # Increasing the #channels on each resolution 
+        # from C, 2C, 4C, 8C to 128, 256, 512, 1024
+        incre_modules = []
+        for i, channels in enumerate(pre_stage_channels):
+            incre_module = self._make_layer(
+                head_block,
+                channels,
+                head_channels[i],
+                1,
+                stride=1,
+                norm_momentum=norm_momentum,
+                has_se=has_se,
+                name='incre' + str(i))
+            incre_modules.append(incre_module)
+        incre_modules = nn.LayerList(incre_modules)
+
+        # downsampling modules
+        downsamp_modules = []
+        for i in range(len(pre_stage_channels) - 1):
+            in_channels = head_channels[i] * 4
+            out_channels = head_channels[i + 1] * 4
+
+            downsamp_module = nn.Sequential(
+                nn.Conv2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1),
+                nn.BatchNorm2D(
+                    out_channels, momentum=norm_momentum),
+                nn.ReLU())
+
+            downsamp_modules.append(downsamp_module)
+        downsamp_modules = nn.LayerList(downsamp_modules)
+
+        final_layer = nn.Sequential(
+            nn.Conv2D(
+                in_channels=head_channels[3] * 4,
+                out_channels=2048,
+                kernel_size=1,
+                stride=1,
+                padding=0),
+            nn.BatchNorm2D(
+                2048, momentum=norm_momentum),
+            nn.ReLU())
+
+        return incre_modules, downsamp_modules, final_layer
+
     def forward(self, inputs):
     def forward(self, inputs):
         x = inputs['image']
         x = inputs['image']
         conv1 = self.conv_layer1_1(x)
         conv1 = self.conv_layer1_1(x)
@@ -707,6 +841,14 @@ class HRNet(nn.Layer):
             x = paddle.concat([st4[0], x1, x2, x3], 1)
             x = paddle.concat([st4[0], x1, x2, x3], 1)
             return x
             return x
 
 
+        if self.downsample:
+            y = self.incre_modules[0](st4[0])
+            for i in range(len(self.downsamp_modules)):
+                y = self.incre_modules[i+1](st4[i+1]) + \
+                            self.downsamp_modules[i](y)
+            y = self.final_layer(y)
+            return y
+
         res = []
         res = []
         for i, layer in enumerate(st4):
         for i, layer in enumerate(st4):
             if i == self.freeze_at:
             if i == self.freeze_at:

+ 5 - 0
paddlers/models/ppdet/modeling/backbones/lite_hrnet.py

@@ -854,6 +854,11 @@ class LiteHRNet(nn.Layer):
 
 
     def forward(self, inputs):
     def forward(self, inputs):
         x = inputs['image']
         x = inputs['image']
+        dims = x.shape
+        if len(dims) == 5:
+            x = paddle.reshape(x, (dims[0] * dims[1], dims[2], dims[3],
+                                   dims[4]))  # [6, 3, 128, 96]
+
         x = self.stem(x)
         x = self.stem(x)
         y_list = [x]
         y_list = [x]
         for stage_idx in range(3):
         for stage_idx in range(3):

+ 30 - 30
paddlers/models/ppdet/modeling/backbones/resnet.py

@@ -285,36 +285,6 @@ class BottleNeck(nn.Layer):
         # ResNeXt
         # ResNeXt
         width = int(ch_out * (base_width / 64.)) * groups
         width = int(ch_out * (base_width / 64.)) * groups
 
 
-        self.shortcut = shortcut
-        if not shortcut:
-            if variant == 'd' and stride == 2:
-                self.short = nn.Sequential()
-                self.short.add_sublayer(
-                    'pool',
-                    nn.AvgPool2D(
-                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
-                self.short.add_sublayer(
-                    'conv',
-                    ConvNormLayer(
-                        ch_in=ch_in,
-                        ch_out=ch_out * self.expansion,
-                        filter_size=1,
-                        stride=1,
-                        norm_type=norm_type,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        lr=lr))
-            else:
-                self.short = ConvNormLayer(
-                    ch_in=ch_in,
-                    ch_out=ch_out * self.expansion,
-                    filter_size=1,
-                    stride=stride,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    lr=lr)
-
         self.branch2a = ConvNormLayer(
         self.branch2a = ConvNormLayer(
             ch_in=ch_in,
             ch_in=ch_in,
             ch_out=width,
             ch_out=width,
@@ -351,6 +321,36 @@ class BottleNeck(nn.Layer):
             freeze_norm=freeze_norm,
             freeze_norm=freeze_norm,
             lr=lr)
             lr=lr)
 
 
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out * self.expansion,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out * self.expansion,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
         self.std_senet = std_senet
         self.std_senet = std_senet
         if self.std_senet:
         if self.std_senet:
             self.se = SELayer(ch_out * self.expansion)
             self.se = SELayer(ch_out * self.expansion)

+ 381 - 0
paddlers/models/ppdet/modeling/backbones/trans_encoder.py

@@ -0,0 +1,381 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import ReLU, Swish, GELU
+import math
+
+from paddlers.models.ppdet.core.workspace import register
+from ..shape_spec import ShapeSpec
+
+__all__ = ['TransEncoder']
+
+
+class BertEmbeddings(nn.Layer):
+    def __init__(self, word_size, position_embeddings_size, word_type_size,
+                 hidden_size, dropout_prob):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(
+            word_size, hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(position_embeddings_size,
+                                                hidden_size)
+        self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(dropout_prob)
+
+    def forward(self, x, token_type_ids=None, position_ids=None):
+        seq_len = paddle.shape(x)[1]
+        if position_ids is None:
+            position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(paddle.shape(x))
+
+        word_embs = self.word_embeddings(x)
+        position_embs = self.position_embeddings(position_ids)
+        token_type_embs = self.token_type_embeddings(token_type_ids)
+
+        embs_cmb = word_embs + position_embs + token_type_embs
+        embs_out = self.layernorm(embs_cmb)
+        embs_out = self.dropout(embs_out)
+        return embs_out
+
+
+class BertSelfAttention(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 output_attentions=False):
+        super(BertSelfAttention, self).__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                "The hidden_size must be a multiple of the number of attention "
+                "heads, but got {} % {} != 0" %
+                (hidden_size, num_attention_heads))
+
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(hidden_size, self.all_head_size)
+        self.key = nn.Linear(hidden_size, self.all_head_size)
+        self.value = nn.Linear(hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(attention_probs_dropout_prob)
+        self.output_attentions = output_attentions
+
+    def forward(self, x, attention_mask, head_mask=None):
+        query = self.query(x)
+        key = self.key(x)
+        value = self.value(x)
+
+        query_dim1, query_dim2 = paddle.shape(query)[:-1]
+        new_shape = [
+            query_dim1, query_dim2, self.num_attention_heads,
+            self.attention_head_size
+        ]
+        query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
+        key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
+        value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
+
+        attention = paddle.matmul(query,
+                                  key) / math.sqrt(self.attention_head_size)
+        attention = attention + attention_mask
+        attention_value = F.softmax(attention, axis=-1)
+        attention_value = self.dropout(attention_value)
+
+        if head_mask is not None:
+            attention_value = attention_value * head_mask
+
+        context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
+                                                                        3))
+        ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2]
+        new_context_shape = [
+            ctx_dim1,
+            ctx_dim2,
+            self.all_head_size,
+        ]
+        context = context.reshape(new_context_shape)
+
+        if self.output_attentions:
+            return (context, attention_value)
+        else:
+            return (context, )
+
+
+class BertAttention(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 output_attentions=False):
+        super(BertAttention, self).__init__()
+        self.bert_selfattention = BertSelfAttention(
+            hidden_size, num_attention_heads, attention_probs_dropout_prob,
+            output_attentions)
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+
+    def forward(self, x, attention_mask, head_mask=None):
+        attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
+        features = self.fc(attention_feats[0])
+        features = self.dropout(features)
+        features = self.layernorm(features + x)
+        if len(attention_feats) == 2:
+            return (features, attention_feats[1])
+        else:
+            return (features, )
+
+
+class BertFeedForward(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False):
+        super(BertFeedForward, self).__init__()
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.act_fn = eval(act_fn)
+        self.fc2 = nn.Linear(intermediate_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+
+    def forward(self, x):
+        features = self.fc1(x)
+        features = self.act_fn(features)
+        features = self.fc2(features)
+        features = self.dropout(features)
+        features = self.layernorm(features + x)
+        return features
+
+
+class BertLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(hidden_size, num_attention_heads,
+                                       attention_probs_dropout_prob,
+                                       output_attentions)
+        self.feed_forward = BertFeedForward(
+            hidden_size, intermediate_size, num_attention_heads,
+            attention_probs_dropout_prob, fc_dropout_prob, act_fn,
+            output_attentions)
+
+    def forward(self, x, attention_mask, head_mask=None):
+        attention_feats = self.attention(x, attention_mask, head_mask)
+        features = self.feed_forward(attention_feats[0])
+        if len(attention_feats) == 2:
+            return (features, attention_feats[1])
+        else:
+            return (features, )
+
+
+class BertEncoder(nn.Layer):
+    def __init__(self,
+                 num_hidden_layers,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False,
+                 output_hidden_feats=False):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = output_attentions
+        self.output_hidden_feats = output_hidden_feats
+        self.layers = nn.LayerList([
+            BertLayer(hidden_size, intermediate_size, num_attention_heads,
+                      attention_probs_dropout_prob, fc_dropout_prob, act_fn,
+                      output_attentions) for _ in range(num_hidden_layers)
+        ])
+
+    def forward(self, x, attention_mask, head_mask=None):
+        all_features = (x, )
+        all_attentions = ()
+
+        for i, layer in enumerate(self.layers):
+            mask = head_mask[i] if head_mask is not None else None
+            layer_out = layer(x, attention_mask, mask)
+
+            if self.output_hidden_feats:
+                all_features = all_features + (x, )
+            x = layer_out[0]
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_out[1], )
+
+        outputs = (x, )
+        if self.output_hidden_feats:
+            outputs += (all_features, )
+        if self.output_attentions:
+            outputs += (all_attentions, )
+        return outputs
+
+
+class BertPooler(nn.Layer):
+    def __init__(self, hidden_size):
+        super(BertPooler, self).__init__()
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.act = nn.Tanh()
+
+    def forward(self, x):
+        first_token = x[:, 0]
+        pooled_output = self.fc(first_token)
+        pooled_output = self.act(pooled_output)
+        return pooled_output
+
+
+class METROEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size,
+                 num_hidden_layers,
+                 features_dims,
+                 position_embeddings_size,
+                 hidden_size,
+                 intermediate_size,
+                 output_feature_dim,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False,
+                 output_hidden_feats=False,
+                 use_img_layernorm=False):
+        super(METROEncoder, self).__init__()
+        self.img_dims = features_dims
+        self.num_hidden_layers = num_hidden_layers
+        self.use_img_layernorm = use_img_layernorm
+        self.output_attentions = output_attentions
+        self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
+                                        hidden_size, fc_dropout_prob)
+        self.encoder = BertEncoder(
+            num_hidden_layers, hidden_size, intermediate_size,
+            num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
+            act_fn, output_attentions, output_hidden_feats)
+        self.pooler = BertPooler(hidden_size)
+        self.position_embeddings = nn.Embedding(position_embeddings_size,
+                                                hidden_size)
+        self.img_embedding = nn.Linear(
+            features_dims, hidden_size, bias_attr=True)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+        self.cls_head = nn.Linear(hidden_size, output_feature_dim)
+        self.residual = nn.Linear(features_dims, output_feature_dim)
+
+        self.apply(self.init_weights)
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.set_value(
+                paddle.normal(
+                    mean=0.0, std=0.02, shape=module.weight.shape))
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
+            module.weight.set_value(
+                paddle.full(
+                    shape=module.weight.shape, fill_value=1.0))
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
+
+    def forward(self, x):
+        batchsize, seq_len = paddle.shape(x)[:2]
+        input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
+        position_ids = paddle.arange(
+            seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
+
+        attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
+        head_mask = [None] * self.num_hidden_layers
+
+        position_embs = self.position_embeddings(position_ids)
+        attention_mask = (1.0 - attention_mask) * -10000.0
+
+        img_features = self.img_embedding(x)
+
+        # We empirically observe that adding an additional learnable position embedding leads to more stable training
+        embeddings = position_embs + img_features
+        if self.use_img_layernorm:
+            embeddings = self.layernorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        encoder_outputs = self.encoder(
+            embeddings, attention_mask, head_mask=head_mask)
+
+        pred_score = self.cls_head(encoder_outputs[0])
+        res_img_feats = self.residual(x)
+        pred_score = pred_score + res_img_feats
+
+        if self.output_attentions and self.output_hidden_feats:
+            return pred_score, encoder_outputs[1], encoder_outputs[-1]
+        else:
+            return pred_score
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
+
+
+@register
+class TransEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size=30522,
+                 num_hidden_layers=4,
+                 num_attention_heads=4,
+                 position_embeddings_size=512,
+                 intermediate_size=3072,
+                 input_feat_dim=[2048, 512, 128],
+                 hidden_feat_dim=[1024, 256, 128],
+                 attention_probs_dropout_prob=0.1,
+                 fc_dropout_prob=0.1,
+                 act_fn='gelu',
+                 output_attentions=False,
+                 output_hidden_feats=False):
+        super(TransEncoder, self).__init__()
+        output_feat_dim = input_feat_dim[1:] + [3]
+        trans_encoder = []
+        for i in range(len(output_feat_dim)):
+            features_dims = input_feat_dim[i]
+            output_feature_dim = output_feat_dim[i]
+            hidden_size = hidden_feat_dim[i]
+
+            # init a transformer encoder and append it to a list
+            assert hidden_size % num_attention_heads == 0
+            model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
+                                 position_embeddings_size, hidden_size,
+                                 intermediate_size, output_feature_dim,
+                                 num_attention_heads,
+                                 attention_probs_dropout_prob, fc_dropout_prob,
+                                 act_fn, output_attentions, output_hidden_feats)
+            trans_encoder.append(model)
+        self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
+
+    def forward(self, x):
+        out = self.trans_encoder(x)
+        return out

+ 29 - 11
paddlers/models/ppdet/modeling/backbones/vision_transformer.py

@@ -284,9 +284,9 @@ class RelativePositionBias(nn.Layer):
 
 
     def forward(self):
     def forward(self):
         relative_position_bias = \
         relative_position_bias = \
-            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
                  self.window_size[0] * self.window_size[1] + 1,
                  self.window_size[0] * self.window_size[1] + 1,
-                 self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH 
+                 self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH 
         return relative_position_bias.transpose((2, 0, 1))  # nH, Wh*Ww, Wh*Ww
         return relative_position_bias.transpose((2, 0, 1))  # nH, Wh*Ww, Wh*Ww
 
 
 
 
@@ -340,6 +340,7 @@ class VisionTransformer(nn.Layer):
                  use_abs_pos_emb=False,
                  use_abs_pos_emb=False,
                  use_sincos_pos_emb=True,
                  use_sincos_pos_emb=True,
                  with_fpn=True,
                  with_fpn=True,
+                 num_fpn_levels=4,
                  use_checkpoint=False,
                  use_checkpoint=False,
                  **args):
                  **args):
         super().__init__()
         super().__init__()
@@ -350,6 +351,8 @@ class VisionTransformer(nn.Layer):
         self.use_sincos_pos_emb = use_sincos_pos_emb
         self.use_sincos_pos_emb = use_sincos_pos_emb
         self.use_rel_pos_bias = use_rel_pos_bias
         self.use_rel_pos_bias = use_rel_pos_bias
         self.final_norm = final_norm
         self.final_norm = final_norm
+        self.out_indices = out_indices
+        self.num_fpn_levels = num_fpn_levels
 
 
         if use_checkpoint:
         if use_checkpoint:
             paddle.seed(0)
             paddle.seed(0)
@@ -415,14 +418,15 @@ class VisionTransformer(nn.Layer):
 
 
         assert len(out_indices) <= 4, ''
         assert len(out_indices) <= 4, ''
         self.out_indices = out_indices
         self.out_indices = out_indices
-        self.out_channels = [embed_dim for _ in range(len(out_indices))]
-        self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
-            8 for _ in range(len(out_indices))
+        self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
+        self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
+            patch_size for _ in range(len(out_indices))
         ]
         ]
 
 
         self.norm = Identity()
         self.norm = Identity()
 
 
         if self.with_fpn:
         if self.with_fpn:
+            assert num_fpn_levels <= 4, ''
             self.init_fpn(
             self.init_fpn(
                 embed_dim=embed_dim,
                 embed_dim=embed_dim,
                 patch_size=patch_size, )
                 patch_size=patch_size, )
@@ -505,16 +509,24 @@ class VisionTransformer(nn.Layer):
         dim = x.shape[-1]
         dim = x.shape[-1]
         # we add a small number to avoid floating point error in the interpolation
         # we add a small number to avoid floating point error in the interpolation
         # see discussion at https://github.com/facebookresearch/dino/issues/8
         # see discussion at https://github.com/facebookresearch/dino/issues/8
-        w0, h0 = w0 + 0.1, h0 + 0.1
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        # patch_pos_embed = nn.functional.interpolate(
+        #     patch_pos_embed.reshape([
+        #         1, self.patch_embed.num_patches_w,
+        #         self.patch_embed.num_patches_h, dim
+        #     ]).transpose((0, 3, 1, 2)),
+        #     scale_factor=(w0 / self.patch_embed.num_patches_w,
+        #                   h0 / self.patch_embed.num_patches_h),
+        #     mode='bicubic', )
 
 
         patch_pos_embed = nn.functional.interpolate(
         patch_pos_embed = nn.functional.interpolate(
             patch_pos_embed.reshape([
             patch_pos_embed.reshape([
                 1, self.patch_embed.num_patches_w,
                 1, self.patch_embed.num_patches_w,
                 self.patch_embed.num_patches_h, dim
                 self.patch_embed.num_patches_h, dim
             ]).transpose((0, 3, 1, 2)),
             ]).transpose((0, 3, 1, 2)),
-            scale_factor=(w0 / self.patch_embed.num_patches_w,
-                          h0 / self.patch_embed.num_patches_h),
+            (w0, h0),
             mode='bicubic', )
             mode='bicubic', )
+
         assert int(w0) == patch_pos_embed.shape[-2] and int(
         assert int(w0) == patch_pos_embed.shape[-2] and int(
             h0) == patch_pos_embed.shape[-1]
             h0) == patch_pos_embed.shape[-1]
         patch_pos_embed = patch_pos_embed.transpose(
         patch_pos_embed = patch_pos_embed.transpose(
@@ -611,9 +623,15 @@ class VisionTransformer(nn.Layer):
                 feats.append(xp)
                 feats.append(xp)
 
 
         if self.with_fpn:
         if self.with_fpn:
-            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
-            for i in range(len(feats)):
-                feats[i] = fpns[i](feats[i])
+            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
+                -self.num_fpn_levels:]
+            assert len(fpns) == len(feats) or len(feats) == 1, ''
+            outputs = []
+            for i, m in enumerate(fpns):
+                outputs.append(
+                    m(feats[i] if len(feats) == len(fpns) else feats[-1]))
+
+            return outputs
 
 
         return feats
         return feats
 
 

+ 99 - 94
paddlers/models/ppdet/modeling/bbox_utils.py

@@ -17,7 +17,9 @@ import paddle
 import numpy as np
 import numpy as np
 
 
 
 
-def bbox2delta(src_boxes, tgt_boxes, weights):
+def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):
+    """Encode bboxes to deltas.
+    """
     src_w = src_boxes[:, 2] - src_boxes[:, 0]
     src_w = src_boxes[:, 2] - src_boxes[:, 0]
     src_h = src_boxes[:, 3] - src_boxes[:, 1]
     src_h = src_boxes[:, 3] - src_boxes[:, 1]
     src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
     src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
@@ -38,7 +40,11 @@ def bbox2delta(src_boxes, tgt_boxes, weights):
     return deltas
     return deltas
 
 
 
 
-def delta2bbox(deltas, boxes, weights):
+def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
+    """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
     clip_scale = math.log(1000.0 / 16)
     clip_scale = math.log(1000.0 / 16)
 
 
     widths = boxes[:, 2] - boxes[:, 0]
     widths = boxes[:, 2] - boxes[:, 0]
@@ -67,6 +73,96 @@ def delta2bbox(deltas, boxes, weights):
     pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
     pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
     pred_boxes = paddle.stack(pred_boxes, axis=-1)
     pred_boxes = paddle.stack(pred_boxes, axis=-1)
 
 
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
+            min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
+            min=0, max=max_shape[0])
+    return pred_boxes
+
+
+def bbox2delta_v2(src_boxes,
+                  tgt_boxes,
+                  delta_mean=[0.0, 0.0, 0.0, 0.0],
+                  delta_std=[1.0, 1.0, 1.0, 1.0]):
+    """Encode bboxes to deltas.
+    Modified from bbox2delta() which just use weight parameters to multiply deltas.
+    """
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+    dx = (tgt_ctr_x - src_ctr_x) / src_w
+    dy = (tgt_ctr_y - src_ctr_y) / src_h
+    dw = paddle.log(tgt_w / src_w)
+    dh = paddle.log(tgt_h / src_h)
+
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    deltas = (
+        deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)
+    return deltas
+
+
+def delta2bbox_v2(deltas,
+                  boxes,
+                  delta_mean=[0.0, 0.0, 0.0, 0.0],
+                  delta_std=[1.0, 1.0, 1.0, 1.0],
+                  max_shape=None,
+                  ctr_clip=32.0):
+    """Decode deltas to bboxes.
+    Modified from delta2bbox() which just use weight parameters to be divided by deltas.
+    Used in YOLOFHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)
+    dx = deltas[:, 0::4]
+    dy = deltas[:, 1::4]
+    dw = deltas[:, 2::4]
+    dh = deltas[:, 3::4]
+
+    # Prevent sending too large values into paddle.exp()
+    dx = dx * widths.unsqueeze(1)
+    dy = dy * heights.unsqueeze(1)
+    if ctr_clip is not None:
+        dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)
+        dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)
+        dw = paddle.clip(dw, max=clip_scale)
+        dh = paddle.clip(dh, max=clip_scale)
+    else:
+        dw = dw.clip(min=-clip_scale, max=clip_scale)
+        dh = dh.clip(min=-clip_scale, max=clip_scale)
+
+    pred_ctr_x = dx + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
+            min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
+            min=0, max=max_shape[0])
     return pred_boxes
     return pred_boxes
 
 
 
 
@@ -269,8 +365,7 @@ def decode_yolo(box, anchor, downsample_ratio):
     x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
     x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
     y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
     y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
 
 
-    anchor = paddle.to_tensor(anchor)
-    anchor = paddle.cast(anchor, x.dtype)
+    anchor = paddle.to_tensor(anchor, dtype=x.dtype)
     anchor = anchor.reshape((1, na, 1, 1, 2))
     anchor = anchor.reshape((1, na, 1, 1, 2))
     w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
     w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
     h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
     h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
@@ -489,96 +584,6 @@ def batch_distance2bbox(points, distance, max_shapes=None):
     return out_bbox
     return out_bbox
 
 
 
 
-def delta2bbox_v2(rois,
-                  deltas,
-                  means=(0.0, 0.0, 0.0, 0.0),
-                  stds=(1.0, 1.0, 1.0, 1.0),
-                  max_shape=None,
-                  wh_ratio_clip=16.0 / 1000.0,
-                  ctr_clip=None):
-    """Transform network output(delta) to bboxes.
-    Based on https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/
-             bbox/coder/delta_xywh_bbox_coder.py
-    Args:
-        rois (Tensor): shape [..., 4], base bboxes, typical examples include
-            anchor and rois
-        deltas (Tensor): shape [..., 4], offset relative to base bboxes
-        means (list[float]): the mean that was used to normalize deltas,
-            must be of size 4
-        stds (list[float]): the std that was used to normalize deltas,
-            must be of size 4
-        max_shape (list[float] or None): height and width of image, will be
-            used to clip bboxes if not None
-        wh_ratio_clip (float): to clip delta wh of decoded bboxes
-        ctr_clip (float or None): whether to clip delta xy of decoded bboxes
-    """
-    if rois.size == 0:
-        return paddle.empty_like(rois)
-    means = paddle.to_tensor(means)
-    stds = paddle.to_tensor(stds)
-    deltas = deltas * stds + means
-
-    dxy = deltas[..., :2]
-    dwh = deltas[..., 2:]
-
-    pxy = (rois[..., :2] + rois[..., 2:]) * 0.5
-    pwh = rois[..., 2:] - rois[..., :2]
-    dxy_wh = pwh * dxy
-
-    max_ratio = np.abs(np.log(wh_ratio_clip))
-    if ctr_clip is not None:
-        dxy_wh = paddle.clip(dxy_wh, max=ctr_clip, min=-ctr_clip)
-        dwh = paddle.clip(dwh, max=max_ratio)
-    else:
-        dwh = dwh.clip(min=-max_ratio, max=max_ratio)
-
-    gxy = pxy + dxy_wh
-    gwh = pwh * dwh.exp()
-    x1y1 = gxy - (gwh * 0.5)
-    x2y2 = gxy + (gwh * 0.5)
-    bboxes = paddle.concat([x1y1, x2y2], axis=-1)
-    if max_shape is not None:
-        bboxes[..., 0::2] = bboxes[..., 0::2].clip(min=0, max=max_shape[1])
-        bboxes[..., 1::2] = bboxes[..., 1::2].clip(min=0, max=max_shape[0])
-    return bboxes
-
-
-def bbox2delta_v2(src_boxes,
-                  tgt_boxes,
-                  means=(0.0, 0.0, 0.0, 0.0),
-                  stds=(1.0, 1.0, 1.0, 1.0)):
-    """Encode bboxes to deltas.
-    Modified from ppdet.modeling.bbox_utils.bbox2delta.
-    Args:
-        src_boxes (Tensor[..., 4]): base bboxes
-        tgt_boxes (Tensor[..., 4]): target bboxes
-        means (list[float]): the mean that will be used to normalize delta
-        stds (list[float]): the std that will be used to normalize delta
-    """
-    if src_boxes.size == 0:
-        return paddle.empty_like(src_boxes)
-    src_w = src_boxes[..., 2] - src_boxes[..., 0]
-    src_h = src_boxes[..., 3] - src_boxes[..., 1]
-    src_ctr_x = src_boxes[..., 0] + 0.5 * src_w
-    src_ctr_y = src_boxes[..., 1] + 0.5 * src_h
-
-    tgt_w = tgt_boxes[..., 2] - tgt_boxes[..., 0]
-    tgt_h = tgt_boxes[..., 3] - tgt_boxes[..., 1]
-    tgt_ctr_x = tgt_boxes[..., 0] + 0.5 * tgt_w
-    tgt_ctr_y = tgt_boxes[..., 1] + 0.5 * tgt_h
-
-    dx = (tgt_ctr_x - src_ctr_x) / src_w
-    dy = (tgt_ctr_y - src_ctr_y) / src_h
-    dw = paddle.log(tgt_w / src_w)
-    dh = paddle.log(tgt_h / src_h)
-
-    deltas = paddle.stack((dx, dy, dw, dh), axis=1)  # [n, 4]
-    means = paddle.to_tensor(means, place=src_boxes.place)
-    stds = paddle.to_tensor(stds, place=src_boxes.place)
-    deltas = (deltas - means) / stds
-    return deltas
-
-
 def iou_similarity(box1, box2, eps=1e-10):
 def iou_similarity(box1, box2, eps=1e-10):
     """Calculate iou of box1 and box2
     """Calculate iou of box1 and box2
 
 

+ 13 - 0
paddlers/models/ppdet/modeling/heads/__init__.py

@@ -33,6 +33,12 @@ from . import sparsercnn_head
 from . import tood_head
 from . import tood_head
 from . import retina_head
 from . import retina_head
 from . import ppyoloe_head
 from . import ppyoloe_head
+from . import fcosr_head
+from . import ppyoloe_r_head
+from . import yolof_head
+from . import ppyoloe_contrast_head
+from . import centertrack_head
+from . import sparse_roi_head
 
 
 from .bbox_head import *
 from .bbox_head import *
 from .mask_head import *
 from .mask_head import *
@@ -55,3 +61,10 @@ from .sparsercnn_head import *
 from .tood_head import *
 from .tood_head import *
 from .retina_head import *
 from .retina_head import *
 from .ppyoloe_head import *
 from .ppyoloe_head import *
+from .fcosr_head import *
+from .ppyoloe_r_head import *
+from .yolof_head import *
+from .ppyoloe_contrast_head import *
+from .centertrack_head import *
+from .sparse_roi_head import *
+from .petr_head import *

+ 51 - 12
paddlers/models/ppdet/modeling/heads/bbox_head.py

@@ -160,8 +160,8 @@ class XConvNormHead(nn.Layer):
 
 
 @register
 @register
 class BBoxHead(nn.Layer):
 class BBoxHead(nn.Layer):
-    __shared__ = ['num_classes']
-    __inject__ = ['bbox_assigner', 'bbox_loss']
+    __shared__ = ['num_classes', 'use_cot']
+    __inject__ = ['bbox_assigner', 'bbox_loss', 'loss_cot']
     """
     """
     RCNN bbox head
     RCNN bbox head
 
 
@@ -173,7 +173,10 @@ class BBoxHead(nn.Layer):
             box.
             box.
         with_pool (bool): Whether to use pooling for the RoI feature.
         with_pool (bool): Whether to use pooling for the RoI feature.
         num_classes (int): The number of classes
         num_classes (int): The number of classes
-        bbox_weight (List[float]): The weight to get the decode box 
+        bbox_weight (List[float]): The weight to get the decode box
+        cot_classes (int): The number of base classes
+        loss_cot (object): The module of Label-cotuning
+        use_cot(bool): whether to use Label-cotuning 
     """
     """
 
 
     def __init__(self,
     def __init__(self,
@@ -185,7 +188,10 @@ class BBoxHead(nn.Layer):
                  num_classes=80,
                  num_classes=80,
                  bbox_weight=[10., 10., 5., 5.],
                  bbox_weight=[10., 10., 5., 5.],
                  bbox_loss=None,
                  bbox_loss=None,
-                 loss_normalize_pos=False):
+                 loss_normalize_pos=False,
+                 cot_classes=None,
+                 loss_cot='COTLoss',
+                 use_cot=False):
         super(BBoxHead, self).__init__()
         super(BBoxHead, self).__init__()
         self.head = head
         self.head = head
         self.roi_extractor = roi_extractor
         self.roi_extractor = roi_extractor
@@ -199,11 +205,29 @@ class BBoxHead(nn.Layer):
         self.bbox_loss = bbox_loss
         self.bbox_loss = bbox_loss
         self.loss_normalize_pos = loss_normalize_pos
         self.loss_normalize_pos = loss_normalize_pos
 
 
-        self.bbox_score = nn.Linear(
-            in_channel,
-            self.num_classes + 1,
-            weight_attr=paddle.ParamAttr(initializer=Normal(
-                mean=0.0, std=0.01)))
+        self.loss_cot = loss_cot
+        self.cot_relation = None
+        self.cot_classes = cot_classes
+        self.use_cot = use_cot
+        if use_cot:
+            self.cot_bbox_score = nn.Linear(
+                in_channel,
+                self.num_classes + 1,
+                weight_attr=paddle.ParamAttr(initializer=Normal(
+                    mean=0.0, std=0.01)))
+            
+            self.bbox_score = nn.Linear(
+                in_channel,
+                self.cot_classes + 1,
+                weight_attr=paddle.ParamAttr(initializer=Normal(
+                    mean=0.0, std=0.01)))
+            self.cot_bbox_score.skip_quant = True
+        else:
+            self.bbox_score = nn.Linear(
+                in_channel,
+                self.num_classes + 1,
+                weight_attr=paddle.ParamAttr(initializer=Normal(
+                    mean=0.0, std=0.01)))
         self.bbox_score.skip_quant = True
         self.bbox_score.skip_quant = True
 
 
         self.bbox_delta = nn.Linear(
         self.bbox_delta = nn.Linear(
@@ -215,6 +239,9 @@ class BBoxHead(nn.Layer):
         self.assigned_label = None
         self.assigned_label = None
         self.assigned_rois = None
         self.assigned_rois = None
 
 
+    def init_cot_head(self, relationship):
+        self.cot_relation = relationship
+        
     @classmethod
     @classmethod
     def from_config(cls, cfg, input_shape):
     def from_config(cls, cfg, input_shape):
         roi_pooler = cfg['roi_extractor']
         roi_pooler = cfg['roi_extractor']
@@ -229,7 +256,7 @@ class BBoxHead(nn.Layer):
             'in_channel': head.out_shape[0].channels
             'in_channel': head.out_shape[0].channels
         }
         }
 
 
-    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None):
+    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None, cot=False):
         """
         """
         body_feats (list[Tensor]): Feature maps from backbone
         body_feats (list[Tensor]): Feature maps from backbone
         rois (list[Tensor]): RoIs generated from RPN module
         rois (list[Tensor]): RoIs generated from RPN module
@@ -248,7 +275,11 @@ class BBoxHead(nn.Layer):
             feat = paddle.squeeze(feat, axis=[2, 3])
             feat = paddle.squeeze(feat, axis=[2, 3])
         else:
         else:
             feat = bbox_feat
             feat = bbox_feat
-        scores = self.bbox_score(feat)
+        if self.use_cot:
+            scores = self.cot_bbox_score(feat)
+            cot_scores = self.bbox_score(feat)
+        else:
+            scores = self.bbox_score(feat)
         deltas = self.bbox_delta(feat)
         deltas = self.bbox_delta(feat)
 
 
         if self.training:
         if self.training:
@@ -259,11 +290,19 @@ class BBoxHead(nn.Layer):
                 rois,
                 rois,
                 self.bbox_weight,
                 self.bbox_weight,
                 loss_normalize_pos=self.loss_normalize_pos)
                 loss_normalize_pos=self.loss_normalize_pos)
+            
+            if self.cot_relation is not None:
+                loss_cot = self.loss_cot(cot_scores, targets, self.cot_relation)
+                loss.update(loss_cot)
             return loss, bbox_feat
             return loss, bbox_feat
         else:
         else:
-            pred = self.get_prediction(scores, deltas)
+            if cot:
+                pred = self.get_prediction(cot_scores, deltas)
+            else:
+                pred = self.get_prediction(scores, deltas)
             return pred, self.head
             return pred, self.head
 
 
+
     def get_loss(self,
     def get_loss(self,
                  scores,
                  scores,
                  deltas,
                  deltas,

+ 42 - 40
paddlers/models/ppdet/modeling/heads/centernet_head.py

@@ -61,13 +61,12 @@ class CenterNetHead(nn.Layer):
         in_channels (int): the channel number of input to CenterNetHead.
         in_channels (int): the channel number of input to CenterNetHead.
         num_classes (int): the number of classes, 80 (COCO dataset) by default.
         num_classes (int): the number of classes, 80 (COCO dataset) by default.
         head_planes (int): the channel number in all head, 256 by default.
         head_planes (int): the channel number in all head, 256 by default.
-        heatmap_weight (float): the weight of heatmap loss, 1 by default.
+        prior_bias (float): prior bias in heatmap head, -2.19 by default, -4.6 in CenterTrack
         regress_ltrb (bool): whether to regress left/top/right/bottom or
         regress_ltrb (bool): whether to regress left/top/right/bottom or
-            width/height for a box, true by default
-        size_weight (float): the weight of box size loss, 0.1 by default.
-        size_loss (): the type of size regression loss, 'L1 loss' by default.
-        offset_weight (float): the weight of center offset loss, 1 by default.
-        iou_weight (float): the weight of iou head loss, 0 by default.
+            width/height for a box, True by default.
+        size_loss (str): the type of size regression loss, 'L1' by default, can be 'giou'.
+        loss_weight (dict): the weight of each loss.
+        add_iou (bool): whether to add iou branch, False by default.
     """
     """
 
 
     __shared__ = ['num_classes']
     __shared__ = ['num_classes']
@@ -76,20 +75,20 @@ class CenterNetHead(nn.Layer):
                  in_channels,
                  in_channels,
                  num_classes=80,
                  num_classes=80,
                  head_planes=256,
                  head_planes=256,
-                 heatmap_weight=1,
+                 prior_bias=-2.19,
                  regress_ltrb=True,
                  regress_ltrb=True,
-                 size_weight=0.1,
                  size_loss='L1',
                  size_loss='L1',
-                 offset_weight=1,
-                 iou_weight=0):
+                 loss_weight={
+                     'heatmap': 1.0,
+                     'size': 0.1,
+                     'offset': 1.0,
+                     'iou': 0.0,
+                 },
+                 add_iou=False):
         super(CenterNetHead, self).__init__()
         super(CenterNetHead, self).__init__()
         self.regress_ltrb = regress_ltrb
         self.regress_ltrb = regress_ltrb
-        self.weights = {
-            'heatmap': heatmap_weight,
-            'size': size_weight,
-            'offset': offset_weight,
-            'iou': iou_weight
-        }
+        self.loss_weight = loss_weight
+        self.add_iou = add_iou
 
 
         # heatmap head
         # heatmap head
         self.heatmap = nn.Sequential(
         self.heatmap = nn.Sequential(
@@ -104,7 +103,7 @@ class CenterNetHead(nn.Layer):
                 padding=0,
                 padding=0,
                 bias=True))
                 bias=True))
         with paddle.no_grad():
         with paddle.no_grad():
-            self.heatmap[2].conv.bias[:] = -2.19
+            self.heatmap[2].conv.bias[:] = prior_bias
 
 
         # size(ltrb or wh) head
         # size(ltrb or wh) head
         self.size = nn.Sequential(
         self.size = nn.Sequential(
@@ -129,7 +128,7 @@ class CenterNetHead(nn.Layer):
                 head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))
                 head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))
 
 
         # iou head (optinal)
         # iou head (optinal)
-        if iou_weight > 0:
+        if self.add_iou and 'iou' in self.loss_weight:
             self.iou = nn.Sequential(
             self.iou = nn.Sequential(
                 ConvLayer(
                 ConvLayer(
                     in_channels,
                     in_channels,
@@ -153,34 +152,34 @@ class CenterNetHead(nn.Layer):
         return {'in_channels': input_shape.channels}
         return {'in_channels': input_shape.channels}
 
 
     def forward(self, feat, inputs):
     def forward(self, feat, inputs):
-        heatmap = self.heatmap(feat)
+        heatmap = F.sigmoid(self.heatmap(feat))
         size = self.size(feat)
         size = self.size(feat)
         offset = self.offset(feat)
         offset = self.offset(feat)
-        iou = self.iou(feat) if hasattr(self, 'iou_weight') else None
+        head_outs = {'heatmap': heatmap, 'size': size, 'offset': offset}
+        if self.add_iou and 'iou' in self.loss_weight:
+            iou = self.iou(feat)
+            head_outs.update({'iou': iou})
 
 
         if self.training:
         if self.training:
-            loss = self.get_loss(
-                inputs, self.weights, heatmap, size, offset, iou=iou)
-            return loss
+            losses = self.get_loss(inputs, self.loss_weight, head_outs)
+            return losses
         else:
         else:
-            heatmap = F.sigmoid(heatmap)
-            head_outs = {'heatmap': heatmap, 'size': size, 'offset': offset}
-            if iou is not None:
-                head_outs.update({'iou': iou})
             return head_outs
             return head_outs
 
 
-    def get_loss(self, inputs, weights, heatmap, size, offset, iou=None):
-        # heatmap head loss: CTFocalLoss
+    def get_loss(self, inputs, weights, head_outs):
+        # 1.heatmap(hm) head loss: CTFocalLoss
+        heatmap = head_outs['heatmap']
         heatmap_target = inputs['heatmap']
         heatmap_target = inputs['heatmap']
-        heatmap = paddle.clip(F.sigmoid(heatmap), 1e-4, 1 - 1e-4)
+        heatmap = paddle.clip(heatmap, 1e-4, 1 - 1e-4)
         ctfocal_loss = CTFocalLoss()
         ctfocal_loss = CTFocalLoss()
         heatmap_loss = ctfocal_loss(heatmap, heatmap_target)
         heatmap_loss = ctfocal_loss(heatmap, heatmap_target)
 
 
-        # size head loss: L1 loss or GIoU loss
+        # 2.size(wh) head loss: L1 loss or GIoU loss
+        size = head_outs['size']
         index = inputs['index']
         index = inputs['index']
         mask = inputs['index_mask']
         mask = inputs['index_mask']
         size = paddle.transpose(size, perm=[0, 2, 3, 1])
         size = paddle.transpose(size, perm=[0, 2, 3, 1])
-        size_n, size_h, size_w, size_c = size.shape
+        size_n, _, _, size_c = size.shape
         size = paddle.reshape(size, shape=[size_n, -1, size_c])
         size = paddle.reshape(size, shape=[size_n, -1, size_c])
         index = paddle.unsqueeze(index, 2)
         index = paddle.unsqueeze(index, 2)
         batch_inds = list()
         batch_inds = list()
@@ -208,7 +207,8 @@ class CenterNetHead(nn.Layer):
                 else:
                 else:
                     # inputs['size'] is ltrb, but regress as wh
                     # inputs['size'] is ltrb, but regress as wh
                     # shape: [bs, max_per_img, 4]
                     # shape: [bs, max_per_img, 4]
-                    size_target = inputs['size'][:, :, 0:2] + inputs['size'][:, :, 2:]
+                    size_target = inputs['size'][:, :, 0:2] + inputs[
+                        'size'][:, :, 2:]
 
 
             size_target.stop_gradient = True
             size_target.stop_gradient = True
             size_loss = F.l1_loss(
             size_loss = F.l1_loss(
@@ -232,10 +232,11 @@ class CenterNetHead(nn.Layer):
                 loc_reweight=None)
                 loc_reweight=None)
             size_loss = size_loss / (pos_num + 1e-4)
             size_loss = size_loss / (pos_num + 1e-4)
 
 
-        # offset head loss: L1 loss
+        # 3.offset(reg) head loss: L1 loss
+        offset = head_outs['offset']
         offset_target = inputs['offset']
         offset_target = inputs['offset']
         offset = paddle.transpose(offset, perm=[0, 2, 3, 1])
         offset = paddle.transpose(offset, perm=[0, 2, 3, 1])
-        offset_n, offset_h, offset_w, offset_c = offset.shape
+        offset_n, _, _, offset_c = offset.shape
         offset = paddle.reshape(offset, shape=[offset_n, -1, offset_c])
         offset = paddle.reshape(offset, shape=[offset_n, -1, offset_c])
         pos_offset = paddle.gather_nd(offset, index=index)
         pos_offset = paddle.gather_nd(offset, index=index)
         offset_mask = paddle.expand_as(mask, pos_offset)
         offset_mask = paddle.expand_as(mask, pos_offset)
@@ -249,10 +250,11 @@ class CenterNetHead(nn.Layer):
             reduction='sum')
             reduction='sum')
         offset_loss = offset_loss / (pos_num + 1e-4)
         offset_loss = offset_loss / (pos_num + 1e-4)
 
 
-        # iou head loss: GIoU loss
-        if iou is not None:
+        # 4.iou head loss: GIoU loss (optinal)
+        if self.add_iou and 'iou' in self.loss_weight:
+            iou = head_outs['iou']
             iou = paddle.transpose(iou, perm=[0, 2, 3, 1])
             iou = paddle.transpose(iou, perm=[0, 2, 3, 1])
-            iou_n, iou_h, iou_w, iou_c = iou.shape
+            iou_n, _, _, iou_c = iou.shape
             iou = paddle.reshape(iou, shape=[iou_n, -1, iou_c])
             iou = paddle.reshape(iou, shape=[iou_n, -1, iou_c])
             pos_iou = paddle.gather_nd(iou, index=index)
             pos_iou = paddle.gather_nd(iou, index=index)
             iou_mask = paddle.expand_as(mask, pos_iou)
             iou_mask = paddle.expand_as(mask, pos_iou)
@@ -284,8 +286,8 @@ class CenterNetHead(nn.Layer):
         det_loss = weights['heatmap'] * heatmap_loss + weights[
         det_loss = weights['heatmap'] * heatmap_loss + weights[
             'size'] * size_loss + weights['offset'] * offset_loss
             'size'] * size_loss + weights['offset'] * offset_loss
 
 
-        if iou is not None:
+        if self.add_iou and 'iou' in self.loss_weight:
             losses.update({'iou_loss': iou_loss})
             losses.update({'iou_loss': iou_loss})
-            det_loss = det_loss + weights['iou'] * iou_loss
+            det_loss += weights['iou'] * iou_loss
         losses.update({'det_loss': det_loss})
         losses.update({'det_loss': det_loss})
         return losses
         return losses

部分文件因为文件数量过多而无法显示