!10818 Modify CenterNet scripts for cpu adaption and user friendliness

From: @shibeiji Reviewed-by: @linqingke,@c_34,@linqingke Signed-off-by: @wuxuejian
2020-12-30 17:33:29 +08:00 · 2020-12-30 17:33:29 +08:00 · 720baf670e
parent 439ef332b8 6ada7318be
commit 720baf670e
17 changed files with 367 additions and 149 deletions
--- a/model_zoo/research/cv/centernet/README.md
+++ b/model_zoo/research/cv/centernet/README.md
@ -119,19 +119,27 @@ After installing MindSpore via the official website, you can start training and

 Note: 1.the first run of training will generate the mindrecord file, which will take a long time.
      2.MINDRECORD_DATASET_PATH is the mindrecord dataset directory.
+      3.LOAD_CHECKPOINT_PATH is the pretrained checkpoint file directory, if no just set ""
+      4.RUN_MODE support validation and testing, set to be "val"/"test"

 ```shell
 # create dataset in mindrecord format
 bash scripts/convert_dataset_to_mindrecord.sh

-# standalone training
-bash scripts/run_standalone_train_ascend.sh [DEVICE_ID] [EPOCH_SIZE]
+# standalone training on Ascend
+bash scripts/run_standalone_train_ascend.sh [DEVICE_ID] [MINDRECORD_DATASET_PATH] [LOAD_CHECKPOINT_PATH]

-# distributed training
-bash scripts/run_distributed_train_ascend.sh [MINDRECORD_DATASET_PATH] [RANK_TABLE_FILE]
+# standalone training on CPU
+bash scripts/run_standalone_train_cpu.sh [MINDRECORD_DATASET_PATH] [LOAD_CHECKPOINT_PATH]

-# eval
-bash scripts/run_standalone_eval_ascend.sh [DEVICE_ID]
+# distributed training on Ascend
+bash scripts/run_distributed_train_ascend.sh [MINDRECORD_DATASET_PATH] [LOAD_CHECKPOINT_PATH] [RANK_TABLE_FILE]
+
+# eval on Ascend
+bash scripts/run_standalone_eval_ascend.sh [DEVICE_ID] [RUN_MODE] [DATA_DIR] [LOAD_CHECKPOINT_PATH]
+
+# eval on CPU
+bash scripts/run_standalone_eval_cpu.sh [RUN_MODE] [DATA_DIR] [LOAD_CHECKPOINT_PATH]
 ```

 # [Script Description](#contents)
@ -153,9 +161,11 @@ bash scripts/run_standalone_eval_ascend.sh [DEVICE_ID]
        │   │    ├──get_distribute_pretrain_cmd.py     // script for distributed pretraining
        │   │    ├──README.md
        │   ├──convert_dataset_to_mindrecord.sh        // shell script for converting coco type dataset to mindrecord
-        │   ├──run_standalone_train_ascend.sh          // shell script for standalone pretrain on ascend
-        │   ├──run_distributed_train_ascend.sh         // shell script for distributed pretrain on ascend
+        │   ├──run_standalone_train_ascend.sh          // shell script for standalone training on ascend
+        │   ├──run_distributed_train_ascend.sh         // shell script for distributed training on ascend
        │   ├──run_standalone_eval_ascend.sh           // shell script for standalone evaluation on ascend
+        │   ├──run_standalone_train_cpu.sh             // shell script for standalone training on cpu
+        │   ├──run_standalone_eval_cpu.sh              // shell script for standalone evaluation on cpu
        └── src
            ├──__init__.py
            ├──centernet_pose.py         // centernet networks, training entry
@ -259,7 +269,6 @@ config for training.

 ```text
 config for evaluation.
-    flip_test                       whether to use flip test: True | False, default is False
    soft_nms                        nms after decode: True | False, default is True
    keep_res                        keep original or fix resolution: True | False, default is False
    multi_scales                    use multi-scales of image: List, default is [1.0]
@ -350,12 +359,12 @@ bash scripts/convert_dataset_to_mindrecord.sh

 The command above will run in the background, after converting mindrecord files will be located in path specified by yourself.

-### Training
+### Standalone Training

 #### Running on Ascend

 ```bash
-bash scripts/run_standalone_pretrain_ascend.sh 0 1
+bash scripts/run_standalone_train_ascend.sh device_id /path/mindrecord_dataset /path/load_ckpt
 ```

 The command above will run in the background, you can view training logs in training_log.txt. After training finished, you will get some checkpoint files under the script folder by default. The loss values will be displayed as follows:
@ -368,12 +377,31 @@ epoch: 349.0, current epoch percent: 1.00, step: 87500, outputs are (Tensor(shap
 ...
 ```

+#### Running on CPU
+
+```bash
+bash scripts/run_standalone_train_cpu.sh /path/mindrecord_dataset /path/load_ckpt
+```
+
+The command above will run in the background, you can view training logs in training_log.txt. After training finished, you will get some checkpoint files under the script folder by default. The loss values will be displayed as follows (rusume from pretrained checkpoint and batch_size was set to be 8):
+
+```text
+# grep "epoch" training_log.txt
+...
+epoch: 0.0, current epoch percent: 0.00, step: 1, time of per steps: 66.693 s, outputs are 3.645
+epoch: 0.0, current epoch percent: 0.00, step: 2, time of per steps: 46.594 s, outputs are 4.862
+epoch: 0.0, current epoch percent: 0.00, step: 3, time of per steps: 44.718 s, outputs are 3.927
+epoch: 0.0, current epoch percent: 0.00, step: 4, time of per steps: 45.113 s, outputs are 3.910
+epoch: 0.0, current epoch percent: 0.00, step: 5, time of per steps: 45.213 s, outputs are 3.749
+...
+```
+
 ### Distributed Training

 #### Running on Ascend

 ```bash
-bash scripts/run_distributed_pretrain_ascend.sh /path/coco2017 /path/mindrecord /path/hccl.json
+bash scripts/run_distributed_pretrain_ascend.sh /path/mindrecord_dataset /path/load_ckpt /path/hccl.json
 ```

 The command above will run in the background, you can view training logs in LOG*/training_log.txt and LOG*/ms_log/. After training finished, you will get some checkpoint files under the LOG*/ckpt_0 folder by default. The loss value will be displayed as follows:
@ -394,7 +422,11 @@ epoch: 0.0, current epoch percent: 0.002, step: 200, outputs are (Tensor(shape=[

 ```bash
 # Evaluation base on validation dataset will be done automatically, while for test or test-dev dataset, the accuracy should be upload to the CodaLab official website(https://competitions.codalab.org).
-bash scripts/run_standalone_eval_ascend.sh [DEVICE_ID]
+# On Ascend
+bash scripts/run_standalone_eval_ascend.sh device_id val(or test) /path/coco_dataset /path/load_ckpt
+
+# On CPU
+bash scripts/run_standalone_eval_cpu.sh val(or test) /path/coco_dataset /path/load_ckpt
 ```

 you can see the MAP result below as below:
@ -439,7 +471,7 @@ python export.py [DEVICE_ID]

 ## [Performance](#contents)

-### Training Performance
+### Training Performance On Ascend

 CenterNet on 11.8K images(The annotation and data format must be the same as coco)

@ -460,7 +492,7 @@ CenterNet on 11.8K images(The annotation and data format must be the same as coc
 | Checkpoint                 | 242M (.ckpt file)                                              |
 | Scripts                    | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/cv/centernet> |

-### Inference Performance
+### Inference Performance On Ascend

 CenterNet on validation(5K images) and test-dev(40K images)

--- a/model_zoo/research/cv/centernet/eval.py
+++ b/model_zoo/research/cv/centernet/eval.py
@ -36,6 +36,8 @@ from src.config import dataset_config, net_config, eval_config
 _current_dir = os.path.dirname(os.path.realpath(__file__))

 parser = argparse.ArgumentParser(description='CenterNet evaluation')
+parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'CPU'],
+                    help='device where the code will be implemented. (Default: Ascend)')
 parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
 parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path")
 parser.add_argument("--data_dir", type=str, default="", help="Dataset directory, "
@ -52,15 +54,20 @@ def predict():
    '''
    Predict function
    '''
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
+    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
+    if args_opt.device_target == "Ascend":
+        context.set_context(device_id=args_opt.device_id)
+        enable_nms_fp16 = True
+    else:
+        enable_nms_fp16 = False

    logger.info("Begin creating {} dataset".format(args_opt.run_mode))
    coco = COCOHP(dataset_config, run_mode=args_opt.run_mode, net_opt=net_config,
                  enable_visual_image=(args_opt.visual_image == "true"), save_path=args_opt.save_result_dir,)
-    coco.init(args_opt.data_dir, keep_res=eval_config.keep_res, flip_test=eval_config.flip_test)
+    coco.init(args_opt.data_dir, keep_res=eval_config.keep_res)
    dataset = coco.create_eval_dataset()

-    net_for_eval = CenterNetMultiPoseEval(net_config, eval_config.flip_test, eval_config.K)
+    net_for_eval = CenterNetMultiPoseEval(net_config, eval_config.K, enable_nms_fp16)
    net_for_eval.set_train(False)

    param_dict = load_checkpoint(args_opt.load_checkpoint_path)
@ -103,9 +110,7 @@ def predict():
        print("Image {}/{} id: {} cost time {} ms".format(index, total_nums, image_id, (end - start) * 1000.))

        # post-process
-        soft_nms = eval_config.soft_nms or len(eval_config.multi_scales) > 0
-        detections = merge_outputs(detections, soft_nms)
-
+        detections = merge_outputs(detections, eval_config.soft_nms)
        # get prediction result
        pred_json = convert_eval_format(detections, image_id)
        gt_image_info = coco.coco.loadImgs([image_id])
--- a/model_zoo/research/cv/centernet/export.py
+++ b/model_zoo/research/cv/centernet/export.py
@ -31,7 +31,7 @@ args = parser.parse_args()
 context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id)

 if __name__ == '__main__':
-    net = CenterNetMultiPoseEval(net_config, eval_config.flip_test, eval_config.K)
+    net = CenterNetMultiPoseEval(net_config, eval_config.K)
    net.set_train(False)

    param_dict = load_checkpoint(export_config.ckpt_file)
--- a/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/get_distribute_train_cmd.py
+++ b/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/get_distribute_train_cmd.py
@ -39,8 +39,7 @@ def parse_args():
    parser.add_argument("--hyper_parameter_config_dir", type=str, default="",
                        help="Hyper Parameter config path, it is better to use absolute path")
    parser.add_argument("--mindrecord_dir", type=str, default="", help="Mindrecord dataset directory")
-    parser.add_argument("--mindrecord_prefix", type=str, default="coco_hp.train.mind",
-                        help="Prefix of MindRecord dataset filename.")
+    parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path")
    parser.add_argument("--hccl_config_dir", type=str, default="",
                        help="Hccl config path, it is better to use absolute path")
    parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh",
@ -72,7 +71,7 @@ def distribute_train():

    run_script = args.run_script_dir
    mindrecord_dir = args.mindrecord_dir
-    mindrecord_prefix = args.mindrecord_prefix
+    load_checkpoint_path = args.load_checkpoint_path
    cf = configparser.ConfigParser()
    cf.read(args.hyper_parameter_config_dir)
    cfg = dict(cf.items("config"))
@ -151,7 +150,7 @@ def distribute_train():
                             " 'device_num' or 'mindrecord_dir'! ")
        run_cmd += opt
        run_cmd += " --mindrecord_dir=" + mindrecord_dir
-        run_cmd += " --mindrecord_prefix=" + mindrecord_prefix
+        run_cmd += " --load_checkpoint_path=" + load_checkpoint_path
        run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
               + str(rank_size) + ' >./training_log.txt 2>&1 &'

--- a/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/hyper_parameter_config.ini
+++ b/model_zoo/research/cv/centernet/scripts/ascend_distributed_launcher/hyper_parameter_config.ini
@ -5,7 +5,6 @@ enable_save_ckpt=true
 do_shuffle=true
 enable_data_sink=true
 data_sink_steps=50
-load_checkpoint_path=""
 save_checkpoint_path=./
 save_checkpoint_steps=3000
 save_checkpoint_num=1
--- a/model_zoo/research/cv/centernet/scripts/run_distributed_train_ascend.sh
+++ b/model_zoo/research/cv/centernet/scripts/run_distributed_train_ascend.sh
@ -14,21 +14,26 @@
 # limitations under the License.
 # ============================================================================

-echo "=============================================================================================================="
-echo "Please run the scipt as: "
-echo "bash run_distributed_train_ascend.sh DATA_DIR MINDRECORD_DIR RANK_TABLE_FILE"
-echo "for example: bash run_distributed_train_ascend.sh /path/dataset /path/mindrecord /path/hccl.json"
-echo "It is better to use absolute path."
+echo "================================================================================================================"
+echo "Please run the script as: "
+echo "bash run_distributed_train_ascend.sh MINDRECORD_DIR LOAD_CHECKPOINT_PATH RANK_TABLE_FILE"
+echo "for example: bash run_distributed_train_ascend.sh /path/mindrecord_dataset /path/load_ckpt /path/hccl.json"
+echo "if no ckpt, just run: bash run_distributed_train_ascend.sh /path/mindrecord_dataset \"\" /path/hccl.json"
+echo "It is better to use the absolute path."
 echo "For hyper parameter, please note that you should customize the scripts:
          '{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' "
-echo "=============================================================================================================="
+echo "================================================================================================================"
 CUR_DIR=`pwd`
+MINDRECORD_DIR=$1
+LOAD_CHECKPOINT_PATH=$2
+HCCL_RANK_FILE=$3

 python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_train_cmd.py \
    --run_script_dir=${CUR_DIR}/train.py \
    --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \
-    --mindrecord_dir=$1 \
-    --hccl_config_dir=$2 \
+    --mindrecord_dir=$MINDRECORD_DIR \
+    --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \
+    --hccl_config_dir=$HCCL_RANK_FILE \
    --hccl_time_out=1200 \
    --cmd_file=distributed_cmd.sh

--- a/model_zoo/research/cv/centernet/scripts/run_standalone_eval_ascend.sh
+++ b/model_zoo/research/cv/centernet/scripts/run_standalone_eval_ascend.sh
@ -16,11 +16,14 @@

 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "bash run_standalone_eval_ascend.sh DEVICE_ID"
-echo "for example: bash run_standalone_eval_ascend.sh 0"
+echo "bash run_standalone_eval_ascend.sh DEVICE_ID RUN_MODE DATA_DIR LOAD_CHECKPOINT_PATH"
+echo "for example of validation: bash run_standalone_eval_ascend.sh 0 val /path/coco_dataset /path/load_ckpt"
+echo "for example of test: bash run_standalone_eval_ascend.sh 0 test /path/coco_dataset /path/load_ckpt"
 echo "=============================================================================================================="
-
 DEVICE_ID=$1
+RUN_MODE=$2
+DATA_DIR=$3
+LOAD_CHECKPOINT_PATH=$4
 mkdir -p ms_log 
 PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
 CUR_DIR=`pwd`
@ -42,10 +45,11 @@ else
 fi

 python ${PROJECT_DIR}/../eval.py  \
+    --device_target=Ascend \
    --device_id=$DEVICE_ID \
-    --load_checkpoint_path="" \
-    --data_dir="" \
+    --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \
+    --data_dir=$DATA_DIR \
+    --run_mode=$RUN_MODE \
    --visual_image=true \
    --enable_eval=true \
-    --save_result_dir="" \
-    --run_mode=val > eval_log.txt 2>&1 &
+    --save_result_dir=./ > eval_log.txt 2>&1 &
--- a/model_zoo/research/cv/centernet/scripts/run_standalone_eval_cpu.sh
+++ b/model_zoo/research/cv/centernet/scripts/run_standalone_eval_cpu.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the scipt as: "
+echo "bash run_standalone_eval_cpu.sh RUN_MODE DATA_DIR LOAD_CHECKPOINT_PATH"
+echo "for example of validation: bash run_standalone_eval_cpu.sh val /path/coco_dataset /path/load_ckpt"
+echo "for example of test: bash run_standalone_eval_cpu.sh test /path/coco_dataset /path/load_ckpt"
+echo "=============================================================================================================="
+RUN_MODE=$1
+DATA_DIR=$2
+LOAD_CHECKPOINT_PATH=$3
+mkdir -p ms_log 
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+CUR_DIR=`pwd`
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
+
+# install nms module from third party
+if python -c "import nms" > /dev/null 2>&1
+then
+    echo "NMS module already exits, no need reinstall."
+else
+    echo "NMS module was not found, install it now..."
+    git clone https://github.com/xingyizhou/CenterNet.git
+    cd CenterNet/src/lib/external/
+    make
+    python setup.py install
+    cd -
+    rm -rf CenterNet
+fi
+
+python ${PROJECT_DIR}/../eval.py  \
+    --device_target=CPU \
+    --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \
+    --data_dir=$DATA_DIR \
+    --run_mode=$RUN_MODE \
+    --visual_image=true \
+    --enable_eval=true \
+    --save_result_dir=./ > eval_log.txt 2>&1 &
--- a/model_zoo/research/cv/centernet/scripts/run_standalone_train_ascend.sh
+++ b/model_zoo/research/cv/centernet/scripts/run_standalone_train_ascend.sh
@ -16,12 +16,14 @@

 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "bash run_standalone_pretrain_ascend.sh DEVICE_ID EPOCH_SIZE"
-echo "for example: bash run_standalone_pretrain_ascend.sh 0 350"
+echo "bash run_standalone_train_ascend.sh DEVICE_ID MINDRECORD_DIR LOAD_CHECKPOINT_PATH"
+echo "for example: bash run_standalone_train_ascend.sh 0 /path/mindrecord_dataset /path/load_ckpt"
+echo "if no ckpt, just run: bash run_standalone_train_ascend.sh 0 /path/mindrecord_dataset \"\" "
 echo "=============================================================================================================="

 DEVICE_ID=$1
-EPOCH_SIZE=$2
+MINDRECORD_DIR=$2
+LOAD_CHECKPOINT_PATH=$3

 mkdir -p ms_log 
 PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
@ -33,16 +35,16 @@ python ${PROJECT_DIR}/../train.py  \
    --distribute=false \
    --need_profiler=false \
    --profiler_path=./profiler \
-    --epoch_size=$EPOCH_SIZE \
    --device_id=$DEVICE_ID \
    --enable_save_ckpt=true \
    --do_shuffle=true \
    --enable_data_sink=true \
    --data_sink_steps=50 \
-    --load_checkpoint_path="" \
+    --epoch_size=350 \
+    --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \
    --save_checkpoint_steps=10000 \
    --save_checkpoint_num=1 \
-    --mindrecord_dir="" \
+    --mindrecord_dir=$MINDRECORD_DIR \
    --mindrecord_prefix="coco_hp.train.mind" \
    --visual_image=false \
    --save_result_dir="" > training_log.txt 2>&1 &
--- a/model_zoo/research/cv/centernet/scripts/run_standalone_train_cpu.sh
+++ b/model_zoo/research/cv/centernet/scripts/run_standalone_train_cpu.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+echo "=============================================================================================================="
+echo "Please run the scipt as: "
+echo "bash run_standalone_train_cpu.sh MINDRECORD_DIR LOAD_CHECKPOINT_PATH"
+echo "for example: bash run_standalone_train_cpu.sh /path/mindrecord_dataset /path/load_ckpt"
+echo "if no ckpt, just run: bash run_standalone_train_cpu.sh /path/mindrecord_dataset \"\" "
+echo "=============================================================================================================="
+
+MINDRECORD_DIR=$1
+LOAD_CHECKPOINT_PATH=$2
+
+mkdir -p ms_log
+PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
+CUR_DIR=`pwd`
+export GLOG_log_dir=${CUR_DIR}/ms_log
+export GLOG_logtostderr=0
+
+python ${PROJECT_DIR}/../train.py  \
+    --device_target=CPU \
+    --enable_save_ckpt=true \
+    --do_shuffle=true \
+    --epoch_size=1 \
+    --load_checkpoint_path=$LOAD_CHECKPOINT_PATH \
+    --save_checkpoint_steps=1000 \
+    --save_checkpoint_num=1 \
+    --mindrecord_dir=$MINDRECORD_DIR \
+    --mindrecord_prefix="coco_hp.train.mind" \
+    --visual_image=false \
+    --save_result_dir="" > training_log.txt 2>&1 &
--- a/model_zoo/research/cv/centernet/src/init.py
+++ b/model_zoo/research/cv/centernet/src/init.py
@ -15,7 +15,7 @@
 """CenterNet Init."""

 from .centernet_pose import GatherMultiPoseFeatureCell, CenterNetMultiPoseLossCell, \
-    CenterNetWithLossScaleCell, CenterNetMultiPoseEval
+    CenterNetWithLossScaleCell, CenterNetMultiPoseEval, CenterNetWithoutLossScaleCell
 from .dataset import COCOHP
 from .visual import visual_allimages, visual_image
 from .decode import MultiPoseDecode
@ -23,6 +23,7 @@ from .post_process import convert_eval_format, to_float, resize_detection, post_

 __all__ = [
    "GatherMultiPoseFeatureCell", "CenterNetMultiPoseLossCell", "CenterNetWithLossScaleCell", \
-    "CenterNetMultiPoseEval", "COCOHP", "visual_allimages", "visual_image", "MultiPoseDecode", \
-    "convert_eval_format", "to_float", "resize_detection", "post_process", "merge_outputs"
+    "CenterNetMultiPoseEval", "CenterNetWithoutLossScaleCell", "COCOHP", "visual_allimages", \
+    "visual_image", "MultiPoseDecode", "convert_eval_format", "to_float", "resize_detection", \
+    "post_process", "merge_outputs"
 ]
--- a/model_zoo/research/cv/centernet/src/centernet_pose.py
+++ b/model_zoo/research/cv/centernet/src/centernet_pose.py
@ -197,6 +197,46 @@ class CenterNetMultiPoseLossCell(nn.Cell):
        return total_loss


+class CenterNetWithoutLossScaleCell(nn.Cell):
+    """
+    Encapsulation class of centernet training.
+
+    Append an optimizer to the training network after that the construct
+    function can be called to create the backward graph.
+
+    Args:
+        network (Cell): The training network. Note that loss function should have been added.
+        optimizer (Optimizer): Optimizer for updating the weights.
+
+    Returns:
+        Tuple of Tensors, the loss, overflow flag and scaling sens of the network.
+    """
+    def __init__(self, network, optimizer):
+        super(CenterNetWithoutLossScaleCell, self).__init__(auto_prefix=False)
+        self.image = ImagePreProcess()
+        self.network = network
+        self.network.set_grad()
+        self.weights = optimizer.parameters
+        self.optimizer = optimizer
+        self.grad = ops.GradOperation(get_by_list=True, sens_param=False)
+
+    @ops.add_flags(has_effect=True)
+    def construct(self, image, hm, reg_mask, ind, wh, kps, kps_mask, reg,
+                  hm_hp, hp_offset, hp_ind, hp_mask):
+        """Defines the computation performed."""
+        image = self.image(image)
+        weights = self.weights
+        loss = self.network(image, hm, reg_mask, ind, wh, kps, kps_mask, reg,
+                            hm_hp, hp_offset, hp_ind, hp_mask)
+
+        grads = self.grad(self.network, weights)(image, hm, reg_mask, ind, wh, kps,
+                                                 kps_mask, reg, hm_hp, hp_offset,
+                                                 hp_ind, hp_mask)
+        succ = self.optimizer(grads)
+        ret = loss
+        return ops.depend(ret, succ)
+
+
 class CenterNetWithLossScaleCell(nn.Cell):
    """
    Encapsulation class of centernet training.
@ -279,17 +319,16 @@ class CenterNetMultiPoseEval(nn.Cell):

    Args:
        net_config: The config info of CenterNet network.
-        flip_test(bool): Flip data augmentation or not. Default: False.
        K(number): Max number of output objects. Default: 100.
+        enable_nms_fp16(bool): Use float16 data for max_pool, adaption for CPU. Default: True.

    Returns:
        Tensor, detection of images(bboxes, score, keypoints and category id of each objects)
    """
-    def __init__(self, net_config, flip_test=False, K=100):
+    def __init__(self, net_config, K=100, enable_nms_fp16=True):
        super(CenterNetMultiPoseEval, self).__init__()
        self.network = GatherMultiPoseFeatureCell(net_config)
-        self.decode = MultiPoseDecode(net_config, flip_test, K)
-        self.flip_test = flip_test
+        self.decode = MultiPoseDecode(net_config, K, enable_nms_fp16)
        self.shape = ops.Shape()
        self.reshape = ops.Reshape()

--- a/model_zoo/research/cv/centernet/src/config.py
+++ b/model_zoo/research/cv/centernet/src/config.py
@ -104,8 +104,7 @@ train_config = edict({


 eval_config = edict({
-    'flip_test': False,
-    'soft_nms': False,
+    'soft_nms': True,
    'keep_res': True,
    'multi_scales': [1.0],
    'pad': 31,
--- a/model_zoo/research/cv/centernet/src/dataset.py
+++ b/model_zoo/research/cv/centernet/src/dataset.py
@ -17,7 +17,6 @@ Data operations, will be used in train.py
 """

 import os
-import copy
 import math
 import argparse
 import cv2
@ -66,7 +65,7 @@ class COCOHP(ds.Dataset):
            if not os.path.exists(self.save_path):
                os.makedirs(self.save_path)

-    def init(self, data_dir, keep_res=False, flip_test=False):
+    def init(self, data_dir, keep_res=False):
        """initailize additional info"""
        logger.info('Initializing coco 2017 {} data.'.format(self.run_mode))
        if not os.path.isdir(data_dir):
@ -94,7 +93,6 @@ class COCOHP(ds.Dataset):
            self.images = image_ids
        self.num_samples = len(self.images)
        self.keep_res = keep_res
-        self.flip_test = flip_test
        if self.run_mode != "train":
            self.pad = 31
        logger.info('Loaded {} {} samples'.format(self.run_mode, self.num_samples))
@ -167,7 +165,7 @@ class COCOHP(ds.Dataset):
        ret = (img, image_id)
        return ret

-    def pre_process_for_test(self, image, img_id, scale, meta=None):
+    def pre_process_for_test(self, image, img_id, scale):
        """image pre-process for evaluation"""
        b, h, w, ch = image.shape
        assert b == 1, "only single image was supported here"
@ -191,17 +189,8 @@ class COCOHP(ds.Dataset):
                                   flags=cv2.INTER_LINEAR)
        inp_img = (inp_image.astype(np.float32) / 255. - self.data_opt.mean) / self.data_opt.std

-        h, w, ch = inp_img.shape
-        images = copy.deepcopy(inp_img)
-        if self.flip_test:
-            flip_image = inp_img[:, ::-1, :]
-            inp_img = inp_img.reshape((1, h, w, ch))
-            flip_image = flip_image.reshape((1, h, w, ch))
-            # (2, h, w, c)
-            images = np.concatenate((inp_img, flip_image), axis=0)
-        else:
-            images = images.reshape((1, h, w, ch))
-        images = images.transpose(0, 3, 1, 2)
+        eval_image = inp_img.reshape((1,) + inp_img.shape)
+        eval_image = eval_image.transpose(0, 3, 1, 2)

        meta = {'c': c, 's': s,
                'out_height': inp_height // self.net_opt.down_ratio,
@ -244,7 +233,7 @@ class COCOHP(ds.Dataset):
                image_name = "gt_" + self.run_mode + "_image_" + str(img_id) + "_scale_" + str(scale) + ".png"
                cv2.imwrite("{}/{}".format(self.save_path, image_name), inp_image)

-        return images, meta
+        return eval_image, meta

    def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id):
        """image pre-process and augmentation"""
--- a/model_zoo/research/cv/centernet/src/decode.py
+++ b/model_zoo/research/cv/centernet/src/decode.py
@ -30,25 +30,32 @@ class NMS(nn.Cell):

    Args:
        kernel(int): Maxpooling kernel size. Default: 3.
+        enable_nms_fp16(bool): Use float16 data for max_pool, adaption for CPU. Default: True.

    Returns:
        Tensor, heatmap after non-maximum suppression.
    """
-    def __init__(self, kernel=3):
+    def __init__(self, kernel=3, enable_nms_fp16=True):
        super(NMS, self).__init__()
        self.pad = (kernel - 1) // 2
        self.cast = ops.Cast()
        self.dtype = ops.DType()
        self.equal = ops.Equal()
        self.max_pool = nn.MaxPool2d(kernel, stride=1, pad_mode="same")
+        self.enable_fp16 = enable_nms_fp16

    def construct(self, heat):
+        """Non-maximum suppression"""
        dtype = self.dtype(heat)
-        heat = self.cast(heat, mstype.float16)
-        heat_max = self.max_pool(heat)
-        keep = self.equal(heat, heat_max)
-        keep = self.cast(keep, dtype)
-        heat = self.cast(heat, dtype)
+        if self.enable_fp16:
+            heat = self.cast(heat, mstype.float16)
+            heat_max = self.max_pool(heat)
+            keep = self.equal(heat, heat_max)
+            keep = self.cast(keep, dtype)
+            heat = self.cast(heat, dtype)
+        else:
+            heat_max = self.max_pool(heat)
+            keep = self.equal(heat, heat_max)
        heat = heat * keep
        return heat

@ -127,18 +134,24 @@ class GatherFeatureByInd(nn.Cell):
    """
    Gather features by index

-    Args: None
+    Args:
+        enable_cpu_gather (bool): Use cpu operator GatherD to gather feature or not, adaption for CPU. Default: True.

    Returns:
        Tensor
    """
-    def __init__(self):
+    def __init__(self, enable_cpu_gatherd=True):
        super(GatherFeatureByInd, self).__init__()
        self.tile = ops.Tile()
        self.shape = ops.Shape()
        self.concat = ops.Concat(axis=1)
        self.reshape = ops.Reshape()
-        self.gather_nd = ops.GatherNd()
+        self.enable_cpu_gatherd = enable_cpu_gatherd
+        if self.enable_cpu_gatherd:
+            self.gather_nd = ops.GatherD()
+            self.expand_dims = ops.ExpandDims()
+        else:
+            self.gather_nd = ops.GatherNd()

    def construct(self, feat, ind):
        """gather by index"""
@ -147,18 +160,24 @@ class GatherFeatureByInd(nn.Cell):
        b, J, K = self.shape(ind)
        feat = self.reshape(feat, (b, J, K, -1))
        _, _, _, N = self.shape(feat)
-        ind = self.reshape(ind, (-1, 1))
-        ind_b = nn.Range(0, b * J, 1)()
-        ind_b = self.reshape(ind_b, (-1, 1))
-        ind_b = self.tile(ind_b, (1, K))
-        ind_b = self.reshape(ind_b, (-1, 1))
-        index = self.concat((ind_b, ind))
-        # (b, N, 2)
-        index = self.reshape(index, (-1, K, 2))
-        # (b, N, c)
-        feat = self.reshape(feat, (-1, K, N))
-        feat = self.gather_nd(feat, index)
-        feat = self.reshape(feat, (b, J, K, -1))
+        if self.enable_cpu_gatherd:
+            # (b, J, K, N)
+            index = self.expand_dims(ind, -1)
+            index = self.tile(index, (1, 1, 1, N))
+            feat = self.gather_nd(feat, 2, index)
+        else:
+            ind = self.reshape(ind, (-1, 1))
+            ind_b = nn.Range(0, b * J, 1)()
+            ind_b = self.reshape(ind_b, (-1, 1))
+            ind_b = self.tile(ind_b, (1, K))
+            ind_b = self.reshape(ind_b, (-1, 1))
+            index = self.concat((ind_b, ind))
+            # (b*J, K, 2)
+            index = self.reshape(index, (-1, K, 2))
+            # (b*J, K)
+            feat = self.reshape(feat, (-1, K, N))
+            feat = self.gather_nd(feat, index)
+            feat = self.reshape(feat, (b, J, K, -1))
        return feat


@ -285,17 +304,16 @@ class MultiPoseDecode(nn.Cell):

    Args:
        net_config(edict): config info for CenterNet network.
-        flip_test(bool): flip test of not. Default: False.
        K(int): maximum objects number. Default: 100.
+        enable_nms_fp16(bool): Use float16 data for max_pool, adaption for CPU. Default: True.

    Returns:
        Tensor, multi-objects detections.
    """
-    def __init__(self, net_config, flip_test=False, K=100):
+    def __init__(self, net_config, K=100, enable_nms_fp16=True):
        super(MultiPoseDecode, self).__init__()
        self.K = K
-        self.flip_test = flip_test
-        self.nms = NMS()
+        self.nms = NMS(enable_nms_fp16=enable_nms_fp16)
        self.shape = ops.Shape()
        self.gather_topk = GatherTopK()
        self.gather_topk_channel = GatherTopKChannel()
@ -336,8 +354,6 @@ class MultiPoseDecode(nn.Cell):
    def construct(self, feature):
        """gather detections"""
        heat = feature[0]
-        if self.flip_test:
-            heat = self.flip_tensor(heat)
        K = self.K
        b, _, _, _ = self.shape(heat)
        heat = self.nms(heat)
@ -346,8 +362,6 @@ class MultiPoseDecode(nn.Cell):
        xs = self.reshape(xs, (b, K, 1))

        kps = feature[1]
-        if self.flip_test:
-            kps = self.flip_lr_off(kps)
        num_joints = self.shape(kps)[1] / 2
        # (b, K, num_joints*2)
        kps = self.trans_gather_feature(kps, inds)
@ -365,15 +379,11 @@ class MultiPoseDecode(nn.Cell):
        kps = self.reshape(kps, (b, K, num_joints * 2))

        wh = feature[2]
-        if self.flip_test:
-            wh = self.flip_tensor(wh)
        wh = self.trans_gather_feature(wh, inds)
        ws, hs = self.half(wh)

        if self.reg_offset:
            reg = feature[self.reg_ind]
-            if self.flip_test:
-                reg, _ = self.half_first(reg)
            reg = self.trans_gather_feature(reg, inds)
            reg = self.reshape(reg, (b, K, 2))
            reg_w, reg_h = self.half(reg)
@ -387,16 +397,12 @@ class MultiPoseDecode(nn.Cell):

        if self.hm_hp:
            hm_hp = feature[self.hm_hp_ind]
-            if self.flip_test:
-                hm_hp = self.flip_lr(hm_hp)
            hm_hp = self.nms(hm_hp)
            # (b, num_joints, K)
            hm_score, hm_inds, hm_ys, hm_xs = self.gather_topk_channel(hm_hp, K=K)

            if self.reg_hp_offset:
                hp_offset = feature[self.reg_hp_ind]
-                if self.flip_test:
-                    hp_offset, _ = self.half_first(hp_offset)
                hp_offset = self.trans_gather_feature(hp_offset, self.reshape(hm_inds, (b, -1)))
                hp_offset = self.reshape(hp_offset, (b, num_joints, K, 2))
                hp_ws, hp_hs = self.half(hp_offset)
--- a/model_zoo/research/cv/centernet/src/utils.py
+++ b/model_zoo/research/cv/centernet/src/utils.py
@ -17,6 +17,7 @@ Functional Cells to be used.
 """

 import math
+import time
 import numpy as np
 import mindspore.nn as nn
 import mindspore.ops as ops
@ -119,33 +120,46 @@ class GatherFeature(nn.Cell):
    """
    Gather feature at specified position

-    Args: None
+    Args:
+        enable_cpu_gather (bool): Use cpu operator GatherD to gather feature or not, adaption for CPU. Default: True.

    Returns:
        Tensor, feature at spectified position
    """
-    def __init__(self):
+    def __init__(self, enable_cpu_gather=True):
        super(GatherFeature, self).__init__()
        self.tile = ops.Tile()
        self.shape = ops.Shape()
        self.concat = ops.Concat(axis=1)
        self.reshape = ops.Reshape()
-        self.gather_nd = ops.GatherNd()
+        self.enable_cpu_gather = enable_cpu_gather
+        if self.enable_cpu_gather:
+            self.gather_nd = ops.GatherD()
+            self.expand_dims = ops.ExpandDims()
+        else:
+            self.gather_nd = ops.GatherND()

    def construct(self, feat, ind):
        """gather by specified index"""
-        # (b, N)->(b*N, 1)
-        b, N = self.shape(ind)
-        ind = self.reshape(ind, (-1, 1))
-        ind_b = nn.Range(0, b, 1)()
-        ind_b = self.reshape(ind_b, (-1, 1))
-        ind_b = self.tile(ind_b, (1, N))
-        ind_b = self.reshape(ind_b, (-1, 1))
-        index = self.concat((ind_b, ind))
-        # (b, N, 2)
-        index = self.reshape(index, (b, N, -1))
-        # (b, N, c)
-        feat = self.gather_nd(feat, index)
+        if self.enable_cpu_gather:
+            _, _, c = self.shape(feat)
+            # (b, N, c)
+            index = self.expand_dims(ind, -1)
+            index = self.tile(index, (1, 1, c))
+            feat = self.gather_nd(feat, 1, index)
+        else:
+            # (b, N)->(b*N, 1)
+            b, N = self.shape(ind)
+            ind = self.reshape(ind, (-1, 1))
+            ind_b = nn.Range(0, b, 1)()
+            ind_b = self.reshape(ind_b, (-1, 1))
+            ind_b = self.tile(ind_b, (1, N))
+            ind_b = self.reshape(ind_b, (-1, 1))
+            index = self.concat((ind_b, ind))
+            # (b, N, 2)
+            index = self.reshape(index, (b, N, -1))
+            # (b, N, c)
+            feat = self.gather_nd(feat, index)
        return feat


@ -477,11 +491,19 @@ class LossCallBack(Callback):

    Args:
        dataset_size (int): Dataset size. Default: -1.
+        enable_static_time (bool): enable static time cost, adaption for CPU. Default: False.
    """

-    def __init__(self, dataset_size=-1):
+    def __init__(self, dataset_size=-1, enable_static_time=False):
        super(LossCallBack, self).__init__()
        self._dataset_size = dataset_size
+        self._enable_static_time = enable_static_time
+
+    def step_begin(self, run_context):
+        """
+        Get begining time of each step
+        """
+        self._begin_time = time.time()

    def step_end(self, run_context):
        """
@ -493,11 +515,19 @@ class LossCallBack(Callback):
            if percent == 0:
                percent = 1
                epoch_num -= 1
-            print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
-                  .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
+            if self._enable_static_time:
+                cur_time = time.time()
+                time_per_step = cur_time - self._begin_time
+                print("epoch: {}, current epoch percent: {}, step: {}, time per step: {} s, outputs are {}"
+                      .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, "%.3f" % time_per_step,
+                              str(cb_params.net_outputs)), flush=True)
+            else:
+                print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
+                      .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num,
+                              str(cb_params.net_outputs)), flush=True)
        else:
            print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
-                                                               str(cb_params.net_outputs)))
+                                                               str(cb_params.net_outputs)), flush=True)


 class CenterNetPolynomialDecayLR(LearningRateSchedule):
--- a/model_zoo/research/cv/centernet/train.py
+++ b/model_zoo/research/cv/centernet/train.py
@ -31,12 +31,15 @@ from mindspore.common import set_seed
 from mindspore.profiler import Profiler
 from src.dataset import COCOHP
 from src import CenterNetMultiPoseLossCell, CenterNetWithLossScaleCell
+from src import CenterNetWithoutLossScaleCell
 from src.utils import LossCallBack, CenterNetPolynomialDecayLR, CenterNetMultiEpochsDecayLR
 from src.config import dataset_config, net_config, train_config

 _current_dir = os.path.dirname(os.path.realpath(__file__))

 parser = argparse.ArgumentParser(description='CenterNet training')
+parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'CPU'],
+                    help='device where the code will be implemented. (Default: Ascend)')
 parser.add_argument("--distribute", type=str, default="false", choices=["true", "false"],
                    help="Run distribute, default is false.")
 parser.add_argument("--need_profiler", type=str, default="false", choices=["true", "false"],
@ -125,26 +128,32 @@ def _get_optimizer(network, dataset_size):

 def train():
    """training CenterNet"""
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
-    context.set_context(enable_auto_mixed_precision=False)
+    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(save_graphs=False)

    ckpt_save_dir = args_opt.save_checkpoint_path
-    if args_opt.distribute == "true":
-        D.init()
-        device_num = args_opt.device_num
-        rank = args_opt.device_id % device_num
-        ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/'
-
-        context.reset_auto_parallel_context()
-        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
-                                          device_num=device_num)
-        _set_parallel_all_reduce_split()
-    else:
-        rank = 0
-        device_num = 1
+    rank = 0
+    device_num = 1
    num_workers = 8
+    if args_opt.device_target == "Ascend":
+        context.set_context(enable_auto_mixed_precision=False)
+        context.set_context(device_id=args_opt.device_id)
+        if args_opt.distribute == "true":
+            D.init()
+            device_num = args_opt.device_num
+            rank = args_opt.device_id % device_num
+            ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/'
+
+            context.reset_auto_parallel_context()
+            context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
+                                              device_num=device_num)
+            _set_parallel_all_reduce_split()
+    else:
+        args_opt.distribute = "false"
+        args_opt.need_profiler = "false"
+        args_opt.enable_data_sink = "false"
+
    # Start create dataset!
    # mindrecord files will be generated at args_opt.mindrecord_dir such as centernet.mindrecord0, 1, ... file_num.
    logger.info("Begin creating dataset for CenterNet")
@ -167,7 +176,8 @@ def train():

    optimizer = _get_optimizer(net_with_loss, dataset_size)

-    callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(dataset_size)]
+    enable_static_time = args_opt.device_target == "CPU"
+    callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(dataset_size, enable_static_time)]
    if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0:
        config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
                                     keep_checkpoint_max=args_opt.save_checkpoint_num)
@ -178,12 +188,13 @@ def train():
    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)
-
-    net_with_grads = CenterNetWithLossScaleCell(net_with_loss, optimizer=optimizer,
-                                                sens=train_config.loss_scale_value)
+    if args_opt.device_target == "Ascend":
+        net_with_grads = CenterNetWithLossScaleCell(net_with_loss, optimizer=optimizer,
+                                                    sens=train_config.loss_scale_value)
+    else:
+        net_with_grads = CenterNetWithoutLossScaleCell(net_with_loss, optimizer=optimizer)

    model = Model(net_with_grads)
-
    model.train(new_repeat_count, dataset, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)