From 17111f72f2802c4945f0645967c45925c5f21d16 Mon Sep 17 00:00:00 2001 From: markuskunej Date: Tue, 6 Jul 2021 16:40:41 +0000 Subject: [PATCH] added gpu support for centerface --- model_zoo/official/cv/centerface/README.md | 473 ++++++++++++------ .../cv/centerface/dependency/evaluate/eval.py | 2 +- .../official/cv/centerface/scripts/test.sh | 61 ++- .../cv/centerface/scripts/test_and_eval.sh | 62 ++- .../cv/centerface/scripts/test_distribute.sh | 70 +-- .../scripts/train_distribute_gpu.sh | 132 +++++ .../scripts/train_standalone_gpu.sh | 146 ++++++ .../official/cv/centerface/src/centerface.py | 57 ++- model_zoo/official/cv/centerface/test.py | 9 +- model_zoo/official/cv/centerface/train.py | 37 +- 10 files changed, 775 insertions(+), 274 deletions(-) create mode 100644 model_zoo/official/cv/centerface/scripts/train_distribute_gpu.sh create mode 100644 model_zoo/official/cv/centerface/scripts/train_standalone_gpu.sh diff --git a/model_zoo/official/cv/centerface/README.md b/model_zoo/official/cv/centerface/README.md index bffb5673b1f..45983af34ef 100644 --- a/model_zoo/official/cv/centerface/README.md +++ b/model_zoo/official/cv/centerface/README.md @@ -84,8 +84,8 @@ other datasets need to use the same format as WiderFace. # [Environment Requirements](#contents) -- Hardware(Ascend) - - Prepare hardware environment with Ascend processor. +- Hardware(Ascend/GPU) + - Prepare hardware environment with Ascend or GPU processor. - Framework - [MindSpore](https://www.mindspore.cn/install/en) - For more information, please check the resources below: @@ -105,7 +105,7 @@ step1: prepare pretrained model: train a mobilenet_v2 model by mindspore or use # The key/cell/module name must as follow, otherwise you need to modify "name_map" function: # --mindspore: as the same as mobilenet_v2_key.ckpt # --pytorch: same as official pytorch model(e.g., official mobilenet_v2-b0353104.pth) -python convert_weight_centerface.py --ckpt_fn=./mobilenet_v2_key.ckpt --pt_fn=./mobilenet_v2-b0353104.pth --out_ckpt_fn=./mobilenet_v2.ckpt +python convert_weight_mobilenetv2.py --ckpt_fn=./mobilenet_v2_key.ckpt --pt_fn=./mobilenet_v2-b0353104.pth --out_ckpt_fn=./mobilenet_v2.ckpt ``` step2: prepare dataset @@ -116,7 +116,7 @@ step2: prepare dataset  3)download training annotations from [annotations](https://pan.baidu.com/s/1j_2wggZ3bvCuOAfZvjWqTg). password: **f9hh** -step3: prepare user rank_table +step3 (ASCEND ONLY): prepare user rank_table ```python # user can use your own rank table file @@ -137,13 +137,25 @@ ls ./dataset/centerface/annotations/train.json # annot_path ls ./dataset/centerface/images/train/images # img_dir ``` -```python -# enter script dir, train CenterFace -sh train_distribute.sh -# after training -mkdir ./model -cp device0/outputs/*/*.ckpt ./model # cp model to [MODEL_PATH] -``` +- Train on Ascend + + ```python + # enter script dir, train CenterFace + sh train_distribute.sh + # after training + mkdir ./model + cp device0/output/*/*.ckpt ./model # cp model to [MODEL_PATH] + ``` + +- Train on GPU + + ```python + # enter script dir, train CenterFace + sh train_distribute_gpu.sh + # after training + mkdir ./model + cp train_distribute_gpu/output/*/*.ckpt ./model # cp model to [MODEL_PATH] + ``` step5: test @@ -163,10 +175,19 @@ ls ./dataset/images/val/images/ # data path ls ./dataset/centerface/ground_truth/val.mat # annot_path ``` -```python -# test CenterFace -sh test_distribute.sh -``` +- Test on Ascend + + ```python + # test CenterFace + sh test_distribute.sh + ``` + +- Test on GPU + + ```bash + # test CenterFace + bash test_distribute GPU + ``` step6: eval @@ -304,7 +325,9 @@ sh eval_all.sh [ground_truth_path] │ ├──test_distribute.sh // testing a range of models │ ├──test_and_eval.sh // test then evaluate a single model │ ├──train_standalone.sh // train in ascend with single npu + │ ├──train_standalone_gpu.sh // train on GPU with single npu │ ├──train_distribute.sh // train in ascend with multi npu + │ ├──train_distribute_gpu.sh // train on GPU with multi npu ├── src │ ├──__init__.py │ ├──centerface.py // centerface networks, training entry @@ -320,7 +343,7 @@ sh eval_all.sh [ground_truth_path] | ├──config.py // Processing configuration parameters | ├──device_adapter.py // Get cloud ID | ├──local_adapter.py // Get local ID - | └ ──moxing_adapter.py // Parameter processing + | ├──moxing_adapter.py // Parameter processing └── dependency // third party codes: MIT License ├──extd // training dependency: data augmentation │ ├──utils @@ -371,6 +394,7 @@ sh eval_all.sh [ground_truth_path] --data_dir: data dir --annot_path: annotations path --img_dir: img dir in data_dir + --device_target: device where the code will be implemented. Options are "Ascend" or "GPU". (default: Ascend) ``` 2. centerface unique configs: in config.py; not recommend user to change @@ -395,6 +419,7 @@ sh eval_all.sh [ground_truth_path] # detail can be found in "test.py" # if ckpt is specified not need below 4 parameter --device_num: training device number + --device_target: device where the code will be implemented. Options are "Ascend" or "GPU". (default: Ascend) --steps_per_epoch: steps for each epoch --start: start loop number, used to calculate first epoch number --end: end loop number, used to calculate last epoch number @@ -414,82 +439,152 @@ Major parameters eval.py as follows: ### Training -'task_set' is important for multi-npu train to get higher speed ---task_set: 0, not task_set; 1 task_set; ---task_set_core: task_set core number, most time = cpu number/nproc_per_node +- Running on Ascend -step1: user need train a mobilenet_v2 model by mindspore or use the script below: + 'task_set' is important for multi-npu train to get higher speed + --task_set: 0, not task_set; 1 task_set; + --task_set_core: task_set core number, most time = cpu number/nproc_per_node -```python -python torch_to_ms_centerface.py --ckpt_fn=./mobilenet_v2_key.ckpt --pt_fn=./mobilenet_v2-b0353104.pth --out_ckpt_fn=./mobilenet_v2.ckpt -``` + step1: user need train a mobilenet_v2 model by mindspore or use the script below: -step2: prepare user rank_table + ```python + python torch_to_ms_mobilenetv2.py --ckpt_fn=./mobilenet_v2_key.ckpt --pt_fn=./mobilenet_v2-b0353104.pth --out_ckpt_fn=./mobilenet_v2.ckpt + ``` -```python -# user can use your own rank table file -# or use the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate rank table file -# e.g., python hccl_tools.py --device_num "[0,8)" -python hccl_tools.py --device_num "[0,8)" -``` + step2: prepare user rank_table -step3: train + ```python + # user can use your own rank table file + # or use the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate rank table file + # e.g., python hccl_tools.py --device_num "[0,8)" + python hccl_tools.py --device_num "[0,8)" + ``` -- Single device + step3: train -```python -# enter script dir, train CenterFace -cd scripts -# you need to change the parameter in train_standalone.sh -# or use symbolic link as quick start -# or use the command as follow: -# USE_DEVICE_ID: your device -# PRETRAINED_BACKBONE: your pretrained model path -# DATASET: dataset path -# ANNOTATIONS: annotation path -# images: img_dir in dataset path -sh train_standalone.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES] -# after training -cp device0/outputs/*/*.ckpt [MODEL_PATH] -``` + - Single device -- multi-device (recommended) + ```python + # enter script dir, train CenterFace + cd scripts + # you need to change the parameter in train_standalone.sh + # or use symbolic link as quick start + # or use the command as follow: + # USE_DEVICE_ID: your device + # PRETRAINED_BACKBONE: your pretrained model path + # DATASET: dataset path + # ANNOTATIONS: annotation path + # images: img_dir in dataset path + sh train_standalone.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES] + # after training + cp device0/output/*/*.ckpt [MODEL_PATH] + ``` -```python -# enter script dir, train CenterFace -cd scripts; -# you need to change the parameter in train_distribute.sh -# or use symbolic link as quick start -# or use the command as follow, most are the same as train_standalone.sh, the different is RANK_TABLE -# RANK_TABLE: for multi-device only, from generate_rank_table.py or user writing -sh train_distribute.sh [RANK_TABLE] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES] -# after training -cp device0/outputs/*/*.ckpt [MODEL_PATH] -``` + - Multi-device (recommended) -After training with 8 device, the loss value will be achieved as follows: + ```python + # enter script dir, train CenterFace + cd scripts; + # you need to change the parameter in train_distribute.sh + # or use symbolic link as quick start + # or use the command as follow, most are the same as train_standalone.sh, the different is RANK_TABLE + # RANK_TABLE: for multi-device only, from generate_rank_table.py or user writing + sh train_distribute.sh [RANK_TABLE] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES] + # after training + cp device0/output/*/*.ckpt [MODEL_PATH] + ``` -```python -# grep "loss is " device0/xxx.log -# epoch: 1 step: 1, loss is greater than 500 and less than 5000 -2020-09-24 19:00:53,550:INFO:epoch:1, iter:0, average_loss:loss:1148.415649, loss:1148.4156494140625, overflow:False, loss_scale:1024.0 -[WARNING] DEBUG(51499,python):2020-09-24-19:00:53.590.008 [mindspore/ccsrc/debug/dump_proto.cc:218] SetValueToProto] Unsupported type UInt -2020-09-24 19:00:53,784:INFO:epoch:1, iter:1, average_loss:loss:798.286713, loss:448.15777587890625, overflow:False, loss_scale:1024.0 -... -2020-09-24 19:01:58,095:INFO:epoch:2, iter:197, average_loss:loss:1.942609, loss:1.5492267608642578, overflow:False, loss_scale:1024.0 -2020-09-24 19:01:58,501:INFO:epoch[2], loss:1.942609, 477.97 imgs/sec, lr:0.004000000189989805 -2020-09-24 19:01:58,502:INFO:==========end epoch=============== -2020-09-24 19:02:00,780:INFO:epoch:3, iter:0, average_loss:loss:2.107658, loss:2.1076583862304688, overflow:False, loss_scale:1024.0 -... -# epoch: 140 average loss is greater than 0.3 and less than 1.5: -2020-09-24 20:19:16,255:INFO:epoch:140, iter:196, average_loss:loss:0.906300, loss:1.1071504354476929, overflow:False, loss_scale:1024.0 -2020-09-24 20:19:16,347:INFO:epoch:140, iter:197, average_loss:loss:0.904684, loss:0.586264967918396, overflow:False, loss_scale:1024.0 -2020-09-24 20:19:16,747:INFO:epoch[140], loss:0.904684, 480.10 imgs/sec, lr:3.9999998989515007e-05 -2020-09-24 20:19:16,748:INFO:==========end epoch=============== -2020-09-24 20:19:16,748:INFO:==========end training=============== -``` + After training with 8 device, the loss value will be achieved as follows: -The model checkpoint will be saved in the scripts/device0/output/xxx/xxx.ckpt + ```python + # grep "loss:" device0/xxx.log + # + # epoch: 1 step: 1, loss is greater than 500 and less than 5000 + 2020-09-24 19:00:53,550:INFO:epoch:1, iter:0, average_loss:loss:1148.415649, loss:1148.4156494140625, overflow:False, loss_scale:1024.0 + [WARNING] DEBUG(51499,python):2020-09-24-19:00:53.590.008 [mindspore/ccsrc/debug/dump_proto.cc:218] SetValueToProto] Unsupported type UInt + 2020-09-24 19:00:53,784:INFO:epoch:1, iter:1, average_loss:loss:798.286713, loss:448.15777587890625, overflow:False, loss_scale:1024.0 + ... + 2020-09-24 19:01:58,095:INFO:epoch:2, iter:197, average_loss:loss:1.942609, loss:1.5492267608642578, overflow:False, loss_scale:1024.0 + 2020-09-24 19:01:58,501:INFO:epoch[2], loss:1.942609, 477.97 imgs/sec, lr:0.004000000189989805 + 2020-09-24 19:01:58,502:INFO:==========end epoch=============== + 2020-09-24 19:02:00,780:INFO:epoch:3, iter:0, average_loss:loss:2.107658, loss:2.1076583862304688, overflow:False, loss_scale:1024.0 + ... + # epoch: 140 average loss is greater than 0.3 and less than 1.5: + 2020-09-24 20:19:16,255:INFO:epoch:140, iter:196, average_loss:loss:0.906300, loss:1.1071504354476929, overflow:False, loss_scale:1024.0 + 2020-09-24 20:19:16,347:INFO:epoch:140, iter:197, average_loss:loss:0.904684, loss:0.586264967918396, overflow:False, loss_scale:1024.0 + 2020-09-24 20:19:16,747:INFO:epoch[140], loss:0.904684, 480.10 imgs/sec, lr:3.9999998989515007e-05 + 2020-09-24 20:19:16,748:INFO:==========end epoch=============== + 2020-09-24 20:19:16,748:INFO:==========end training=============== + ``` + + The model checkpoint will be saved in scripts/device0/output/xxx/xxx.ckpt + +- Running on GPU + + 'task_set' is important for multi-npu train to get higher speed + --task_set: 0, not task_set; 1 task_set; + --task_set_core: task_set core number, most time = cpu number/nproc_per_node + + step1: user need train a mobilenet_v2 model by mindspore or use the script below: + + ```python + python torch_to_ms_mobilenetv2.py --ckpt_fn=./mobilenet_v2_key.ckpt --pt_fn=./mobilenet_v2-b0353104.pth --out_ckpt_fn=./mobilenet_v2.ckpt + ``` + + step2: train + + - Single device + + ```python + # enter script dir, train CenterFace + cd scripts + # you need to change the parameter in train_standalone_gpu.sh + # or use symbolic link as quick start + # or use the command as follow: + # USE_DEVICE_ID: your device + # PRETRAINED_BACKBONE: your pretrained model path + # DATASET: dataset path + # ANNOTATIONS: annotation path + # images: img_dir in dataset path + sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES] + # after training + cp train_standalone_gpu/output/*/*.ckpt [MODEL_PATH] + ``` + + - Multi-device (recommended) + + ```python + # enter script dir, train CenterFace + cd scripts; + # you need to change the parameter in train_distribute_gpu.sh + # or use symbolic link as quick start + # or use the command as follow, most are the same as train_standalone_gpu.sh, the different is DEVICE_NUM + # DEVICE_NUM: for multi-device only, number of devices + sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES] + # after training + cp train_distribute_gpu/output/*/*.ckpt [MODEL_PATH] + ``` + + After training with 8 device, the loss value will be achieved as follows: + + ```python + # grep "loss:" train_distribute_gpu/xxx.log + # + # epoch: 1 step: 1, loss is greater than 500 and less than 5000 + 2021-07-06 16:00:45,375:INFO:epoch:1, iter:0, avg_loss:loss:1271.834595, loss:1271.8345947265625, overflow:False, loss_scale:1024.0 + [WARNING] ME(50115:139631687231296,_GeneratorWorkerMp-42):2021-07-06-16:00:45.499.845 [mindspore/dataset/engine/queue.py:99] Using shared memory queue, but rowsize is larger than allocated memory max_rowsize 6291456 current rowwize 9550848 + 2021-07-06 16:00:45,600:INFO:epoch:1, iter:1, avg_loss:loss:1017.134613, loss:762.4346313476562, overflow:False, loss_scale:1024.0 + ... + 2021-07-06 16:01:42,710:INFO:epoch:2, iter:197, avg_loss:loss:1.906899, loss:1.6912976503372192, overflow:False, loss_scale:1024.0 + 2021-07-06 16:01:42,869:INFO:epoch[2], loss:1.906899, 442.33 imgs/sec, lr:0.004000000189989805 + 2021-07-06 16:01:42,985:INFO:epoch:3, iter:0, avg_loss:loss:1.804715, loss:1.804714560508728, overflow:False, loss_scale:1024.0 + ... + # epoch: 140 average loss is greater than 0.3 and less than 1.5: + 2021-07-06 17:02:39,750:INFO:epoch:140, iter:196, avg_loss:loss:0.870886, loss:0.7947260141372681, overflow:False, loss_scale:1024.0 + 2021-07-06 17:02:39,869:INFO:epoch:140, iter:197, avg_loss:loss:0.872917, loss:1.2730457782745361, overflow:False, loss_scale:1024.0 + 2021-07-06 17:02:40,005:INFO:epoch[140], loss:0.872917, 529.03 imgs/sec, lr:3.9999998989515007e-05 + 2021-07-06 17:02:41,273:INFO:==========end training=============== + ``` ## [Testing Process](#contents) @@ -511,27 +606,29 @@ mkdir [SAVE_PATH] # you need to change the parameter in test.sh # or use symbolic link as quick start # or use the command as follow: + # DEVICE_TARGET: device where the code will be implemented. Either Ascend or GPU (default: Ascend) # MODEL_PATH: ckpt path saved during training # DATASET: img dir # GROUND_TRUTH_MAT: ground_truth file, mat type # SAVE_PATH: save_path for evaluate # DEVICE_ID: use device id # CKPT: test model name - sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT] + sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT] ``` 2. test many out ckpt for user to choose the best one -```python -# you need to change the parameter in test.sh -# or use symbolic link as quick start -# or use the command as follow, most are the same as test.sh, the different are: -# DEVICE_NUM: training device number -# STEPS_PER_EPOCH: steps for each epoch -# START: start loop number, used to calculate first epoch number -# END: end loop number, used to calculate last epoch number -sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START] [END] -``` + ```python + # you need to change the parameter in test.sh + # or use symbolic link as quick start + # or use the command as follow, most are the same as test.sh, the different are: + # DEVICE_TARGET: device where the code will be implemented. Either Ascend or GPU (default: Ascend) + # DEVICE_NUM: training device number + # STEPS_PER_EPOCH: steps for each epoch + # START: start loop number, used to calculate first epoch number + # END: end loop number, used to calculate last epoch number + sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START] [END] + ``` After testing, you can find many txt file save the box information and scores, open it you can see: @@ -572,57 +669,107 @@ cd ../../../scripts; 3. test+eval -```python -# you need to change the parameter in test_and_eval.sh -# or use symbolic link as quick start, default eval the ckpt saved in ./scripts/output/centerface/999 -# or use the command as follow, most are the same as test.sh, the different are: -# GROUND_TRUTH_PATH: ground truth path -sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [CKPT] [GROUND_TRUTH_PATH] -``` + ```python + # you need to change the parameter in test_and_eval.sh + # or use symbolic link as quick start, default eval the ckpt saved in ./scripts/output/centerface/999 + # or use the command as follow, most are the same as test.sh, the different are: + # GROUND_TRUTH_PATH: ground truth path + sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [CKPT] [GROUND_TRUTH_PATH] + ``` -you can see the MAP below by eval.sh +- Running on Ascend -```log -(ci3.7) [root@bms-aiserver scripts]# ./eval.sh ./ground_truth_path -start eval -==================== Results = ==================== ./scripts/output/centerface/999 -Easy Val AP: 0.923914407045363 -Medium Val AP: 0.9166100571371586 -Hard Val AP: 0.7810750535799462 -================================================= -end eval -``` + you can see the MAP below by eval.sh -you can see the MAP below by eval_all.sh + ```log + (ci3.7) [root@bms-aiserver scripts]# ./eval.sh + start eval + ==================== Results = ==================== ./scripts/output/centerface/999 + Easy Val AP: 0.923914407045363 + Medium Val AP: 0.9166100571371586 + Hard Val AP: 0.7810750535799462 + ================================================= + end eval + ``` -```log -(ci3.7) [root@bms-aiserver scripts]# ./eval_all.sh ./ground_truth_path -==================== Results = ==================== ./scripts/output/centerface/89 -Easy Val AP: 0.8884892849068273 -Medium Val AP: 0.8928813452811216 -Hard Val AP: 0.7721131614294564 -================================================= -==================== Results = ==================== ./scripts/output/centerface/90 -Easy Val AP: 0.8836073914165545 -Medium Val AP: 0.8875938506473486 -Hard Val AP: 0.775956751740446 -... -==================== Results = ==================== ./scripts/output/centerface/125 -Easy Val AP: 0.923914407045363 -Medium Val AP: 0.9166100571371586 -Hard Val AP: 0.7810750535799462 -================================================= -==================== Results = ==================== ./scripts/output/centerface/126 -Easy Val AP: 0.9218741197149122 -Medium Val AP: 0.9151860193570651 -Hard Val AP: 0.7825645670331809 -... -==================== Results = ==================== ./scripts/output/centerface/140 -Easy Val AP: 0.9250715236965638 -Medium Val AP: 0.9170429723233877 -Hard Val AP: 0.7822182013830674 -================================================= -``` + you can see the MAP below by eval_all.sh + + ```log + (ci3.7) [root@bms-aiserver scripts]# ./eval_all.sh + ==================== Results = ==================== ./scripts/output/centerface/89 + Easy Val AP: 0.8884892849068273 + Medium Val AP: 0.8928813452811216 + Hard Val AP: 0.7721131614294564 + ================================================= + ==================== Results = ==================== ./scripts/output/centerface/90 + Easy Val AP: 0.8836073914165545 + Medium Val AP: 0.8875938506473486 + Hard Val AP: 0.775956751740446 + ... + ==================== Results = ==================== ./scripts/output/centerface/125 + Easy Val AP: 0.923914407045363 + Medium Val AP: 0.9166100571371586 + Hard Val AP: 0.7810750535799462 + ================================================= + ==================== Results = ==================== ./scripts/output/centerface/126 + Easy Val AP: 0.9218741197149122 + Medium Val AP: 0.9151860193570651 + Hard Val AP: 0.7825645670331809 + ... + ==================== Results = ==================== ./scripts/output/centerface/140 + Easy Val AP: 0.9250715236965638 + Medium Val AP: 0.9170429723233877 + Hard Val AP: 0.7822182013830674 + ================================================= + ``` + +- Running on GPU + + you can see the MAP below from eval.sh + + ```log + (markus) rescue@distrubuteddata13: ./scripts$ bash eval.sh + start eval + ==================== Results = ==================== ./scripts/output/centerface/140 + Easy Val AP: 0.9240708943779239 + Medium Val AP: 0.9193106635436091 + Hard Val AP: 0.7777030480280428 + ================================================= + end eval + ``` + + you can see the MAP below from eval_all.sh + + ```log + (markus) rescue@distrubuteddata13: ./scripts$ bash eval_all.sh + ==================== Results = ==================== ./scripts/output/centerface/89 + Easy Val AP: 0.9138417914429035 + Medium Val AP: 0.9052437122819539 + Hard Val AP: 0.7705692348147004 + ================================================= + ==================== Results = ==================== ./scripts/output/centerface/90 + Easy Val AP: 0.8820974959531916 + Medium Val AP: 0.8902186098138436 + Hard Val AP: 0.7655257898032033 + ================================================= + ... + ==================== Results = ==================== /home/rescue/markus/markus_repo/mindspore/model_zoo/official/cv/centerface/scripts/output/centerface/125 + Easy Val AP: 0.9240525949727452 + Medium Val AP: 0.9180645371016661 + Hard Val AP: 0.782047346778918 + ================================================= + ==================== Results = ==================== /home/rescue/markus/markus_repo/mindspore/model_zoo/official/cv/centerface/scripts/output/centerface/126 + Easy Val AP: 0.9199560196120761 + Medium Val AP: 0.9157462777329638 + Hard Val AP: 0.7814679399942209 + ================================================= + ... + ==================== Results = ==================== /home/rescue/markus/markus_repo/mindspore/model_zoo/official/cv/centerface/scripts/output/centerface/140 + Easy Val AP: 0.9240708943779239 + Medium Val AP: 0.9193106635436091 + Hard Val AP: 0.7777030480280428 + ================================================= + ``` ## [Inference process](#contents) @@ -678,36 +825,36 @@ Hard Val AP: 0.776737419299741 CenterFace on 13K images(The annotation and data format must be the same as widerFace) -| Parameters | CenterFace | -| -------------------------- | ----------------------------------------------------------- | -| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | -| uploaded Date | 10/29/2020 (month/day/year) | -| MindSpore Version | 1.0.0 | -| Dataset | 13K images | -| Training Parameters | epoch=140, steps=198 * epoch, batch_size = 8, lr=0.004 | -| Optimizer | Adam | -| Loss Function | Focal Loss, L1 Loss, Smooth L1 Loss | -| outputs | heatmaps | -| Loss | 0.3-1.5, average loss for last epoch is in 0.8-1.0 | -| Speed | 1p 65 img/s, 8p 475 img/s | -| Total time | train(8p) 1.1h, test 50min, eval 5-10min | -| Checkpoint for Fine tuning | 22M (.ckpt file) | -| Scripts | | +| Parameters | Ascend | GPU | +| -------------------------- | ----------------------------------------------------------- | -----------------------------------------| +| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | Tesla V100 PCIe 32GB; CPU 2.70GHz; 52cores; Memory 1510G; OS Ubuntu 18.04.5 | +| uploaded Date | 10/29/2020 (month/day/year) | 7/9/2021 (month/day/year) | +| MindSpore Version | 1.0.0 | 1.3.0 | +| Dataset | 13K images | 13K images | +| Training Parameters | epoch=140, steps=198 * epoch, batch_size = 8, lr=0.004 | epoch=140, steps=198 * epoch, batch_size = 8, lr=0.004 | +| Optimizer | Adam | Adam | +| Loss Function | Focal Loss, L1 Loss, Smooth L1 Loss | Focal Loss, L1 Loss, Smooth L1 Loss | +| outputs | heatmaps | heatmaps | +| Loss | 0.3-1.5, average loss for last epoch is in 0.8-1.0 | iter loss for last epoch 0.3-3.3, average loss for last epoch is in 0.75-1.05 | +| Speed | 1p 65 img/s, 8p 475 img/s | 1gpu 80 img/s, 8gpu 480 img/s | +| Total time | train(8p) 1.1h, test 50min, eval 5-10min | train(8gpu) 1.0h, test 35 min, eval 5-10min | +| Checkpoint for Fine tuning | 22M (.ckpt file) | 23M (.ckpt file) | +| Scripts | | | ### Inference Performance CenterFace on 3.2K images(The annotation and data format must be the same as widerFace) -| Parameters | CenterFace | -| -------------------------- | ----------------------------------------------------------- | -| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | -| uploaded Date | 10/29/2020 (month/day/year) | -| MindSpore Version | 1.0.0 | -| Dataset | 3.2K images | -| batch_size | 1 | -| outputs | box position and sorces, and probability | -| Accuracy | Easy 92.2% Medium 91.5% Hard 78.2% (+-0.5%) | -| Model for inference | 22M (.ckpt file) | +| Parameters | Ascend | GPU | +| -------------------------- | ----------------------------------------------------------- | ------------------------------------------ | +| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | Tesla V100 PCIe 32GB; CPU 2.70GHz; 52cores; Memory 1510G; OS Ubuntu 18.04.5 | +| uploaded Date | 10/29/2020 (month/day/year) | 7/9/2021 (month/day/year) | +| MindSpore Version | 1.0.0 | 1.3.0 +| Dataset | 3.2K images | 3.2K images | +| batch_size | 1 | 1 | +| outputs | box position and scores, and probability | box position and scores, and probability | +| Accuracy | Easy 92.2% Medium 91.5% Hard 78.2% (+-0.5%) | Easy 92.4% Medium 91.9% Hard 77.8% (+-0.5%) | +| Model for inference | 22M (.ckpt file) | 23M (.ckpt file) | ### 310Inference Performance diff --git a/model_zoo/official/cv/centerface/dependency/evaluate/eval.py b/model_zoo/official/cv/centerface/dependency/evaluate/eval.py index b565cce4028..031aa1497b5 100644 --- a/model_zoo/official/cv/centerface/dependency/evaluate/eval.py +++ b/model_zoo/official/cv/centerface/dependency/evaluate/eval.py @@ -39,7 +39,7 @@ from bbox import bbox_overlaps def get_gt_boxes(gt_dir): """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)""" - gt_mat = loadmat(os.path.join(gt_dir, 'wider_face_val.mat')) # you own ground_truth name + gt_mat = loadmat(os.path.join(gt_dir, 'val.mat')) # you own ground_truth name hard_mat = loadmat(os.path.join(gt_dir, 'wider_hard_val.mat')) medium_mat = loadmat(os.path.join(gt_dir, 'wider_medium_val.mat')) easy_mat = loadmat(os.path.join(gt_dir, 'wider_easy_val.mat')) diff --git a/model_zoo/official/cv/centerface/scripts/test.sh b/model_zoo/official/cv/centerface/scripts/test.sh index 4d623bd608f..ee719554631 100644 --- a/model_zoo/official/cv/centerface/scripts/test.sh +++ b/model_zoo/official/cv/centerface/scripts/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-21 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,14 +14,15 @@ # limitations under the License. # ============================================================================ -if [ $# -gt 6 ] +if [ $# -gt 7 ] then - echo "Usage: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]" - echo " or: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]" - echo " or: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]" - echo " or: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]" - echo " or: sh test.sh [MODEL_PATH] [DATASET]" - echo " or: sh test.sh [MODEL_PATH]" + echo "Usage: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]" + echo " or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]" + echo " or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]" + echo " or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]" + echo " or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]" + echo " or: sh test.sh [DEVICE_TARGET] [MODEL_PATH]" + echo " or: sh test.sh [DEVICE_TARGET]" echo " or: sh test.sh " exit 1 fi @@ -50,32 +51,43 @@ dataset_root=$root/dataset dataset_path=$dataset_root/centerface/images/val/images/ ground_truth_mat=$dataset_root/centerface/ground_truth/val.mat save_path=$root/output/centerface/ +device_target="Ascend" device_id=0 -ckpt="0-125_24750.ckpt" # the model saved for epoch=125 +ckpt="0-140_221620.ckpt" # the model saved for epoch=140 -if [ $# == 1 ] +if [ $# -ge 1 ] then - model_path=$(get_real_path $1) - if [ ! -f $model_path ] + device_target="$1" + if [ "$device_target" != "Ascend" ] && [ "$device_target" != "GPU" ] + then + echo "error: device_target=$device_target is not a valid option (Ascend or GPU)" + exit 1 + fi +fi + +if [ $# -ge 2 ] +then + model_path=$(get_real_path $2) + if [ ! -d $model_path ] then echo "error: model_path=$model_path is not a file" exit 1 fi fi -if [ $# == 2 ] +if [ $# -ge 3 ] then - dataset_path=$(get_real_path $2) - if [ ! -f $dataset_path ] + dataset_path=$(get_real_path $3) + if [ ! -d $dataset_path ] then echo "error: dataset_path=$dataset_path is not a file" exit 1 fi fi -if [ $# == 3 ] +if [ $# -ge 4 ] then - ground_truth_mat=$(get_real_path $3) + ground_truth_mat=$(get_real_path $4) if [ ! -f $ground_truth_mat ] then echo "error: ground_truth_mat=$ground_truth_mat is not a file" @@ -83,24 +95,24 @@ then fi fi -if [ $# == 4 ] +if [ $# -ge 5 ] then - save_path=$(get_real_path $4) - if [ ! -f $save_path ] + save_path=$(get_real_path $5) + if [ ! -d $save_path ] then echo "error: save_path=$save_path is not a file" exit 1 fi fi -if [ $# == 5 ] +if [ $# -ge 6 ] then - device_id=$5 + device_id=$6 fi -if [ $# == 6 ] +if [ $# == 7 ] then - ckpt=$6 + ckpt=$7 fi echo $model_path @@ -126,6 +138,7 @@ python ${dirname_path}/${SCRIPT_NAME} \ --ground_truth_mat=$ground_truth_mat \ --save_dir=$save_path \ --rank=$device_id \ + --device_target=$device_target \ --ckpt_name=$ckpt > test.log 2>&1 & echo 'running' diff --git a/model_zoo/official/cv/centerface/scripts/test_and_eval.sh b/model_zoo/official/cv/centerface/scripts/test_and_eval.sh index 6a6e1ea4f34..e52e0a59fae 100644 --- a/model_zoo/official/cv/centerface/scripts/test_and_eval.sh +++ b/model_zoo/official/cv/centerface/scripts/test_and_eval.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-21 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,15 +14,16 @@ # limitations under the License. # ============================================================================ -if [ $# -gt 6 ] +if [ $# -gt 8 ] then - echo "Usage: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT] [GROUND_TRUTH_PATH]" - echo " or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]" - echo " or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]" - echo " or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]" - echo " or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]" - echo " or: sh test_and_eval.sh [MODEL_PATH] [DATASET]" - echo " or: sh test_and_eval.sh [MODEL_PATH]" + echo "Usage: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT] [GROUND_TRUTH_PATH]" + echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]" + echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]" + echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]" + echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]" + echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]" + echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH]" + echo " or: sh test_and_eval.sh [DEVICE_TARGET]" echo " or: sh test_and_eval.sh " exit 1 fi @@ -51,14 +52,24 @@ dataset_root=$root/dataset dataset_path=$dataset_root/centerface/images/val/images/ ground_truth_mat=$dataset_root/centerface/ground_truth/val.mat save_path=$root/output/centerface/999 +device_target="Ascend" device_id=0 -ckpt="0-125_24750.ckpt" # the model saved for epoch=125 +ckpt="0-140_221620.ckpt" # the model saved for epoch=125 ground_truth_path=$root/dataset/centerface/ground_truth if [ $# -ge 1 ] then - model_path=$(get_real_path $1) -# if [ ! -f $model_path ] + device_target="$1" + if [ "$device_target" != "Ascend" ] && [ "$device_target" != "GPU" ] + then + echo "error: device_target=$device_target is not a valid option (Ascend or GPU)" + exit 1 + fi +fi + +if [ $# -ge 2 ] +then + model_path=$(get_real_path $2) if [ ! -d $model_path ] then echo "error: model_path=$model_path is not a dir" @@ -66,9 +77,9 @@ then fi fi -if [ $# -ge 2 ] +if [ $# -ge 3 ] then - dataset_path=$(get_real_path $2) + dataset_path=$(get_real_path $3) if [ ! -d $dataset_path ] then echo "error: dataset_path=$dataset_path is not a dir" @@ -76,9 +87,9 @@ then fi fi -if [ $# -ge 3 ] +if [ $# -ge 4 ] then - ground_truth_mat=$(get_real_path $3) + ground_truth_mat=$(get_real_path $4) if [ ! -f $ground_truth_mat ] then echo "error: ground_truth_mat=$ground_truth_mat is not a file" @@ -86,9 +97,9 @@ then fi fi -if [ $# -ge 4 ] +if [ $# -ge 5 ] then - save_path=$(get_real_path $4) + save_path=$(get_real_path $5) if [ ! -d $save_path ] then echo "error: save_path=$save_path is not a dir" @@ -96,19 +107,19 @@ then fi fi -if [ $# -ge 5 ] -then - device_id=$5 -fi - if [ $# -ge 6 ] then - ckpt=$6 + device_id=$6 fi if [ $# -ge 7 ] then - ground_truth_path=$(get_real_path $7) + ckpt=$7 +fi + +if [ $# == 8 ] +then + ground_truth_path=$(get_real_path $8) if [ ! -f $ground_truth_path ] then echo "error: ground_truth_path=$ground_truth_path is not a file" @@ -142,6 +153,7 @@ python ${dirname_path}/${SCRIPT_NAME} \ --rank=$device_id \ --ckpt_name=$ckpt \ --eval=1 \ + --device_target=$device_target \ --ground_truth_path=$ground_truth_path > test.log 2>&1 & echo 'running' diff --git a/model_zoo/official/cv/centerface/scripts/test_distribute.sh b/model_zoo/official/cv/centerface/scripts/test_distribute.sh index 3cfc82934e4..d14c84df6c8 100644 --- a/model_zoo/official/cv/centerface/scripts/test_distribute.sh +++ b/model_zoo/official/cv/centerface/scripts/test_distribute.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-21 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,18 +14,19 @@ # limitations under the License. # ============================================================================ -if [ $# -gt 8 ] +if [ $# -gt 9 ] then - echo "Usage: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START] [END]" - echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START]" - echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH]" - echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]" - echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]" - echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]" - echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]" - echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET]" - echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET]" - echo " or: sh test_distribute.sh [MODEL_PATH]" + echo "Usage: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START] [END]" + echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START]" + echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH]" + echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]" + echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]" + echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]" + echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]" + echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]" + echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]" + echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH]" + echo " or: sh test_distribute.sh [DEVICE_TARGET]" echo " or: sh test_distribute.sh " exit 1 fi @@ -58,6 +59,7 @@ save_path=$root/output/centerface/ # model/ckpt name is "0-" + str(ckpt_num) + "_" + str(198*ckpt_num) + ".ckpt"; # ckpt_num is epoch number, can be calculated by device_num # detail can be found in "test.py" +device_target="Ascend" device_num=8 steps_per_epoch=198 #198 for 8P; 1583 for 1p start=11 # start epoch number = start * device_num + min(device_phy_id) + 1 @@ -65,8 +67,17 @@ end=18 # end epoch number = end * device_num + max(device_phy_id) + 1 if [ $# -ge 1 ] then - model_path=$(get_real_path $1) -# if [ ! -f $model_path ] + device_target="$1" + if [ "$device_target" != "Ascend" ] && [ "$device_target" != "GPU" ] + then + echo "error: device_target=$device_target is not a valid option (Ascend or GPU)" + exit 1 + fi +fi + +if [ $# -ge 2 ] +then + model_path=$(get_real_path $2) if [ ! -d $model_path ] then echo "error: model_path=$model_path is not a dir" @@ -74,9 +85,9 @@ then fi fi -if [ $# -ge 2 ] +if [ $# -ge 3 ] then - dataset_path=$(get_real_path $2) + dataset_path=$(get_real_path $3) if [ ! -d $dataset_path ] then echo "error: dataset_path=$dataset_path is not a dir" @@ -84,9 +95,9 @@ then fi fi -if [ $# -ge 3 ] +if [ $# -ge 4 ] then - ground_truth_mat=$(get_real_path $3) + ground_truth_mat=$(get_real_path $4) if [ ! -f $ground_truth_mat ] then echo "error: ground_truth_mat=$ground_truth_mat is not a file" @@ -94,9 +105,9 @@ then fi fi -if [ $# -ge 4 ] +if [ $# -ge 5 ] then - save_path=$(get_real_path $4) + save_path=$(get_real_path $5) if [ ! -d $save_path ] then echo "error: save_path=$save_path is not a dir" @@ -104,24 +115,24 @@ then fi fi -if [ $# -ge 5 ] -then - device_num=$5 -fi - if [ $# -ge 6 ] then - steps_per_epoch=$6 + device_num=$6 fi if [ $# -ge 7 ] then - start=$7 + steps_per_epoch=$7 fi -if [ $# == 8 ] +if [ $# -ge 8 ] then - end=$8 + start=$8 +fi + +if [ $# == 9 ] +then + end=$9 fi echo $model_path @@ -150,6 +161,7 @@ do --save_dir=$save_path \ --rank=$i \ --device_num=$device_num \ + --device_target=$device_target \ --steps_per_epoch=$steps_per_epoch \ --start=$start \ --end=$end > test.log 2>&1 & diff --git a/model_zoo/official/cv/centerface/scripts/train_distribute_gpu.sh b/model_zoo/official/cv/centerface/scripts/train_distribute_gpu.sh new file mode 100644 index 00000000000..c8b626de9ee --- /dev/null +++ b/model_zoo/official/cv/centerface/scripts/train_distribute_gpu.sh @@ -0,0 +1,132 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 0 ] && [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +then + echo "Usage: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]" + echo " or: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS]" + echo " or: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET]" + echo " or: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE]" + echo " or: sh train_distribute_gpu.sh [DEVICE_NUM]" + echo " or: sh train_distribute_gpu.sh " +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +current_exec_path=$(pwd) +echo ${current_exec_path} + +dirname_path=$(dirname "$(pwd)") +echo ${dirname_path} + +rm -rf ${current_exec_path}/train_distribute_gpu +SCRIPT_NAME='train.py' + +ulimit -c unlimited + +root=${current_exec_path} # your script path +pretrained_backbone=${dirname_path}/mobilenet_v2.ckpt # or mobilenet_v2-b0353104.ckpt +dataset_path=$root/dataset/centerface +annot_path=$dataset_path/annotations/train.json +img_dir=$dataset_path/images/train/images +num_devices=8 + +if [ $# == 1 ] +then + num_devices=$1 +fi + +if [ $# == 2 ] +then + pretrained_backbone=$(get_real_path $2) + if [ ! -f $pretrained_backbone ] + then + echo "error: pretrained_backbone=$pretrained_backbone is not a file" + exit 1 + fi +fi + +if [ $# == 3 ] +then + dataset_path=$(get_real_path $3) + if [ ! -f $dataset_path ] + then + echo "error: dataset_path=$dataset_path is not a file" + exit 1 + fi +fi + +if [ $# == 4 ] +then + annot_path=$(get_real_path $4) + if [ ! -f $annot_path ] + then + echo "error: annot_path=$annot_path is not a file" + exit 1 + fi +fi + +if [ $# == 5 ] +then + img_dir=$(get_real_path $5) + if [ ! -f $img_dir ] + then + echo "error: img_dir=$img_dir is not a file" + exit 1 + fi +fi + +echo $pretrained_backbone +echo $dataset_path +echo $annot_path +echo $img_dir + +export PYTHONPATH=${dirname_path}:$PYTHONPATH +export RANK_SIZE=$num_devices +export DEVICE_ID=0 + +echo "start training on $RANK_SIZE devices" + +mkdir ${current_exec_path}/train_distribute_gpu +cd ${current_exec_path}/train_distribute_gpu || exit + +mpirun -n $RANK_SIZE \ + python ${dirname_path}/${SCRIPT_NAME} \ + --lr=4e-3 \ + --per_batch_size=8 \ + --is_distributed=1 \ + --t_max=140 \ + --max_epoch=140 \ + --warmup_epochs=0 \ + --lr_scheduler=multistep \ + --lr_epochs=90,120 \ + --weight_decay=0.0000 \ + --loss_scale=1024 \ + --pretrained_backbone=$pretrained_backbone \ + --data_dir=$dataset_path \ + --annot_path=$annot_path \ + --img_dir=$img_dir \ + --device_target="GPU" > train.log 2>&1 & + + +echo 'running' diff --git a/model_zoo/official/cv/centerface/scripts/train_standalone_gpu.sh b/model_zoo/official/cv/centerface/scripts/train_standalone_gpu.sh new file mode 100644 index 00000000000..6a187d66936 --- /dev/null +++ b/model_zoo/official/cv/centerface/scripts/train_standalone_gpu.sh @@ -0,0 +1,146 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 0 ] && [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ] +then + echo "Usage: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]" + echo " or: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS]" + echo " or: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET]" + echo " or: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE]" + echo " or: sh train_standalone_gpu.sh [USE_DEVICE_ID]" + echo " or: sh train_standalone_gpu.sh " +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +current_exec_path=$(pwd) +echo "current_exec_path: " ${current_exec_path} + +dirname_path=$(dirname "$(pwd)") +echo "dirname_path: " ${dirname_path} + +SCRIPT_NAME='train.py' + +ulimit -c unlimited + +root=${current_exec_path} # your script path +pretrained_backbone=${dirname_path}/mobilenet_v2.ckpt # or mobilenet_v2-b0353104.ckpt +dataset_path=$root/dataset/centerface +annot_path=$dataset_path/annotations/train.json +img_dir=$dataset_path/images/train/images +use_device_id=0 + +if [ $# == 1 ] +then + use_device_id=$1 +fi + +if [ $# == 2 ] +then + use_device_id=$1 + pretrained_backbone=$(get_real_path $2) +fi + +if [ $# == 3 ] +then + use_device_id=$1 + pretrained_backbone=$(get_real_path $2) + dataset_path=$(get_real_path $3) +fi + +if [ $# == 4 ] +then + use_device_id=$1 + pretrained_backbone=$(get_real_path $2) + dataset_path=$(get_real_path $3) + annot_path=$(get_real_path $4) +fi + +if [ $# == 5 ] +then + use_device_id=$1 + pretrained_backbone=$(get_real_path $2) + dataset_path=$(get_real_path $3) + annot_path=$(get_real_path $4) + img_dir=$(get_real_path $5) +fi + +echo "use_device_id: " $use_device_id +echo "pretrained_backbone: " $pretrained_backbone +echo "dataset_path: " $dataset_path +echo "annot_path: " $annot_path +echo "img_dir: " $img_dir + +if [ ! -f $pretrained_backbone ] +then + echo "error: pretrained_backbone=$pretrained_backbone is not a file" +exit 1 +fi + +if [ ! -d $dataset_path ] +then + echo "error: dataset_path=$dataset_path is not a directory" +exit 1 +fi + +if [ ! -f $annot_path ] +then + echo "error: annot_path=$annot_path is not a file" +exit 1 +fi + +if [ ! -d $img_dir ] +then + echo "error: img_dir=$img_dir is not a directory" +exit 1 +fi + +export PYTHONPATH=${dirname_path}:$PYTHONPATH +export RANK_SIZE=1 + +echo 'start training' +echo 'start rank '$use_device_id +rm -rf ${current_exec_path}/train_standalone_gpu +mkdir ${current_exec_path}/train_standalone_gpu +cd ${current_exec_path}/train_standalone_gpu || exit +export RANK_ID=0 +dev=`expr $use_device_id + 0` +export DEVICE_ID=$dev +python ${dirname_path}/${SCRIPT_NAME} \ + --lr=5e-4 \ + --per_batch_size=8 \ + --is_distributed=0 \ + --t_max=140 \ + --max_epoch=140 \ + --warmup_epochs=0 \ + --lr_scheduler=multistep \ + --lr_epochs=90,120 \ + --weight_decay=0.0000 \ + --loss_scale=1024 \ + --pretrained_backbone=$pretrained_backbone \ + --data_dir=$dataset_path \ + --annot_path=$annot_path \ + --img_dir=$img_dir \ + --device_target="GPU" > train.log 2>&1 & + +echo 'running' diff --git a/model_zoo/official/cv/centerface/src/centerface.py b/model_zoo/official/cv/centerface/src/centerface.py index a5b482f0edb..aae19169a39 100644 --- a/model_zoo/official/cv/centerface/src/centerface.py +++ b/model_zoo/official/cv/centerface/src/centerface.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-21 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -39,6 +39,13 @@ reciprocal = P.Reciprocal() def tensor_grad_scale(scale, grad): return grad * reciprocal(scale) +_grad_overflow = C.MultitypeFuncGraph("_grad_overflow") +grad_overflow = P.FloatStatus() + +@_grad_overflow.register("Tensor") +def _tensor_grad_overflow(grad): + return grad_overflow(grad) + def conv1x1(in_channels, out_channels, stride=1, padding=0, has_bias=False): return nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, has_bias=has_bias, padding=padding, pad_mode="pad") @@ -240,9 +247,16 @@ class TrainingWrapper(nn.Cell): self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) self.hyper_map = C.HyperMap() - self.alloc_status = NPUAllocFloatStatus() - self.get_status = NPUGetFloatStatus() - self.clear_status = NPUClearFloatStatus() + if context.get_context("device_target") == "GPU": + self.gpu_target = True + self.float_status = P.FloatStatus() + self.addn = P.AddN() + self.reshape = P.Reshape() + else: + self.gpu_target = False + self.alloc_status = NPUAllocFloatStatus() + self.get_status = NPUGetFloatStatus() + self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() @@ -257,12 +271,15 @@ class TrainingWrapper(nn.Cell): weights = self.weights loss = self.network(x, hm, reg_mask, ind, wh, wight_mask, hm_offset, hps_mask, landmarks) - # init overflow buffer - init = self.alloc_status() - init = F.depend(init, loss) - # clear overflow buffer - clear_status = self.clear_status(init) - loss = F.depend(loss, clear_status) + init = False + + if not self.gpu_target: + # init overflow buffer + init = self.alloc_status() + init = F.depend(init, loss) + # clear overflow buffer + clear_status = self.clear_status(init) + loss = F.depend(loss, clear_status) #sens = sens_input #P.Fill()(P.DType()(loss), P.Shape()(loss), sens_input) # user can contral loss scale by add a sens_input sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens) @@ -272,12 +289,20 @@ class TrainingWrapper(nn.Cell): if self.reducer_flag: grads = self.grad_reducer(grads) - # get the overflow buffer - init = F.depend(init, grads) - get_status = self.get_status(init) - init = F.depend(init, get_status) - # sum overflow buffer elements, 0:not overflow , >0:overflow - flag_sum = self.reduce_sum(init, (0,)) + if not self.gpu_target: + # get the overflow buffer + init = F.depend(init, grads) + + get_status = self.get_status(init) + init = F.depend(init, get_status) + # sum overflow buffer elements, 0:not overflow , >0:overflow + flag_sum = self.reduce_sum(init, (0,)) + else: + flag_sum = self.hyper_map(F.partial(_grad_overflow), grads) + flag_sum = self.addn(flag_sum) + # convert flag_sum to scalar + flag_sum = self.reshape(flag_sum, (())) + if self.is_distributed: # sum overflow flag over devices flag_reduce = self.allreduce(flag_sum) diff --git a/model_zoo/official/cv/centerface/test.py b/model_zoo/official/cv/centerface/test.py index 47654739e2d..b5635e79cf8 100644 --- a/model_zoo/official/cv/centerface/test.py +++ b/model_zoo/official/cv/centerface/test.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-21 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,8 +34,11 @@ from dependency.centernet.src.lib.detectors.base_detector import CenterFaceDetec from dependency.evaluate.eval import evaluation dev_id = get_device_id() -context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=False, - device_target="Ascend", save_graphs=False, device_id=dev_id) +context.set_context(mode=context.GRAPH_MODE, + device_target=config.device_target, save_graphs=False, device_id=dev_id) + +if config.device_target == "Ascend": + context.set_context(enable_auto_mixed_precision=False) def modelarts_process(): config.data_dir = config.data_path diff --git a/model_zoo/official/cv/centerface/train.py b/model_zoo/official/cv/centerface/train.py index 74650ce842f..57a7d05603f 100644 --- a/model_zoo/official/cv/centerface/train.py +++ b/model_zoo/official/cv/centerface/train.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2020-21 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -52,14 +52,18 @@ from src.model_utils.device_adapter import get_device_id set_seed(1) dev_id = get_device_id() -context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=False, - device_target="Ascend", save_graphs=False, device_id=dev_id, reserve_class_name_in_scope=False) +context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, + save_graphs=False, device_id=dev_id, reserve_class_name_in_scope=False) + +if config.device_target == "Ascend": + context.set_context(enable_auto_mixed_precision=False) if config.lr_scheduler == 'cosine_annealing' and config.max_epoch > config.t_max: config.t_max = config.max_epoch config.lr_epochs = list(map(int, config.lr_epochs.split(','))) + def convert_training_shape(args_): """ Convert training shape @@ -81,10 +85,12 @@ class InternalCallbackParam(dict): def modelarts_pre_process(): config.ckpt_path = os.path.join(config.output_path, config.ckpt_path) + @moxing_wrapper(pre_process=modelarts_pre_process) def train_centerface(): pass + if __name__ == "__main__": train_centerface() print('\ntrain.py config:\n', config) @@ -103,7 +109,8 @@ if __name__ == "__main__": config.rank_save_ckpt_flag = 1 # logger - config.outputs_dir = os.path.join(config.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) + config.outputs_dir = os.path.join( + config.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) if config.need_profiler: profiler = Profiler(output_path=config.outputs_dir) @@ -120,14 +127,16 @@ if __name__ == "__main__": # Notice: parameter_broadcast should be supported, but current version has bugs, thus been disabled. # To make sure the init weight on all npu is the same, we need to set a static seed in default_recurisive_init when weight initialization - context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) + context.set_auto_parallel_context( + parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) network = CenterfaceMobilev2() # init, to avoid overflow, some std of weight should be enough small default_recurisive_init(network) if config.pretrained_backbone: network = load_backbone(network, config.pretrained_backbone, config) - print('load pre-trained backbone {} into network'.format(config.pretrained_backbone)) + print( + 'load pre-trained backbone {} into network'.format(config.pretrained_backbone)) else: print('Not load pre-trained backbone, please be careful') @@ -155,9 +164,10 @@ if __name__ == "__main__": config.multi_scale = [convert_training_shape(config)] # data loader - data_loader, config.steps_per_epoch = GetDataLoader(per_batch_size=config.per_batch_size, \ - max_epoch=config.max_epoch, rank=config.rank, group_size=config.group_size, \ - config=config, split='train') + data_loader, config.steps_per_epoch = GetDataLoader(per_batch_size=config.per_batch_size, + max_epoch=config.max_epoch, rank=config.rank, + group_size=config.group_size, + config=config, split='train') config.steps_per_epoch = config.steps_per_epoch // config.max_epoch print('Finish loading dataset') @@ -238,7 +248,7 @@ if __name__ == "__main__": run_context = RunContext(cb_params) ckpt_cb.begin(run_context) - print('config.steps_per_epoch = {} config.ckpt_interval ={}'.format(config.steps_per_epoch, \ + print('config.steps_per_epoch = {} config.ckpt_interval ={}'.format(config.steps_per_epoch, config.ckpt_interval)) t_end = time.time() @@ -258,12 +268,13 @@ if __name__ == "__main__": hps_mask = Tensor(hps_mask) landmarks = Tensor(landmarks) - loss, overflow, scaling = network(images, hm, reg_mask, ind, wh, wight_mask, hm_offset, hps_mask, landmarks) + loss, overflow, scaling = network( + images, hm, reg_mask, ind, wh, wight_mask, hm_offset, hps_mask, landmarks) # Tensor to numpy overflow = np.all(overflow.asnumpy()) loss = loss.asnumpy() loss_meter.update(loss) - print('epoch:{}, iter:{}, avg_loss:{}, loss:{}, overflow:{}, loss_scale:{}'.format( \ + print('epoch:{}, iter:{}, avg_loss:{}, loss:{}, overflow:{}, loss_scale:{}'.format( epoch, i, loss_meter, loss, overflow, scaling.asnumpy())) if config.rank_save_ckpt_flag: @@ -280,7 +291,7 @@ if __name__ == "__main__": print( 'epoch[{}], {}, {:.2f} imgs/sec, lr:{}' .format(epoch, loss_meter, fps, lr[i + (epoch-1)*config.steps_per_epoch]) - ) + ) t_end = time.time() loss_meter.reset()