!20118 Add CenterFace training and testing on GPU
Merge pull request !20118 from markuskunej/centreface_gpu
This commit is contained in:
commit
051ed05046
|
@ -84,8 +84,8 @@ other datasets need to use the same format as WiderFace.
|
|||
|
||||
# [Environment Requirements](#contents)
|
||||
|
||||
- Hardware(Ascend)
|
||||
- Prepare hardware environment with Ascend processor.
|
||||
- Hardware(Ascend/GPU)
|
||||
- Prepare hardware environment with Ascend or GPU processor.
|
||||
- Framework
|
||||
- [MindSpore](https://www.mindspore.cn/install/en)
|
||||
- For more information, please check the resources below:
|
||||
|
@ -105,7 +105,7 @@ step1: prepare pretrained model: train a mobilenet_v2 model by mindspore or use
|
|||
# The key/cell/module name must as follow, otherwise you need to modify "name_map" function:
|
||||
# --mindspore: as the same as mobilenet_v2_key.ckpt
|
||||
# --pytorch: same as official pytorch model(e.g., official mobilenet_v2-b0353104.pth)
|
||||
python convert_weight_centerface.py --ckpt_fn=./mobilenet_v2_key.ckpt --pt_fn=./mobilenet_v2-b0353104.pth --out_ckpt_fn=./mobilenet_v2.ckpt
|
||||
python convert_weight_mobilenetv2.py --ckpt_fn=./mobilenet_v2_key.ckpt --pt_fn=./mobilenet_v2-b0353104.pth --out_ckpt_fn=./mobilenet_v2.ckpt
|
||||
```
|
||||
|
||||
step2: prepare dataset
|
||||
|
@ -116,7 +116,7 @@ step2: prepare dataset
|
|||
|
||||
 3)download training annotations from [annotations](https://pan.baidu.com/s/1j_2wggZ3bvCuOAfZvjWqTg). password: **f9hh**
|
||||
|
||||
step3: prepare user rank_table
|
||||
step3 (ASCEND ONLY): prepare user rank_table
|
||||
|
||||
```python
|
||||
# user can use your own rank table file
|
||||
|
@ -137,13 +137,25 @@ ls ./dataset/centerface/annotations/train.json # annot_path
|
|||
ls ./dataset/centerface/images/train/images # img_dir
|
||||
```
|
||||
|
||||
```python
|
||||
# enter script dir, train CenterFace
|
||||
sh train_distribute.sh
|
||||
# after training
|
||||
mkdir ./model
|
||||
cp device0/outputs/*/*.ckpt ./model # cp model to [MODEL_PATH]
|
||||
```
|
||||
- Train on Ascend
|
||||
|
||||
```python
|
||||
# enter script dir, train CenterFace
|
||||
sh train_distribute.sh
|
||||
# after training
|
||||
mkdir ./model
|
||||
cp device0/output/*/*.ckpt ./model # cp model to [MODEL_PATH]
|
||||
```
|
||||
|
||||
- Train on GPU
|
||||
|
||||
```python
|
||||
# enter script dir, train CenterFace
|
||||
sh train_distribute_gpu.sh
|
||||
# after training
|
||||
mkdir ./model
|
||||
cp train_distribute_gpu/output/*/*.ckpt ./model # cp model to [MODEL_PATH]
|
||||
```
|
||||
|
||||
step5: test
|
||||
|
||||
|
@ -163,10 +175,19 @@ ls ./dataset/images/val/images/ # data path
|
|||
ls ./dataset/centerface/ground_truth/val.mat # annot_path
|
||||
```
|
||||
|
||||
```python
|
||||
# test CenterFace
|
||||
sh test_distribute.sh
|
||||
```
|
||||
- Test on Ascend
|
||||
|
||||
```python
|
||||
# test CenterFace
|
||||
sh test_distribute.sh
|
||||
```
|
||||
|
||||
- Test on GPU
|
||||
|
||||
```bash
|
||||
# test CenterFace
|
||||
bash test_distribute GPU
|
||||
```
|
||||
|
||||
step6: eval
|
||||
|
||||
|
@ -304,7 +325,9 @@ sh eval_all.sh [ground_truth_path]
|
|||
│ ├──test_distribute.sh // testing a range of models
|
||||
│ ├──test_and_eval.sh // test then evaluate a single model
|
||||
│ ├──train_standalone.sh // train in ascend with single npu
|
||||
│ ├──train_standalone_gpu.sh // train on GPU with single npu
|
||||
│ ├──train_distribute.sh // train in ascend with multi npu
|
||||
│ ├──train_distribute_gpu.sh // train on GPU with multi npu
|
||||
├── src
|
||||
│ ├──__init__.py
|
||||
│ ├──centerface.py // centerface networks, training entry
|
||||
|
@ -320,7 +343,7 @@ sh eval_all.sh [ground_truth_path]
|
|||
| ├──config.py // Processing configuration parameters
|
||||
| ├──device_adapter.py // Get cloud ID
|
||||
| ├──local_adapter.py // Get local ID
|
||||
| └ ──moxing_adapter.py // Parameter processing
|
||||
| ├──moxing_adapter.py // Parameter processing
|
||||
└── dependency // third party codes: MIT License
|
||||
├──extd // training dependency: data augmentation
|
||||
│ ├──utils
|
||||
|
@ -371,6 +394,7 @@ sh eval_all.sh [ground_truth_path]
|
|||
--data_dir: data dir
|
||||
--annot_path: annotations path
|
||||
--img_dir: img dir in data_dir
|
||||
--device_target: device where the code will be implemented. Options are "Ascend" or "GPU". (default: Ascend)
|
||||
```
|
||||
|
||||
2. centerface unique configs: in config.py; not recommend user to change
|
||||
|
@ -395,6 +419,7 @@ sh eval_all.sh [ground_truth_path]
|
|||
# detail can be found in "test.py"
|
||||
# if ckpt is specified not need below 4 parameter
|
||||
--device_num: training device number
|
||||
--device_target: device where the code will be implemented. Options are "Ascend" or "GPU". (default: Ascend)
|
||||
--steps_per_epoch: steps for each epoch
|
||||
--start: start loop number, used to calculate first epoch number
|
||||
--end: end loop number, used to calculate last epoch number
|
||||
|
@ -414,82 +439,152 @@ Major parameters eval.py as follows:
|
|||
|
||||
### Training
|
||||
|
||||
'task_set' is important for multi-npu train to get higher speed
|
||||
--task_set: 0, not task_set; 1 task_set;
|
||||
--task_set_core: task_set core number, most time = cpu number/nproc_per_node
|
||||
- Running on Ascend
|
||||
|
||||
step1: user need train a mobilenet_v2 model by mindspore or use the script below:
|
||||
'task_set' is important for multi-npu train to get higher speed
|
||||
--task_set: 0, not task_set; 1 task_set;
|
||||
--task_set_core: task_set core number, most time = cpu number/nproc_per_node
|
||||
|
||||
```python
|
||||
python torch_to_ms_centerface.py --ckpt_fn=./mobilenet_v2_key.ckpt --pt_fn=./mobilenet_v2-b0353104.pth --out_ckpt_fn=./mobilenet_v2.ckpt
|
||||
```
|
||||
step1: user need train a mobilenet_v2 model by mindspore or use the script below:
|
||||
|
||||
step2: prepare user rank_table
|
||||
```python
|
||||
python torch_to_ms_mobilenetv2.py --ckpt_fn=./mobilenet_v2_key.ckpt --pt_fn=./mobilenet_v2-b0353104.pth --out_ckpt_fn=./mobilenet_v2.ckpt
|
||||
```
|
||||
|
||||
```python
|
||||
# user can use your own rank table file
|
||||
# or use the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate rank table file
|
||||
# e.g., python hccl_tools.py --device_num "[0,8)"
|
||||
python hccl_tools.py --device_num "[0,8)"
|
||||
```
|
||||
step2: prepare user rank_table
|
||||
|
||||
step3: train
|
||||
```python
|
||||
# user can use your own rank table file
|
||||
# or use the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate rank table file
|
||||
# e.g., python hccl_tools.py --device_num "[0,8)"
|
||||
python hccl_tools.py --device_num "[0,8)"
|
||||
```
|
||||
|
||||
- Single device
|
||||
step3: train
|
||||
|
||||
```python
|
||||
# enter script dir, train CenterFace
|
||||
cd scripts
|
||||
# you need to change the parameter in train_standalone.sh
|
||||
# or use symbolic link as quick start
|
||||
# or use the command as follow:
|
||||
# USE_DEVICE_ID: your device
|
||||
# PRETRAINED_BACKBONE: your pretrained model path
|
||||
# DATASET: dataset path
|
||||
# ANNOTATIONS: annotation path
|
||||
# images: img_dir in dataset path
|
||||
sh train_standalone.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]
|
||||
# after training
|
||||
cp device0/outputs/*/*.ckpt [MODEL_PATH]
|
||||
```
|
||||
- Single device
|
||||
|
||||
- multi-device (recommended)
|
||||
```python
|
||||
# enter script dir, train CenterFace
|
||||
cd scripts
|
||||
# you need to change the parameter in train_standalone.sh
|
||||
# or use symbolic link as quick start
|
||||
# or use the command as follow:
|
||||
# USE_DEVICE_ID: your device
|
||||
# PRETRAINED_BACKBONE: your pretrained model path
|
||||
# DATASET: dataset path
|
||||
# ANNOTATIONS: annotation path
|
||||
# images: img_dir in dataset path
|
||||
sh train_standalone.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]
|
||||
# after training
|
||||
cp device0/output/*/*.ckpt [MODEL_PATH]
|
||||
```
|
||||
|
||||
```python
|
||||
# enter script dir, train CenterFace
|
||||
cd scripts;
|
||||
# you need to change the parameter in train_distribute.sh
|
||||
# or use symbolic link as quick start
|
||||
# or use the command as follow, most are the same as train_standalone.sh, the different is RANK_TABLE
|
||||
# RANK_TABLE: for multi-device only, from generate_rank_table.py or user writing
|
||||
sh train_distribute.sh [RANK_TABLE] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]
|
||||
# after training
|
||||
cp device0/outputs/*/*.ckpt [MODEL_PATH]
|
||||
```
|
||||
- Multi-device (recommended)
|
||||
|
||||
After training with 8 device, the loss value will be achieved as follows:
|
||||
```python
|
||||
# enter script dir, train CenterFace
|
||||
cd scripts;
|
||||
# you need to change the parameter in train_distribute.sh
|
||||
# or use symbolic link as quick start
|
||||
# or use the command as follow, most are the same as train_standalone.sh, the different is RANK_TABLE
|
||||
# RANK_TABLE: for multi-device only, from generate_rank_table.py or user writing
|
||||
sh train_distribute.sh [RANK_TABLE] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]
|
||||
# after training
|
||||
cp device0/output/*/*.ckpt [MODEL_PATH]
|
||||
```
|
||||
|
||||
```python
|
||||
# grep "loss is " device0/xxx.log
|
||||
# epoch: 1 step: 1, loss is greater than 500 and less than 5000
|
||||
2020-09-24 19:00:53,550:INFO:epoch:1, iter:0, average_loss:loss:1148.415649, loss:1148.4156494140625, overflow:False, loss_scale:1024.0
|
||||
[WARNING] DEBUG(51499,python):2020-09-24-19:00:53.590.008 [mindspore/ccsrc/debug/dump_proto.cc:218] SetValueToProto] Unsupported type UInt
|
||||
2020-09-24 19:00:53,784:INFO:epoch:1, iter:1, average_loss:loss:798.286713, loss:448.15777587890625, overflow:False, loss_scale:1024.0
|
||||
...
|
||||
2020-09-24 19:01:58,095:INFO:epoch:2, iter:197, average_loss:loss:1.942609, loss:1.5492267608642578, overflow:False, loss_scale:1024.0
|
||||
2020-09-24 19:01:58,501:INFO:epoch[2], loss:1.942609, 477.97 imgs/sec, lr:0.004000000189989805
|
||||
2020-09-24 19:01:58,502:INFO:==========end epoch===============
|
||||
2020-09-24 19:02:00,780:INFO:epoch:3, iter:0, average_loss:loss:2.107658, loss:2.1076583862304688, overflow:False, loss_scale:1024.0
|
||||
...
|
||||
# epoch: 140 average loss is greater than 0.3 and less than 1.5:
|
||||
2020-09-24 20:19:16,255:INFO:epoch:140, iter:196, average_loss:loss:0.906300, loss:1.1071504354476929, overflow:False, loss_scale:1024.0
|
||||
2020-09-24 20:19:16,347:INFO:epoch:140, iter:197, average_loss:loss:0.904684, loss:0.586264967918396, overflow:False, loss_scale:1024.0
|
||||
2020-09-24 20:19:16,747:INFO:epoch[140], loss:0.904684, 480.10 imgs/sec, lr:3.9999998989515007e-05
|
||||
2020-09-24 20:19:16,748:INFO:==========end epoch===============
|
||||
2020-09-24 20:19:16,748:INFO:==========end training===============
|
||||
```
|
||||
After training with 8 device, the loss value will be achieved as follows:
|
||||
|
||||
The model checkpoint will be saved in the scripts/device0/output/xxx/xxx.ckpt
|
||||
```python
|
||||
# grep "loss:" device0/xxx.log
|
||||
#
|
||||
# epoch: 1 step: 1, loss is greater than 500 and less than 5000
|
||||
2020-09-24 19:00:53,550:INFO:epoch:1, iter:0, average_loss:loss:1148.415649, loss:1148.4156494140625, overflow:False, loss_scale:1024.0
|
||||
[WARNING] DEBUG(51499,python):2020-09-24-19:00:53.590.008 [mindspore/ccsrc/debug/dump_proto.cc:218] SetValueToProto] Unsupported type UInt
|
||||
2020-09-24 19:00:53,784:INFO:epoch:1, iter:1, average_loss:loss:798.286713, loss:448.15777587890625, overflow:False, loss_scale:1024.0
|
||||
...
|
||||
2020-09-24 19:01:58,095:INFO:epoch:2, iter:197, average_loss:loss:1.942609, loss:1.5492267608642578, overflow:False, loss_scale:1024.0
|
||||
2020-09-24 19:01:58,501:INFO:epoch[2], loss:1.942609, 477.97 imgs/sec, lr:0.004000000189989805
|
||||
2020-09-24 19:01:58,502:INFO:==========end epoch===============
|
||||
2020-09-24 19:02:00,780:INFO:epoch:3, iter:0, average_loss:loss:2.107658, loss:2.1076583862304688, overflow:False, loss_scale:1024.0
|
||||
...
|
||||
# epoch: 140 average loss is greater than 0.3 and less than 1.5:
|
||||
2020-09-24 20:19:16,255:INFO:epoch:140, iter:196, average_loss:loss:0.906300, loss:1.1071504354476929, overflow:False, loss_scale:1024.0
|
||||
2020-09-24 20:19:16,347:INFO:epoch:140, iter:197, average_loss:loss:0.904684, loss:0.586264967918396, overflow:False, loss_scale:1024.0
|
||||
2020-09-24 20:19:16,747:INFO:epoch[140], loss:0.904684, 480.10 imgs/sec, lr:3.9999998989515007e-05
|
||||
2020-09-24 20:19:16,748:INFO:==========end epoch===============
|
||||
2020-09-24 20:19:16,748:INFO:==========end training===============
|
||||
```
|
||||
|
||||
The model checkpoint will be saved in scripts/device0/output/xxx/xxx.ckpt
|
||||
|
||||
- Running on GPU
|
||||
|
||||
'task_set' is important for multi-npu train to get higher speed
|
||||
--task_set: 0, not task_set; 1 task_set;
|
||||
--task_set_core: task_set core number, most time = cpu number/nproc_per_node
|
||||
|
||||
step1: user need train a mobilenet_v2 model by mindspore or use the script below:
|
||||
|
||||
```python
|
||||
python torch_to_ms_mobilenetv2.py --ckpt_fn=./mobilenet_v2_key.ckpt --pt_fn=./mobilenet_v2-b0353104.pth --out_ckpt_fn=./mobilenet_v2.ckpt
|
||||
```
|
||||
|
||||
step2: train
|
||||
|
||||
- Single device
|
||||
|
||||
```python
|
||||
# enter script dir, train CenterFace
|
||||
cd scripts
|
||||
# you need to change the parameter in train_standalone_gpu.sh
|
||||
# or use symbolic link as quick start
|
||||
# or use the command as follow:
|
||||
# USE_DEVICE_ID: your device
|
||||
# PRETRAINED_BACKBONE: your pretrained model path
|
||||
# DATASET: dataset path
|
||||
# ANNOTATIONS: annotation path
|
||||
# images: img_dir in dataset path
|
||||
sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]
|
||||
# after training
|
||||
cp train_standalone_gpu/output/*/*.ckpt [MODEL_PATH]
|
||||
```
|
||||
|
||||
- Multi-device (recommended)
|
||||
|
||||
```python
|
||||
# enter script dir, train CenterFace
|
||||
cd scripts;
|
||||
# you need to change the parameter in train_distribute_gpu.sh
|
||||
# or use symbolic link as quick start
|
||||
# or use the command as follow, most are the same as train_standalone_gpu.sh, the different is DEVICE_NUM
|
||||
# DEVICE_NUM: for multi-device only, number of devices
|
||||
sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]
|
||||
# after training
|
||||
cp train_distribute_gpu/output/*/*.ckpt [MODEL_PATH]
|
||||
```
|
||||
|
||||
After training with 8 device, the loss value will be achieved as follows:
|
||||
|
||||
```python
|
||||
# grep "loss:" train_distribute_gpu/xxx.log
|
||||
#
|
||||
# epoch: 1 step: 1, loss is greater than 500 and less than 5000
|
||||
2021-07-06 16:00:45,375:INFO:epoch:1, iter:0, avg_loss:loss:1271.834595, loss:1271.8345947265625, overflow:False, loss_scale:1024.0
|
||||
[WARNING] ME(50115:139631687231296,_GeneratorWorkerMp-42):2021-07-06-16:00:45.499.845 [mindspore/dataset/engine/queue.py:99] Using shared memory queue, but rowsize is larger than allocated memory max_rowsize 6291456 current rowwize 9550848
|
||||
2021-07-06 16:00:45,600:INFO:epoch:1, iter:1, avg_loss:loss:1017.134613, loss:762.4346313476562, overflow:False, loss_scale:1024.0
|
||||
...
|
||||
2021-07-06 16:01:42,710:INFO:epoch:2, iter:197, avg_loss:loss:1.906899, loss:1.6912976503372192, overflow:False, loss_scale:1024.0
|
||||
2021-07-06 16:01:42,869:INFO:epoch[2], loss:1.906899, 442.33 imgs/sec, lr:0.004000000189989805
|
||||
2021-07-06 16:01:42,985:INFO:epoch:3, iter:0, avg_loss:loss:1.804715, loss:1.804714560508728, overflow:False, loss_scale:1024.0
|
||||
...
|
||||
# epoch: 140 average loss is greater than 0.3 and less than 1.5:
|
||||
2021-07-06 17:02:39,750:INFO:epoch:140, iter:196, avg_loss:loss:0.870886, loss:0.7947260141372681, overflow:False, loss_scale:1024.0
|
||||
2021-07-06 17:02:39,869:INFO:epoch:140, iter:197, avg_loss:loss:0.872917, loss:1.2730457782745361, overflow:False, loss_scale:1024.0
|
||||
2021-07-06 17:02:40,005:INFO:epoch[140], loss:0.872917, 529.03 imgs/sec, lr:3.9999998989515007e-05
|
||||
2021-07-06 17:02:41,273:INFO:==========end training===============
|
||||
```
|
||||
|
||||
## [Testing Process](#contents)
|
||||
|
||||
|
@ -511,27 +606,29 @@ mkdir [SAVE_PATH]
|
|||
# you need to change the parameter in test.sh
|
||||
# or use symbolic link as quick start
|
||||
# or use the command as follow:
|
||||
# DEVICE_TARGET: device where the code will be implemented. Either Ascend or GPU (default: Ascend)
|
||||
# MODEL_PATH: ckpt path saved during training
|
||||
# DATASET: img dir
|
||||
# GROUND_TRUTH_MAT: ground_truth file, mat type
|
||||
# SAVE_PATH: save_path for evaluate
|
||||
# DEVICE_ID: use device id
|
||||
# CKPT: test model name
|
||||
sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]
|
||||
sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]
|
||||
```
|
||||
|
||||
2. test many out ckpt for user to choose the best one
|
||||
|
||||
```python
|
||||
# you need to change the parameter in test.sh
|
||||
# or use symbolic link as quick start
|
||||
# or use the command as follow, most are the same as test.sh, the different are:
|
||||
# DEVICE_NUM: training device number
|
||||
# STEPS_PER_EPOCH: steps for each epoch
|
||||
# START: start loop number, used to calculate first epoch number
|
||||
# END: end loop number, used to calculate last epoch number
|
||||
sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START] [END]
|
||||
```
|
||||
```python
|
||||
# you need to change the parameter in test.sh
|
||||
# or use symbolic link as quick start
|
||||
# or use the command as follow, most are the same as test.sh, the different are:
|
||||
# DEVICE_TARGET: device where the code will be implemented. Either Ascend or GPU (default: Ascend)
|
||||
# DEVICE_NUM: training device number
|
||||
# STEPS_PER_EPOCH: steps for each epoch
|
||||
# START: start loop number, used to calculate first epoch number
|
||||
# END: end loop number, used to calculate last epoch number
|
||||
sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START] [END]
|
||||
```
|
||||
|
||||
After testing, you can find many txt file save the box information and scores,
|
||||
open it you can see:
|
||||
|
@ -572,57 +669,107 @@ cd ../../../scripts;
|
|||
|
||||
3. test+eval
|
||||
|
||||
```python
|
||||
# you need to change the parameter in test_and_eval.sh
|
||||
# or use symbolic link as quick start, default eval the ckpt saved in ./scripts/output/centerface/999
|
||||
# or use the command as follow, most are the same as test.sh, the different are:
|
||||
# GROUND_TRUTH_PATH: ground truth path
|
||||
sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [CKPT] [GROUND_TRUTH_PATH]
|
||||
```
|
||||
```python
|
||||
# you need to change the parameter in test_and_eval.sh
|
||||
# or use symbolic link as quick start, default eval the ckpt saved in ./scripts/output/centerface/999
|
||||
# or use the command as follow, most are the same as test.sh, the different are:
|
||||
# GROUND_TRUTH_PATH: ground truth path
|
||||
sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [CKPT] [GROUND_TRUTH_PATH]
|
||||
```
|
||||
|
||||
you can see the MAP below by eval.sh
|
||||
- Running on Ascend
|
||||
|
||||
```log
|
||||
(ci3.7) [root@bms-aiserver scripts]# ./eval.sh ./ground_truth_path
|
||||
start eval
|
||||
==================== Results = ==================== ./scripts/output/centerface/999
|
||||
Easy Val AP: 0.923914407045363
|
||||
Medium Val AP: 0.9166100571371586
|
||||
Hard Val AP: 0.7810750535799462
|
||||
=================================================
|
||||
end eval
|
||||
```
|
||||
you can see the MAP below by eval.sh
|
||||
|
||||
you can see the MAP below by eval_all.sh
|
||||
```log
|
||||
(ci3.7) [root@bms-aiserver scripts]# ./eval.sh
|
||||
start eval
|
||||
==================== Results = ==================== ./scripts/output/centerface/999
|
||||
Easy Val AP: 0.923914407045363
|
||||
Medium Val AP: 0.9166100571371586
|
||||
Hard Val AP: 0.7810750535799462
|
||||
=================================================
|
||||
end eval
|
||||
```
|
||||
|
||||
```log
|
||||
(ci3.7) [root@bms-aiserver scripts]# ./eval_all.sh ./ground_truth_path
|
||||
==================== Results = ==================== ./scripts/output/centerface/89
|
||||
Easy Val AP: 0.8884892849068273
|
||||
Medium Val AP: 0.8928813452811216
|
||||
Hard Val AP: 0.7721131614294564
|
||||
=================================================
|
||||
==================== Results = ==================== ./scripts/output/centerface/90
|
||||
Easy Val AP: 0.8836073914165545
|
||||
Medium Val AP: 0.8875938506473486
|
||||
Hard Val AP: 0.775956751740446
|
||||
...
|
||||
==================== Results = ==================== ./scripts/output/centerface/125
|
||||
Easy Val AP: 0.923914407045363
|
||||
Medium Val AP: 0.9166100571371586
|
||||
Hard Val AP: 0.7810750535799462
|
||||
=================================================
|
||||
==================== Results = ==================== ./scripts/output/centerface/126
|
||||
Easy Val AP: 0.9218741197149122
|
||||
Medium Val AP: 0.9151860193570651
|
||||
Hard Val AP: 0.7825645670331809
|
||||
...
|
||||
==================== Results = ==================== ./scripts/output/centerface/140
|
||||
Easy Val AP: 0.9250715236965638
|
||||
Medium Val AP: 0.9170429723233877
|
||||
Hard Val AP: 0.7822182013830674
|
||||
=================================================
|
||||
```
|
||||
you can see the MAP below by eval_all.sh
|
||||
|
||||
```log
|
||||
(ci3.7) [root@bms-aiserver scripts]# ./eval_all.sh
|
||||
==================== Results = ==================== ./scripts/output/centerface/89
|
||||
Easy Val AP: 0.8884892849068273
|
||||
Medium Val AP: 0.8928813452811216
|
||||
Hard Val AP: 0.7721131614294564
|
||||
=================================================
|
||||
==================== Results = ==================== ./scripts/output/centerface/90
|
||||
Easy Val AP: 0.8836073914165545
|
||||
Medium Val AP: 0.8875938506473486
|
||||
Hard Val AP: 0.775956751740446
|
||||
...
|
||||
==================== Results = ==================== ./scripts/output/centerface/125
|
||||
Easy Val AP: 0.923914407045363
|
||||
Medium Val AP: 0.9166100571371586
|
||||
Hard Val AP: 0.7810750535799462
|
||||
=================================================
|
||||
==================== Results = ==================== ./scripts/output/centerface/126
|
||||
Easy Val AP: 0.9218741197149122
|
||||
Medium Val AP: 0.9151860193570651
|
||||
Hard Val AP: 0.7825645670331809
|
||||
...
|
||||
==================== Results = ==================== ./scripts/output/centerface/140
|
||||
Easy Val AP: 0.9250715236965638
|
||||
Medium Val AP: 0.9170429723233877
|
||||
Hard Val AP: 0.7822182013830674
|
||||
=================================================
|
||||
```
|
||||
|
||||
- Running on GPU
|
||||
|
||||
you can see the MAP below from eval.sh
|
||||
|
||||
```log
|
||||
(markus) rescue@distrubuteddata13: ./scripts$ bash eval.sh
|
||||
start eval
|
||||
==================== Results = ==================== ./scripts/output/centerface/140
|
||||
Easy Val AP: 0.9240708943779239
|
||||
Medium Val AP: 0.9193106635436091
|
||||
Hard Val AP: 0.7777030480280428
|
||||
=================================================
|
||||
end eval
|
||||
```
|
||||
|
||||
you can see the MAP below from eval_all.sh
|
||||
|
||||
```log
|
||||
(markus) rescue@distrubuteddata13: ./scripts$ bash eval_all.sh
|
||||
==================== Results = ==================== ./scripts/output/centerface/89
|
||||
Easy Val AP: 0.9138417914429035
|
||||
Medium Val AP: 0.9052437122819539
|
||||
Hard Val AP: 0.7705692348147004
|
||||
=================================================
|
||||
==================== Results = ==================== ./scripts/output/centerface/90
|
||||
Easy Val AP: 0.8820974959531916
|
||||
Medium Val AP: 0.8902186098138436
|
||||
Hard Val AP: 0.7655257898032033
|
||||
=================================================
|
||||
...
|
||||
==================== Results = ==================== /home/rescue/markus/markus_repo/mindspore/model_zoo/official/cv/centerface/scripts/output/centerface/125
|
||||
Easy Val AP: 0.9240525949727452
|
||||
Medium Val AP: 0.9180645371016661
|
||||
Hard Val AP: 0.782047346778918
|
||||
=================================================
|
||||
==================== Results = ==================== /home/rescue/markus/markus_repo/mindspore/model_zoo/official/cv/centerface/scripts/output/centerface/126
|
||||
Easy Val AP: 0.9199560196120761
|
||||
Medium Val AP: 0.9157462777329638
|
||||
Hard Val AP: 0.7814679399942209
|
||||
=================================================
|
||||
...
|
||||
==================== Results = ==================== /home/rescue/markus/markus_repo/mindspore/model_zoo/official/cv/centerface/scripts/output/centerface/140
|
||||
Easy Val AP: 0.9240708943779239
|
||||
Medium Val AP: 0.9193106635436091
|
||||
Hard Val AP: 0.7777030480280428
|
||||
=================================================
|
||||
```
|
||||
|
||||
## [Inference process](#contents)
|
||||
|
||||
|
@ -678,36 +825,36 @@ Hard Val AP: 0.776737419299741
|
|||
|
||||
CenterFace on 13K images(The annotation and data format must be the same as widerFace)
|
||||
|
||||
| Parameters | CenterFace |
|
||||
| -------------------------- | ----------------------------------------------------------- |
|
||||
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 |
|
||||
| uploaded Date | 10/29/2020 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 |
|
||||
| Dataset | 13K images |
|
||||
| Training Parameters | epoch=140, steps=198 * epoch, batch_size = 8, lr=0.004 |
|
||||
| Optimizer | Adam |
|
||||
| Loss Function | Focal Loss, L1 Loss, Smooth L1 Loss |
|
||||
| outputs | heatmaps |
|
||||
| Loss | 0.3-1.5, average loss for last epoch is in 0.8-1.0 |
|
||||
| Speed | 1p 65 img/s, 8p 475 img/s |
|
||||
| Total time | train(8p) 1.1h, test 50min, eval 5-10min |
|
||||
| Checkpoint for Fine tuning | 22M (.ckpt file) |
|
||||
| Scripts | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/centerface> |
|
||||
| Parameters | Ascend | GPU |
|
||||
| -------------------------- | ----------------------------------------------------------- | -----------------------------------------|
|
||||
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | Tesla V100 PCIe 32GB; CPU 2.70GHz; 52cores; Memory 1510G; OS Ubuntu 18.04.5 |
|
||||
| uploaded Date | 10/29/2020 (month/day/year) | 7/9/2021 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 | 1.3.0 |
|
||||
| Dataset | 13K images | 13K images |
|
||||
| Training Parameters | epoch=140, steps=198 * epoch, batch_size = 8, lr=0.004 | epoch=140, steps=198 * epoch, batch_size = 8, lr=0.004 |
|
||||
| Optimizer | Adam | Adam |
|
||||
| Loss Function | Focal Loss, L1 Loss, Smooth L1 Loss | Focal Loss, L1 Loss, Smooth L1 Loss |
|
||||
| outputs | heatmaps | heatmaps |
|
||||
| Loss | 0.3-1.5, average loss for last epoch is in 0.8-1.0 | iter loss for last epoch 0.3-3.3, average loss for last epoch is in 0.75-1.05 |
|
||||
| Speed | 1p 65 img/s, 8p 475 img/s | 1gpu 80 img/s, 8gpu 480 img/s |
|
||||
| Total time | train(8p) 1.1h, test 50min, eval 5-10min | train(8gpu) 1.0h, test 35 min, eval 5-10min |
|
||||
| Checkpoint for Fine tuning | 22M (.ckpt file) | 23M (.ckpt file) |
|
||||
| Scripts | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/centerface> | <https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/centerface> |
|
||||
|
||||
### Inference Performance
|
||||
|
||||
CenterFace on 3.2K images(The annotation and data format must be the same as widerFace)
|
||||
|
||||
| Parameters | CenterFace |
|
||||
| -------------------------- | ----------------------------------------------------------- |
|
||||
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 |
|
||||
| uploaded Date | 10/29/2020 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 |
|
||||
| Dataset | 3.2K images |
|
||||
| batch_size | 1 |
|
||||
| outputs | box position and sorces, and probability |
|
||||
| Accuracy | Easy 92.2% Medium 91.5% Hard 78.2% (+-0.5%) |
|
||||
| Model for inference | 22M (.ckpt file) |
|
||||
| Parameters | Ascend | GPU |
|
||||
| -------------------------- | ----------------------------------------------------------- | ------------------------------------------ |
|
||||
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | Tesla V100 PCIe 32GB; CPU 2.70GHz; 52cores; Memory 1510G; OS Ubuntu 18.04.5 |
|
||||
| uploaded Date | 10/29/2020 (month/day/year) | 7/9/2021 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 | 1.3.0
|
||||
| Dataset | 3.2K images | 3.2K images |
|
||||
| batch_size | 1 | 1 |
|
||||
| outputs | box position and scores, and probability | box position and scores, and probability |
|
||||
| Accuracy | Easy 92.2% Medium 91.5% Hard 78.2% (+-0.5%) | Easy 92.4% Medium 91.9% Hard 77.8% (+-0.5%) |
|
||||
| Model for inference | 22M (.ckpt file) | 23M (.ckpt file) |
|
||||
|
||||
### 310Inference Performance
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ from bbox import bbox_overlaps
|
|||
def get_gt_boxes(gt_dir):
|
||||
""" gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)"""
|
||||
|
||||
gt_mat = loadmat(os.path.join(gt_dir, 'wider_face_val.mat')) # you own ground_truth name
|
||||
gt_mat = loadmat(os.path.join(gt_dir, 'val.mat')) # you own ground_truth name
|
||||
hard_mat = loadmat(os.path.join(gt_dir, 'wider_hard_val.mat'))
|
||||
medium_mat = loadmat(os.path.join(gt_dir, 'wider_medium_val.mat'))
|
||||
easy_mat = loadmat(os.path.join(gt_dir, 'wider_easy_val.mat'))
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-21 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -14,14 +14,15 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# -gt 6 ]
|
||||
if [ $# -gt 7 ]
|
||||
then
|
||||
echo "Usage: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]"
|
||||
echo " or: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]"
|
||||
echo " or: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
|
||||
echo " or: sh test.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
|
||||
echo " or: sh test.sh [MODEL_PATH] [DATASET]"
|
||||
echo " or: sh test.sh [MODEL_PATH]"
|
||||
echo "Usage: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]"
|
||||
echo " or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]"
|
||||
echo " or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
|
||||
echo " or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
|
||||
echo " or: sh test.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]"
|
||||
echo " or: sh test.sh [DEVICE_TARGET] [MODEL_PATH]"
|
||||
echo " or: sh test.sh [DEVICE_TARGET]"
|
||||
echo " or: sh test.sh "
|
||||
exit 1
|
||||
fi
|
||||
|
@ -50,32 +51,43 @@ dataset_root=$root/dataset
|
|||
dataset_path=$dataset_root/centerface/images/val/images/
|
||||
ground_truth_mat=$dataset_root/centerface/ground_truth/val.mat
|
||||
save_path=$root/output/centerface/
|
||||
device_target="Ascend"
|
||||
device_id=0
|
||||
ckpt="0-125_24750.ckpt" # the model saved for epoch=125
|
||||
ckpt="0-140_221620.ckpt" # the model saved for epoch=140
|
||||
|
||||
if [ $# == 1 ]
|
||||
if [ $# -ge 1 ]
|
||||
then
|
||||
model_path=$(get_real_path $1)
|
||||
if [ ! -f $model_path ]
|
||||
device_target="$1"
|
||||
if [ "$device_target" != "Ascend" ] && [ "$device_target" != "GPU" ]
|
||||
then
|
||||
echo "error: device_target=$device_target is not a valid option (Ascend or GPU)"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 2 ]
|
||||
then
|
||||
model_path=$(get_real_path $2)
|
||||
if [ ! -d $model_path ]
|
||||
then
|
||||
echo "error: model_path=$model_path is not a file"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $# == 2 ]
|
||||
if [ $# -ge 3 ]
|
||||
then
|
||||
dataset_path=$(get_real_path $2)
|
||||
if [ ! -f $dataset_path ]
|
||||
dataset_path=$(get_real_path $3)
|
||||
if [ ! -d $dataset_path ]
|
||||
then
|
||||
echo "error: dataset_path=$dataset_path is not a file"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
if [ $# -ge 4 ]
|
||||
then
|
||||
ground_truth_mat=$(get_real_path $3)
|
||||
ground_truth_mat=$(get_real_path $4)
|
||||
if [ ! -f $ground_truth_mat ]
|
||||
then
|
||||
echo "error: ground_truth_mat=$ground_truth_mat is not a file"
|
||||
|
@ -83,24 +95,24 @@ then
|
|||
fi
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
if [ $# -ge 5 ]
|
||||
then
|
||||
save_path=$(get_real_path $4)
|
||||
if [ ! -f $save_path ]
|
||||
save_path=$(get_real_path $5)
|
||||
if [ ! -d $save_path ]
|
||||
then
|
||||
echo "error: save_path=$save_path is not a file"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
if [ $# -ge 6 ]
|
||||
then
|
||||
device_id=$5
|
||||
device_id=$6
|
||||
fi
|
||||
|
||||
if [ $# == 6 ]
|
||||
if [ $# == 7 ]
|
||||
then
|
||||
ckpt=$6
|
||||
ckpt=$7
|
||||
fi
|
||||
|
||||
echo $model_path
|
||||
|
@ -126,6 +138,7 @@ python ${dirname_path}/${SCRIPT_NAME} \
|
|||
--ground_truth_mat=$ground_truth_mat \
|
||||
--save_dir=$save_path \
|
||||
--rank=$device_id \
|
||||
--device_target=$device_target \
|
||||
--ckpt_name=$ckpt > test.log 2>&1 &
|
||||
|
||||
echo 'running'
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-21 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -14,15 +14,16 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# -gt 6 ]
|
||||
if [ $# -gt 8 ]
|
||||
then
|
||||
echo "Usage: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT] [GROUND_TRUTH_PATH]"
|
||||
echo " or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]"
|
||||
echo " or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]"
|
||||
echo " or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
|
||||
echo " or: sh test_and_eval.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
|
||||
echo " or: sh test_and_eval.sh [MODEL_PATH] [DATASET]"
|
||||
echo " or: sh test_and_eval.sh [MODEL_PATH]"
|
||||
echo "Usage: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT] [GROUND_TRUTH_PATH]"
|
||||
echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID] [CKPT]"
|
||||
echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_ID]"
|
||||
echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
|
||||
echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
|
||||
echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]"
|
||||
echo " or: sh test_and_eval.sh [DEVICE_TARGET] [MODEL_PATH]"
|
||||
echo " or: sh test_and_eval.sh [DEVICE_TARGET]"
|
||||
echo " or: sh test_and_eval.sh "
|
||||
exit 1
|
||||
fi
|
||||
|
@ -51,14 +52,24 @@ dataset_root=$root/dataset
|
|||
dataset_path=$dataset_root/centerface/images/val/images/
|
||||
ground_truth_mat=$dataset_root/centerface/ground_truth/val.mat
|
||||
save_path=$root/output/centerface/999
|
||||
device_target="Ascend"
|
||||
device_id=0
|
||||
ckpt="0-125_24750.ckpt" # the model saved for epoch=125
|
||||
ckpt="0-140_221620.ckpt" # the model saved for epoch=125
|
||||
ground_truth_path=$root/dataset/centerface/ground_truth
|
||||
|
||||
if [ $# -ge 1 ]
|
||||
then
|
||||
model_path=$(get_real_path $1)
|
||||
# if [ ! -f $model_path ]
|
||||
device_target="$1"
|
||||
if [ "$device_target" != "Ascend" ] && [ "$device_target" != "GPU" ]
|
||||
then
|
||||
echo "error: device_target=$device_target is not a valid option (Ascend or GPU)"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 2 ]
|
||||
then
|
||||
model_path=$(get_real_path $2)
|
||||
if [ ! -d $model_path ]
|
||||
then
|
||||
echo "error: model_path=$model_path is not a dir"
|
||||
|
@ -66,9 +77,9 @@ then
|
|||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 2 ]
|
||||
if [ $# -ge 3 ]
|
||||
then
|
||||
dataset_path=$(get_real_path $2)
|
||||
dataset_path=$(get_real_path $3)
|
||||
if [ ! -d $dataset_path ]
|
||||
then
|
||||
echo "error: dataset_path=$dataset_path is not a dir"
|
||||
|
@ -76,9 +87,9 @@ then
|
|||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 3 ]
|
||||
if [ $# -ge 4 ]
|
||||
then
|
||||
ground_truth_mat=$(get_real_path $3)
|
||||
ground_truth_mat=$(get_real_path $4)
|
||||
if [ ! -f $ground_truth_mat ]
|
||||
then
|
||||
echo "error: ground_truth_mat=$ground_truth_mat is not a file"
|
||||
|
@ -86,9 +97,9 @@ then
|
|||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 4 ]
|
||||
if [ $# -ge 5 ]
|
||||
then
|
||||
save_path=$(get_real_path $4)
|
||||
save_path=$(get_real_path $5)
|
||||
if [ ! -d $save_path ]
|
||||
then
|
||||
echo "error: save_path=$save_path is not a dir"
|
||||
|
@ -96,19 +107,19 @@ then
|
|||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 5 ]
|
||||
then
|
||||
device_id=$5
|
||||
fi
|
||||
|
||||
if [ $# -ge 6 ]
|
||||
then
|
||||
ckpt=$6
|
||||
device_id=$6
|
||||
fi
|
||||
|
||||
if [ $# -ge 7 ]
|
||||
then
|
||||
ground_truth_path=$(get_real_path $7)
|
||||
ckpt=$7
|
||||
fi
|
||||
|
||||
if [ $# == 8 ]
|
||||
then
|
||||
ground_truth_path=$(get_real_path $8)
|
||||
if [ ! -f $ground_truth_path ]
|
||||
then
|
||||
echo "error: ground_truth_path=$ground_truth_path is not a file"
|
||||
|
@ -142,6 +153,7 @@ python ${dirname_path}/${SCRIPT_NAME} \
|
|||
--rank=$device_id \
|
||||
--ckpt_name=$ckpt \
|
||||
--eval=1 \
|
||||
--device_target=$device_target \
|
||||
--ground_truth_path=$ground_truth_path > test.log 2>&1 &
|
||||
|
||||
echo 'running'
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-21 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -14,18 +14,19 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# -gt 8 ]
|
||||
if [ $# -gt 9 ]
|
||||
then
|
||||
echo "Usage: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START] [END]"
|
||||
echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START]"
|
||||
echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH]"
|
||||
echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]"
|
||||
echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]"
|
||||
echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
|
||||
echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
|
||||
echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET]"
|
||||
echo " or: sh test_distribute.sh [MODEL_PATH] [DATASET]"
|
||||
echo " or: sh test_distribute.sh [MODEL_PATH]"
|
||||
echo "Usage: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START] [END]"
|
||||
echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH] [START]"
|
||||
echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM] [STEPS_PER_EPOCH]"
|
||||
echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]"
|
||||
echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH] [DEVICE_NUM]"
|
||||
echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT] [SAVE_PATH]"
|
||||
echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET] [GROUND_TRUTH_MAT]"
|
||||
echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]"
|
||||
echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH] [DATASET]"
|
||||
echo " or: sh test_distribute.sh [DEVICE_TARGET] [MODEL_PATH]"
|
||||
echo " or: sh test_distribute.sh [DEVICE_TARGET]"
|
||||
echo " or: sh test_distribute.sh "
|
||||
exit 1
|
||||
fi
|
||||
|
@ -58,6 +59,7 @@ save_path=$root/output/centerface/
|
|||
# model/ckpt name is "0-" + str(ckpt_num) + "_" + str(198*ckpt_num) + ".ckpt";
|
||||
# ckpt_num is epoch number, can be calculated by device_num
|
||||
# detail can be found in "test.py"
|
||||
device_target="Ascend"
|
||||
device_num=8
|
||||
steps_per_epoch=198 #198 for 8P; 1583 for 1p
|
||||
start=11 # start epoch number = start * device_num + min(device_phy_id) + 1
|
||||
|
@ -65,8 +67,17 @@ end=18 # end epoch number = end * device_num + max(device_phy_id) + 1
|
|||
|
||||
if [ $# -ge 1 ]
|
||||
then
|
||||
model_path=$(get_real_path $1)
|
||||
# if [ ! -f $model_path ]
|
||||
device_target="$1"
|
||||
if [ "$device_target" != "Ascend" ] && [ "$device_target" != "GPU" ]
|
||||
then
|
||||
echo "error: device_target=$device_target is not a valid option (Ascend or GPU)"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 2 ]
|
||||
then
|
||||
model_path=$(get_real_path $2)
|
||||
if [ ! -d $model_path ]
|
||||
then
|
||||
echo "error: model_path=$model_path is not a dir"
|
||||
|
@ -74,9 +85,9 @@ then
|
|||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 2 ]
|
||||
if [ $# -ge 3 ]
|
||||
then
|
||||
dataset_path=$(get_real_path $2)
|
||||
dataset_path=$(get_real_path $3)
|
||||
if [ ! -d $dataset_path ]
|
||||
then
|
||||
echo "error: dataset_path=$dataset_path is not a dir"
|
||||
|
@ -84,9 +95,9 @@ then
|
|||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 3 ]
|
||||
if [ $# -ge 4 ]
|
||||
then
|
||||
ground_truth_mat=$(get_real_path $3)
|
||||
ground_truth_mat=$(get_real_path $4)
|
||||
if [ ! -f $ground_truth_mat ]
|
||||
then
|
||||
echo "error: ground_truth_mat=$ground_truth_mat is not a file"
|
||||
|
@ -94,9 +105,9 @@ then
|
|||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 4 ]
|
||||
if [ $# -ge 5 ]
|
||||
then
|
||||
save_path=$(get_real_path $4)
|
||||
save_path=$(get_real_path $5)
|
||||
if [ ! -d $save_path ]
|
||||
then
|
||||
echo "error: save_path=$save_path is not a dir"
|
||||
|
@ -104,24 +115,24 @@ then
|
|||
fi
|
||||
fi
|
||||
|
||||
if [ $# -ge 5 ]
|
||||
then
|
||||
device_num=$5
|
||||
fi
|
||||
|
||||
if [ $# -ge 6 ]
|
||||
then
|
||||
steps_per_epoch=$6
|
||||
device_num=$6
|
||||
fi
|
||||
|
||||
if [ $# -ge 7 ]
|
||||
then
|
||||
start=$7
|
||||
steps_per_epoch=$7
|
||||
fi
|
||||
|
||||
if [ $# == 8 ]
|
||||
if [ $# -ge 8 ]
|
||||
then
|
||||
end=$8
|
||||
start=$8
|
||||
fi
|
||||
|
||||
if [ $# == 9 ]
|
||||
then
|
||||
end=$9
|
||||
fi
|
||||
|
||||
echo $model_path
|
||||
|
@ -150,6 +161,7 @@ do
|
|||
--save_dir=$save_path \
|
||||
--rank=$i \
|
||||
--device_num=$device_num \
|
||||
--device_target=$device_target \
|
||||
--steps_per_epoch=$steps_per_epoch \
|
||||
--start=$start \
|
||||
--end=$end > test.log 2>&1 &
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 0 ] && [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
|
||||
then
|
||||
echo "Usage: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]"
|
||||
echo " or: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS]"
|
||||
echo " or: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE] [DATASET]"
|
||||
echo " or: sh train_distribute_gpu.sh [DEVICE_NUM] [PRETRAINED_BACKBONE]"
|
||||
echo " or: sh train_distribute_gpu.sh [DEVICE_NUM]"
|
||||
echo " or: sh train_distribute_gpu.sh "
|
||||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
current_exec_path=$(pwd)
|
||||
echo ${current_exec_path}
|
||||
|
||||
dirname_path=$(dirname "$(pwd)")
|
||||
echo ${dirname_path}
|
||||
|
||||
rm -rf ${current_exec_path}/train_distribute_gpu
|
||||
SCRIPT_NAME='train.py'
|
||||
|
||||
ulimit -c unlimited
|
||||
|
||||
root=${current_exec_path} # your script path
|
||||
pretrained_backbone=${dirname_path}/mobilenet_v2.ckpt # or mobilenet_v2-b0353104.ckpt
|
||||
dataset_path=$root/dataset/centerface
|
||||
annot_path=$dataset_path/annotations/train.json
|
||||
img_dir=$dataset_path/images/train/images
|
||||
num_devices=8
|
||||
|
||||
if [ $# == 1 ]
|
||||
then
|
||||
num_devices=$1
|
||||
fi
|
||||
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
pretrained_backbone=$(get_real_path $2)
|
||||
if [ ! -f $pretrained_backbone ]
|
||||
then
|
||||
echo "error: pretrained_backbone=$pretrained_backbone is not a file"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
dataset_path=$(get_real_path $3)
|
||||
if [ ! -f $dataset_path ]
|
||||
then
|
||||
echo "error: dataset_path=$dataset_path is not a file"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
annot_path=$(get_real_path $4)
|
||||
if [ ! -f $annot_path ]
|
||||
then
|
||||
echo "error: annot_path=$annot_path is not a file"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
img_dir=$(get_real_path $5)
|
||||
if [ ! -f $img_dir ]
|
||||
then
|
||||
echo "error: img_dir=$img_dir is not a file"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo $pretrained_backbone
|
||||
echo $dataset_path
|
||||
echo $annot_path
|
||||
echo $img_dir
|
||||
|
||||
export PYTHONPATH=${dirname_path}:$PYTHONPATH
|
||||
export RANK_SIZE=$num_devices
|
||||
export DEVICE_ID=0
|
||||
|
||||
echo "start training on $RANK_SIZE devices"
|
||||
|
||||
mkdir ${current_exec_path}/train_distribute_gpu
|
||||
cd ${current_exec_path}/train_distribute_gpu || exit
|
||||
|
||||
mpirun -n $RANK_SIZE \
|
||||
python ${dirname_path}/${SCRIPT_NAME} \
|
||||
--lr=4e-3 \
|
||||
--per_batch_size=8 \
|
||||
--is_distributed=1 \
|
||||
--t_max=140 \
|
||||
--max_epoch=140 \
|
||||
--warmup_epochs=0 \
|
||||
--lr_scheduler=multistep \
|
||||
--lr_epochs=90,120 \
|
||||
--weight_decay=0.0000 \
|
||||
--loss_scale=1024 \
|
||||
--pretrained_backbone=$pretrained_backbone \
|
||||
--data_dir=$dataset_path \
|
||||
--annot_path=$annot_path \
|
||||
--img_dir=$img_dir \
|
||||
--device_target="GPU" > train.log 2>&1 &
|
||||
|
||||
|
||||
echo 'running'
|
|
@ -0,0 +1,146 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 0 ] && [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] && [ $# != 5 ]
|
||||
then
|
||||
echo "Usage: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS] [IMAGES]"
|
||||
echo " or: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET] [ANNOTATIONS]"
|
||||
echo " or: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE] [DATASET]"
|
||||
echo " or: sh train_standalone_gpu.sh [USE_DEVICE_ID] [PRETRAINED_BACKBONE]"
|
||||
echo " or: sh train_standalone_gpu.sh [USE_DEVICE_ID]"
|
||||
echo " or: sh train_standalone_gpu.sh "
|
||||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
current_exec_path=$(pwd)
|
||||
echo "current_exec_path: " ${current_exec_path}
|
||||
|
||||
dirname_path=$(dirname "$(pwd)")
|
||||
echo "dirname_path: " ${dirname_path}
|
||||
|
||||
SCRIPT_NAME='train.py'
|
||||
|
||||
ulimit -c unlimited
|
||||
|
||||
root=${current_exec_path} # your script path
|
||||
pretrained_backbone=${dirname_path}/mobilenet_v2.ckpt # or mobilenet_v2-b0353104.ckpt
|
||||
dataset_path=$root/dataset/centerface
|
||||
annot_path=$dataset_path/annotations/train.json
|
||||
img_dir=$dataset_path/images/train/images
|
||||
use_device_id=0
|
||||
|
||||
if [ $# == 1 ]
|
||||
then
|
||||
use_device_id=$1
|
||||
fi
|
||||
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
use_device_id=$1
|
||||
pretrained_backbone=$(get_real_path $2)
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
use_device_id=$1
|
||||
pretrained_backbone=$(get_real_path $2)
|
||||
dataset_path=$(get_real_path $3)
|
||||
fi
|
||||
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
use_device_id=$1
|
||||
pretrained_backbone=$(get_real_path $2)
|
||||
dataset_path=$(get_real_path $3)
|
||||
annot_path=$(get_real_path $4)
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
use_device_id=$1
|
||||
pretrained_backbone=$(get_real_path $2)
|
||||
dataset_path=$(get_real_path $3)
|
||||
annot_path=$(get_real_path $4)
|
||||
img_dir=$(get_real_path $5)
|
||||
fi
|
||||
|
||||
echo "use_device_id: " $use_device_id
|
||||
echo "pretrained_backbone: " $pretrained_backbone
|
||||
echo "dataset_path: " $dataset_path
|
||||
echo "annot_path: " $annot_path
|
||||
echo "img_dir: " $img_dir
|
||||
|
||||
if [ ! -f $pretrained_backbone ]
|
||||
then
|
||||
echo "error: pretrained_backbone=$pretrained_backbone is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d $dataset_path ]
|
||||
then
|
||||
echo "error: dataset_path=$dataset_path is not a directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f $annot_path ]
|
||||
then
|
||||
echo "error: annot_path=$annot_path is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d $img_dir ]
|
||||
then
|
||||
echo "error: img_dir=$img_dir is not a directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export PYTHONPATH=${dirname_path}:$PYTHONPATH
|
||||
export RANK_SIZE=1
|
||||
|
||||
echo 'start training'
|
||||
echo 'start rank '$use_device_id
|
||||
rm -rf ${current_exec_path}/train_standalone_gpu
|
||||
mkdir ${current_exec_path}/train_standalone_gpu
|
||||
cd ${current_exec_path}/train_standalone_gpu || exit
|
||||
export RANK_ID=0
|
||||
dev=`expr $use_device_id + 0`
|
||||
export DEVICE_ID=$dev
|
||||
python ${dirname_path}/${SCRIPT_NAME} \
|
||||
--lr=5e-4 \
|
||||
--per_batch_size=8 \
|
||||
--is_distributed=0 \
|
||||
--t_max=140 \
|
||||
--max_epoch=140 \
|
||||
--warmup_epochs=0 \
|
||||
--lr_scheduler=multistep \
|
||||
--lr_epochs=90,120 \
|
||||
--weight_decay=0.0000 \
|
||||
--loss_scale=1024 \
|
||||
--pretrained_backbone=$pretrained_backbone \
|
||||
--data_dir=$dataset_path \
|
||||
--annot_path=$annot_path \
|
||||
--img_dir=$img_dir \
|
||||
--device_target="GPU" > train.log 2>&1 &
|
||||
|
||||
echo 'running'
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-21 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -39,6 +39,13 @@ reciprocal = P.Reciprocal()
|
|||
def tensor_grad_scale(scale, grad):
|
||||
return grad * reciprocal(scale)
|
||||
|
||||
_grad_overflow = C.MultitypeFuncGraph("_grad_overflow")
|
||||
grad_overflow = P.FloatStatus()
|
||||
|
||||
@_grad_overflow.register("Tensor")
|
||||
def _tensor_grad_overflow(grad):
|
||||
return grad_overflow(grad)
|
||||
|
||||
def conv1x1(in_channels, out_channels, stride=1, padding=0, has_bias=False):
|
||||
return nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, has_bias=has_bias,
|
||||
padding=padding, pad_mode="pad")
|
||||
|
@ -240,9 +247,16 @@ class TrainingWrapper(nn.Cell):
|
|||
self.grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree)
|
||||
|
||||
self.hyper_map = C.HyperMap()
|
||||
self.alloc_status = NPUAllocFloatStatus()
|
||||
self.get_status = NPUGetFloatStatus()
|
||||
self.clear_status = NPUClearFloatStatus()
|
||||
if context.get_context("device_target") == "GPU":
|
||||
self.gpu_target = True
|
||||
self.float_status = P.FloatStatus()
|
||||
self.addn = P.AddN()
|
||||
self.reshape = P.Reshape()
|
||||
else:
|
||||
self.gpu_target = False
|
||||
self.alloc_status = NPUAllocFloatStatus()
|
||||
self.get_status = NPUGetFloatStatus()
|
||||
self.clear_status = NPUClearFloatStatus()
|
||||
self.reduce_sum = ReduceSum(keep_dims=False)
|
||||
self.base = Tensor(1, mstype.float32)
|
||||
self.less_equal = LessEqual()
|
||||
|
@ -257,12 +271,15 @@ class TrainingWrapper(nn.Cell):
|
|||
weights = self.weights
|
||||
loss = self.network(x, hm, reg_mask, ind, wh, wight_mask, hm_offset, hps_mask, landmarks)
|
||||
|
||||
# init overflow buffer
|
||||
init = self.alloc_status()
|
||||
init = F.depend(init, loss)
|
||||
# clear overflow buffer
|
||||
clear_status = self.clear_status(init)
|
||||
loss = F.depend(loss, clear_status)
|
||||
init = False
|
||||
|
||||
if not self.gpu_target:
|
||||
# init overflow buffer
|
||||
init = self.alloc_status()
|
||||
init = F.depend(init, loss)
|
||||
# clear overflow buffer
|
||||
clear_status = self.clear_status(init)
|
||||
loss = F.depend(loss, clear_status)
|
||||
|
||||
#sens = sens_input #P.Fill()(P.DType()(loss), P.Shape()(loss), sens_input) # user can contral loss scale by add a sens_input
|
||||
sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
|
||||
|
@ -272,12 +289,20 @@ class TrainingWrapper(nn.Cell):
|
|||
if self.reducer_flag:
|
||||
grads = self.grad_reducer(grads)
|
||||
|
||||
# get the overflow buffer
|
||||
init = F.depend(init, grads)
|
||||
get_status = self.get_status(init)
|
||||
init = F.depend(init, get_status)
|
||||
# sum overflow buffer elements, 0:not overflow , >0:overflow
|
||||
flag_sum = self.reduce_sum(init, (0,))
|
||||
if not self.gpu_target:
|
||||
# get the overflow buffer
|
||||
init = F.depend(init, grads)
|
||||
|
||||
get_status = self.get_status(init)
|
||||
init = F.depend(init, get_status)
|
||||
# sum overflow buffer elements, 0:not overflow , >0:overflow
|
||||
flag_sum = self.reduce_sum(init, (0,))
|
||||
else:
|
||||
flag_sum = self.hyper_map(F.partial(_grad_overflow), grads)
|
||||
flag_sum = self.addn(flag_sum)
|
||||
# convert flag_sum to scalar
|
||||
flag_sum = self.reshape(flag_sum, (()))
|
||||
|
||||
if self.is_distributed:
|
||||
# sum overflow flag over devices
|
||||
flag_reduce = self.allreduce(flag_sum)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-21 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -34,8 +34,11 @@ from dependency.centernet.src.lib.detectors.base_detector import CenterFaceDetec
|
|||
from dependency.evaluate.eval import evaluation
|
||||
|
||||
dev_id = get_device_id()
|
||||
context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=False,
|
||||
device_target="Ascend", save_graphs=False, device_id=dev_id)
|
||||
context.set_context(mode=context.GRAPH_MODE,
|
||||
device_target=config.device_target, save_graphs=False, device_id=dev_id)
|
||||
|
||||
if config.device_target == "Ascend":
|
||||
context.set_context(enable_auto_mixed_precision=False)
|
||||
|
||||
def modelarts_process():
|
||||
config.data_dir = config.data_path
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-21 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -52,14 +52,18 @@ from src.model_utils.device_adapter import get_device_id
|
|||
|
||||
set_seed(1)
|
||||
dev_id = get_device_id()
|
||||
context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=False,
|
||||
device_target="Ascend", save_graphs=False, device_id=dev_id, reserve_class_name_in_scope=False)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target,
|
||||
save_graphs=False, device_id=dev_id, reserve_class_name_in_scope=False)
|
||||
|
||||
if config.device_target == "Ascend":
|
||||
context.set_context(enable_auto_mixed_precision=False)
|
||||
|
||||
if config.lr_scheduler == 'cosine_annealing' and config.max_epoch > config.t_max:
|
||||
config.t_max = config.max_epoch
|
||||
|
||||
config.lr_epochs = list(map(int, config.lr_epochs.split(',')))
|
||||
|
||||
|
||||
def convert_training_shape(args_):
|
||||
"""
|
||||
Convert training shape
|
||||
|
@ -81,10 +85,12 @@ class InternalCallbackParam(dict):
|
|||
def modelarts_pre_process():
|
||||
config.ckpt_path = os.path.join(config.output_path, config.ckpt_path)
|
||||
|
||||
|
||||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def train_centerface():
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_centerface()
|
||||
print('\ntrain.py config:\n', config)
|
||||
|
@ -103,7 +109,8 @@ if __name__ == "__main__":
|
|||
config.rank_save_ckpt_flag = 1
|
||||
|
||||
# logger
|
||||
config.outputs_dir = os.path.join(config.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
|
||||
config.outputs_dir = os.path.join(
|
||||
config.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
|
||||
|
||||
if config.need_profiler:
|
||||
profiler = Profiler(output_path=config.outputs_dir)
|
||||
|
@ -120,14 +127,16 @@ if __name__ == "__main__":
|
|||
|
||||
# Notice: parameter_broadcast should be supported, but current version has bugs, thus been disabled.
|
||||
# To make sure the init weight on all npu is the same, we need to set a static seed in default_recurisive_init when weight initialization
|
||||
context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree)
|
||||
context.set_auto_parallel_context(
|
||||
parallel_mode=parallel_mode, gradients_mean=True, device_num=degree)
|
||||
network = CenterfaceMobilev2()
|
||||
# init, to avoid overflow, some std of weight should be enough small
|
||||
default_recurisive_init(network)
|
||||
|
||||
if config.pretrained_backbone:
|
||||
network = load_backbone(network, config.pretrained_backbone, config)
|
||||
print('load pre-trained backbone {} into network'.format(config.pretrained_backbone))
|
||||
print(
|
||||
'load pre-trained backbone {} into network'.format(config.pretrained_backbone))
|
||||
else:
|
||||
print('Not load pre-trained backbone, please be careful')
|
||||
|
||||
|
@ -155,9 +164,10 @@ if __name__ == "__main__":
|
|||
config.multi_scale = [convert_training_shape(config)]
|
||||
|
||||
# data loader
|
||||
data_loader, config.steps_per_epoch = GetDataLoader(per_batch_size=config.per_batch_size, \
|
||||
max_epoch=config.max_epoch, rank=config.rank, group_size=config.group_size, \
|
||||
config=config, split='train')
|
||||
data_loader, config.steps_per_epoch = GetDataLoader(per_batch_size=config.per_batch_size,
|
||||
max_epoch=config.max_epoch, rank=config.rank,
|
||||
group_size=config.group_size,
|
||||
config=config, split='train')
|
||||
config.steps_per_epoch = config.steps_per_epoch // config.max_epoch
|
||||
print('Finish loading dataset')
|
||||
|
||||
|
@ -238,7 +248,7 @@ if __name__ == "__main__":
|
|||
run_context = RunContext(cb_params)
|
||||
ckpt_cb.begin(run_context)
|
||||
|
||||
print('config.steps_per_epoch = {} config.ckpt_interval ={}'.format(config.steps_per_epoch, \
|
||||
print('config.steps_per_epoch = {} config.ckpt_interval ={}'.format(config.steps_per_epoch,
|
||||
config.ckpt_interval))
|
||||
|
||||
t_end = time.time()
|
||||
|
@ -258,12 +268,13 @@ if __name__ == "__main__":
|
|||
hps_mask = Tensor(hps_mask)
|
||||
landmarks = Tensor(landmarks)
|
||||
|
||||
loss, overflow, scaling = network(images, hm, reg_mask, ind, wh, wight_mask, hm_offset, hps_mask, landmarks)
|
||||
loss, overflow, scaling = network(
|
||||
images, hm, reg_mask, ind, wh, wight_mask, hm_offset, hps_mask, landmarks)
|
||||
# Tensor to numpy
|
||||
overflow = np.all(overflow.asnumpy())
|
||||
loss = loss.asnumpy()
|
||||
loss_meter.update(loss)
|
||||
print('epoch:{}, iter:{}, avg_loss:{}, loss:{}, overflow:{}, loss_scale:{}'.format( \
|
||||
print('epoch:{}, iter:{}, avg_loss:{}, loss:{}, overflow:{}, loss_scale:{}'.format(
|
||||
epoch, i, loss_meter, loss, overflow, scaling.asnumpy()))
|
||||
|
||||
if config.rank_save_ckpt_flag:
|
||||
|
@ -280,7 +291,7 @@ if __name__ == "__main__":
|
|||
print(
|
||||
'epoch[{}], {}, {:.2f} imgs/sec, lr:{}'
|
||||
.format(epoch, loss_meter, fps, lr[i + (epoch-1)*config.steps_per_epoch])
|
||||
)
|
||||
)
|
||||
t_end = time.time()
|
||||
loss_meter.reset()
|
||||
|
||||
|
|
Loading…
Reference in New Issue