forked from mindspore-Ecosystem/mindspore
GPU script adaptation
This commit is contained in:
parent
c9af7643d7
commit
f841c25f0a
|
@ -112,8 +112,12 @@ The entire code structure is as following:
|
|||
├─ scripts
|
||||
├─ run_standalone_train.sh # launch standalone training(1p) in ascend
|
||||
├─ run_distribute_train.sh # launch distributed training(8p) in ascend
|
||||
├─ run_standalone_train_gpu.sh # launch standalone training(1p) in GPU
|
||||
├─ run_distribute_train_gpu.sh # launch distributed training(8p) in GPU
|
||||
├─ run_eval.sh # launch evaluating in ascend
|
||||
├─ run_eval_gpu.sh # launch evaluating in gpu
|
||||
└─ run_export.sh # launch exporting air model
|
||||
├─ run_infer_310.sh # shell script for 310 inference
|
||||
├─ src
|
||||
├─ FaceAttribute
|
||||
├─ cross_entropy.py # cross entroy loss
|
||||
|
@ -144,14 +148,14 @@ The entire code structure is as following:
|
|||
|
||||
- Stand alone mode
|
||||
|
||||
```bash
|
||||
```bash Ascend
|
||||
cd ./scripts
|
||||
sh run_standalone_train.sh [MINDRECORD_FILE] [USE_DEVICE_ID]
|
||||
```
|
||||
|
||||
or (fine-tune)
|
||||
|
||||
```bash
|
||||
```bash Ascend
|
||||
cd ./scripts
|
||||
sh run_standalone_train.sh [MINDRECORD_FILE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]
|
||||
```
|
||||
|
@ -163,27 +167,65 @@ The entire code structure is as following:
|
|||
sh run_standalone_train.sh /home/train.mindrecord 0 /home/a.ckpt
|
||||
```
|
||||
|
||||
- Distribute mode (recommended)
|
||||
|
||||
```bash
|
||||
```bash GPU
|
||||
cd ./scripts
|
||||
sh run_distribute_train.sh [MINDRECORD_FILE] [RANK_TABLE]
|
||||
sh run_standalone_train_gpu.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES]
|
||||
```
|
||||
|
||||
or (fine-tune)
|
||||
|
||||
```bash
|
||||
```bash GPU
|
||||
cd ./scripts
|
||||
sh run_distribute_train.sh [MINDRECORD_FILE] [RANK_TABLE] [PRETRAINED_BACKBONE]
|
||||
sh run_standalone_train_gpu.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES] [PRETRAINED_BACKBONE]
|
||||
```
|
||||
|
||||
for example:
|
||||
|
||||
```bash
|
||||
cd ./scripts
|
||||
sh run_standalone_train_gpu.sh /home/train.mindrecord 0 /home/a.ckpt
|
||||
```
|
||||
|
||||
- Distribute mode (recommended)
|
||||
|
||||
```bash Ascend
|
||||
cd ./scripts
|
||||
sh run_distribute_train.sh [MINDRECORD_FILE] [RANK_TABLE]
|
||||
```
|
||||
|
||||
or (fine-tune)
|
||||
|
||||
```bash Ascend
|
||||
cd ./scripts
|
||||
sh run_distribute_train.sh [MINDRECORD_FILE] [RANK_TABLE] [PRETRAINED_BACKBONE]
|
||||
```
|
||||
|
||||
for example:
|
||||
|
||||
```bash Ascend
|
||||
cd ./scripts
|
||||
sh run_distribute_train.sh /home/train.mindrecord ./rank_table_8p.json /home/a.ckpt
|
||||
```
|
||||
|
||||
```bash GPU
|
||||
cd ./scripts
|
||||
sh run_distribute_train_gpu.sh [DEVICE_NUM] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDRECORD_FILE]
|
||||
```
|
||||
|
||||
or (fine-tune)
|
||||
|
||||
```bash GPU
|
||||
cd ./scripts
|
||||
sh run_distribute_train_gpu.sh [DEVICE_NUM] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDRECORD_FILE] [PRETRAINED_BACKBONE]
|
||||
```
|
||||
|
||||
for example:
|
||||
|
||||
```bash GPU
|
||||
cd ./scripts
|
||||
sh run_distribute_train_gpu.sh 8 0,1,2,3,4,5,6,7 /home/train.mindrecord ./rank_table_8p.json /home/a.ckpt
|
||||
```
|
||||
|
||||
You will get the loss value of each step as following in "./output/[TIME]/[TIME].log" or "./scripts/device0/train.log":
|
||||
|
||||
```python
|
||||
|
@ -285,14 +327,26 @@ epoch[69], iter[6150], loss:1.167064, 9300.77 imgs/sec
|
|||
|
||||
### Evaluation
|
||||
|
||||
```bash
|
||||
```bash Ascend
|
||||
cd ./scripts
|
||||
sh run_eval.sh [MINDRECORD_FILE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]
|
||||
```
|
||||
|
||||
for example:
|
||||
|
||||
```bash
|
||||
```bash Ascend
|
||||
cd ./scripts
|
||||
sh run_eval.sh /home/eval.mindrecord 0 /home/a.ckpt
|
||||
```
|
||||
|
||||
```bash GPU
|
||||
cd ./scripts
|
||||
sh run_eval_gpu.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES] [PRETRAINED_BACKBONE]
|
||||
```
|
||||
|
||||
for example:
|
||||
|
||||
```bash GPU
|
||||
cd ./scripts
|
||||
sh run_eval.sh /home/eval.mindrecord 0 /home/a.ckpt
|
||||
```
|
||||
|
@ -315,7 +369,7 @@ mask f1: 0.9992691394116572
|
|||
|
||||
If you want to infer the network on Ascend 310, you should convert the model to AIR:
|
||||
|
||||
```bash
|
||||
```bash Ascend
|
||||
cd ./scripts
|
||||
sh run_export.sh [BATCH_SIZE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]
|
||||
```
|
||||
|
@ -325,7 +379,7 @@ sh run_export.sh [BATCH_SIZE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]
|
|||
#### Export MindIR
|
||||
|
||||
```shell
|
||||
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
|
||||
python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] --device_target [device_target]
|
||||
```
|
||||
|
||||
The ckpt_file parameter is required,
|
||||
|
@ -362,36 +416,36 @@ Inference result is saved in current path, you can find result like this in acc.
|
|||
|
||||
### Training Performance
|
||||
|
||||
| Parameters | Face Attribute |
|
||||
| -------------------------- | ----------------------------------------------------------- |
|
||||
| Model Version | V1 |
|
||||
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 |
|
||||
| uploaded Date | 09/30/2020 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 |
|
||||
| Dataset | 91K images |
|
||||
| Training Parameters | epoch=70, batch_size=128, momentum=0.9, lr=0.001 |
|
||||
| Optimizer | Momentum |
|
||||
| Loss Function | Softmax Cross Entropy |
|
||||
| outputs | probability |
|
||||
| Speed | 1pc: 200~250 ms/step; 8pcs: 100~150 ms/step |
|
||||
| Total time | 1pc: 2.5 hours; 8pcs: 0.3 hours |
|
||||
| Checkpoint for Fine tuning | 88M (.ckpt file) |
|
||||
| Parameters | Face Attribute | Face Attribute |
|
||||
| -------------------------- | ----------------------------------------------------------- | ----------------------------------------------------------- |
|
||||
| Model Version | V1 | V1 |
|
||||
| Resource | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | Tesla V100-PICE-32G |
|
||||
| uploaded Date | 09/30/2020 (month/day/year) | 07/19/2021 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 | 1.3.0 |
|
||||
| Dataset | 91K images | 91K images |
|
||||
| Training Parameters | epoch=70, batch_size=128, momentum=0.9, lr=0.001 | epoch=70, batch_size=128, momentum=0.9, lr=0.001 |
|
||||
| Optimizer | Momentum | Momentum |
|
||||
| Loss Function | Softmax Cross Entropy | Softmax Cross Entropy |
|
||||
| outputs | probability | probability |
|
||||
| Speed | 1pc: 200~250 ms/step; 8pcs: 100~150 ms/step | 1pc: 115~125 ms/step; 8pcs: 150~200 ms/step |
|
||||
| Total time | 1pc: 2.5 hours; 8pcs: 0.3 hours | 1pc: 1.5 hours; 8pcs: 0.4 hours |
|
||||
| Checkpoint for Fine tuning | 88M (.ckpt file) | 88M (.ckpt file) |
|
||||
|
||||
### Evaluation Performance
|
||||
|
||||
| Parameters | Face Attribute |
|
||||
| ------------------- | --------------------------- |
|
||||
| Model Version | V1 |
|
||||
| Resource | Ascend 910; OS Euler2.8 |
|
||||
| Uploaded Date | 09/30/2020 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 |
|
||||
| Dataset | 11K images |
|
||||
| batch_size | 1 |
|
||||
| outputs | accuracy |
|
||||
| Accuracy(8pcs) | age:45.7% |
|
||||
| | gender:89.5% |
|
||||
| | mask:99.2% |
|
||||
| Model for inference | 88M (.ckpt file) |
|
||||
| Parameters | Face Attribute | Face Attribute |
|
||||
| ------------------- | --------------------------- | --------------------------- |
|
||||
| Model Version | V1 | V1 |
|
||||
| Resource | Ascend 910; OS Euler2.8 | Tesla V100-PICE-32G |
|
||||
| Uploaded Date | 09/30/2020 (month/day/year) | 07/19/2021 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 | 1.3.0 |
|
||||
| Dataset | 11K images | 11K images |
|
||||
| batch_size | 1 | 1 |
|
||||
| outputs | accuracy | accuracy |
|
||||
| Accuracy(8pcs) | age:45.7% | age:49.0% |
|
||||
| | gender:89.5% | gender:90.8% |
|
||||
| | mask:99.2% | mask:99.3% |
|
||||
| Model for inference | 88M (.ckpt file) | 88M (.ckpt file) |
|
||||
|
||||
# [ModelZoo Homepage](#contents)
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ from model_utils.device_adapter import get_device_id, get_device_num
|
|||
def softmax(x, axis=0):
|
||||
return np.exp(x) / np.sum(np.exp(x), axis=axis)
|
||||
|
||||
|
||||
def load_pretrain(checkpoint, network):
|
||||
'''load pretrain model.'''
|
||||
if os.path.isfile(checkpoint):
|
||||
|
@ -51,8 +52,10 @@ def load_pretrain(checkpoint, network):
|
|||
print('-----------------------load model failed-----------------------')
|
||||
return network
|
||||
|
||||
|
||||
def modelarts_pre_process():
|
||||
'''modelarts pre process function.'''
|
||||
|
||||
def unzip(zip_file, save_dir):
|
||||
import zipfile
|
||||
s_time = time.time()
|
||||
|
@ -106,7 +109,8 @@ def modelarts_pre_process():
|
|||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def run_eval():
|
||||
'''run eval.'''
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=get_device_id())
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False,
|
||||
device_id=get_device_id())
|
||||
|
||||
network = get_resnet18(config)
|
||||
ckpt_path = config.model_path
|
||||
|
|
|
@ -33,7 +33,7 @@ def modelarts_pre_process():
|
|||
def run_export():
|
||||
'''run export.'''
|
||||
devid = 0
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=devid)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, device_id=devid)
|
||||
|
||||
network = get_resnet18(config)
|
||||
ckpt_path = config.ckpt_file
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 3 ] && [ $# != 4 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train_gpu [DEVICE_NUM] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDRECORD_FILE] [PRETRAINED_BACKBONE]"
|
||||
echo " or: sh run_distrubute_train_gpu.sh [DEVICE_NUM] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDRECORD_FILE]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
current_exec_path=$(pwd)
|
||||
echo ${current_exec_path}
|
||||
|
||||
dirname_path=$(dirname "$(pwd)")
|
||||
echo ${dirname_path}
|
||||
|
||||
export PYTHONPATH=${dirname_path}:$PYTHONPATH
|
||||
export CUDA_VISIBLE_DEVICES=$2
|
||||
export RANK_SIZE=$1
|
||||
|
||||
SCRIPT_NAME='train.py'
|
||||
ulimit -c unlimited
|
||||
|
||||
echo 'start training'
|
||||
export RANK_ID=0
|
||||
rm -rf train_distribute_gpu
|
||||
mkdir train_distribute_gpu
|
||||
cd train_distribute_gpu
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
mpirun -n $1 --allow-run-as-root python ${dirname_path}/${SCRIPT_NAME} \
|
||||
--world_size=$1 \
|
||||
--device_target='GPU' \
|
||||
--mindrecord_path=$3 > train.log 2>&1 &
|
||||
else
|
||||
mpirun -n $1 --allow-run-as-root python ${dirname_path}/${SCRIPT_NAME} \
|
||||
--world_size=$1 \
|
||||
--device_target='GPU' \
|
||||
--mindrecord_path=$3 \
|
||||
--pretrained=$4 > train.log 2>&1 &
|
||||
fi
|
||||
echo 'running'
|
|
@ -0,0 +1,46 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 3 ]
|
||||
then
|
||||
echo "Usage: sh run_eval.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES] [PRETRAINED_BACKBONE]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
current_exec_path=$(pwd)
|
||||
echo ${current_exec_path}
|
||||
|
||||
dirname_path=$(dirname "$(pwd)")
|
||||
echo ${dirname_path}
|
||||
|
||||
export PYTHONPATH=${dirname_path}:$PYTHONPATH
|
||||
export CUDA_VISIBLE_DEVICES=$2
|
||||
export RANK_SIZE=1
|
||||
|
||||
SCRIPT_NAME='eval.py'
|
||||
|
||||
ulimit -c unlimited
|
||||
echo 'start evaluating'
|
||||
export RANK_ID=0
|
||||
rm -rf eval_gpu
|
||||
mkdir eval_gpu
|
||||
cd eval_gpu
|
||||
|
||||
python ${dirname_path}/${SCRIPT_NAME} \
|
||||
--mindrecord_path=$1 \
|
||||
--device_target="GPU" \
|
||||
--model_path=$3 > eval.log 2>&1 &
|
||||
echo 'running'
|
|
@ -0,0 +1,57 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 2 ] && [ $# != 3 ]
|
||||
then
|
||||
echo "Usage: sh run_standalone_train_gpu.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES] [PRETRAINED_BACKBONE]"
|
||||
echo " or: sh run_standalone_train_gpu.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
current_exec_path=$(pwd)
|
||||
echo ${current_exec_path}
|
||||
|
||||
dirname_path=$(dirname "$(pwd)")
|
||||
echo ${dirname_path}
|
||||
|
||||
export PYTHONPATH=${dirname_path}:$PYTHONPATH
|
||||
export RANK_SIZE=1
|
||||
export CUDA_VISIBLE_DEVICES=$2
|
||||
|
||||
SCRIPT_NAME='train.py'
|
||||
|
||||
ulimit -c unlimited
|
||||
|
||||
echo 'start training'
|
||||
export RANK_ID=0
|
||||
rm -rf train_alone_gpu
|
||||
mkdir train_alone_gpu
|
||||
cd train_alone_gpu
|
||||
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
python ${dirname_path}/${SCRIPT_NAME} \
|
||||
--world_size=1 \
|
||||
--device_target='GPU' \
|
||||
--mindrecord_path=$1 > train.log 2>&1 &
|
||||
else
|
||||
python ${dirname_path}/${SCRIPT_NAME} \
|
||||
--world_size=1 \
|
||||
--device_target='GPU' \
|
||||
--mindrecord_path=$1 \
|
||||
--pretrained=$3 > train.log 2>&1 &
|
||||
fi
|
||||
echo 'running'
|
|
@ -16,7 +16,6 @@
|
|||
import os
|
||||
import time
|
||||
import datetime
|
||||
|
||||
import mindspore
|
||||
import mindspore.nn as nn
|
||||
from mindspore import context
|
||||
|
@ -29,13 +28,11 @@ from mindspore.train.callback import ModelCheckpoint, RunContext, CheckpointConf
|
|||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.common import dtype as mstype
|
||||
|
||||
from src.FaceAttribute.resnet18 import get_resnet18
|
||||
from src.FaceAttribute.loss_factory import get_loss
|
||||
from src.dataset_train import data_generator
|
||||
from src.lrsche_factory import warmup_step
|
||||
from src.logging import get_logger, AverageMeter
|
||||
|
||||
from model_utils.config import config
|
||||
from model_utils.moxing_adapter import moxing_wrapper
|
||||
from model_utils.device_adapter import get_device_id, get_device_num
|
||||
|
@ -50,8 +47,10 @@ class InternalCallbackParam(dict):
|
|||
def __setattr__(self, _key, _value):
|
||||
self[_key] = _value
|
||||
|
||||
|
||||
class BuildTrainNetwork(nn.Cell):
|
||||
'''Build train network.'''
|
||||
|
||||
def __init__(self, my_network, my_criterion):
|
||||
super(BuildTrainNetwork, self).__init__()
|
||||
self.network = my_network
|
||||
|
@ -66,6 +65,7 @@ class BuildTrainNetwork(nn.Cell):
|
|||
|
||||
def modelarts_pre_process():
|
||||
'''modelarts pre process function.'''
|
||||
|
||||
def unzip(zip_file, save_dir):
|
||||
import zipfile
|
||||
s_time = time.time()
|
||||
|
@ -121,7 +121,8 @@ def modelarts_pre_process():
|
|||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def run_train():
|
||||
'''run train.'''
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=get_device_id())
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False,
|
||||
device_id=get_device_id())
|
||||
mindspore.set_seed(1)
|
||||
|
||||
# init distributed
|
||||
|
@ -241,5 +242,6 @@ def run_train():
|
|||
|
||||
config.logger.info('--------- trains out ---------')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_train()
|
||||
|
|
Loading…
Reference in New Issue