From aaa0436882627366b006ac642a1963c0218a615a Mon Sep 17 00:00:00 2001 From: hwjiaorui Date: Thu, 3 Sep 2020 21:37:15 +0800 Subject: [PATCH] add training scripts and modify readme of mobilenetv2_quant and resnet50_quant modify readme --- .../official/cv/mobilenetv2_quant/Readme.md | 17 +- .../cv/mobilenetv2_quant/scripts/run_train.sh | 300 ++++++++++++++++++ .../scripts/run_train_quant.sh | 96 ------ .../official/cv/resnet50_quant/Readme.md | 51 ++- .../cv/resnet50_quant/scripts/run_train.sh | 290 ++++++++++++++--- 5 files changed, 603 insertions(+), 151 deletions(-) create mode 100644 model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh delete mode 100644 model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh diff --git a/model_zoo/official/cv/mobilenetv2_quant/Readme.md b/model_zoo/official/cv/mobilenetv2_quant/Readme.md index 354fd5703ff..0ec5b361f73 100644 --- a/model_zoo/official/cv/mobilenetv2_quant/Readme.md +++ b/model_zoo/official/cv/mobilenetv2_quant/Readme.md @@ -70,7 +70,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil ├── mobileNetv2_quant ├── Readme.md # descriptions about MobileNetV2-Quant ├── scripts - │ ├──run_train_quant.sh # shell script for train on Ascend + │ ├──run_train.sh # shell script for train on Ascend and GPU │ ├──run_infer_quant.sh # shell script for evaluation on Ascend ├── src │ ├──config.py # parameter configuration @@ -91,19 +91,22 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil You can start training using python or shell scripts. The usage of shell scripts as follows: -- Ascend: sh run_train_quant.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] +- bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional) +- bash run_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional) + ### Launch -``` -# training example - shell: - Ascend: sh run_train_quant.sh Ascend 8 10.222.223.224 0,1,2,3,4,5,6,7 ~/imagenet/train/ mobilenet_199.ckpt +``` bash + # training example + >>> bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt + >>> bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt ``` ### Result -Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. +Training result will be stored in the example path. Checkpoints trained by `Ascend` will be stored at `./train/device$i/checkpoint` by default, and training log will be redirected to `./train/device$i/train.log`. Checkpoints trained by `GPU` will be stored in `./train/checkpointckpt_$i` by default, and training log will be redirected to `./train/train.log`. +`train.log` is as follows: ``` epoch: [ 0/200], step:[ 624/ 625], loss:[5.258/5.258], time:[140412.236], lr:[0.100] diff --git a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh new file mode 100644 index 00000000000..36594b23dd7 --- /dev/null +++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh @@ -0,0 +1,300 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + + +# check_and_get_Ascend_device(){ + +# #device_list=(${1//,/ }) +# IFS=',' read -ra device_list <<<"$1" +# last_device_id=0 +# first_device_id=8 +# device_used=(0 0 0 0 0 0 0 0) + +# for var in "${device_list[@]}" +# do + +# if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ] +# then +# echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!" +# exit 1 +# fi + +# if [ ${device_used[$((var))]} -eq 0 ] +# then +# device_used[ $((var)) ]=1 +# else +# echo "error: device id is duplicate, please check your device id list!" +# exit 1 +# fi + +# if [ ${last_device_id} \< $((var)) ] +# then +# last_device_id=$((var)) +# fi +# if [ ${first_device_id} \> $((var)) ] +# then +# first_device_id=$((var)) +# fi +# done + +# device_num=`expr ${last_device_id} - ${first_device_id} + 1` +# if [ ${device_num} != ${#device_list[*]} ] +# then +# echo "error: the Ascend chips used must be continuous, please check your device id list!" +# exit 1 +# fi + +# if [ ${first_device_id} -lt 4 ] && [ ${last_device_id} -ge 4 ] +# then +# if [ ${first_device_id} != 0 ] || [ ${last_device_id} != 7 ] +# then +# echo "error: device id list must be in the same group of [0,4) or [4,8) when using Ascend chips." +# exit 1 +# fi +# fi + +# echo "${first_device_id},`expr ${last_device_id} + 1`" +# } + +# get_hccl_name(){ + +# server_ip=$(ifconfig -a | grep inet | grep -v 127.0.0.1 | grep -v inet6 | awk '{print $2}' | tr -d "addr:") +# device_num=`expr $2 - $1` +# device_id_list="" + +# for(( i=$1 ; i < $2 ; i++ )) +# do +# device_id_list=${device_id_list}$i +# done +# hccl_name="hccl_${device_num}p_${device_id_list}_${server_ip}.json" + +# echo ${hccl_name} +# } + + +get_gpu_device_num(){ + + #device_list=(${1//,/ }) + IFS=',' read -ra device_list <<<"$1" + device_used=(0 0 0 0 0 0 0 0) + device_num=0 + for var in "${device_list[@]}" + do + if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ] + then + echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!" + exit 1 + fi + + if [ ${device_used[$((var))]} -eq 0 ] + then + device_used[ $((var)) ]=1 + device_num=$((device_num+1)) + fi + done + + echo ${device_num} +} + + +run_ascend(){ + + if [ $# -gt 4 ] || [ $# -lt 3 ] + then + echo "Usage: bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n " + exit 1 + fi + + #first_last_device=$(check_and_get_Ascend_device $2) + #devices=(${first_last_device//,/ }) + #IFS=',' read -ra devices <<<"${first_last_device}" + # first_device=${first_last_device:0:1} + # last_device=${first_last_device:2:1} + # device_num=`expr $((last_device)) - $((first_device))` + + #single ascend or multiple ascend + # if [ ${device_num} -gt 1 ] + # then + # ori_path=$(dirname "$(readlink -f "$0" )") + # #generate hccl config file + # cd ../../../../utils/hccl_tools/ || exit + # device_num_arg="[${first_device},${last_device})" + + # python hccl_tools.py --device_num=${device_num_arg} + + # hccl_name=$(get_hccl_name ${first_device} ${last_device}) + + # if [ ! -e ${hccl_name} ] + # then + # echo "error: failed to generate the hccl config file!" + # exit 1 + # fi + + # mv ${hccl_name} ${ori_path} + # cd ${ori_path} || exit + + # PATH1=$(get_real_path ${hccl_name}) + + # if [ ! -f $PATH1 ] + # then + # echo "error: RANK_TABLE_FILE=$PATH1 is not a file" + # exit 1 + # fi + + # export RANK_TABLE_FILE=$PATH1 + # fi + + PATH1=$(get_real_path $2) + PATH2=$(get_real_path $3) + + if [ $# == 4 ] + then + PATH3=$(get_real_path $4) + fi + + if [ ! -f $PATH1 ] + then + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" + exit 1 + fi + + if [ ! -d $PATH2 ] + then + echo "error: DATASET_PATH=$PATH2 is not a directory" + exit 1 + fi + + if [ $# == 4 ] && [ ! -f $PATH3 ] + then + echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" + exit 1 + fi + + + rank_file_name=${2##*/} + IFS='_' read -ra array <<<"${rank_file_name}" + device_id_list=${array[2]} + first_device=${device_id_list:0:1} + #last_device=${device_list:${#device_list}-1:1} + device_num=${#device_id_list} + + ulimit -u unlimited + export DEVICE_NUM=${device_num} + export RANK_SIZE=${device_num} + export RANK_TABLE_FILE=$PATH1 + + export SERVER_ID=0 + rank_start=$((DEVICE_NUM * SERVER_ID)) + + rm -rf ./train + mkdir ./train + for((i=0; i<${device_num}; i++)) + do + export DEVICE_ID=$((first_device+i)) + export RANK_ID=$((rank_start + i)) + mkdir ./train/device$i + cp ../*.py ./train/device$i + cp *.sh ./train/device$i + cp -r ../src ./train/device$i + cd ./train/device$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + if [ $# == 3 ] + then + python train.py --device_target=$1 --dataset_path=$PATH2 &> train.log & + fi + + if [ $# == 4 ] + then + python train.py --device_traget=$1 --dataset_path=$PATH2 --pre_trained=$PATH3 &> train.log & + fi + + cd ../.. || exit + done +} + +run_gpu(){ + if [ $# -gt 3 ] || [ $# -lt 2 ] + then + echo "Usage: bash run_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n " + exit 1 + fi + + PATH1=$(get_real_path $3) + + if [ $# == 4 ] + then + PATH2=$(get_real_path $4) + fi + + if [ ! -d $PATH1 ] + then + echo "error: DATASET_PATH=$PATH1 is not a directory" + exit 1 + fi + + if [ $# == 4 ] && [ ! -f $PATH2 ] + then + echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" + exit 1 + fi + + device_num=$(get_gpu_device_num $2) + + ulimit -u unlimited + export DEVICE_NUM=${device_num} + export RANK_SIZE=${device_num} + export CUDA_VISIBLE_DEVICES=$2 + + rm -rf ./train + mkdir ./train + cp ../*.py ./train + cp *.sh ./train + cp -r ../src ./train + cd ./train || exit + echo "start training" + env > env.log + if [ $# == 3 ] + then + mpirun --allow-run-as-root -n ${RANK_SIZE} \ + python train.py --device_target=$1 --dataset_path=$PATH1 &> train.log & + fi + + if [ $# == 4 ] + then + mpirun --allow-run-as-root -n ${RANK_SIZE} \ + python train.py --device_traget=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log & + fi + + cd .. +} + + +if [ $1 = "Ascend" ] ; then + run_ascend "$@" +elif [ $1 = "GPU" ] ; then + run_gpu "$@" +else + echo "Unsupported device target: $1" +fi; diff --git a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh deleted file mode 100644 index 1b2d42d265c..00000000000 --- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -run_ascend() -{ - if [ $2 -lt 1 ] && [ $2 -gt 8 ] - then - echo "error: DEVICE_NUM=$2 is not in (1-9)" - exit 1 - fi - - if [ ! -d $5 ] && [ ! -f $5 ] - then - echo "error: DATASET_PATH=$5 is not a directory or file" - exit 1 - fi - - BASEPATH=$(cd "`dirname $0`" || exit; pwd) - export PYTHONPATH=${BASEPATH}:$PYTHONPATH - if [ -d "../train" ]; - then - rm -rf ../train - fi - mkdir ../train - cd ../train || exit - python ${BASEPATH}/../src/launch.py \ - --nproc_per_node=$2 \ - --visible_devices=$4 \ - --server_id=$3 \ - --training_script=${BASEPATH}/../train.py \ - --dataset_path=$5 \ - --pre_trained=$6 \ - --device_target=$1 &> train.log & # dataset train folder -} - -run_gpu() -{ - if [ $2 -lt 1 ] && [ $2 -gt 8 ] - then - echo "error: DEVICE_NUM=$2 is not in (1-8)" - exit 1 - fi - - if [ ! -d $4 ] - then - echo "error: DATASET_PATH=$4 is not a directory" - exit 1 - fi - - BASEPATH=$(cd "`dirname $0`" || exit; pwd) - export PYTHONPATH=${BASEPATH}:$PYTHONPATH - if [ -d "../train" ]; - then - rm -rf ../train - fi - mkdir ../train - cd ../train || exit - - export CUDA_VISIBLE_DEVICES="$3" - mpirun -n $2 --allow-run-as-root \ - python ${BASEPATH}/../train.py \ - --dataset_path=$4 \ - --device_target=$1 \ - --pre_trained=$5 &> ../train.log & # dataset train folder -} - -if [ $# -gt 6 ] || [ $# -lt 5 ] -then - echo "Usage:\n \ - Ascend: sh run_train_quant.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \ - GPU: sh run_train_quant.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \ - " -exit 1 -fi - -if [ $1 = "Ascend" ] ; then - run_ascend "$@" -elif [ $1 = "GPU" ] ; then - run_gpu "$@" -else - echo "Unsupported device target." -fi; - diff --git a/model_zoo/official/cv/resnet50_quant/Readme.md b/model_zoo/official/cv/resnet50_quant/Readme.md index 82c94f7de7d..9f242d8e700 100644 --- a/model_zoo/official/cv/resnet50_quant/Readme.md +++ b/model_zoo/official/cv/resnet50_quant/Readme.md @@ -1,4 +1,43 @@ # Contents +# ResNet-50_quant Example + +## Description + +This is an example of training ResNet-50_quant with ImageNet2012 dataset in MindSpore. + +## Requirements + +- Install [MindSpore](https://www.mindspore.cn/install/en). + +- Download the dataset ImageNet2012 + +> Unzip the ImageNet2012 dataset to any path you want and the folder structure should include train and eval dataset as follows: +> ``` +> . +> ├── ilsvrc # train dataset +> └── ilsvrc_eval # infer dataset: images should be classified into 1000 directories firstly, just like train images +> ``` + + +## Example structure + +```shell +resnet50_quant/ + ├── eval.py + ├── models + │   └── resnet_quant.py + ├── Readme.md + ├── scripts + │   ├── run_infer.sh + │   └── run_train.sh + ├── src + │   ├── config.py + │   ├── crossentropy.py + │   ├── dataset.py + │   ├── launch.py + │   └── lr_generator.py + └── train.py +``` - [resnet50 Description](#resnet50-description) - [Model Architecture](#model-architecture) @@ -88,21 +127,17 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil ### Usage - -You can start training using python or shell scripts. The usage of shell scripts as follows: - -- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH][CKPT_PATH] +- Ascend: sh run_train.sh Ascend [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional) ### Launch ``` -# training example - shell: - Ascend: sh run_train.sh Ascend 8 10.222.223.224 0,1,2,3,4,5,6,7 ~/resnet/train/ Resnet50-90_5004.ckpt + # training example + Ascend: bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ``` ### Result -Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log will be redirected to `./train/train.log` like followings. +Training result will be stored in the example path. Checkpoints will be stored at `./train/device$i/` by default, and training log will be redirected to `./train/device$i/train.log` like followings. ``` epoch: 1 step: 5004, loss is 4.8995576 diff --git a/model_zoo/official/cv/resnet50_quant/scripts/run_train.sh b/model_zoo/official/cv/resnet50_quant/scripts/run_train.sh index a4272015879..5717cc3bc49 100644 --- a/model_zoo/official/cv/resnet50_quant/scripts/run_train.sh +++ b/model_zoo/official/cv/resnet50_quant/scripts/run_train.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/bash # Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,49 +14,259 @@ # limitations under the License. # ============================================================================ -run_ascend() -{ - if [ $2 -lt 1 ] && [ $2 -gt 8 ] - then - echo "error: DEVICE_NUM=$2 is not in (1-8)" - exit 1 +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" fi - - if [ ! -d $5 ] && [ ! -f $5 ] - then - echo "error: DATASET_PATH=$5 is not a directory or file" - exit 1 - fi - - BASEPATH=$(cd "`dirname $0`" || exit; pwd) - export PYTHONPATH=${BASEPATH}:$PYTHONPATH - if [ -d "../train" ]; - then - rm -rf ../train - fi - mkdir ../train - cd ../train || exit - python ${BASEPATH}/../src/launch.py \ - --nproc_per_node=$2 \ - --visible_devices=$4 \ - --server_id=$3 \ - --training_script=${BASEPATH}/../train.py \ - --dataset_path=$5 \ - --pre_trained=$6 \ - --device_target=$1 &> train.log & # dataset train folder } -if [ $# -gt 6 ] || [ $# -lt 4 ] -then - echo "Usage:\n \ - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \ - " -exit 1 -fi +# check_and_get_Ascend_device(){ + +# #device_list=(${1//,/ }) +# IFS=',' read -ra device_list <<<"$1" +# last_device_id=0 +# first_device_id=8 +# device_used=(0 0 0 0 0 0 0 0) + +# for var in "${device_list[@]}" +# do + +# if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ] +# then +# echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!" +# exit 1 +# fi + +# if [ ${device_used[$((var))]} -eq 0 ] +# then +# device_used[ $((var)) ]=1 +# else +# echo "error: device id is duplicate, please check your device id list!" +# exit 1 +# fi + +# if [ ${last_device_id} \< $((var)) ] +# then +# last_device_id=$((var)) +# fi +# if [ ${first_device_id} \> $((var)) ] +# then +# first_device_id=$((var)) +# fi +# done + +# device_num=`expr ${last_device_id} - ${first_device_id} + 1` +# if [ ${device_num} != ${#device_list[*]} ] +# then +# echo "error: the Ascend chips used must be continuous, please check your device id list!" +# exit 1 +# fi + +# if [ ${first_device_id} -lt 4 ] && [ ${last_device_id} -ge 4 ] +# then +# if [ ${first_device_id} != 0 ] || [ ${last_device_id} != 7 ] +# then +# echo "error: device id list must be in the same group of [0,4) or [4,8) when using Ascend chips." +# exit 1 +# fi +# fi + +# echo "${first_device_id},`expr ${last_device_id} + 1`" +# } + +# get_hccl_name(){ + +# server_ip=$(ifconfig -a | grep inet | grep -v 127.0.0.1 | grep -v inet6 | awk '{print $2}' | tr -d "addr:") +# device_num=`expr $2 - $1` +# device_id_list="" + +# for(( i=$1 ; i < $2 ; i++ )) +# do +# device_id_list=${device_id_list}$i +# done +# hccl_name="hccl_${device_num}p_${device_id_list}_${server_ip}.json" + +# echo ${hccl_name} +# } + + +run_ascend(){ + + if [ $# != 3 ] && [ $# != 4 ] + then + echo "Usage: bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n" + exit 1 + fi + + # first_last_device=$(check_and_get_Ascend_device $2) + # #devices=(${first_last_device//,/ }) + # #IFS=',' read -ra devices <<<"${first_last_device}" + # first_device=${first_last_device:0:1} + # last_device=${first_last_device:2:1} + # device_num=`expr $((last_device)) - $((first_device))` + + + # #single ascend or multiple ascend + # if [ ${device_num} -gt 1 ] + # then + # ori_path=$(dirname "$(readlink -f "$0")") + # #generate hccl config file + # cd ../../../../utils/hccl_tools/ || exit + # device_num_arg="[${first_device},${last_device})" + + # python hccl_tools.py --device_num=${device_num_arg} + + # hccl_name=$(get_hccl_name ${first_device} ${last_device}) + + # if [ ! -e ${hccl_name} ] + # then + # echo "error: failed to generate the hccl config file!" + # exit 1 + # fi + + # mv ${hccl_name} ${ori_path} + # cd ${ori_path} || exit + + # PATH1=$(get_real_path ${hccl_name}) + + # if [ ! -f $PATH1 ] + # then + # echo "error: RANK_TABLE_FILE=$PATH1 is not a file" + # exit 1 + # fi + + # export RANK_TABLE_FILE=$PATH1 + # fi + + + PATH1=$(get_real_path $2) + PATH2=$(get_real_path $3) + + if [ $# == 4 ] + then + PATH3=$(get_real_path $4) + fi + + if [ ! -f $PATH1 ] + then + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" + exit 1 + fi + + if [ ! -d $PATH2 ] + then + echo "error: DATASET_PATH=$PATH2 is not a directory" + exit 1 + fi + + if [ $# == 4 ] && [ ! -f $PATH3 ] + then + echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" + exit 1 + fi + + rank_file_name=${2##*/} + IFS='_' read -ra array <<<"${rank_file_name}" + device_id_list=${array[2]} + first_device=${device_id_list:0:1} + device_num=${#device_id_list} + + + ulimit -u unlimited + export DEVICE_NUM=${device_num} + export RANK_SIZE=${device_num} + export RANK_TABLE_FILE=$PATH1 + + export SERVER_ID=0 + rank_start=$((DEVICE_NUM * SERVER_ID)) + + rm -rf ./train + mkdir ./train + for((i=0; i<${device_num}; i++)) + do + export DEVICE_ID=$((first_device+i)) + export RANK_ID=$((rank_start + i)) + mkdir ./train/device$i + cp ../*.py ./train/device$i + cp *.sh ./train/device$i + cp -r ../src ./train/device$i + cp -r ../models ./train/device$i + cd ./train/device$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + if [ $# == 3 ] + then + python train.py --device_target=$1 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> train.log & + fi + + if [ $# == 4 ] + then + python train.py --device_target=$1 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> train.log & + fi + + cd ../.. || exit + done +} + +# run_gpu(){ + +# if [ $# -gt 3 ] || [ $# -lt 2 ] +# then +# echo "Usage: sh run_train_distribute_quant.sh [GPU] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n " +# exit 1 +# fi + +# PATH1=$(get_real_path $2) + +# if [ $# == 3 ] +# then +# PATH2=$(get_real_path $3) +# fi + +# if [ ! -d $PATH1 ] +# then +# echo "error: DATASET_PATH=$PATH1 is not a directory" +# exit 1 +# fi + +# if [ $# == 3 ] && [ ! -f $PATH2 ] +# then +# echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" +# exit 1 +# fi + +# ulimit -u unlimited +# export RANK_SIZE=2 +# #export CUDA_VISIBLE_DEVICES=1,2 + +# rm -rf ./train_parallel +# mkdir ./train_parallel +# cp ../*.py ./train_parallel +# cp *.sh ./train_parallel +# cp -r ../src ./train_parallel +# cp -r ../models ./train_parallel +# cd ./train_parallel || exit +# echo "start training" +# env > env.log +# if [ $# == 2 ] +# then +# mpirun --allow-run-as-root -n $RANK_SIZE +# python train.py --device_target=$1 --dataset_path=$PATH1 &> log & +# fi + +# if [ $# == 3 ] +# then +# mpirun --allow-run-as-root -n $RANK_SIZE +# python train.py --device_traget=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & +# fi +# cd .. +# } + if [ $1 = "Ascend" ] ; then run_ascend "$@" else - echo "not support platform" -fi; - + echo "Unsupported device target: $1" +fi; \ No newline at end of file