!5127 add training script

Merge pull request !5127 from hwjiaorui/master
2020-09-04 15:21:30 +08:00 · 2020-09-04 15:21:30 +08:00 · e5686df05a
parent 349356d8d7 aaa0436882
commit e5686df05a
5 changed files with 603 additions and 151 deletions
--- a/model_zoo/official/cv/mobilenetv2_quant/Readme.md
+++ b/model_zoo/official/cv/mobilenetv2_quant/Readme.md
@ -70,7 +70,7 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil
 ├── mobileNetv2_quant
  ├── Readme.md     # descriptions about MobileNetV2-Quant
  ├── scripts
-  │   ├──run_train_quant.sh   # shell script for train on Ascend
+  │   ├──run_train.sh   # shell script for train on Ascend and GPU
  │   ├──run_infer_quant.sh    # shell script for evaluation on Ascend
  ├── src
  │   ├──config.py      # parameter configuration
@ -91,19 +91,22 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil

 You can start training using python or shell scripts. The usage of shell scripts as follows:

- Ascend: sh run_train_quant.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]
+- bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)
+- bash run_train.sh [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)
+

 ### Launch

-```
-# training example
-  shell:
-      Ascend: sh run_train_quant.sh Ascend 8 10.222.223.224 0,1,2,3,4,5,6,7 ~/imagenet/train/ mobilenet_199.ckpt
+``` bash
+  # training example
+  >>> bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ ~/mobilenet.ckpt
+  >>> bash run_train.sh GPU 1,2 ~/imagenet/train/ ~/mobilenet.ckpt
 ```

 ### Result

-Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log  will be redirected to `./train/train.log` like followings.
+Training result will be stored in the example path. Checkpoints trained by `Ascend` will be stored at `./train/device$i/checkpoint` by default, and training log  will be redirected to `./train/device$i/train.log`. Checkpoints trained by `GPU` will be stored in `./train/checkpointckpt_$i` by default, and training log will be redirected to `./train/train.log`.  
+`train.log` is as follows:

 ```
 epoch: [  0/200], step:[  624/  625], loss:[5.258/5.258], time:[140412.236], lr:[0.100]
--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
@ -0,0 +1,300 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+get_real_path(){
+    if [ "${1:0:1}" == "/" ]; then
+        echo "$1"
+    else
+        echo "$(realpath -m $PWD/$1)"
+    fi
+}
+
+
+# check_and_get_Ascend_device(){
+
+#     #device_list=(${1//,/ })
+#     IFS=',' read -ra device_list <<<"$1"
+#     last_device_id=0
+#     first_device_id=8
+#     device_used=(0 0 0 0 0 0 0 0)
+
+#     for var in "${device_list[@]}"
+#     do  
+        
+#         if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ]
+#         then 
+#             echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!"
+#             exit 1
+#         fi
+
+#         if [  ${device_used[$((var))]} -eq 0 ]
+#         then 
+#             device_used[ $((var)) ]=1
+#         else
+#             echo "error: device id is duplicate, please check your device id list!"
+#             exit 1
+#         fi
+
+#         if [ ${last_device_id} \< $((var)) ]
+#         then 
+#             last_device_id=$((var))
+#         fi
+#         if [ ${first_device_id} \> $((var)) ]
+#         then
+#             first_device_id=$((var))
+#         fi
+#     done
+
+#     device_num=`expr ${last_device_id} - ${first_device_id} + 1`
+#     if [ ${device_num} != ${#device_list[*]} ]
+#     then 
+#         echo "error: the Ascend chips used must be continuous, please check your device id list!"
+#         exit 1
+#     fi
+
+#     if [ ${first_device_id} -lt 4 ] && [ ${last_device_id} -ge 4 ]
+#     then
+#         if [ ${first_device_id} != 0 ] || [ ${last_device_id} != 7 ]
+#         then
+#             echo "error: device id list must be in the same group of [0,4) or [4,8) when using Ascend chips."
+#             exit 1
+#         fi
+#     fi
+
+#     echo "${first_device_id},`expr ${last_device_id} + 1`"
+# }
+
+# get_hccl_name(){
+
+#     server_ip=$(ifconfig -a | grep inet | grep -v 127.0.0.1 | grep -v inet6 | awk '{print $2}' | tr -d "addr:")
+#     device_num=`expr $2 - $1`
+#     device_id_list=""
+
+#     for(( i=$1 ; i < $2 ; i++ ))
+#     do 
+#         device_id_list=${device_id_list}$i
+#     done
+#     hccl_name="hccl_${device_num}p_${device_id_list}_${server_ip}.json"
+
+#     echo ${hccl_name}
+# }
+
+
+get_gpu_device_num(){
+
+    #device_list=(${1//,/ })
+    IFS=',' read -ra device_list <<<"$1"
+    device_used=(0 0 0 0 0 0 0 0)
+    device_num=0
+    for var in "${device_list[@]}"
+    do  
+        if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ]
+        then 
+            echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!"
+            exit 1
+        fi
+
+        if [  ${device_used[$((var))]} -eq 0 ]
+        then 
+            device_used[ $((var)) ]=1
+            device_num=$((device_num+1))
+        fi
+    done
+
+    echo ${device_num}
+}
+
+
+run_ascend(){
+
+    if [ $# -gt 4 ] || [ $# -lt 3 ]
+    then
+        echo "Usage:  bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n "
+        exit 1
+    fi
+
+    #first_last_device=$(check_and_get_Ascend_device $2)
+    #devices=(${first_last_device//,/ })
+    #IFS=',' read -ra devices <<<"${first_last_device}"
+    # first_device=${first_last_device:0:1}
+    # last_device=${first_last_device:2:1}
+    # device_num=`expr $((last_device)) - $((first_device))`
+    
+    #single ascend or multiple ascend 
+    # if [ ${device_num} -gt 1 ]
+    # then
+    #     ori_path=$(dirname "$(readlink -f "$0" )")
+    #     #generate hccl config file
+    #     cd ../../../../utils/hccl_tools/ || exit
+    #     device_num_arg="[${first_device},${last_device})" 
+
+    #     python hccl_tools.py --device_num=${device_num_arg}
+
+    #     hccl_name=$(get_hccl_name ${first_device} ${last_device})
+
+    #     if [ ! -e ${hccl_name} ]
+    #     then
+    #         echo "error: failed to generate the hccl config file!"
+    #         exit 1
+    #     fi
+
+    #     mv ${hccl_name} ${ori_path}
+    #     cd ${ori_path} || exit
+
+    #     PATH1=$(get_real_path ${hccl_name})
+
+    #     if [ ! -f $PATH1 ]
+    #     then 
+    #         echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
+    #         exit 1
+    #     fi 
+
+    #     export RANK_TABLE_FILE=$PATH1
+    # fi
+
+    PATH1=$(get_real_path $2)
+    PATH2=$(get_real_path $3)
+
+    if [ $# == 4 ]
+    then 
+        PATH3=$(get_real_path $4)
+    fi
+
+    if [ ! -f $PATH1 ]
+    then 
+        echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
+        exit 1
+    fi 
+
+    if [ ! -d $PATH2 ]
+    then 
+        echo "error: DATASET_PATH=$PATH2 is not a directory"
+    exit 1
+    fi 
+
+    if [ $# == 4 ] && [ ! -f $PATH3 ]
+    then
+        echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
+        exit 1
+    fi
+
+
+    rank_file_name=${2##*/}
+    IFS='_' read -ra array <<<"${rank_file_name}"
+    device_id_list=${array[2]}
+    first_device=${device_id_list:0:1}
+    #last_device=${device_list:${#device_list}-1:1}
+    device_num=${#device_id_list}
+    
+    ulimit -u unlimited
+    export DEVICE_NUM=${device_num}
+    export RANK_SIZE=${device_num}
+    export RANK_TABLE_FILE=$PATH1
+
+    export SERVER_ID=0
+    rank_start=$((DEVICE_NUM * SERVER_ID))
+
+    rm -rf ./train
+    mkdir ./train
+    for((i=0; i<${device_num}; i++))
+    do
+        export DEVICE_ID=$((first_device+i))
+        export RANK_ID=$((rank_start + i))
+        mkdir ./train/device$i
+        cp ../*.py ./train/device$i
+        cp *.sh ./train/device$i
+        cp -r ../src ./train/device$i
+        cd ./train/device$i || exit
+        echo "start training for rank $RANK_ID, device $DEVICE_ID"
+        env > env.log
+        if [ $# == 3 ]
+        then	    
+            python train.py --device_target=$1  --dataset_path=$PATH2 &> train.log &
+        fi
+        
+        if [ $# == 4 ]
+        then
+            python train.py --device_traget=$1  --dataset_path=$PATH2 --pre_trained=$PATH3 &> train.log &
+        fi
+
+        cd ../.. || exit
+    done
+}
+
+run_gpu(){
+    if [ $# -gt 3 ] || [ $# -lt 2 ]
+    then
+        echo "Usage:  bash run_train.sh  [GPU] [DEVICE_ID_LIST] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n "
+        exit 1
+    fi
+
+    PATH1=$(get_real_path $3)
+    
+    if [ $# == 4 ]
+    then 
+        PATH2=$(get_real_path $4)
+    fi
+
+    if [ ! -d $PATH1 ]
+    then 
+        echo "error: DATASET_PATH=$PATH1 is not a directory"
+        exit 1
+    fi 
+
+    if [ $# == 4 ] && [ ! -f $PATH2 ]
+    then
+        echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
+        exit 1
+    fi
+
+    device_num=$(get_gpu_device_num $2)
+
+    ulimit -u unlimited
+    export DEVICE_NUM=${device_num}
+    export RANK_SIZE=${device_num}
+    export CUDA_VISIBLE_DEVICES=$2
+    
+    rm -rf ./train
+    mkdir ./train
+    cp ../*.py ./train
+    cp *.sh ./train
+    cp -r ../src ./train
+    cd ./train || exit
+    echo "start training"
+    env > env.log
+    if [ $# == 3 ]
+    then
+        mpirun --allow-run-as-root -n ${RANK_SIZE} \
+        python train.py --device_target=$1  --dataset_path=$PATH1 &> train.log &
+    fi
+    
+    if [ $# == 4 ]
+    then
+        mpirun --allow-run-as-root -n ${RANK_SIZE} \
+        python train.py --device_traget=$1  --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log &
+    fi
+
+    cd ..
+}
+
+
+if [ $1 = "Ascend" ] ; then
+    run_ascend "$@"
+elif [ $1 = "GPU" ] ; then
+    run_gpu "$@"
+else
+    echo "Unsupported device target: $1"
+fi;
--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh
@ -1,96 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-run_ascend()
-{
-    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
-    then
-        echo "error: DEVICE_NUM=$2 is not in (1-9)"
-    exit 1
-    fi
-
-    if [ ! -d $5 ] && [ ! -f $5 ]
-    then
-        echo "error: DATASET_PATH=$5 is not a directory or file"
-    exit 1
-    fi
-
-    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
-    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
-    if [ -d "../train" ];
-    then
-        rm -rf ../train
-    fi
-    mkdir ../train
-    cd ../train || exit
-    python ${BASEPATH}/../src/launch.py \
-            --nproc_per_node=$2 \
-            --visible_devices=$4 \
-            --server_id=$3 \
-            --training_script=${BASEPATH}/../train.py \
-            --dataset_path=$5 \
-            --pre_trained=$6 \
-            --device_target=$1 &> train.log &  # dataset train folder
-}
-
-run_gpu()
-{
-    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
-    then
-        echo "error: DEVICE_NUM=$2 is not in (1-8)"
-    exit 1
-    fi
-
-    if [ ! -d $4 ]
-    then
-        echo "error: DATASET_PATH=$4 is not a directory"
-    exit 1
-    fi
-
-    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
-    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
-    if [ -d "../train" ];
-    then
-        rm -rf ../train
-    fi
-    mkdir ../train
-    cd ../train || exit
-
-    export CUDA_VISIBLE_DEVICES="$3"
-    mpirun -n $2 --allow-run-as-root \
-    python ${BASEPATH}/../train.py \
-        --dataset_path=$4 \
-        --device_target=$1 \
-        --pre_trained=$5  &> ../train.log &  # dataset train folder
-}
-
-if [ $# -gt 6 ] || [ $# -lt 5 ]
-then
-    echo "Usage:\n \
-          Ascend: sh run_train_quant.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
-          GPU: sh run_train_quant.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
-          "
-exit 1
-fi
-
-if [ $1 = "Ascend" ] ; then
-    run_ascend "$@"
-elif [ $1 = "GPU" ] ; then
-    run_gpu "$@"
-else
-    echo "Unsupported device target."
-fi;
-
--- a/model_zoo/official/cv/resnet50_quant/Readme.md
+++ b/model_zoo/official/cv/resnet50_quant/Readme.md
@ -1,4 +1,43 @@
 # Contents
+# ResNet-50_quant Example
+
+## Description
+
+This is an example of training ResNet-50_quant with ImageNet2012 dataset in MindSpore.
+
+## Requirements
+
+- Install [MindSpore](https://www.mindspore.cn/install/en).
+
+- Download the dataset ImageNet2012 
+
+> Unzip the ImageNet2012 dataset to any path you want and the folder structure should include train and eval dataset as follows:
+> ```
+> .  
+> ├── ilsvrc                  # train dataset
+> └── ilsvrc_eval             # infer dataset: images should be classified into 1000 directories firstly, just like train images
+> ```
+
+
+## Example structure
+
+```shell
+resnet50_quant/
+  ├── eval.py
+  ├── models
+  │   └── resnet_quant.py
+  ├── Readme.md
+  ├── scripts
+  │   ├── run_infer.sh
+  │   └── run_train.sh
+  ├── src
+  │   ├── config.py
+  │   ├── crossentropy.py
+  │   ├── dataset.py
+  │   ├── launch.py
+  │   └── lr_generator.py
+  └── train.py
+```

 - [resnet50 Description](#resnet50-description)
 - [Model Architecture](#model-architecture)
@ -88,21 +127,17 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil

 ### Usage

-
-You can start training using python or shell scripts. The usage of shell scripts as follows:
-
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH][CKPT_PATH]
+- Ascend: sh run_train.sh Ascend [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH]\(optional)
 ### Launch

 ```
-# training example
-  shell:
-      Ascend: sh run_train.sh Ascend 8 10.222.223.224 0,1,2,3,4,5,6,7 ~/resnet/train/ Resnet50-90_5004.ckpt
+  # training example
+  Ascend: bash run_train.sh Ascend ~/hccl_4p_0123_x.x.x.x.json ~/imagenet/train/ 
 ```

 ### Result

-Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log  will be redirected to `./train/train.log` like followings.
+Training result will be stored in the example path. Checkpoints will be stored at `./train/device$i/` by default, and training log  will be redirected to `./train/device$i/train.log` like followings. 

 ```
 epoch: 1 step: 5004, loss is 4.8995576
--- a/model_zoo/official/cv/resnet50_quant/scripts/run_train.sh
+++ b/model_zoo/official/cv/resnet50_quant/scripts/run_train.sh
@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -14,49 +14,259 @@
 # limitations under the License.
 # ============================================================================

-run_ascend()
-{
-    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
-    then
-        echo "error: DEVICE_NUM=$2 is not in (1-8)"
-    exit 1
+get_real_path(){
+    if [ "${1:0:1}" == "/" ]; then
+        echo "$1"
+    else
+        echo "$(realpath -m $PWD/$1)"
    fi
-
-    if [ ! -d $5 ] && [ ! -f $5 ]
-    then
-        echo "error: DATASET_PATH=$5 is not a directory or file"
-    exit 1
-    fi
-
-    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
-    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
-    if [ -d "../train" ];
-    then
-        rm -rf ../train
-    fi
-    mkdir ../train
-    cd ../train || exit
-    python ${BASEPATH}/../src/launch.py \
-            --nproc_per_node=$2 \
-            --visible_devices=$4 \
-            --server_id=$3 \
-            --training_script=${BASEPATH}/../train.py \
-            --dataset_path=$5 \
-            --pre_trained=$6 \
-            --device_target=$1 &> train.log &  # dataset train folder
 }

-if [ $# -gt 6 ] || [ $# -lt 4 ]
-then
-    echo "Usage:\n \
-          Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
-          "
-exit 1
-fi
+# check_and_get_Ascend_device(){
+
+#     #device_list=(${1//,/ })
+#     IFS=',' read -ra device_list <<<"$1"
+#     last_device_id=0
+#     first_device_id=8
+#     device_used=(0 0 0 0 0 0 0 0)
+
+#     for var in "${device_list[@]}"
+#     do  
+        
+#         if [ $((var)) -lt 0 ] || [ $((var)) -ge 8 ]
+#         then 
+#             echo "error: device id=${var} is incorrect, device id must be in range [0,8), please check your device id list!"
+#             exit 1
+#         fi
+
+#         if [  ${device_used[$((var))]} -eq 0 ]
+#         then 
+#             device_used[ $((var)) ]=1
+#         else
+#             echo "error: device id is duplicate, please check your device id list!"
+#             exit 1
+#         fi
+
+#         if [ ${last_device_id} \< $((var)) ]
+#         then 
+#             last_device_id=$((var))
+#         fi
+#         if [ ${first_device_id} \> $((var)) ]
+#         then
+#             first_device_id=$((var))
+#         fi
+#     done
+
+#     device_num=`expr ${last_device_id} - ${first_device_id} + 1`
+#     if [ ${device_num} != ${#device_list[*]} ]
+#     then 
+#         echo "error: the Ascend chips used must be continuous, please check your device id list!"
+#         exit 1
+#     fi
+
+#     if [ ${first_device_id} -lt 4 ] && [ ${last_device_id} -ge 4 ]
+#     then
+#         if [ ${first_device_id} != 0 ] || [ ${last_device_id} != 7 ]
+#         then
+#             echo "error: device id list must be in the same group of [0,4) or [4,8) when using Ascend chips."
+#             exit 1
+#         fi
+#     fi
+
+#     echo "${first_device_id},`expr ${last_device_id} + 1`"
+# }
+
+# get_hccl_name(){
+
+#     server_ip=$(ifconfig -a | grep inet | grep -v 127.0.0.1 | grep -v inet6 | awk '{print $2}' | tr -d "addr:")
+#     device_num=`expr $2 - $1`
+#     device_id_list=""
+
+#     for(( i=$1 ; i < $2 ; i++ ))
+#     do 
+#         device_id_list=${device_id_list}$i
+#     done
+#     hccl_name="hccl_${device_num}p_${device_id_list}_${server_ip}.json"
+
+#     echo ${hccl_name}
+# }
+
+
+run_ascend(){
+
+    if [ $# != 3 ] && [ $# != 4 ]
+    then 
+        echo "Usage: bash run_train.sh [Ascend] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n"
+        exit 1
+    fi
+
+    # first_last_device=$(check_and_get_Ascend_device $2)
+    # #devices=(${first_last_device//,/ })
+    # #IFS=',' read -ra devices <<<"${first_last_device}"
+    # first_device=${first_last_device:0:1}
+    # last_device=${first_last_device:2:1}
+    # device_num=`expr $((last_device)) - $((first_device))`
+    
+
+    # #single ascend or multiple ascend 
+    # if [ ${device_num} -gt 1 ]
+    # then
+    #     ori_path=$(dirname "$(readlink -f "$0")")
+    #     #generate hccl config file
+    #     cd ../../../../utils/hccl_tools/  || exit
+    #     device_num_arg="[${first_device},${last_device})" 
+
+    #     python hccl_tools.py --device_num=${device_num_arg}
+
+    #     hccl_name=$(get_hccl_name ${first_device} ${last_device})
+
+    #     if [ ! -e ${hccl_name} ]
+    #     then
+    #         echo "error: failed to generate the hccl config file!"
+    #         exit 1
+    #     fi
+
+    #     mv ${hccl_name} ${ori_path}
+    #     cd ${ori_path} || exit
+
+    #     PATH1=$(get_real_path ${hccl_name})
+
+    #     if [ ! -f $PATH1 ]
+    #     then 
+    #         echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
+    #         exit 1
+    #     fi 
+
+    #     export RANK_TABLE_FILE=$PATH1
+    # fi
+
+
+    PATH1=$(get_real_path $2)
+    PATH2=$(get_real_path $3)
+
+    if [ $# == 4 ]
+    then 
+        PATH3=$(get_real_path $4)
+    fi
+
+    if [ ! -f $PATH1 ]
+    then 
+        echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
+        exit 1
+    fi 
+
+    if [ ! -d $PATH2 ]
+    then 
+        echo "error: DATASET_PATH=$PATH2 is not a directory"
+        exit 1
+    fi 
+
+    if [ $# == 4 ] && [ ! -f $PATH3 ]
+    then
+        echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
+        exit 1
+    fi
+
+    rank_file_name=${2##*/}
+    IFS='_' read -ra array <<<"${rank_file_name}"
+    device_id_list=${array[2]}
+    first_device=${device_id_list:0:1}
+    device_num=${#device_id_list}
+
+
+    ulimit -u unlimited
+    export DEVICE_NUM=${device_num}
+    export RANK_SIZE=${device_num}
+    export RANK_TABLE_FILE=$PATH1
+
+    export SERVER_ID=0
+    rank_start=$((DEVICE_NUM * SERVER_ID))
+
+    rm -rf ./train
+    mkdir ./train
+    for((i=0; i<${device_num}; i++))
+    do
+        export DEVICE_ID=$((first_device+i))
+        export RANK_ID=$((rank_start + i))
+        mkdir ./train/device$i
+        cp ../*.py ./train/device$i
+        cp *.sh ./train/device$i
+        cp -r ../src ./train/device$i
+        cp -r ../models ./train/device$i
+        cd ./train/device$i || exit
+        echo "start training for rank $RANK_ID, device $DEVICE_ID"
+        env > env.log
+        if [ $# == 3 ]
+        then	    
+            python train.py  --device_target=$1 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> train.log &
+        fi
+        
+        if [ $# == 4 ]
+        then
+            python train.py  --device_target=$1 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> train.log &
+        fi
+
+        cd ../.. || exit
+    done    
+}
+
+# run_gpu(){
+
+#     if [ $# -gt 3 ] || [ $# -lt 2 ]
+#     then
+#         echo "Usage:  sh run_train_distribute_quant.sh  [GPU] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)\n "
+#         exit 1
+#     fi
+
+#     PATH1=$(get_real_path $2)
+    
+#     if [ $# == 3 ]
+#     then 
+#         PATH2=$(get_real_path $3)
+#     fi
+
+#     if [ ! -d $PATH1 ]
+#     then 
+#         echo "error: DATASET_PATH=$PATH1 is not a directory"
+#         exit 1
+#     fi 
+
+#     if [ $# == 3 ] && [ ! -f $PATH2 ]
+#     then
+#         echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
+#         exit 1
+#     fi
+
+#     ulimit -u unlimited
+#     export RANK_SIZE=2
+#     #export CUDA_VISIBLE_DEVICES=1,2
+
+#     rm -rf ./train_parallel
+#     mkdir ./train_parallel
+#     cp ../*.py ./train_parallel
+#     cp *.sh ./train_parallel
+#     cp -r ../src ./train_parallel
+#     cp -r ../models ./train_parallel
+#     cd ./train_parallel || exit
+#     echo "start training"
+#     env > env.log
+#     if [ $# == 2 ]
+#     then	    
+#         mpirun --allow-run-as-root -n $RANK_SIZE
+#         python train.py --device_target=$1  --dataset_path=$PATH1 &> log &
+#     fi
+    
+#     if [ $# == 3 ]
+#     then
+#         mpirun --allow-run-as-root -n $RANK_SIZE
+#         python train.py --device_traget=$1  --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
+#     fi
+#     cd ..
+# }
+

 if [ $1 = "Ascend" ] ; then
    run_ascend "$@"
 else
-    echo "not support platform"
-fi;
-
+    echo "Unsupported device target: $1"
+fi;