forked from mindspore-Ecosystem/mindspore
!6063 modify loading way of hccl file and display content during training
Merge pull request !6063 from hwjiaorui/master
This commit is contained in:
commit
15a7722d84
|
@ -82,12 +82,19 @@ run_ascend(){
|
|||
fi
|
||||
|
||||
|
||||
rank_file_name=${2##*/}
|
||||
IFS='_' read -ra array <<<"${rank_file_name}"
|
||||
device_id_list=${array[2]}
|
||||
first_device=${device_id_list:0:1}
|
||||
|
||||
#rank_file_name=${2##*/}
|
||||
#IFS='_' read -ra array <<<"${rank_file_name}"
|
||||
#device_id_list=${array[2]}
|
||||
#first_device=${device_id_list:0:1}
|
||||
#last_device=${device_list:${#device_list}-1:1}
|
||||
device_num=${#device_id_list}
|
||||
#device_num=${#device_id_list}
|
||||
cat $2 | awk -F "[device_id]" '/device_id/{print$0}' >temp.log
|
||||
array=$(cat temp.log | awk -F "[:]" '/device_id/{print$2}')
|
||||
rm temp.log
|
||||
IFS=" " read -ra device_list <<<$array
|
||||
first_device=${device_list[0]:1:1}
|
||||
device_num=${#device_list[*]}
|
||||
|
||||
ulimit -u unlimited
|
||||
export DEVICE_NUM=${device_num}
|
||||
|
@ -188,3 +195,4 @@ elif [ $1 = "GPU" ] ; then
|
|||
else
|
||||
echo "Unsupported device target: $1"
|
||||
fi;
|
||||
|
||||
|
|
|
@ -74,8 +74,7 @@ class Monitor(Callback):
|
|||
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num
|
||||
|
||||
print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.5f}]".format(
|
||||
cb_params.cur_epoch_num -
|
||||
1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss,
|
||||
cb_params.cur_epoch_num, cb_params.epoch_num, cur_step_in_epoch+1, cb_params.batch_num, step_loss,
|
||||
np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1]))
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue