!23497 [MS][pix2pix][gpu 8p]train failed. Bug fix
Merge pull request !23497 from ZeyangGAO/code_docs_pix
This commit is contained in:
commit
ec2942a66a
|
@ -231,8 +231,8 @@ bash run_infer_310.sh [The path of the MINDIR for 310 infer] [The path of the da
|
|||
| Optimizer | Adam | Adam |
|
||||
| Loss Function | SigmoidCrossEntropyWithLogits Loss & L1 Loss | SigmoidCrossEntropyWithLogits Loss & L1 Loss |
|
||||
| outputs | probability | probability |
|
||||
| Speed | 1pc(Ascend): 10 ms/step | 1pc(GPU): 50 ms/step |
|
||||
| Total time | 1pc(Ascend): 0.3h | 1pc(GPU): 0.9 h |
|
||||
| Speed | 1pc(Ascend): 10 ms/step | 1pc(GPU): 40 ms/step |
|
||||
| Total time | 1pc(Ascend): 0.3h | 1pc(GPU): 0.8 h |
|
||||
| Checkpoint for Fine tuning | 207M (.ckpt file) | 207M (.ckpt file) |
|
||||
|
||||
| Parameters | single Ascend | single GPU |
|
||||
|
@ -245,8 +245,8 @@ bash run_infer_310.sh [The path of the MINDIR for 310 infer] [The path of the da
|
|||
| Optimizer | Adam | Adam |
|
||||
| Loss Function | SigmoidCrossEntropyWithLogits Loss & L1 Loss | SigmoidCrossEntropyWithLogits Loss & L1 Loss |
|
||||
| outputs | probability | probability |
|
||||
| Speed | 1pc(Ascend): 20 ms/step | 1pc(GPU): 60 ms/step |
|
||||
| Total time | 1pc(Ascend): 1.58h | 1pc(GPU): 2.2h |
|
||||
| Speed | 1pc(Ascend): 20 ms/step | 1pc(GPU): 90 ms/step |
|
||||
| Total time | 1pc(Ascend): 1.58h | 1pc(GPU): 3.3h |
|
||||
| Checkpoint for Fine tuning | 207M (.ckpt file) | 207M (.ckpt file) |
|
||||
|
||||
### Distributed Training Performance
|
||||
|
@ -275,8 +275,8 @@ bash run_infer_310.sh [The path of the MINDIR for 310 infer] [The path of the da
|
|||
| Optimizer | Adam | Adam |
|
||||
| Loss Function | SigmoidCross55EntropyWithLogits Loss & L1 Loss | SigmoidCrossEntropyWithLogits Loss & L1 Loss |
|
||||
| outputs | probability | probability |
|
||||
| Speed | 8pc(Ascend): 20 ms/step | 8pc(GPU): 30 ms/step |
|
||||
| Total time | 8pc(Ascend): 1.2h | 8pc(GPU): 2.1h |
|
||||
| Speed | 8pc(Ascend): 20 ms/step | 8pc(GPU): 40 ms/step |
|
||||
| Total time | 8pc(Ascend): 1.2h | 8pc(GPU): 2.8h |
|
||||
| Checkpoint for Fine tuning | 207M (.ckpt file) | 207M (.ckpt file) |
|
||||
|
||||
### Evaluation Performance
|
||||
|
|
|
@ -29,8 +29,8 @@ get_real_path(){
|
|||
fi
|
||||
}
|
||||
|
||||
export RANK_SIZE=$(get_real_path $1)
|
||||
export DISTRIBUTE=$(get_real_path $2)
|
||||
export RANK_SIZE=$1
|
||||
export DISTRIBUTE=$2
|
||||
export RANK_TABLE_FILE=$(get_real_path $3)
|
||||
export DATASET_PATH=$(get_real_path $4)
|
||||
|
||||
|
|
|
@ -58,7 +58,7 @@ then
|
|||
--run_distribute 1 --device_num $3 --dataset_size 400 --train_data_dir $PATH1 --pad_mode REFLECT &> log &
|
||||
elif [ $2 == 'maps' ];
|
||||
then
|
||||
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||
mpirun --allow-run-as-root -n $3 --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --device_target GPU --device_num $3 --dataset_size 1096 \
|
||||
--run_distribute 1 --train_data_dir $PATH1 --pad_mode REFLECT &> log &
|
||||
fi
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
|
||||
import argparse
|
||||
import ast
|
||||
from mindspore import context
|
||||
|
||||
|
||||
def get_args():
|
||||
|
@ -71,7 +70,4 @@ def get_args():
|
|||
parser.add_argument('--predict_dir', type=str, default='./results/predict/',
|
||||
help='during validating, the file path of Generated image.')
|
||||
args = parser.parse_args()
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id)
|
||||
|
||||
return args
|
||||
|
|
|
@ -43,7 +43,7 @@ if __name__ == '__main__':
|
|||
print("ds.shape:", ds.output_shapes())
|
||||
|
||||
steps_per_epoch = ds.get_dataset_size()
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
|
||||
if args.device_target == 'Ascend':
|
||||
if args.run_distribute:
|
||||
print("Ascend distribute")
|
||||
|
@ -55,6 +55,8 @@ if __name__ == '__main__':
|
|||
init()
|
||||
|
||||
rank = get_rank()
|
||||
else:
|
||||
context.set_context(device_id=args.device_id)
|
||||
elif args.device_target == 'GPU':
|
||||
if args.run_distribute:
|
||||
print("GPU distribute")
|
||||
|
@ -63,6 +65,8 @@ if __name__ == '__main__':
|
|||
context.set_auto_parallel_context(device_num=get_group_size(),
|
||||
parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True)
|
||||
else:
|
||||
context.set_context(device_id=args.device_id)
|
||||
netG = get_generator()
|
||||
netD = get_discriminator()
|
||||
|
||||
|
|
Loading…
Reference in New Issue