diff --git a/model_zoo/official/cv/googlenet/README.md b/model_zoo/official/cv/googlenet/README.md index 46e69d97fe9..8cfbc52c4d6 100644 --- a/model_zoo/official/cv/googlenet/README.md +++ b/model_zoo/official/cv/googlenet/README.md @@ -82,10 +82,10 @@ After installing MindSpore via the official website, you can start training and python train.py > train.log 2>&1 & # run distributed training example -sh scripts/run_train.sh rank_table.json +Ascend: sh scripts/run_train.sh rank_table.json OR GPU: sh scripts/run_train_gpu.sh 8 0,1,2,3,4,5,6,7 # run evaluation example -python eval.py > eval.log 2>&1 & OR sh run_eval.sh +python eval.py > eval.log 2>&1 & OR Ascend: sh run_eval.sh OR GPU: sh run_eval_gpu.sh ``` @@ -161,7 +161,7 @@ The model checkpoint will be saved in the current directory. ### Distributed Training ``` -sh scripts/run_train.sh rank_table.json +Ascend: sh scripts/run_train.sh rank_table.json OR GPU: sh scripts/run_train_gpu.sh 8 0,1,2,3,4,5,6,7 ``` The above shell script will run distribute training in the background. You can view the results through the file `train_parallel[X]/log`. The loss value will be achieved as follows: @@ -187,7 +187,9 @@ Before running the command below, please check the checkpoint path used for eval ``` python eval.py > eval.log 2>&1 & OR -sh scripts/run_eval.sh +Ascned: sh scripts/run_eval.sh +OR +GPU: sh scripts/run_eval_gpu.sh ``` The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows: diff --git a/model_zoo/official/recommend/deepfm/train.py b/model_zoo/official/recommend/deepfm/train.py index 656a7bfa049..95810c3a7e0 100644 --- a/model_zoo/official/recommend/deepfm/train.py +++ b/model_zoo/official/recommend/deepfm/train.py @@ -83,6 +83,8 @@ if __name__ == '__main__': rank_size=rank_size, rank_id=rank_id) + steps_size = ds_train.get_dataset_size() + model_builder = ModelBuilder(ModelConfig, TrainConfig) train_net, eval_net = model_builder.get_train_eval_net() auc_metric = AUCMetric() @@ -95,8 +97,12 @@ if __name__ == '__main__': if train_config.save_checkpoint: if rank_size: train_config.ckpt_file_name_prefix = train_config.ckpt_file_name_prefix + str(get_rank()) - config_ck = CheckpointConfig(save_checkpoint_steps=train_config.save_checkpoint_steps, - keep_checkpoint_max=train_config.keep_checkpoint_max) + if args_opt.device_target == "GPU": + config_ck = CheckpointConfig(save_checkpoint_steps=steps_size, + keep_checkpoint_max=train_config.keep_checkpoint_max) + else: + config_ck = CheckpointConfig(save_checkpoint_steps=train_config.save_checkpoint_steps, + keep_checkpoint_max=train_config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix=train_config.ckpt_file_name_prefix, directory=args_opt.ckpt_path, config=config_ck)