modify for Squeezenet_gpu

This commit is contained in:
郑彬 2021-09-01 15:56:11 +08:00
parent 1518006924
commit a8c9859141
6 changed files with 14 additions and 17 deletions

View File

@ -198,7 +198,7 @@ Parameters for both training and evaluation can be set in *.yaml
```py
"class_num": 10, # dataset class num
"global_batch_size": 32, # the total batch_size for training and evaluation
"batch_size": 32, # Batch_size for training, evaluation and export. If running distributed on gpu, divide this value by device_num.
"loss_scale": 1024, # loss scale
"momentum": 0.9, # momentum
"weight_decay": 1e-4, # weight decay
@ -219,7 +219,7 @@ Parameters for both training and evaluation can be set in *.yaml
```py
"class_num": 1000, # dataset class num
"global_batch_size": 256, # the total batch_size for training and evaluation
"batch_size": 32, # Batch_size for training, evaluation and export
"loss_scale": 1024, # loss scale
"momentum": 0.9, # momentum
"weight_decay": 7e-5, # weight decay
@ -242,7 +242,7 @@ Parameters for both training and evaluation can be set in *.yaml
```py
"class_num": 10, # dataset class num
"global_batch_size": 32, # the total batch_size for training and evaluation
"batch_size": 32, # Batch_size for training, evaluation and export. If running distributed on gpu, divide this value by device_num.
"loss_scale": 1024, # loss scale
"momentum": 0.9, # momentum
"weight_decay": 1e-4, # weight decay
@ -263,7 +263,7 @@ Parameters for both training and evaluation can be set in *.yaml
```py
"class_num": 1000, # dataset class num
"global_batch_size": 256, # The total batch_size for training and evaluation
"batch_size": 32, # Batch_size for training, evaluation and export
"loss_scale": 1024, # loss scale
"momentum": 0.9, # momentum
"weight_decay": 7e-5, # weight decay

View File

@ -21,7 +21,7 @@ checkpoint_file_path: "suqeezenet_cifar10-120_195.ckpt"
net_name: "suqeezenet"
dataset : "cifar10"
class_num: 10
global_batch_size: 32
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
@ -55,7 +55,7 @@ load_path: "The location of checkpoint for obs"
device_target: "Target device type, available: [Ascend, GPU, CPU]"
enable_profiling: "Whether enable profiling while training, default: False"
num_classes: "Class for dataset"
global_batch_size: "The total batch_size for training and evaluation"
batch_size: "Batch_size for training, evaluation and export. If running distributed on gpu, divide this value by device_num"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."

View File

@ -21,7 +21,7 @@ checkpoint_file_path: "suqeezenet_imagenet-200_5004.ckpt"
net_name: "suqeezenet"
dataset : "imagenet"
class_num: 1000
global_batch_size: 256
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.00007
@ -57,7 +57,7 @@ load_path: 'The location of checkpoint for obs'
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
enable_profiling: 'Whether enable profiling while training, default: False'
num_classes: 'Class for dataset'
global_batch_size: "The total batch_size for training and evaluation"
batch_size: "Batch_size for training, evaluation and export"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."

View File

@ -21,7 +21,7 @@ checkpoint_file_path: "suqeezenet_residual_cifar10-150_195.ckpt"
net_name: "suqeezenet_residual"
dataset : "cifar10"
class_num: 10
global_batch_size: 32
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
@ -55,7 +55,7 @@ load_path: "The location of checkpoint for obs"
device_target: "Target device type, available: [Ascend, GPU, CPU]"
enable_profiling: "Whether enable profiling while training, default: False"
num_classes: "Class for dataset"
global_batch_size: "The total batch_size for training and evaluation."
batch_size: "Batch_size for training, evaluation and export. If running distributed on gpu, divide this value by device_num."
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."

View File

@ -21,7 +21,7 @@ checkpoint_file_path: "suqeezenet_residual_imagenet-300_5004.ckpt"
net_name: "suqeezenet_residual"
dataset : "imagenet"
class_num: 1000
global_batch_size: 256
batch_size: 32
loss_scale: 1024
momentum: 0.9
weight_decay: 0.00007
@ -57,7 +57,7 @@ load_path: "The location of checkpoint for obs"
device_target: "Target device type, available: [Ascend, GPU, CPU]"
enable_profiling: "Whether enable profiling while training, default: False"
num_classes: "Class for dataset"
global_batch_size: "The total batch_size for training and evaluation"
batch_size: "Batch_size for training, evaluation and export"
epoch_size: "Total training epochs."
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
checkpoint_path: "The location of the checkpoint file."

View File

@ -76,15 +76,12 @@ def train_net():
gradients_mean=True)
ckpt_save_dir = ckpt_save_dir + "/ckpt_" + str(
get_rank()) + "/"
# obtain the actual batch_size
if not hasattr(config, "global_batch_size"):
raise AttributeError("'config' object has no attribute 'global_batch_size', please check the yaml file.")
batch_size = max(config.global_batch_size // device_num, 1)
# create dataset
dataset = create_dataset(dataset_path=config.data_path,
do_train=True,
repeat_num=1,
batch_size=batch_size,
batch_size=config.batch_size,
target=target)
step_size = dataset.get_dataset_size()