update resnet acc_mode scripts.

This commit is contained in:
linqingke 2021-07-07 10:38:29 +08:00
parent 86eb2a9605
commit 1de2366cda
15 changed files with 119 additions and 16 deletions

View File

@ -40,6 +40,7 @@ class AutoAcc:
def __init__(self, level, kwargs):
if level not in _acc_config_level.keys():
level = 'O0'
self.level = level
acc_config = _acc_config_level[level]
self._acc_config = acc_config
self._fn_flag = True

View File

@ -73,16 +73,16 @@ class OptimizerProcess:
params_gc_value.append(param)
else:
params_value.append(param)
if params_gc_value:
new_group_param = copy.deepcopy(group_param)
new_group_param['params'] = params_gc_value
new_group_param['grad_centralization'] = True
group_params.append(new_group_param)
if params_value:
new_group_param = copy.deepcopy(group_param)
new_group_param['params'] = params_value
group_params.append(new_group_param)
self.origin_params = group_params
if params_gc_value:
new_group_param = copy.deepcopy(group_param)
new_group_param['params'] = params_gc_value
new_group_param['grad_centralization'] = True
group_params.append(new_group_param)
if params_value:
new_group_param = copy.deepcopy(group_param)
new_group_param['params'] = params_value
group_params.append(new_group_param)
self.origin_params = group_params
def generate_new_optimizer(self):
"""Generate new optimizer."""
@ -135,6 +135,7 @@ class ParameterProcess:
else:
group_params = []
params_name = [param.name for param in parameters]
new_params_count = copy.deepcopy(params_name)
for group_param in origin_params_copy:
if 'order_params' in group_param.keys():
new_group_param = copy.deepcopy(group_param)
@ -146,9 +147,16 @@ class ParameterProcess:
if param.name in params_name:
index = params_name.index(param.name)
params_value.append(parameters[index])
new_params_count.remove(param.name)
new_group_param = copy.deepcopy(group_param)
new_group_param['params'] = params_value
group_params.append(new_group_param)
if new_params_count:
params_value = []
for param in new_params_count:
index = params_name.index(param)
params_value.append(parameters[index])
group_params.append({"params": params_value})
return group_params
_gradient_accumulation_op = C.MultitypeFuncGraph("gradient_accumulation_op")

View File

@ -139,6 +139,7 @@ class Model:
self._device_number = _get_device_num()
self._global_rank = _get_global_rank()
self._parameter_broadcast = _get_parameter_broadcast()
self._metrics = metrics
self._check_amp_level_arg(optimizer, amp_level)
self._check_for_graph_cell(kwargs)
@ -175,7 +176,7 @@ class Model:
def _check_kwargs(self, kwargs):
for arg in kwargs:
if arg not in ['loss_scale_manager', 'keep_batchnorm_fp32', 'total_steps']:
if arg not in ['loss_scale_manager', 'keep_batchnorm_fp32']:
raise ValueError(f"Unsupported arg '{arg}'")
def _check_reuse_dataset(self, dataset):
@ -187,15 +188,18 @@ class Model:
def _build_acc_network(self, kwargs):
"""Build the acc network."""
processor = acc.AutoAcc(self._acc_level, kwargs)
if processor.level not in ["O1", "O2"]:
return
if self._optimizer is None:
logger.warning("In acc mode, the optimizer must be defined.")
return
if self._eval_network is None:
logger.warning("In acc mode, the eval_network must be defined.")
if self._eval_network is None and self._metrics is None:
logger.warning("In acc mode, the eval_network and metrics cannot be undefined at the same time.")
return
self._network, self._optimizer = processor.network_auto_process_train(self._network, self._optimizer)
self._eval_network = processor.network_auto_process_eval(self._eval_network)
if self._eval_network is not None:
self._eval_network = processor.network_auto_process_eval(self._eval_network)
def _build_train_network(self):
"""Build train network"""

View File

@ -48,6 +48,7 @@ eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
acc_mode: "O0"
# Export options
device_id: 0

View File

@ -48,6 +48,7 @@ eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
acc_mode: "O0"
# Export options
device_id: 0

View File

@ -50,6 +50,7 @@ eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
acc_mode: "O0"
# Export options
device_id: 0

View File

@ -50,6 +50,7 @@ eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
acc_mode: "O0"
# Export options
device_id: 0

View File

@ -48,6 +48,7 @@ eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
acc_mode: "O0"
# Export options
device_id: 0

View File

@ -0,0 +1,78 @@
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
enable_modelarts: False
# Url for modelarts
data_url: ""
train_url: ""
checkpoint_url: ""
# Path for local
run_distribute: False
enable_profiling: False
data_path: "/cache/data"
output_path: "/cache/train"
load_path: "/cache/checkpoint_path/"
device_target: "Ascend"
checkpoint_path: "./checkpoint/"
checkpoint_file_path: ""
# ==============================================================================
# Training options
optimizer: "Momentum"
infer_label: ""
class_num: 1001
batch_size: 256
loss_scale: 1024
momentum: 0.9
weight_decay: 0.0001
epoch_size: 90
pretrain_epoch_size: 0
save_checkpoint: True
save_checkpoint_epochs: 5
keep_checkpoint_max: 10
warmup_epochs: 0
lr_decay_mode: "linear"
use_label_smooth: True
label_smooth_factor: 0.1
lr_init: 0
lr_max: 0.8
lr_end: 0.0
net_name: "resnet50"
dataset: "imagenet2012"
device_num: 1
pre_trained: ""
run_eval: False
eval_dataset_path: ""
parameter_server: False
filter_weight: False
save_best_ckpt: True
eval_start_epoch: 40
eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
acc_mode: "O1"
# Export options
device_id: 0
width: 224
height: 224
file_name: "resnet50"
file_format: "AIR"
ckpt_file: ""
network_dataset: "resnet50_imagenet2012"
---
# Help description for each configuration
enable_modelarts: "Whether training on modelarts, default: False"
data_url: "Dataset url for obs"
checkpoint_url: "The location of checkpoint for obs"
data_path: "Dataset path for local"
output_path: "Training output path for local"
load_path: "The location of checkpoint for obs"
device_target: "Target device type, available: [Ascend, GPU, CPU]"
enable_profiling: "Whether enable profiling while training, default: False"
num_classes: "Class for dataset"
batch_size: "Batch size for training and evaluation"
epoch_size: "Total training epochs."
checkpoint_path: "The location of the checkpoint file."
checkpoint_file_path: "The location of the checkpoint file."

View File

@ -51,6 +51,7 @@ eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
acc_mode: "O0"
# Export options
device_id: 0

View File

@ -51,6 +51,7 @@ eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
acc_mode: "O0"
# Export options
device_id: 0

View File

@ -50,6 +50,7 @@ eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
acc_mode: "O0"
# Export options
device_id: 0

View File

@ -25,6 +25,7 @@ eval: False
save_ckpt: False
mode_name: "GRAPH"
dtype: "fp16"
acc_mode: "O0"
# Export options
device_id: 0

View File

@ -51,6 +51,7 @@ eval_interval: 1
enable_cache: False
cache_session_id: ""
mode_name: "GRAPH"
acc_mode: "O0"
# Export options
device_id: 0

View File

@ -105,7 +105,8 @@ def set_parameter():
gradients_mean=True)
set_algo_parameters(elementwise_op_strategy_follow=True)
if config.net_name == "resnet50" or config.net_name == "se-resnet50":
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
if config.acc_mode not in ["O1", "O2"]:
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
elif config.net_name == "resnet101":
context.set_auto_parallel_context(all_reduce_fusion_config=[80, 210, 313])
init()
@ -228,7 +229,8 @@ def train_net():
model = Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network)
else:
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics,
amp_level="O2", keep_batchnorm_fp32=False, eval_network=dist_eval_network)
amp_level="O2", acc_level=config.acc_mode, keep_batchnorm_fp32=False,
eval_network=dist_eval_network)
if config.optimizer == "Thor" and config.dataset == "imagenet2012":
from src.lr_generator import get_thor_damping