update resnet acc_mode scripts.
This commit is contained in:
parent
86eb2a9605
commit
1de2366cda
|
@ -40,6 +40,7 @@ class AutoAcc:
|
||||||
def __init__(self, level, kwargs):
|
def __init__(self, level, kwargs):
|
||||||
if level not in _acc_config_level.keys():
|
if level not in _acc_config_level.keys():
|
||||||
level = 'O0'
|
level = 'O0'
|
||||||
|
self.level = level
|
||||||
acc_config = _acc_config_level[level]
|
acc_config = _acc_config_level[level]
|
||||||
self._acc_config = acc_config
|
self._acc_config = acc_config
|
||||||
self._fn_flag = True
|
self._fn_flag = True
|
||||||
|
|
|
@ -73,16 +73,16 @@ class OptimizerProcess:
|
||||||
params_gc_value.append(param)
|
params_gc_value.append(param)
|
||||||
else:
|
else:
|
||||||
params_value.append(param)
|
params_value.append(param)
|
||||||
if params_gc_value:
|
if params_gc_value:
|
||||||
new_group_param = copy.deepcopy(group_param)
|
new_group_param = copy.deepcopy(group_param)
|
||||||
new_group_param['params'] = params_gc_value
|
new_group_param['params'] = params_gc_value
|
||||||
new_group_param['grad_centralization'] = True
|
new_group_param['grad_centralization'] = True
|
||||||
group_params.append(new_group_param)
|
group_params.append(new_group_param)
|
||||||
if params_value:
|
if params_value:
|
||||||
new_group_param = copy.deepcopy(group_param)
|
new_group_param = copy.deepcopy(group_param)
|
||||||
new_group_param['params'] = params_value
|
new_group_param['params'] = params_value
|
||||||
group_params.append(new_group_param)
|
group_params.append(new_group_param)
|
||||||
self.origin_params = group_params
|
self.origin_params = group_params
|
||||||
|
|
||||||
def generate_new_optimizer(self):
|
def generate_new_optimizer(self):
|
||||||
"""Generate new optimizer."""
|
"""Generate new optimizer."""
|
||||||
|
@ -135,6 +135,7 @@ class ParameterProcess:
|
||||||
else:
|
else:
|
||||||
group_params = []
|
group_params = []
|
||||||
params_name = [param.name for param in parameters]
|
params_name = [param.name for param in parameters]
|
||||||
|
new_params_count = copy.deepcopy(params_name)
|
||||||
for group_param in origin_params_copy:
|
for group_param in origin_params_copy:
|
||||||
if 'order_params' in group_param.keys():
|
if 'order_params' in group_param.keys():
|
||||||
new_group_param = copy.deepcopy(group_param)
|
new_group_param = copy.deepcopy(group_param)
|
||||||
|
@ -146,9 +147,16 @@ class ParameterProcess:
|
||||||
if param.name in params_name:
|
if param.name in params_name:
|
||||||
index = params_name.index(param.name)
|
index = params_name.index(param.name)
|
||||||
params_value.append(parameters[index])
|
params_value.append(parameters[index])
|
||||||
|
new_params_count.remove(param.name)
|
||||||
new_group_param = copy.deepcopy(group_param)
|
new_group_param = copy.deepcopy(group_param)
|
||||||
new_group_param['params'] = params_value
|
new_group_param['params'] = params_value
|
||||||
group_params.append(new_group_param)
|
group_params.append(new_group_param)
|
||||||
|
if new_params_count:
|
||||||
|
params_value = []
|
||||||
|
for param in new_params_count:
|
||||||
|
index = params_name.index(param)
|
||||||
|
params_value.append(parameters[index])
|
||||||
|
group_params.append({"params": params_value})
|
||||||
return group_params
|
return group_params
|
||||||
|
|
||||||
_gradient_accumulation_op = C.MultitypeFuncGraph("gradient_accumulation_op")
|
_gradient_accumulation_op = C.MultitypeFuncGraph("gradient_accumulation_op")
|
||||||
|
|
|
@ -139,6 +139,7 @@ class Model:
|
||||||
self._device_number = _get_device_num()
|
self._device_number = _get_device_num()
|
||||||
self._global_rank = _get_global_rank()
|
self._global_rank = _get_global_rank()
|
||||||
self._parameter_broadcast = _get_parameter_broadcast()
|
self._parameter_broadcast = _get_parameter_broadcast()
|
||||||
|
self._metrics = metrics
|
||||||
|
|
||||||
self._check_amp_level_arg(optimizer, amp_level)
|
self._check_amp_level_arg(optimizer, amp_level)
|
||||||
self._check_for_graph_cell(kwargs)
|
self._check_for_graph_cell(kwargs)
|
||||||
|
@ -175,7 +176,7 @@ class Model:
|
||||||
|
|
||||||
def _check_kwargs(self, kwargs):
|
def _check_kwargs(self, kwargs):
|
||||||
for arg in kwargs:
|
for arg in kwargs:
|
||||||
if arg not in ['loss_scale_manager', 'keep_batchnorm_fp32', 'total_steps']:
|
if arg not in ['loss_scale_manager', 'keep_batchnorm_fp32']:
|
||||||
raise ValueError(f"Unsupported arg '{arg}'")
|
raise ValueError(f"Unsupported arg '{arg}'")
|
||||||
|
|
||||||
def _check_reuse_dataset(self, dataset):
|
def _check_reuse_dataset(self, dataset):
|
||||||
|
@ -187,15 +188,18 @@ class Model:
|
||||||
def _build_acc_network(self, kwargs):
|
def _build_acc_network(self, kwargs):
|
||||||
"""Build the acc network."""
|
"""Build the acc network."""
|
||||||
processor = acc.AutoAcc(self._acc_level, kwargs)
|
processor = acc.AutoAcc(self._acc_level, kwargs)
|
||||||
|
if processor.level not in ["O1", "O2"]:
|
||||||
|
return
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
logger.warning("In acc mode, the optimizer must be defined.")
|
logger.warning("In acc mode, the optimizer must be defined.")
|
||||||
return
|
return
|
||||||
if self._eval_network is None:
|
if self._eval_network is None and self._metrics is None:
|
||||||
logger.warning("In acc mode, the eval_network must be defined.")
|
logger.warning("In acc mode, the eval_network and metrics cannot be undefined at the same time.")
|
||||||
return
|
return
|
||||||
|
|
||||||
self._network, self._optimizer = processor.network_auto_process_train(self._network, self._optimizer)
|
self._network, self._optimizer = processor.network_auto_process_train(self._network, self._optimizer)
|
||||||
self._eval_network = processor.network_auto_process_eval(self._eval_network)
|
if self._eval_network is not None:
|
||||||
|
self._eval_network = processor.network_auto_process_eval(self._eval_network)
|
||||||
|
|
||||||
def _build_train_network(self):
|
def _build_train_network(self):
|
||||||
"""Build train network"""
|
"""Build train network"""
|
||||||
|
|
|
@ -48,6 +48,7 @@ eval_interval: 1
|
||||||
enable_cache: False
|
enable_cache: False
|
||||||
cache_session_id: ""
|
cache_session_id: ""
|
||||||
mode_name: "GRAPH"
|
mode_name: "GRAPH"
|
||||||
|
acc_mode: "O0"
|
||||||
|
|
||||||
# Export options
|
# Export options
|
||||||
device_id: 0
|
device_id: 0
|
||||||
|
|
|
@ -48,6 +48,7 @@ eval_interval: 1
|
||||||
enable_cache: False
|
enable_cache: False
|
||||||
cache_session_id: ""
|
cache_session_id: ""
|
||||||
mode_name: "GRAPH"
|
mode_name: "GRAPH"
|
||||||
|
acc_mode: "O0"
|
||||||
|
|
||||||
# Export options
|
# Export options
|
||||||
device_id: 0
|
device_id: 0
|
||||||
|
|
|
@ -50,6 +50,7 @@ eval_interval: 1
|
||||||
enable_cache: False
|
enable_cache: False
|
||||||
cache_session_id: ""
|
cache_session_id: ""
|
||||||
mode_name: "GRAPH"
|
mode_name: "GRAPH"
|
||||||
|
acc_mode: "O0"
|
||||||
|
|
||||||
# Export options
|
# Export options
|
||||||
device_id: 0
|
device_id: 0
|
||||||
|
|
|
@ -50,6 +50,7 @@ eval_interval: 1
|
||||||
enable_cache: False
|
enable_cache: False
|
||||||
cache_session_id: ""
|
cache_session_id: ""
|
||||||
mode_name: "GRAPH"
|
mode_name: "GRAPH"
|
||||||
|
acc_mode: "O0"
|
||||||
|
|
||||||
# Export options
|
# Export options
|
||||||
device_id: 0
|
device_id: 0
|
||||||
|
|
|
@ -48,6 +48,7 @@ eval_interval: 1
|
||||||
enable_cache: False
|
enable_cache: False
|
||||||
cache_session_id: ""
|
cache_session_id: ""
|
||||||
mode_name: "GRAPH"
|
mode_name: "GRAPH"
|
||||||
|
acc_mode: "O0"
|
||||||
|
|
||||||
# Export options
|
# Export options
|
||||||
device_id: 0
|
device_id: 0
|
||||||
|
|
|
@ -0,0 +1,78 @@
|
||||||
|
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||||
|
enable_modelarts: False
|
||||||
|
# Url for modelarts
|
||||||
|
data_url: ""
|
||||||
|
train_url: ""
|
||||||
|
checkpoint_url: ""
|
||||||
|
# Path for local
|
||||||
|
run_distribute: False
|
||||||
|
enable_profiling: False
|
||||||
|
data_path: "/cache/data"
|
||||||
|
output_path: "/cache/train"
|
||||||
|
load_path: "/cache/checkpoint_path/"
|
||||||
|
device_target: "Ascend"
|
||||||
|
checkpoint_path: "./checkpoint/"
|
||||||
|
checkpoint_file_path: ""
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Training options
|
||||||
|
optimizer: "Momentum"
|
||||||
|
infer_label: ""
|
||||||
|
class_num: 1001
|
||||||
|
batch_size: 256
|
||||||
|
loss_scale: 1024
|
||||||
|
momentum: 0.9
|
||||||
|
weight_decay: 0.0001
|
||||||
|
epoch_size: 90
|
||||||
|
pretrain_epoch_size: 0
|
||||||
|
save_checkpoint: True
|
||||||
|
save_checkpoint_epochs: 5
|
||||||
|
keep_checkpoint_max: 10
|
||||||
|
warmup_epochs: 0
|
||||||
|
lr_decay_mode: "linear"
|
||||||
|
use_label_smooth: True
|
||||||
|
label_smooth_factor: 0.1
|
||||||
|
lr_init: 0
|
||||||
|
lr_max: 0.8
|
||||||
|
lr_end: 0.0
|
||||||
|
|
||||||
|
net_name: "resnet50"
|
||||||
|
dataset: "imagenet2012"
|
||||||
|
device_num: 1
|
||||||
|
pre_trained: ""
|
||||||
|
run_eval: False
|
||||||
|
eval_dataset_path: ""
|
||||||
|
parameter_server: False
|
||||||
|
filter_weight: False
|
||||||
|
save_best_ckpt: True
|
||||||
|
eval_start_epoch: 40
|
||||||
|
eval_interval: 1
|
||||||
|
enable_cache: False
|
||||||
|
cache_session_id: ""
|
||||||
|
mode_name: "GRAPH"
|
||||||
|
acc_mode: "O1"
|
||||||
|
|
||||||
|
# Export options
|
||||||
|
device_id: 0
|
||||||
|
width: 224
|
||||||
|
height: 224
|
||||||
|
file_name: "resnet50"
|
||||||
|
file_format: "AIR"
|
||||||
|
ckpt_file: ""
|
||||||
|
network_dataset: "resnet50_imagenet2012"
|
||||||
|
|
||||||
|
---
|
||||||
|
# Help description for each configuration
|
||||||
|
enable_modelarts: "Whether training on modelarts, default: False"
|
||||||
|
data_url: "Dataset url for obs"
|
||||||
|
checkpoint_url: "The location of checkpoint for obs"
|
||||||
|
data_path: "Dataset path for local"
|
||||||
|
output_path: "Training output path for local"
|
||||||
|
load_path: "The location of checkpoint for obs"
|
||||||
|
device_target: "Target device type, available: [Ascend, GPU, CPU]"
|
||||||
|
enable_profiling: "Whether enable profiling while training, default: False"
|
||||||
|
num_classes: "Class for dataset"
|
||||||
|
batch_size: "Batch size for training and evaluation"
|
||||||
|
epoch_size: "Total training epochs."
|
||||||
|
checkpoint_path: "The location of the checkpoint file."
|
||||||
|
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -51,6 +51,7 @@ eval_interval: 1
|
||||||
enable_cache: False
|
enable_cache: False
|
||||||
cache_session_id: ""
|
cache_session_id: ""
|
||||||
mode_name: "GRAPH"
|
mode_name: "GRAPH"
|
||||||
|
acc_mode: "O0"
|
||||||
|
|
||||||
# Export options
|
# Export options
|
||||||
device_id: 0
|
device_id: 0
|
||||||
|
|
|
@ -51,6 +51,7 @@ eval_interval: 1
|
||||||
enable_cache: False
|
enable_cache: False
|
||||||
cache_session_id: ""
|
cache_session_id: ""
|
||||||
mode_name: "GRAPH"
|
mode_name: "GRAPH"
|
||||||
|
acc_mode: "O0"
|
||||||
|
|
||||||
# Export options
|
# Export options
|
||||||
device_id: 0
|
device_id: 0
|
|
@ -50,6 +50,7 @@ eval_interval: 1
|
||||||
enable_cache: False
|
enable_cache: False
|
||||||
cache_session_id: ""
|
cache_session_id: ""
|
||||||
mode_name: "GRAPH"
|
mode_name: "GRAPH"
|
||||||
|
acc_mode: "O0"
|
||||||
|
|
||||||
# Export options
|
# Export options
|
||||||
device_id: 0
|
device_id: 0
|
||||||
|
|
|
@ -25,6 +25,7 @@ eval: False
|
||||||
save_ckpt: False
|
save_ckpt: False
|
||||||
mode_name: "GRAPH"
|
mode_name: "GRAPH"
|
||||||
dtype: "fp16"
|
dtype: "fp16"
|
||||||
|
acc_mode: "O0"
|
||||||
|
|
||||||
# Export options
|
# Export options
|
||||||
device_id: 0
|
device_id: 0
|
||||||
|
|
|
@ -51,6 +51,7 @@ eval_interval: 1
|
||||||
enable_cache: False
|
enable_cache: False
|
||||||
cache_session_id: ""
|
cache_session_id: ""
|
||||||
mode_name: "GRAPH"
|
mode_name: "GRAPH"
|
||||||
|
acc_mode: "O0"
|
||||||
|
|
||||||
# Export options
|
# Export options
|
||||||
device_id: 0
|
device_id: 0
|
||||||
|
|
|
@ -105,7 +105,8 @@ def set_parameter():
|
||||||
gradients_mean=True)
|
gradients_mean=True)
|
||||||
set_algo_parameters(elementwise_op_strategy_follow=True)
|
set_algo_parameters(elementwise_op_strategy_follow=True)
|
||||||
if config.net_name == "resnet50" or config.net_name == "se-resnet50":
|
if config.net_name == "resnet50" or config.net_name == "se-resnet50":
|
||||||
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
|
if config.acc_mode not in ["O1", "O2"]:
|
||||||
|
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
|
||||||
elif config.net_name == "resnet101":
|
elif config.net_name == "resnet101":
|
||||||
context.set_auto_parallel_context(all_reduce_fusion_config=[80, 210, 313])
|
context.set_auto_parallel_context(all_reduce_fusion_config=[80, 210, 313])
|
||||||
init()
|
init()
|
||||||
|
@ -228,7 +229,8 @@ def train_net():
|
||||||
model = Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network)
|
model = Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network)
|
||||||
else:
|
else:
|
||||||
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics,
|
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics,
|
||||||
amp_level="O2", keep_batchnorm_fp32=False, eval_network=dist_eval_network)
|
amp_level="O2", acc_level=config.acc_mode, keep_batchnorm_fp32=False,
|
||||||
|
eval_network=dist_eval_network)
|
||||||
|
|
||||||
if config.optimizer == "Thor" and config.dataset == "imagenet2012":
|
if config.optimizer == "Thor" and config.dataset == "imagenet2012":
|
||||||
from src.lr_generator import get_thor_damping
|
from src.lr_generator import get_thor_damping
|
||||||
|
|
Loading…
Reference in New Issue