update resnet acc_mode scripts.
This commit is contained in:
parent
86eb2a9605
commit
1de2366cda
|
@ -40,6 +40,7 @@ class AutoAcc:
|
|||
def __init__(self, level, kwargs):
|
||||
if level not in _acc_config_level.keys():
|
||||
level = 'O0'
|
||||
self.level = level
|
||||
acc_config = _acc_config_level[level]
|
||||
self._acc_config = acc_config
|
||||
self._fn_flag = True
|
||||
|
|
|
@ -73,16 +73,16 @@ class OptimizerProcess:
|
|||
params_gc_value.append(param)
|
||||
else:
|
||||
params_value.append(param)
|
||||
if params_gc_value:
|
||||
new_group_param = copy.deepcopy(group_param)
|
||||
new_group_param['params'] = params_gc_value
|
||||
new_group_param['grad_centralization'] = True
|
||||
group_params.append(new_group_param)
|
||||
if params_value:
|
||||
new_group_param = copy.deepcopy(group_param)
|
||||
new_group_param['params'] = params_value
|
||||
group_params.append(new_group_param)
|
||||
self.origin_params = group_params
|
||||
if params_gc_value:
|
||||
new_group_param = copy.deepcopy(group_param)
|
||||
new_group_param['params'] = params_gc_value
|
||||
new_group_param['grad_centralization'] = True
|
||||
group_params.append(new_group_param)
|
||||
if params_value:
|
||||
new_group_param = copy.deepcopy(group_param)
|
||||
new_group_param['params'] = params_value
|
||||
group_params.append(new_group_param)
|
||||
self.origin_params = group_params
|
||||
|
||||
def generate_new_optimizer(self):
|
||||
"""Generate new optimizer."""
|
||||
|
@ -135,6 +135,7 @@ class ParameterProcess:
|
|||
else:
|
||||
group_params = []
|
||||
params_name = [param.name for param in parameters]
|
||||
new_params_count = copy.deepcopy(params_name)
|
||||
for group_param in origin_params_copy:
|
||||
if 'order_params' in group_param.keys():
|
||||
new_group_param = copy.deepcopy(group_param)
|
||||
|
@ -146,9 +147,16 @@ class ParameterProcess:
|
|||
if param.name in params_name:
|
||||
index = params_name.index(param.name)
|
||||
params_value.append(parameters[index])
|
||||
new_params_count.remove(param.name)
|
||||
new_group_param = copy.deepcopy(group_param)
|
||||
new_group_param['params'] = params_value
|
||||
group_params.append(new_group_param)
|
||||
if new_params_count:
|
||||
params_value = []
|
||||
for param in new_params_count:
|
||||
index = params_name.index(param)
|
||||
params_value.append(parameters[index])
|
||||
group_params.append({"params": params_value})
|
||||
return group_params
|
||||
|
||||
_gradient_accumulation_op = C.MultitypeFuncGraph("gradient_accumulation_op")
|
||||
|
|
|
@ -139,6 +139,7 @@ class Model:
|
|||
self._device_number = _get_device_num()
|
||||
self._global_rank = _get_global_rank()
|
||||
self._parameter_broadcast = _get_parameter_broadcast()
|
||||
self._metrics = metrics
|
||||
|
||||
self._check_amp_level_arg(optimizer, amp_level)
|
||||
self._check_for_graph_cell(kwargs)
|
||||
|
@ -175,7 +176,7 @@ class Model:
|
|||
|
||||
def _check_kwargs(self, kwargs):
|
||||
for arg in kwargs:
|
||||
if arg not in ['loss_scale_manager', 'keep_batchnorm_fp32', 'total_steps']:
|
||||
if arg not in ['loss_scale_manager', 'keep_batchnorm_fp32']:
|
||||
raise ValueError(f"Unsupported arg '{arg}'")
|
||||
|
||||
def _check_reuse_dataset(self, dataset):
|
||||
|
@ -187,15 +188,18 @@ class Model:
|
|||
def _build_acc_network(self, kwargs):
|
||||
"""Build the acc network."""
|
||||
processor = acc.AutoAcc(self._acc_level, kwargs)
|
||||
if processor.level not in ["O1", "O2"]:
|
||||
return
|
||||
if self._optimizer is None:
|
||||
logger.warning("In acc mode, the optimizer must be defined.")
|
||||
return
|
||||
if self._eval_network is None:
|
||||
logger.warning("In acc mode, the eval_network must be defined.")
|
||||
if self._eval_network is None and self._metrics is None:
|
||||
logger.warning("In acc mode, the eval_network and metrics cannot be undefined at the same time.")
|
||||
return
|
||||
|
||||
self._network, self._optimizer = processor.network_auto_process_train(self._network, self._optimizer)
|
||||
self._eval_network = processor.network_auto_process_eval(self._eval_network)
|
||||
if self._eval_network is not None:
|
||||
self._eval_network = processor.network_auto_process_eval(self._eval_network)
|
||||
|
||||
def _build_train_network(self):
|
||||
"""Build train network"""
|
||||
|
|
|
@ -48,6 +48,7 @@ eval_interval: 1
|
|||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
acc_mode: "O0"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
|
|
|
@ -48,6 +48,7 @@ eval_interval: 1
|
|||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
acc_mode: "O0"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
|
|
|
@ -50,6 +50,7 @@ eval_interval: 1
|
|||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
acc_mode: "O0"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
|
|
|
@ -50,6 +50,7 @@ eval_interval: 1
|
|||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
acc_mode: "O0"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
|
|
|
@ -48,6 +48,7 @@ eval_interval: 1
|
|||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
acc_mode: "O0"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: "Ascend"
|
||||
checkpoint_path: "./checkpoint/"
|
||||
checkpoint_file_path: ""
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
optimizer: "Momentum"
|
||||
infer_label: ""
|
||||
class_num: 1001
|
||||
batch_size: 256
|
||||
loss_scale: 1024
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001
|
||||
epoch_size: 90
|
||||
pretrain_epoch_size: 0
|
||||
save_checkpoint: True
|
||||
save_checkpoint_epochs: 5
|
||||
keep_checkpoint_max: 10
|
||||
warmup_epochs: 0
|
||||
lr_decay_mode: "linear"
|
||||
use_label_smooth: True
|
||||
label_smooth_factor: 0.1
|
||||
lr_init: 0
|
||||
lr_max: 0.8
|
||||
lr_end: 0.0
|
||||
|
||||
net_name: "resnet50"
|
||||
dataset: "imagenet2012"
|
||||
device_num: 1
|
||||
pre_trained: ""
|
||||
run_eval: False
|
||||
eval_dataset_path: ""
|
||||
parameter_server: False
|
||||
filter_weight: False
|
||||
save_best_ckpt: True
|
||||
eval_start_epoch: 40
|
||||
eval_interval: 1
|
||||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
acc_mode: "O1"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnet50"
|
||||
file_format: "AIR"
|
||||
ckpt_file: ""
|
||||
network_dataset: "resnet50_imagenet2012"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: "Whether training on modelarts, default: False"
|
||||
data_url: "Dataset url for obs"
|
||||
checkpoint_url: "The location of checkpoint for obs"
|
||||
data_path: "Dataset path for local"
|
||||
output_path: "Training output path for local"
|
||||
load_path: "The location of checkpoint for obs"
|
||||
device_target: "Target device type, available: [Ascend, GPU, CPU]"
|
||||
enable_profiling: "Whether enable profiling while training, default: False"
|
||||
num_classes: "Class for dataset"
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -51,6 +51,7 @@ eval_interval: 1
|
|||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
acc_mode: "O0"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
|
|
|
@ -51,6 +51,7 @@ eval_interval: 1
|
|||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
acc_mode: "O0"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
|
@ -50,6 +50,7 @@ eval_interval: 1
|
|||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
acc_mode: "O0"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
|
|
|
@ -25,6 +25,7 @@ eval: False
|
|||
save_ckpt: False
|
||||
mode_name: "GRAPH"
|
||||
dtype: "fp16"
|
||||
acc_mode: "O0"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
|
|
|
@ -51,6 +51,7 @@ eval_interval: 1
|
|||
enable_cache: False
|
||||
cache_session_id: ""
|
||||
mode_name: "GRAPH"
|
||||
acc_mode: "O0"
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
|
|
|
@ -105,7 +105,8 @@ def set_parameter():
|
|||
gradients_mean=True)
|
||||
set_algo_parameters(elementwise_op_strategy_follow=True)
|
||||
if config.net_name == "resnet50" or config.net_name == "se-resnet50":
|
||||
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
|
||||
if config.acc_mode not in ["O1", "O2"]:
|
||||
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
|
||||
elif config.net_name == "resnet101":
|
||||
context.set_auto_parallel_context(all_reduce_fusion_config=[80, 210, 313])
|
||||
init()
|
||||
|
@ -228,7 +229,8 @@ def train_net():
|
|||
model = Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network)
|
||||
else:
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics,
|
||||
amp_level="O2", keep_batchnorm_fp32=False, eval_network=dist_eval_network)
|
||||
amp_level="O2", acc_level=config.acc_mode, keep_batchnorm_fp32=False,
|
||||
eval_network=dist_eval_network)
|
||||
|
||||
if config.optimizer == "Thor" and config.dataset == "imagenet2012":
|
||||
from src.lr_generator import get_thor_damping
|
||||
|
|
Loading…
Reference in New Issue