support dynamic frequency in thor

This commit is contained in:
mwang 2021-03-23 16:32:51 +08:00
parent a45f03eebf
commit 65796b4cc5
3 changed files with 41 additions and 5 deletions

View File

@ -37,6 +37,8 @@ config = ed({
"damping_init": 0.03,
"damping_decay": 0.87,
"frequency": 834,
"use_dynamic_frequency": False,
"first_stage_steps": 835,
})
# config for resnet50, imagenet2012, GPU
@ -59,4 +61,6 @@ config_gpu = ed({
"damping_init": 0.02345,
"damping_decay": 0.5467,
"frequency": 834,
"use_dynamic_frequency": False,
"first_stage_steps": 835,
})

View File

@ -105,10 +105,13 @@ class Model_Thor(Model):
"""
def __init__(self, network, loss_fn=None, optimizer=None, metrics=None, eval_network=None,
eval_indexes=None, amp_level="O0", frequency=834, **kwargs):
eval_indexes=None, amp_level="O0", frequency=834, use_dynamic_frequency=False,
first_stage_steps=5, **kwargs):
super(Model_Thor, self).__init__(network, loss_fn, optimizer, metrics, eval_network,
eval_indexes, amp_level, **kwargs)
self._frequency = frequency
self._use_dynamic_frequency = use_dynamic_frequency
self._first_stage_steps = first_stage_steps
self._train_network = self._build_train_network()
def _exec_preprocess(self, network, is_train, phase, dataset, dataset_sink_mode, sink_size=-1,
@ -128,6 +131,25 @@ class Model_Thor(Model):
return dataset_helper, network
def _get_iter_second_steps(self, cb_params, sink_size):
"""get first stage steps for second order."""
iter_second_steps = 1
if self._use_dynamic_frequency:
global_steps = (cb_params.cur_epoch_num - 1) * sink_size + cb_params.cur_step_num
if global_steps <= self._first_stage_steps:
iter_second_steps = self._first_stage_steps
return iter_second_steps
def _get_ascend_sink_count(self, cb_params, dataset_helper, sink_size, iter_first_order, ori_sink_count):
"""get ascend sink count for each epoch."""
if context.get_context("device_target") == "Ascend":
if self._use_dynamic_frequency and cb_params.cur_epoch_num == 1:
fix_fre_sink_size = sink_size - self._first_stage_steps - iter_first_order
first_epoch_sink_count = math.ceil(fix_fre_sink_size / self._frequency) * 2 + 2
dataset_helper.iter.sink_count = first_epoch_sink_count
else:
dataset_helper.iter.sink_count = ori_sink_count
def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None, sink_size=-1):
"""
Training process. The data would be passed to network through dataset channel.
@ -174,9 +196,12 @@ class Model_Thor(Model):
train_network_init_flag = True
has_do_dataset_init = False
ori_sink_count = dataset_helper.iter.sink_count
for i in range(epoch):
cb_params.cur_epoch_num = i + 1
list_callback.epoch_begin(run_context)
self._get_ascend_sink_count(cb_params, dataset_helper, sink_size, iter_first_order, ori_sink_count)
# for data sink dataset_helper only iter once, other wise iter epoch_size times.
for inputs in dataset_helper:
if _need_to_full() and context.get_context("device_target") == "GPU":
@ -188,9 +213,14 @@ class Model_Thor(Model):
if train_network_init_flag:
self._train_network.add_flags_recursive(thor=True)
self._train_network.phase = 'train0'
switch_branch_one = not switch_branch_one
outputs = self._train_network(*inputs)
cb_params.net_outputs = outputs
is_first_stage = self._use_dynamic_frequency and cb_params.cur_epoch_num == 1 \
and cb_params.cur_step_num < self._first_stage_steps
if is_first_stage:
continue
else:
switch_branch_one = not switch_branch_one
list_callback.step_end(run_context)
else:
cb_params.cur_step_num += 1
@ -207,7 +237,7 @@ class Model_Thor(Model):
list_callback.step_end(run_context)
else:
if switch_branch_one:
cb_params.cur_step_num += 1
cb_params.cur_step_num += self._get_iter_second_steps(cb_params, sink_size)
if train_network_init_flag:
self._train_network.add_flags_recursive(thor=True)
self._train_network.phase = 'train0'

View File

@ -125,7 +125,9 @@ if __name__ == '__main__':
config.weight_decay, config.loss_scale)
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', loss_scale_manager=loss_scale,
keep_batchnorm_fp32=False, metrics={'acc'}, frequency=config.frequency)
keep_batchnorm_fp32=False, metrics={'acc'}, frequency=config.frequency,
use_dynamic_frequency=config.use_dynamic_frequency,
first_stage_steps=config.first_stage_steps)
# define callbacks
time_cb = TimeMonitor(data_size=step_size)