diff --git a/tests/st/networks/models/resnet50/scripts/run_resnet50_imagenet_4p.sh b/tests/st/networks/models/resnet50/scripts/run_resnet50_imagenet_4p.sh new file mode 100644 index 00000000000..7ac913a9459 --- /dev/null +++ b/tests/st/networks/models/resnet50/scripts/run_resnet50_imagenet_4p.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +BASE_PATH=$(cd "$(dirname $0)"; pwd) + +export RANK_SIZE=4 +export RANK_TABLE_FILE="/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_1.json" + +cpus=`cat /proc/cpuinfo| grep "processor"| wc -l` +avg=`expr $cpus \/ 8` +gap=`expr $avg \- 1` +rank_start=0 +for((i=0; i<$RANK_SIZE; i++)) +do + j=$((rank_start + i)) + start=`expr $j \* $avg` + end=`expr $start \+ $gap` + cmdopt=$start"-"$end + export DEVICE_ID=$((rank_start + i)) + export RANK_ID=${i} + rm -rf $BASE_PATH/../train_parallel$j + mkdir $BASE_PATH/../train_parallel$j + cd $BASE_PATH/../train_parallel$j || exit + echo "start resnet training for rank $RANK_ID, device $DEVICE_ID" + (taskset -c $cmdopt python $BASE_PATH/../train_resnet50.py &> log; grep "===" log > resnet_$i.txt) & + cd .. +done +wait +echo "result:" +cat $BASE_PATH/../train_parallel0/log +cat $BASE_PATH/../train_parallel*/resnet_*.txt diff --git a/tests/st/networks/models/resnet50/scripts/run_resnet_thor_imagenet_4p.sh b/tests/st/networks/models/resnet50/scripts/run_resnet_thor_imagenet_4p.sh new file mode 100644 index 00000000000..0f5b1496305 --- /dev/null +++ b/tests/st/networks/models/resnet50/scripts/run_resnet_thor_imagenet_4p.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +BASE_PATH=$(cd "$(dirname $0)"; pwd) + +export RANK_SIZE=4 +export RANK_TABLE_FILE="/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_2.json" + +cpus=`cat /proc/cpuinfo| grep "processor"| wc -l` +avg=`expr $cpus \/ 8` +gap=`expr $avg \- 1` +rank_start=4 +for((i=0; i<$RANK_SIZE; i++)) +do + j=$((rank_start + i)) + start=`expr $j \* $avg` + end=`expr $start \+ $gap` + cmdopt=$start"-"$end + export DEVICE_ID=$((rank_start + i)) + export RANK_ID=${i} + rm -rf $BASE_PATH/../train_parallel$j + mkdir $BASE_PATH/../train_parallel$j + cd $BASE_PATH/../train_parallel$j || exit + echo "start resnet thor training for rank $RANK_ID, device $DEVICE_ID" + (taskset -c $cmdopt python $BASE_PATH/../train_resnet50_thor.py &> log; grep "===" log > thor_$i.txt) & + cd .. +done +wait +echo "result:" +cat $BASE_PATH/../train_parallel5/log +cat $BASE_PATH/../train_parallel*/thor_*.txt diff --git a/tests/st/networks/models/resnet50/scripts/run_train.sh b/tests/st/networks/models/resnet50/scripts/run_train.sh new file mode 100644 index 00000000000..979123de644 --- /dev/null +++ b/tests/st/networks/models/resnet50/scripts/run_train.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +BASE_PATH=$(cd "$(dirname $0)"; pwd) +bash $BASE_PATH/run_resnet50_imagenet_4p.sh & +bash $BASE_PATH/run_resnet_thor_imagenet_4p.sh & +wait diff --git a/tests/st/networks/models/resnet50/src/callback.py b/tests/st/networks/models/resnet50/src/callback.py new file mode 100644 index 00000000000..d8d74067de9 --- /dev/null +++ b/tests/st/networks/models/resnet50/src/callback.py @@ -0,0 +1,70 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""custom callback.""" +import time +import numpy as np +import mindspore as ms +from mindspore.train.callback import Callback + + +class LossGet(Callback): + def __init__(self, per_print_times, data_size): + super(LossGet, self).__init__() + if not isinstance(per_print_times, int) or per_print_times < 0: + raise ValueError("print_step must be int and >= 0.") + self._per_print_times = per_print_times + self._loss = 0.0 + self.data_size = data_size + self._epoch = 0 + self.epoch_time = time.time() + self._per_step_mseconds = 0 + + def step_end(self, run_context): + cb_params = run_context.original_args() + loss = cb_params.net_outputs + self._epoch = cb_params.cur_epoch_num + if isinstance(loss, (tuple, list)): + if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): + loss = loss[0] + + if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray): + loss = np.mean(loss.asnumpy()) + + cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 + + if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)): + raise ValueError("epoch: {} step: {}. Invalid loss, terminating training." + .format(cb_params.cur_epoch_num, cur_step_in_epoch)) + cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 + if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0: + self._loss = loss + print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, + cur_step_in_epoch, loss), flush=True) + + def epoch_begin(self, run_context): + self.epoch_time = time.time() + + def epoch_end(self, run_context): + epoch_mseconds = (time.time() - self.epoch_time) * 1000 + self._per_step_mseconds = epoch_mseconds / self.data_size + + def get_loss(self): + return self._loss + + def get_per_step_time(self): + return self._per_step_mseconds + + def get_epoch(self): + return self._epoch diff --git a/tests/st/networks/models/resnet50/test_resnet50_imagenet_and_thor.py b/tests/st/networks/models/resnet50/test_resnet50_imagenet_and_thor.py index 81e214480e2..9cf16bb9f02 100644 --- a/tests/st/networks/models/resnet50/test_resnet50_imagenet_and_thor.py +++ b/tests/st/networks/models/resnet50/test_resnet50_imagenet_and_thor.py @@ -1,4 +1,4 @@ -# Copyright 2020-2021 Huawei Technologies Co., Ltd +# Copyright 2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,332 +14,61 @@ # ============================================================================ """train and evaluate resnet50 network on imagenet dataset""" - import os -import time -from multiprocessing import Process, Queue +import shutil import pytest -import numpy as np - -from mindspore import context -from mindspore.common.tensor import Tensor -from mindspore.communication.management import init -from mindspore.context import ParallelMode -from mindspore.train.callback import Callback -from mindspore.train.model import Model -from mindspore.train.train_thor import ConvertModelUtils -from mindspore.train.loss_scale_manager import FixedLossScaleManager -from mindspore.nn.optim import thor -import mindspore.dataset as ds -import mindspore.nn as nn - -from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell -from tests.st.networks.models.resnet50.src.dataset import create_dataset -from tests.st.networks.models.resnet50.src.lr_generator import get_learning_rate -from tests.st.networks.models.resnet50.src.config import config -from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth -from tests.st.networks.models.resnet50.src_thor.config import config as thor_config -from tests.st.networks.models.resnet50.src_thor.dataset import create_dataset2 as create_dataset_thor -from tests.st.networks.models.resnet50.src.resnet import resnet50 - -MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_1.json" -MINDSPORE_HCCL_CONFIG_PATH_2 = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_2.json" -dataset_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train" -eval_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val" - -np.random.seed(1) -ds.config.set_seed(1) -os.environ['GLOG_v'] = str(2) -def get_thor_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100): - """get_model_lr""" - lr_each_step = [] - total_steps = steps_per_epoch * total_epochs - for i in range(total_steps): - epoch = (i + 1) / steps_per_epoch - base = (1.0 - float(epoch) / total_epochs) ** decay - lr_local = lr_init * base - if epoch >= decay_epochs: - lr_local = lr_local * 0.5 - if epoch >= decay_epochs + 1: - lr_local = lr_local * 0.5 - lr_each_step.append(lr_local) - current_step = global_step - lr_each_step = np.array(lr_each_step).astype(np.float32) - learning_rate = lr_each_step[current_step:] - return learning_rate +def get_env_info(): + print("================== CPU ======================") + os.system("top -bi -n 2 -d 0.02") + print("================= IO ====================") + os.system("iostat") + print("================= Memory =====================") + os.system("free -h") + print("================= Process ====================") + os.system("ps -ef | grep python") + print("================= NPU ====================") + os.system("npu-smi info") -def get_thor_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch): - """get_model_damping""" - damping_each_step = [] - total_steps = steps_per_epoch * total_epochs - for step in range(total_steps): - epoch = (step + 1) / steps_per_epoch - damping_here = damping_init * (decay_rate ** (epoch / 10)) - damping_each_step.append(damping_here) - current_step = global_step - damping_each_step = np.array(damping_each_step).astype(np.float32) - damping_now = damping_each_step[current_step:] - return damping_now - - -class LossGet(Callback): - def __init__(self, per_print_times, data_size): - super(LossGet, self).__init__() - if not isinstance(per_print_times, int) or per_print_times < 0: - raise ValueError("print_step must be int and >= 0.") - self._per_print_times = per_print_times - self._loss = 0.0 - self.data_size = data_size - self._epoch = 0 - - def step_end(self, run_context): - cb_params = run_context.original_args() - loss = cb_params.net_outputs - self._epoch = cb_params.cur_epoch_num - if isinstance(loss, (tuple, list)): - if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray): - loss = loss[0] - - if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray): - loss = np.mean(loss.asnumpy()) - - cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 - - if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)): - raise ValueError("epoch: {} step: {}. Invalid loss, terminating training." - .format(cb_params.cur_epoch_num, cur_step_in_epoch)) - cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 - if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0: - self._loss = loss - print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, - cur_step_in_epoch, loss), flush=True) - - def epoch_begin(self, run_context): - self.epoch_time = time.time() - - def epoch_end(self, run_context): - epoch_mseconds = (time.time() - self.epoch_time) * 1000 - self._per_step_mseconds = epoch_mseconds / self.data_size - - def get_loss(self): - return self._loss - - def get_per_step_time(self): - return self._per_step_mseconds - - def get_epoch(self): - return self._epoch - - -def train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset, q): - print("run_start", device_id) - eval_interval = config.eval_interval - step_size = dataset.get_dataset_size() - acc = 0.0 - time_cost = 0.0 - for epoch_idx in range(0, int(epoch_size / eval_interval)): - model.train(1, dataset, callbacks=loss_cb) - eval_start = time.time() - output = model.eval(eval_dataset) - eval_cost = (time.time() - eval_start) * 1000 - acc = float(output["acc"]) - time_cost = loss_cb.get_per_step_time() - loss = loss_cb.get_loss() - print("the {} epoch's resnet result:\n " - "device{}, training loss {}, acc {}, " - "training per step cost {:.2f} ms, eval cost {:.2f} ms, " - "total_cost {:.2f} ms".format(epoch_idx, device_id, - loss, acc, time_cost, - eval_cost, - time_cost * step_size + eval_cost)) - q.put({'acc': acc, 'cost': time_cost}) - - -def train_process(q, device_id, epoch_size, device_num, enable_hccl): - os.system("mkdir " + str(device_id)) - os.chdir(str(device_id)) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) - os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH - os.environ['RANK_ID'] = str(device_id) - os.environ['RANK_SIZE'] = str(device_num) - if enable_hccl: - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True, all_reduce_fusion_config=[107, 160]) - init() - - # network - - net = resnet50(class_num=config.class_num) - - # evaluation network - dist_eval_network = ClassifyCorrectCell(net) - - if not config.use_label_smooth: - config.label_smooth_factor = 0.0 - - # loss - loss = CrossEntropySmooth(sparse=True, reduction="mean", - smooth_factor=config.label_smooth_factor, num_classes=config.class_num) - - # train dataset - dataset = create_dataset(dataset_path=dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size) - - step_size = dataset.get_dataset_size() - # evaluation dataset - eval_dataset = create_dataset(dataset_path=eval_path, do_train=False, - repeat_num=1, batch_size=config.eval_batch_size) - - # loss scale - loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) - - # learning rate - lr = Tensor(get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, - warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, - steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode)) - - # optimizer - decayed_params = [] - no_decayed_params = [] - for param in net.trainable_params(): - if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: - decayed_params.append(param) - else: - no_decayed_params.append(param) - - group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, - {'params': no_decayed_params, 'weight_decay': 0.0}, - {'order_params': net.trainable_params()}] - - if config.use_lars: - momentum = nn.Momentum(group_params, lr, config.momentum, - loss_scale=config.loss_scale, use_nesterov=config.use_nesterov) - opt = nn.LARS(momentum, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, - lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) - - else: - opt = nn.Momentum(group_params, lr, config.momentum, - loss_scale=config.loss_scale, use_nesterov=config.use_nesterov) - - # model - model = Model(net, loss_fn=loss, optimizer=opt, - loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False, - metrics={'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num)}, - eval_network=dist_eval_network) - - # callbacks - loss_cb = LossGet(1, step_size) - train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset, q) - - -def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl): - os.system("mkdir " + str(device_id)) - os.chdir(str(device_id)) - context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") - context.set_context(device_id=device_id) - os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2 - os.environ['RANK_ID'] = str(device_id - 4) - os.environ['RANK_SIZE'] = str(device_num) - if enable_hccl: - context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - gradients_mean=True, all_reduce_fusion_config=[85, 160]) - init() - - # network - net = resnet50(thor_config.class_num) - - if not thor_config.label_smooth: - thor_config.label_smooth_factor = 0.0 - - # loss - loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor, - num_classes=thor_config.class_num) - - # train dataset - dataset = create_dataset_thor(dataset_path=dataset_path, do_train=True, - batch_size=thor_config.batch_size, train_image_size=thor_config.train_image_size, - eval_image_size=thor_config.eval_image_size, target="Ascend", - distribute=True) - step_size = dataset.get_dataset_size() - - # loss scale - loss_scale = FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False) - - # learning rate - lr = get_thor_lr(0, 0.05803, 4.04839, 53, 5004, decay_epochs=39) - damping = get_thor_damping(0, 0.02714, 0.50036, 70, 5004) - # optimizer - split_indices = [26, 53] - opt = thor(net, Tensor(lr), Tensor(damping), thor_config.momentum, thor_config.weight_decay, thor_config.loss_scale, - thor_config.batch_size, split_indices=split_indices, frequency=thor_config.frequency) - - # evaluation network - dist_eval_network = ClassifyCorrectCell(net) - # model - model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, - metrics={'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num)}, - amp_level="O2", keep_batchnorm_fp32=False, - eval_network=dist_eval_network) - - model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, - loss_scale_manager=loss_scale, metrics={'acc'}, - amp_level="O2", keep_batchnorm_fp32=False) - - # callbacks - loss_cb = LossGet(1, step_size) - - # train and eval - print("run_start", device_id) - model.train(2, dataset, callbacks=loss_cb, - sink_size=dataset.get_dataset_size(), dataset_sink_mode=True) - time_cost = loss_cb.get_per_step_time() - loss = loss_cb.get_loss() - epoch_idx = loss_cb.get_epoch() - print("the {} epoch's resnet result:\n " - "device{}, training loss {}, " - "training per step cost {:.2f} ms, total_cost {:.2f} ms".format(epoch_idx, device_id, - loss, time_cost, time_cost * step_size)) - q.put({'loss': loss, 'cost': time_cost}) - - -def resnet_end(device_num, q): - acc = 0.0 - cost = 0.0 - for i in range(device_num): - assert not q.empty() - output = q.get() - acc += output['acc'] - cost += output['cost'] - acc = acc / device_num - cost = cost / device_num - - for i in range(device_num): - os.system("rm -rf " + str(i)) - print("End training...") +def resnet_end(): + acc = 0 + cost = 0 + sh_path = os.path.split(os.path.realpath(__file__))[0] + for i in range(4): + with open(os.path.join(sh_path, f"train_parallel{i}", f"resnet_{i}.txt")) as f: + lines = f.readlines() + acc += float(lines[0].strip().split(": ")[1]) + cost += float(lines[1].strip().split(": ")[1]) + acc /= 4 + cost /= 4 + print(f"resnet acc: {acc}, cost: {cost}") assert acc > 0.1 assert cost < 26 + for i in range(4): + shutil.rmtree(os.path.join(sh_path, f"train_parallel{i}")) -def thor_end(device_num, q): - thor_loss = 0.0 - thor_cost = 0.0 - for i in range(device_num): - output = q.get() - thor_loss += output['loss'] - thor_cost += output['cost'] - thor_loss = thor_loss / device_num - thor_cost = thor_cost / device_num - - for i in range(4, device_num + 4): - os.system("rm -rf " + str(i)) - print("End training...") +def thor_end(): + thor_cost = 0 + thor_loss = 0 + sh_path = os.path.split(os.path.realpath(__file__))[0] + for i in range(4): + with open(os.path.join(sh_path, f"train_parallel{i+4}", f"thor_{i}.txt")) as f: + lines = f.readlines() + thor_loss += float(lines[0].strip().split(": ")[1]) + thor_cost += float(lines[1].strip().split(": ")[1]) + thor_loss /= 4 + thor_cost /= 4 + print(f"resnet thor_loss: {thor_loss}, thor_cost: {thor_cost}") assert thor_loss < 7 assert thor_cost < 30 + for i in range(4): + shutil.rmtree(os.path.join(sh_path, f"train_parallel{i+4}")) -@pytest.mark.level1 +@pytest.mark.level0 @pytest.mark.platform_arm_ascend_training @pytest.mark.platform_x86_ascend_training @pytest.mark.env_single @@ -349,46 +78,9 @@ def test_resnet_imagenet_and_thor_4p(): Description: Train and evaluate resnet50 network on imagenet dataset. Expectation: accuracy > 0.1, time cost < 26. """ - context.set_context(enable_graph_kernel=False, enable_sparse=False) - context.reset_auto_parallel_context() - context.reset_ps_context() - - q = Queue() - q2 = Queue() - device_num = 4 - epoch_size = 2 - epoch_size_2 = 1 - enable_hccl = True - process = [] - process2 = [] - for i in range(device_num): - device_id = i - process.append(Process(target=train_process, - args=(q, device_id, epoch_size, device_num, enable_hccl))) - process2.append(Process(target=train_process_thor, - args=(q2, device_id + 4, epoch_size_2, device_num, enable_hccl))) - cpu_count = os.cpu_count() - half_cpu_count = cpu_count // 2 - each_cpu_count = half_cpu_count // device_num - for i in range(device_num): - process[i].start() - process2[i].start() - if each_cpu_count > 1: - cpu_start = each_cpu_count * i - cpu_end = each_cpu_count * (i + 1) - process_cpu = [x for x in range(cpu_start, cpu_end)] - process2_cpu = [x for x in range(cpu_start + half_cpu_count, cpu_end + half_cpu_count)] - pid1 = process[i].pid - pid2 = process2[i].pid - os.sched_setaffinity(pid1, set(process_cpu)) - os.sched_setaffinity(pid2, set(process2_cpu)) - print("Waiting for all subprocesses done...") - - for i in range(device_num): - process[i].join() - process2[i].join() - # resnet - resnet_end(device_num, q) - # thor - thor_end(device_num, q2) - \ No newline at end of file + get_env_info() + sh_path = os.path.split(os.path.realpath(__file__))[0] + ret = os.system(f"sh {sh_path}/scripts/run_train.sh") + assert ret == 0 + resnet_end() + thor_end() diff --git a/tests/st/networks/models/resnet50/train_resnet50.py b/tests/st/networks/models/resnet50/train_resnet50.py new file mode 100644 index 00000000000..150026e509b --- /dev/null +++ b/tests/st/networks/models/resnet50/train_resnet50.py @@ -0,0 +1,122 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""resnet train & eval case.""" +import os +import time +import mindspore as ms +from mindspore import nn +from tests.st.networks.models.resnet50.src.callback import LossGet +from tests.st.networks.models.resnet50.src.config import config +from tests.st.networks.models.resnet50.src.resnet import resnet50 +from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell +from tests.st.networks.models.resnet50.src.dataset import create_dataset +from tests.st.networks.models.resnet50.src.lr_generator import get_learning_rate +from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth + +TRAIN_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train" +EVAL_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val" +ms.set_seed(1) + + +def get_optimizer(net, step_size): + # optimizer + lr = ms.Tensor(get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, + warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, + steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode)) + decayed_params = [] + no_decayed_params = [] + for param in net.trainable_params(): + if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: + decayed_params.append(param) + else: + no_decayed_params.append(param) + + group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, + {'params': no_decayed_params, 'weight_decay': 0.0}, + {'order_params': net.trainable_params()}] + + if config.use_lars: + momentum = nn.Momentum(group_params, lr, config.momentum, + loss_scale=config.loss_scale, use_nesterov=config.use_nesterov) + opt = nn.LARS(momentum, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, + lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) + + else: + opt = nn.Momentum(group_params, lr, config.momentum, + loss_scale=config.loss_scale, use_nesterov=config.use_nesterov) + return opt + + +def train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset): + print("run_start", device_id) + eval_interval = config.eval_interval + step_size = dataset.get_dataset_size() + acc = 0.0 + time_cost = 0.0 + for epoch_idx in range(0, int(epoch_size / eval_interval)): + model.train(1, dataset, callbacks=loss_cb) + eval_start = time.time() + output = model.eval(eval_dataset) + eval_cost = (time.time() - eval_start) * 1000 + acc = float(output["acc"]) + time_cost = loss_cb.get_per_step_time() + loss = loss_cb.get_loss() + print("the {} epoch's resnet result:\n " + "device{}, training loss {}, acc {}, " + "training per step cost {:.2f} ms, eval cost {:.2f} ms, " + "total_cost {:.2f} ms".format(epoch_idx, device_id, + loss, acc, time_cost, + eval_cost, + time_cost * step_size + eval_cost)) + print(f"===resnet_acc: {acc}") + print(f"===resnet_time_cost: {time_cost}") + + +def run_train(): + ms.context.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") + rank_id = int(os.getenv('RANK_ID', '0')) + device_num = int(os.getenv('RANK_SIZE', '1')) + device_id = int(os.getenv('DEVICE_ID', '0')) + print(f"run resnet50 device_num:{device_num}, device_id:{device_id}, rank_id:{rank_id}") + if device_num > 1: + ms.communication.init() + ms.context.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, + gradients_mean=True, all_reduce_fusion_config=[107, 160]) + net = resnet50(class_num=config.class_num) + dist_eval_network = ClassifyCorrectCell(net) + + if not config.use_label_smooth: + config.label_smooth_factor = 0.0 + loss = CrossEntropySmooth(sparse=True, reduction="mean", + smooth_factor=config.label_smooth_factor, num_classes=config.class_num) + + # dataset + dataset = create_dataset(dataset_path=TRAIN_PATH, do_train=True, repeat_num=1, batch_size=config.batch_size) + step_size = dataset.get_dataset_size() + eval_dataset = create_dataset(dataset_path=EVAL_PATH, do_train=False, + repeat_num=1, batch_size=config.eval_batch_size) + + loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + opt = get_optimizer(net, step_size) + + model = ms.Model(net, loss_fn=loss, optimizer=opt, + loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False, + metrics={'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num)}, + eval_network=dist_eval_network) + loss_cb = LossGet(1, step_size) + train_and_eval(device_id, 2, model, dataset, loss_cb, eval_dataset) + +if __name__ == '__main__': + run_train() diff --git a/tests/st/networks/models/resnet50/train_resnet50_thor.py b/tests/st/networks/models/resnet50/train_resnet50_thor.py new file mode 100644 index 00000000000..1ea108778ee --- /dev/null +++ b/tests/st/networks/models/resnet50/train_resnet50_thor.py @@ -0,0 +1,133 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""resnet train & eval case.""" +import os +import numpy as np +import mindspore as ms +from mindspore import nn +from mindspore.train.train_thor import ConvertModelUtils +from tests.st.networks.models.resnet50.src.callback import LossGet +from tests.st.networks.models.resnet50.src_thor.config import config as thor_config +from tests.st.networks.models.resnet50.src_thor.dataset import create_dataset2 as create_dataset_thor +from tests.st.networks.models.resnet50.src.resnet import resnet50 +from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell +from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth + +TRAIN_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train" +EVAL_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val" +ms.set_seed(1) + + +def get_thor_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100): + """get_model_lr""" + lr_each_step = [] + total_steps = steps_per_epoch * total_epochs + for i in range(total_steps): + epoch = (i + 1) / steps_per_epoch + base = (1.0 - float(epoch) / total_epochs) ** decay + lr_local = lr_init * base + if epoch >= decay_epochs: + lr_local = lr_local * 0.5 + if epoch >= decay_epochs + 1: + lr_local = lr_local * 0.5 + lr_each_step.append(lr_local) + current_step = global_step + lr_each_step = np.array(lr_each_step).astype(np.float32) + learning_rate = lr_each_step[current_step:] + return learning_rate + + +def get_thor_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch): + """get_model_damping""" + damping_each_step = [] + total_steps = steps_per_epoch * total_epochs + for step in range(total_steps): + epoch = (step + 1) / steps_per_epoch + damping_here = damping_init * (decay_rate ** (epoch / 10)) + damping_each_step.append(damping_here) + current_step = global_step + damping_each_step = np.array(damping_each_step).astype(np.float32) + damping_now = damping_each_step[current_step:] + return damping_now + + +def run_train(): + ms.context.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") + rank_id = int(os.getenv('RANK_ID', '0')) + device_num = int(os.getenv('RANK_SIZE', '1')) + device_id = int(os.getenv('DEVICE_ID', '0')) + print(f"run resnet50 thor device_num:{device_num}, device_id:{device_id}, rank_id:{rank_id}") + if device_num > 1: + ms.communication.init() + ms.context.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, + gradients_mean=True, all_reduce_fusion_config=[85, 160]) + net = resnet50(thor_config.class_num) + + if not thor_config.label_smooth: + thor_config.label_smooth_factor = 0.0 + + # loss + loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor, + num_classes=thor_config.class_num) + + # train dataset + dataset = create_dataset_thor(dataset_path=TRAIN_PATH, do_train=True, + batch_size=thor_config.batch_size, train_image_size=thor_config.train_image_size, + eval_image_size=thor_config.eval_image_size, target="Ascend", + distribute=True) + step_size = dataset.get_dataset_size() + + # loss scale + loss_scale = ms.FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False) + + # learning rate + lr = get_thor_lr(0, 0.05803, 4.04839, 53, 5004, decay_epochs=39) + damping = get_thor_damping(0, 0.02714, 0.50036, 70, 5004) + # optimizer + split_indices = [26, 53] + opt = nn.thor(net, ms.Tensor(lr), ms.Tensor(damping), thor_config.momentum, thor_config.weight_decay, + thor_config.loss_scale, thor_config.batch_size, split_indices=split_indices, + frequency=thor_config.frequency) + + # evaluation network + dist_eval_network = ClassifyCorrectCell(net) + # model + model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, + metrics={'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num)}, + amp_level="O2", keep_batchnorm_fp32=False, + eval_network=dist_eval_network) + + model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt, + loss_scale_manager=loss_scale, metrics={'acc'}, + amp_level="O2", keep_batchnorm_fp32=False) + + # callbacks + loss_cb = LossGet(1, step_size) + + # train and eval + print("run_start", device_id) + model.train(2, dataset, callbacks=loss_cb, dataset_sink_mode=True, sink_size=step_size) + time_cost = loss_cb.get_per_step_time() + loss = loss_cb.get_loss() + epoch_idx = loss_cb.get_epoch() + print("the {} epoch's resnet result:\n " + "device{}, training loss {}, " + "training per step cost {:.2f} ms, total_cost {:.2f} ms".format(epoch_idx, device_id, + loss, time_cost, time_cost * step_size)) + print(f"===resnet_thor_loss: {loss}") + print(f"===resnet_thor_time_cost: {time_cost}") + +if __name__ == '__main__': + run_train()