fix resnet50 & thor tests

2022-04-16 11:21:18 +08:00 · 2022-04-16 11:21:18 +08:00 · 96607251ee
parent c52ef8ed33
commit 96607251ee
7 changed files with 481 additions and 356 deletions
--- a/tests/st/networks/models/resnet50/scripts/run_resnet50_imagenet_4p.sh
+++ b/tests/st/networks/models/resnet50/scripts/run_resnet50_imagenet_4p.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 # Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 BASE_PATH=$(cd "$(dirname $0)"; pwd)
 export RANK_SIZE=4
 export RANK_TABLE_FILE="/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_1.json"
 cpus=`cat /proc/cpuinfo| grep "processor"| wc -l`
 avg=`expr $cpus \/ 8`
 gap=`expr $avg \- 1`
 rank_start=0
 for((i=0; i<$RANK_SIZE; i++))
 do
    j=$((rank_start + i))
    start=`expr $j \* $avg`
    end=`expr $start \+ $gap`
    cmdopt=$start"-"$end
    export DEVICE_ID=$((rank_start + i))
    export RANK_ID=${i}
    rm -rf $BASE_PATH/../train_parallel$j
    mkdir $BASE_PATH/../train_parallel$j
    cd $BASE_PATH/../train_parallel$j || exit
    echo "start resnet training for rank $RANK_ID, device $DEVICE_ID"
    (taskset -c $cmdopt python $BASE_PATH/../train_resnet50.py &> log; grep "===" log > resnet_$i.txt) &
    cd ..
 done
 wait
 echo "result:"
 cat $BASE_PATH/../train_parallel0/log
 cat $BASE_PATH/../train_parallel*/resnet_*.txt
--- a/tests/st/networks/models/resnet50/scripts/run_resnet_thor_imagenet_4p.sh
+++ b/tests/st/networks/models/resnet50/scripts/run_resnet_thor_imagenet_4p.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 # Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 BASE_PATH=$(cd "$(dirname $0)"; pwd)
 export RANK_SIZE=4
 export RANK_TABLE_FILE="/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_2.json"
 cpus=`cat /proc/cpuinfo| grep "processor"| wc -l`
 avg=`expr $cpus \/ 8`
 gap=`expr $avg \- 1`
 rank_start=4
 for((i=0; i<$RANK_SIZE; i++))
 do
    j=$((rank_start + i))
    start=`expr $j \* $avg`
    end=`expr $start \+ $gap`
    cmdopt=$start"-"$end
    export DEVICE_ID=$((rank_start + i))
    export RANK_ID=${i}
    rm -rf $BASE_PATH/../train_parallel$j
    mkdir $BASE_PATH/../train_parallel$j
    cd $BASE_PATH/../train_parallel$j || exit
    echo "start resnet thor training for rank $RANK_ID, device $DEVICE_ID"
    (taskset -c $cmdopt python $BASE_PATH/../train_resnet50_thor.py &> log; grep "===" log > thor_$i.txt) &
    cd ..
 done
 wait
 echo "result:"
 cat $BASE_PATH/../train_parallel5/log
 cat $BASE_PATH/../train_parallel*/thor_*.txt
--- a/tests/st/networks/models/resnet50/scripts/run_train.sh
+++ b/tests/st/networks/models/resnet50/scripts/run_train.sh
@ -0,0 +1,20 @@
 #!/bin/bash
 # Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 BASE_PATH=$(cd "$(dirname $0)"; pwd)
 bash $BASE_PATH/run_resnet50_imagenet_4p.sh &
 bash $BASE_PATH/run_resnet_thor_imagenet_4p.sh &
 wait
--- a/tests/st/networks/models/resnet50/src/callback.py
+++ b/tests/st/networks/models/resnet50/src/callback.py
@ -0,0 +1,70 @@
 # Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """custom callback."""
 import time
 import numpy as np
 import mindspore as ms
 from mindspore.train.callback import Callback
 class LossGet(Callback):
    def __init__(self, per_print_times, data_size):
        super(LossGet, self).__init__()
        if not isinstance(per_print_times, int) or per_print_times < 0:
            raise ValueError("print_step must be int and >= 0.")
        self._per_print_times = per_print_times
        self._loss = 0.0
        self.data_size = data_size
        self._epoch = 0
        self.epoch_time = time.time()
        self._per_step_mseconds = 0
    def step_end(self, run_context):
        cb_params = run_context.original_args()
        loss = cb_params.net_outputs
        self._epoch = cb_params.cur_epoch_num
        if isinstance(loss, (tuple, list)):
            if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
                loss = loss[0]
        if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray):
            loss = np.mean(loss.asnumpy())
        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
        if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
            raise ValueError("epoch: {} step: {}. Invalid loss, terminating training."
                             .format(cb_params.cur_epoch_num, cur_step_in_epoch))
        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
        if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
            self._loss = loss
            print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num,
                                                      cur_step_in_epoch, loss), flush=True)
    def epoch_begin(self, run_context):
        self.epoch_time = time.time()
    def epoch_end(self, run_context):
        epoch_mseconds = (time.time() - self.epoch_time) * 1000
        self._per_step_mseconds = epoch_mseconds / self.data_size
    def get_loss(self):
        return self._loss
    def get_per_step_time(self):
        return self._per_step_mseconds
    def get_epoch(self):
        return self._epoch
--- a/tests/st/networks/models/resnet50/test_resnet50_imagenet_and_thor.py
+++ b/tests/st/networks/models/resnet50/test_resnet50_imagenet_and_thor.py
@ -1,4 +1,4 @@
-# Copyright 2020-2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -14,332 +14,61 @@
 # ============================================================================
 """train and evaluate resnet50 network on imagenet dataset"""
 import os
-import time
+import shutil
 from multiprocessing import Process, Queue
 import pytest
 import numpy as np
 from mindspore import context
 from mindspore.common.tensor import Tensor
 from mindspore.communication.management import init
 from mindspore.context import ParallelMode
 from mindspore.train.callback import Callback
 from mindspore.train.model import Model
 from mindspore.train.train_thor import ConvertModelUtils
 from mindspore.train.loss_scale_manager import FixedLossScaleManager
 from mindspore.nn.optim import thor
 import mindspore.dataset as ds
 import mindspore.nn as nn
 from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
 from tests.st.networks.models.resnet50.src.dataset import create_dataset
 from tests.st.networks.models.resnet50.src.lr_generator import get_learning_rate
 from tests.st.networks.models.resnet50.src.config import config
 from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth
 from tests.st.networks.models.resnet50.src_thor.config import config as thor_config
 from tests.st.networks.models.resnet50.src_thor.dataset import create_dataset2 as create_dataset_thor
 from tests.st.networks.models.resnet50.src.resnet import resnet50
 MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_1.json"
 MINDSPORE_HCCL_CONFIG_PATH_2 = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_2.json"
 dataset_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
 eval_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
 np.random.seed(1)
 ds.config.set_seed(1)
 os.environ['GLOG_v'] = str(2)
-def get_thor_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100):
+def get_env_info():
-    """get_model_lr"""
+    print("================== CPU ======================")
-    lr_each_step = []
+    os.system("top -bi -n 2 -d 0.02")
-    total_steps = steps_per_epoch * total_epochs
+    print("================= IO ====================")
-    for i in range(total_steps):
+    os.system("iostat")
-        epoch = (i + 1) / steps_per_epoch
+    print("================= Memory =====================")
-        base = (1.0 - float(epoch) / total_epochs) ** decay
+    os.system("free -h")
-        lr_local = lr_init * base
+    print("================= Process ====================")
-        if epoch >= decay_epochs:
+    os.system("ps -ef | grep python")
-            lr_local = lr_local * 0.5
+    print("================= NPU ====================")
-        if epoch >= decay_epochs + 1:
+    os.system("npu-smi info")
            lr_local = lr_local * 0.5
        lr_each_step.append(lr_local)
    current_step = global_step
    lr_each_step = np.array(lr_each_step).astype(np.float32)
    learning_rate = lr_each_step[current_step:]
    return learning_rate
-def get_thor_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch):
+def resnet_end():
-    """get_model_damping"""
+    acc = 0
-    damping_each_step = []
+    cost = 0
-    total_steps = steps_per_epoch * total_epochs
+    sh_path = os.path.split(os.path.realpath(__file__))[0]
-    for step in range(total_steps):
+    for i in range(4):
-        epoch = (step + 1) / steps_per_epoch
+        with open(os.path.join(sh_path, f"train_parallel{i}", f"resnet_{i}.txt")) as f:
-        damping_here = damping_init * (decay_rate ** (epoch / 10))
+            lines = f.readlines()
-        damping_each_step.append(damping_here)
+            acc += float(lines[0].strip().split(": ")[1])
-    current_step = global_step
+            cost += float(lines[1].strip().split(": ")[1])
-    damping_each_step = np.array(damping_each_step).astype(np.float32)
+    acc /= 4
-    damping_now = damping_each_step[current_step:]
+    cost /= 4
-    return damping_now
+    print(f"resnet acc: {acc}, cost: {cost}")
 class LossGet(Callback):
    def __init__(self, per_print_times, data_size):
        super(LossGet, self).__init__()
        if not isinstance(per_print_times, int) or per_print_times < 0:
            raise ValueError("print_step must be int and >= 0.")
        self._per_print_times = per_print_times
        self._loss = 0.0
        self.data_size = data_size
        self._epoch = 0
    def step_end(self, run_context):
        cb_params = run_context.original_args()
        loss = cb_params.net_outputs
        self._epoch = cb_params.cur_epoch_num
        if isinstance(loss, (tuple, list)):
            if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
                loss = loss[0]
        if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
            loss = np.mean(loss.asnumpy())
        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
        if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
            raise ValueError("epoch: {} step: {}. Invalid loss, terminating training."
                             .format(cb_params.cur_epoch_num, cur_step_in_epoch))
        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
        if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
            self._loss = loss
            print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num,
                                                      cur_step_in_epoch, loss), flush=True)
    def epoch_begin(self, run_context):
        self.epoch_time = time.time()
    def epoch_end(self, run_context):
        epoch_mseconds = (time.time() - self.epoch_time) * 1000
        self._per_step_mseconds = epoch_mseconds / self.data_size
    def get_loss(self):
        return self._loss
    def get_per_step_time(self):
        return self._per_step_mseconds
    def get_epoch(self):
        return self._epoch
 def train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset, q):
    print("run_start", device_id)
    eval_interval = config.eval_interval
    step_size = dataset.get_dataset_size()
    acc = 0.0
    time_cost = 0.0
    for epoch_idx in range(0, int(epoch_size / eval_interval)):
        model.train(1, dataset, callbacks=loss_cb)
        eval_start = time.time()
        output = model.eval(eval_dataset)
        eval_cost = (time.time() - eval_start) * 1000
        acc = float(output["acc"])
        time_cost = loss_cb.get_per_step_time()
        loss = loss_cb.get_loss()
        print("the {} epoch's resnet result:\n "
              "device{}, training loss {}, acc {}, "
              "training per step cost {:.2f} ms, eval cost {:.2f} ms, "
              "total_cost {:.2f} ms".format(epoch_idx, device_id,
                                            loss, acc, time_cost,
                                            eval_cost,
                                            time_cost * step_size + eval_cost))
    q.put({'acc': acc, 'cost': time_cost})
 def train_process(q, device_id, epoch_size, device_num, enable_hccl):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id)
    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
    os.environ['RANK_ID'] = str(device_id)
    os.environ['RANK_SIZE'] = str(device_num)
    if enable_hccl:
        context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True, all_reduce_fusion_config=[107, 160])
        init()
    # network
    net = resnet50(class_num=config.class_num)
    # evaluation network
    dist_eval_network = ClassifyCorrectCell(net)
    if not config.use_label_smooth:
        config.label_smooth_factor = 0.0
    # loss
    loss = CrossEntropySmooth(sparse=True, reduction="mean",
                              smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
    # train dataset
    dataset = create_dataset(dataset_path=dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size)
    step_size = dataset.get_dataset_size()
    # evaluation dataset
    eval_dataset = create_dataset(dataset_path=eval_path, do_train=False,
                                  repeat_num=1, batch_size=config.eval_batch_size)
    # loss scale
    loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
    # learning rate
    lr = Tensor(get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max,
                                  warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size,
                                  steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode))
    # optimizer
    decayed_params = []
    no_decayed_params = []
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            decayed_params.append(param)
        else:
            no_decayed_params.append(param)
    group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
                    {'params': no_decayed_params, 'weight_decay': 0.0},
                    {'order_params': net.trainable_params()}]
    if config.use_lars:
        momentum = nn.Momentum(group_params, lr, config.momentum,
                               loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
        opt = nn.LARS(momentum, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient,
                      lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name)
    else:
        opt = nn.Momentum(group_params, lr, config.momentum,
                          loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
    # model
    model = Model(net, loss_fn=loss, optimizer=opt,
                  loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False,
                  metrics={'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num)},
                  eval_network=dist_eval_network)
    # callbacks
    loss_cb = LossGet(1, step_size)
    train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset, q)
 def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    context.set_context(device_id=device_id)
    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2
    os.environ['RANK_ID'] = str(device_id - 4)
    os.environ['RANK_SIZE'] = str(device_num)
    if enable_hccl:
        context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True, all_reduce_fusion_config=[85, 160])
        init()
    # network
    net = resnet50(thor_config.class_num)
    if not thor_config.label_smooth:
        thor_config.label_smooth_factor = 0.0
    # loss
    loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor,
                              num_classes=thor_config.class_num)
    # train dataset
    dataset = create_dataset_thor(dataset_path=dataset_path, do_train=True,
                                  batch_size=thor_config.batch_size, train_image_size=thor_config.train_image_size,
                                  eval_image_size=thor_config.eval_image_size, target="Ascend",
                                  distribute=True)
    step_size = dataset.get_dataset_size()
    # loss scale
    loss_scale = FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False)
    # learning rate
    lr = get_thor_lr(0, 0.05803, 4.04839, 53, 5004, decay_epochs=39)
    damping = get_thor_damping(0, 0.02714, 0.50036, 70, 5004)
    # optimizer
    split_indices = [26, 53]
    opt = thor(net, Tensor(lr), Tensor(damping), thor_config.momentum, thor_config.weight_decay, thor_config.loss_scale,
               thor_config.batch_size, split_indices=split_indices, frequency=thor_config.frequency)
    # evaluation network
    dist_eval_network = ClassifyCorrectCell(net)
    # model
    model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale,
                  metrics={'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num)},
                  amp_level="O2", keep_batchnorm_fp32=False,
                  eval_network=dist_eval_network)
    model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
                                                      loss_scale_manager=loss_scale, metrics={'acc'},
                                                      amp_level="O2", keep_batchnorm_fp32=False)
    # callbacks
    loss_cb = LossGet(1, step_size)
    # train and eval
    print("run_start", device_id)
    model.train(2, dataset, callbacks=loss_cb,
                sink_size=dataset.get_dataset_size(), dataset_sink_mode=True)
    time_cost = loss_cb.get_per_step_time()
    loss = loss_cb.get_loss()
    epoch_idx = loss_cb.get_epoch()
    print("the {} epoch's resnet result:\n "
          "device{}, training loss {}, "
          "training per step cost {:.2f} ms, total_cost {:.2f} ms".format(epoch_idx, device_id,
                                                                          loss, time_cost, time_cost * step_size))
    q.put({'loss': loss, 'cost': time_cost})
 def resnet_end(device_num, q):
    acc = 0.0
    cost = 0.0
    for i in range(device_num):
        assert not q.empty()
        output = q.get()
        acc += output['acc']
        cost += output['cost']
    acc = acc / device_num
    cost = cost / device_num
    for i in range(device_num):
        os.system("rm -rf " + str(i))
    print("End training...")
    assert acc > 0.1
    assert cost < 26
    for i in range(4):
        shutil.rmtree(os.path.join(sh_path, f"train_parallel{i}"))
-def thor_end(device_num, q):
+def thor_end():
-    thor_loss = 0.0
+    thor_cost = 0
-    thor_cost = 0.0
+    thor_loss = 0
-    for i in range(device_num):
+    sh_path = os.path.split(os.path.realpath(__file__))[0]
-        output = q.get()
+    for i in range(4):
-        thor_loss += output['loss']
+        with open(os.path.join(sh_path, f"train_parallel{i+4}", f"thor_{i}.txt")) as f:
-        thor_cost += output['cost']
+            lines = f.readlines()
-    thor_loss = thor_loss / device_num
+            thor_loss += float(lines[0].strip().split(": ")[1])
-    thor_cost = thor_cost / device_num
+            thor_cost += float(lines[1].strip().split(": ")[1])
-
+    thor_loss /= 4
-    for i in range(4, device_num + 4):
+    thor_cost /= 4
-        os.system("rm -rf " + str(i))
+    print(f"resnet thor_loss: {thor_loss}, thor_cost: {thor_cost}")
    print("End training...")
    assert thor_loss < 7
    assert thor_cost < 30
    for i in range(4):
        shutil.rmtree(os.path.join(sh_path, f"train_parallel{i+4}"))
-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_single
@ -349,46 +78,9 @@ def test_resnet_imagenet_and_thor_4p():
    Description: Train and evaluate resnet50 network on imagenet dataset.
    Expectation: accuracy > 0.1, time cost < 26.
    """
-    context.set_context(enable_graph_kernel=False, enable_sparse=False)
+    get_env_info()
-    context.reset_auto_parallel_context()
+    sh_path = os.path.split(os.path.realpath(__file__))[0]
-    context.reset_ps_context()
+    ret = os.system(f"sh {sh_path}/scripts/run_train.sh")
-
+    assert ret == 0
-    q = Queue()
+    resnet_end()
-    q2 = Queue()
+    thor_end()
    device_num = 4
    epoch_size = 2
    epoch_size_2 = 1
    enable_hccl = True
    process = []
    process2 = []
    for i in range(device_num):
        device_id = i
        process.append(Process(target=train_process,
                               args=(q, device_id, epoch_size, device_num, enable_hccl)))
        process2.append(Process(target=train_process_thor,
                                args=(q2, device_id + 4, epoch_size_2, device_num, enable_hccl)))
    cpu_count = os.cpu_count()
    half_cpu_count = cpu_count // 2
    each_cpu_count = half_cpu_count // device_num
    for i in range(device_num):
        process[i].start()
        process2[i].start()
        if each_cpu_count > 1:
            cpu_start = each_cpu_count * i
            cpu_end = each_cpu_count * (i + 1)
            process_cpu = [x for x in range(cpu_start, cpu_end)]
            process2_cpu = [x for x in range(cpu_start + half_cpu_count, cpu_end + half_cpu_count)]
            pid1 = process[i].pid
            pid2 = process2[i].pid
            os.sched_setaffinity(pid1, set(process_cpu))
            os.sched_setaffinity(pid2, set(process2_cpu))
    print("Waiting for all subprocesses done...")
    for i in range(device_num):
        process[i].join()
        process2[i].join()
    # resnet
    resnet_end(device_num, q)
    # thor
    thor_end(device_num, q2)
--- a/tests/st/networks/models/resnet50/train_resnet50.py
+++ b/tests/st/networks/models/resnet50/train_resnet50.py
@ -0,0 +1,122 @@
 # Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """resnet train & eval case."""
 import os
 import time
 import mindspore as ms
 from mindspore import nn
 from tests.st.networks.models.resnet50.src.callback import LossGet
 from tests.st.networks.models.resnet50.src.config import config
 from tests.st.networks.models.resnet50.src.resnet import resnet50
 from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
 from tests.st.networks.models.resnet50.src.dataset import create_dataset
 from tests.st.networks.models.resnet50.src.lr_generator import get_learning_rate
 from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth
 TRAIN_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
 EVAL_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
 ms.set_seed(1)
 def get_optimizer(net, step_size):
    # optimizer
    lr = ms.Tensor(get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max,
                                     warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size,
                                     steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode))
    decayed_params = []
    no_decayed_params = []
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            decayed_params.append(param)
        else:
            no_decayed_params.append(param)
    group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
                    {'params': no_decayed_params, 'weight_decay': 0.0},
                    {'order_params': net.trainable_params()}]
    if config.use_lars:
        momentum = nn.Momentum(group_params, lr, config.momentum,
                               loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
        opt = nn.LARS(momentum, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient,
                      lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name)
    else:
        opt = nn.Momentum(group_params, lr, config.momentum,
                          loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
    return opt
 def train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset):
    print("run_start", device_id)
    eval_interval = config.eval_interval
    step_size = dataset.get_dataset_size()
    acc = 0.0
    time_cost = 0.0
    for epoch_idx in range(0, int(epoch_size / eval_interval)):
        model.train(1, dataset, callbacks=loss_cb)
        eval_start = time.time()
        output = model.eval(eval_dataset)
        eval_cost = (time.time() - eval_start) * 1000
        acc = float(output["acc"])
        time_cost = loss_cb.get_per_step_time()
        loss = loss_cb.get_loss()
        print("the {} epoch's resnet result:\n "
              "device{}, training loss {}, acc {}, "
              "training per step cost {:.2f} ms, eval cost {:.2f} ms, "
              "total_cost {:.2f} ms".format(epoch_idx, device_id,
                                            loss, acc, time_cost,
                                            eval_cost,
                                            time_cost * step_size + eval_cost))
    print(f"===resnet_acc: {acc}")
    print(f"===resnet_time_cost: {time_cost}")
 def run_train():
    ms.context.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
    rank_id = int(os.getenv('RANK_ID', '0'))
    device_num = int(os.getenv('RANK_SIZE', '1'))
    device_id = int(os.getenv('DEVICE_ID', '0'))
    print(f"run resnet50 device_num:{device_num}, device_id:{device_id}, rank_id:{rank_id}")
    if device_num > 1:
        ms.communication.init()
        ms.context.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL,
                                             gradients_mean=True, all_reduce_fusion_config=[107, 160])
    net = resnet50(class_num=config.class_num)
    dist_eval_network = ClassifyCorrectCell(net)
    if not config.use_label_smooth:
        config.label_smooth_factor = 0.0
    loss = CrossEntropySmooth(sparse=True, reduction="mean",
                              smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
    # dataset
    dataset = create_dataset(dataset_path=TRAIN_PATH, do_train=True, repeat_num=1, batch_size=config.batch_size)
    step_size = dataset.get_dataset_size()
    eval_dataset = create_dataset(dataset_path=EVAL_PATH, do_train=False,
                                  repeat_num=1, batch_size=config.eval_batch_size)
    loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
    opt = get_optimizer(net, step_size)
    model = ms.Model(net, loss_fn=loss, optimizer=opt,
                     loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False,
                     metrics={'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num)},
                     eval_network=dist_eval_network)
    loss_cb = LossGet(1, step_size)
    train_and_eval(device_id, 2, model, dataset, loss_cb, eval_dataset)
 if __name__ == '__main__':
    run_train()
--- a/tests/st/networks/models/resnet50/train_resnet50_thor.py
+++ b/tests/st/networks/models/resnet50/train_resnet50_thor.py
@ -0,0 +1,133 @@
 # Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """resnet train & eval case."""
 import os
 import numpy as np
 import mindspore as ms
 from mindspore import nn
 from mindspore.train.train_thor import ConvertModelUtils
 from tests.st.networks.models.resnet50.src.callback import LossGet
 from tests.st.networks.models.resnet50.src_thor.config import config as thor_config
 from tests.st.networks.models.resnet50.src_thor.dataset import create_dataset2 as create_dataset_thor
 from tests.st.networks.models.resnet50.src.resnet import resnet50
 from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
 from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth
 TRAIN_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
 EVAL_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
 ms.set_seed(1)
 def get_thor_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100):
    """get_model_lr"""
    lr_each_step = []
    total_steps = steps_per_epoch * total_epochs
    for i in range(total_steps):
        epoch = (i + 1) / steps_per_epoch
        base = (1.0 - float(epoch) / total_epochs) ** decay
        lr_local = lr_init * base
        if epoch >= decay_epochs:
            lr_local = lr_local * 0.5
        if epoch >= decay_epochs + 1:
            lr_local = lr_local * 0.5
        lr_each_step.append(lr_local)
    current_step = global_step
    lr_each_step = np.array(lr_each_step).astype(np.float32)
    learning_rate = lr_each_step[current_step:]
    return learning_rate
 def get_thor_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch):
    """get_model_damping"""
    damping_each_step = []
    total_steps = steps_per_epoch * total_epochs
    for step in range(total_steps):
        epoch = (step + 1) / steps_per_epoch
        damping_here = damping_init * (decay_rate ** (epoch / 10))
        damping_each_step.append(damping_here)
    current_step = global_step
    damping_each_step = np.array(damping_each_step).astype(np.float32)
    damping_now = damping_each_step[current_step:]
    return damping_now
 def run_train():
    ms.context.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
    rank_id = int(os.getenv('RANK_ID', '0'))
    device_num = int(os.getenv('RANK_SIZE', '1'))
    device_id = int(os.getenv('DEVICE_ID', '0'))
    print(f"run resnet50 thor device_num:{device_num}, device_id:{device_id}, rank_id:{rank_id}")
    if device_num > 1:
        ms.communication.init()
        ms.context.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL,
                                             gradients_mean=True, all_reduce_fusion_config=[85, 160])
    net = resnet50(thor_config.class_num)
    if not thor_config.label_smooth:
        thor_config.label_smooth_factor = 0.0
    # loss
    loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor,
                              num_classes=thor_config.class_num)
    # train dataset
    dataset = create_dataset_thor(dataset_path=TRAIN_PATH, do_train=True,
                                  batch_size=thor_config.batch_size, train_image_size=thor_config.train_image_size,
                                  eval_image_size=thor_config.eval_image_size, target="Ascend",
                                  distribute=True)
    step_size = dataset.get_dataset_size()
    # loss scale
    loss_scale = ms.FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False)
    # learning rate
    lr = get_thor_lr(0, 0.05803, 4.04839, 53, 5004, decay_epochs=39)
    damping = get_thor_damping(0, 0.02714, 0.50036, 70, 5004)
    # optimizer
    split_indices = [26, 53]
    opt = nn.thor(net, ms.Tensor(lr), ms.Tensor(damping), thor_config.momentum, thor_config.weight_decay,
                  thor_config.loss_scale, thor_config.batch_size, split_indices=split_indices,
                  frequency=thor_config.frequency)
    # evaluation network
    dist_eval_network = ClassifyCorrectCell(net)
    # model
    model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale,
                     metrics={'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num)},
                     amp_level="O2", keep_batchnorm_fp32=False,
                     eval_network=dist_eval_network)
    model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
                                                      loss_scale_manager=loss_scale, metrics={'acc'},
                                                      amp_level="O2", keep_batchnorm_fp32=False)
    # callbacks
    loss_cb = LossGet(1, step_size)
    # train and eval
    print("run_start", device_id)
    model.train(2, dataset, callbacks=loss_cb, dataset_sink_mode=True, sink_size=step_size)
    time_cost = loss_cb.get_per_step_time()
    loss = loss_cb.get_loss()
    epoch_idx = loss_cb.get_epoch()
    print("the {} epoch's resnet result:\n "
          "device{}, training loss {}, "
          "training per step cost {:.2f} ms, total_cost {:.2f} ms".format(epoch_idx, device_id,
                                                                          loss, time_cost, time_cost * step_size))
    print(f"===resnet_thor_loss: {loss}")
    print(f"===resnet_thor_time_cost: {time_cost}")
 if __name__ == '__main__':
    run_train()