fix resnet50 & thor tests

2022-04-16 11:21:18 +08:00 · 2022-04-16 11:21:18 +08:00 · 96607251ee
parent c52ef8ed33
commit 96607251ee
7 changed files with 481 additions and 356 deletions
--- a/tests/st/networks/models/resnet50/scripts/run_resnet50_imagenet_4p.sh
+++ b/tests/st/networks/models/resnet50/scripts/run_resnet50_imagenet_4p.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+BASE_PATH=$(cd "$(dirname $0)"; pwd)
+
+export RANK_SIZE=4
+export RANK_TABLE_FILE="/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_1.json"
+
+cpus=`cat /proc/cpuinfo| grep "processor"| wc -l`
+avg=`expr $cpus \/ 8`
+gap=`expr $avg \- 1`
+rank_start=0
+for((i=0; i<$RANK_SIZE; i++))
+do
+    j=$((rank_start + i))
+    start=`expr $j \* $avg`
+    end=`expr $start \+ $gap`
+    cmdopt=$start"-"$end
+    export DEVICE_ID=$((rank_start + i))
+    export RANK_ID=${i}
+    rm -rf $BASE_PATH/../train_parallel$j
+    mkdir $BASE_PATH/../train_parallel$j
+    cd $BASE_PATH/../train_parallel$j || exit
+    echo "start resnet training for rank $RANK_ID, device $DEVICE_ID"
+    (taskset -c $cmdopt python $BASE_PATH/../train_resnet50.py &> log; grep "===" log > resnet_$i.txt) &
+    cd ..
+done
+wait
+echo "result:"
+cat $BASE_PATH/../train_parallel0/log
+cat $BASE_PATH/../train_parallel*/resnet_*.txt
--- a/tests/st/networks/models/resnet50/scripts/run_resnet_thor_imagenet_4p.sh
+++ b/tests/st/networks/models/resnet50/scripts/run_resnet_thor_imagenet_4p.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+BASE_PATH=$(cd "$(dirname $0)"; pwd)
+
+export RANK_SIZE=4
+export RANK_TABLE_FILE="/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_2.json"
+
+cpus=`cat /proc/cpuinfo| grep "processor"| wc -l`
+avg=`expr $cpus \/ 8`
+gap=`expr $avg \- 1`
+rank_start=4
+for((i=0; i<$RANK_SIZE; i++))
+do
+    j=$((rank_start + i))
+    start=`expr $j \* $avg`
+    end=`expr $start \+ $gap`
+    cmdopt=$start"-"$end
+    export DEVICE_ID=$((rank_start + i))
+    export RANK_ID=${i}
+    rm -rf $BASE_PATH/../train_parallel$j
+    mkdir $BASE_PATH/../train_parallel$j
+    cd $BASE_PATH/../train_parallel$j || exit
+    echo "start resnet thor training for rank $RANK_ID, device $DEVICE_ID"
+    (taskset -c $cmdopt python $BASE_PATH/../train_resnet50_thor.py &> log; grep "===" log > thor_$i.txt) &
+    cd ..
+done
+wait
+echo "result:"
+cat $BASE_PATH/../train_parallel5/log
+cat $BASE_PATH/../train_parallel*/thor_*.txt
--- a/tests/st/networks/models/resnet50/scripts/run_train.sh
+++ b/tests/st/networks/models/resnet50/scripts/run_train.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+BASE_PATH=$(cd "$(dirname $0)"; pwd)
+bash $BASE_PATH/run_resnet50_imagenet_4p.sh &
+bash $BASE_PATH/run_resnet_thor_imagenet_4p.sh &
+wait
--- a/tests/st/networks/models/resnet50/src/callback.py
+++ b/tests/st/networks/models/resnet50/src/callback.py
@ -0,0 +1,70 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""custom callback."""
+import time
+import numpy as np
+import mindspore as ms
+from mindspore.train.callback import Callback
+
+
+class LossGet(Callback):
+    def __init__(self, per_print_times, data_size):
+        super(LossGet, self).__init__()
+        if not isinstance(per_print_times, int) or per_print_times < 0:
+            raise ValueError("print_step must be int and >= 0.")
+        self._per_print_times = per_print_times
+        self._loss = 0.0
+        self.data_size = data_size
+        self._epoch = 0
+        self.epoch_time = time.time()
+        self._per_step_mseconds = 0
+
+    def step_end(self, run_context):
+        cb_params = run_context.original_args()
+        loss = cb_params.net_outputs
+        self._epoch = cb_params.cur_epoch_num
+        if isinstance(loss, (tuple, list)):
+            if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
+                loss = loss[0]
+
+        if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray):
+            loss = np.mean(loss.asnumpy())
+
+        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
+
+        if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
+            raise ValueError("epoch: {} step: {}. Invalid loss, terminating training."
+                             .format(cb_params.cur_epoch_num, cur_step_in_epoch))
+        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
+        if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
+            self._loss = loss
+            print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num,
+                                                      cur_step_in_epoch, loss), flush=True)
+
+    def epoch_begin(self, run_context):
+        self.epoch_time = time.time()
+
+    def epoch_end(self, run_context):
+        epoch_mseconds = (time.time() - self.epoch_time) * 1000
+        self._per_step_mseconds = epoch_mseconds / self.data_size
+
+    def get_loss(self):
+        return self._loss
+
+    def get_per_step_time(self):
+        return self._per_step_mseconds
+
+    def get_epoch(self):
+        return self._epoch
--- a/tests/st/networks/models/resnet50/test_resnet50_imagenet_and_thor.py
+++ b/tests/st/networks/models/resnet50/test_resnet50_imagenet_and_thor.py
@ -1,4 +1,4 @@
-# Copyright 2020-2021 Huawei Technologies Co., Ltd
+# Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -14,332 +14,61 @@
 # ============================================================================

 """train and evaluate resnet50 network on imagenet dataset"""
-
 import os
-import time
-from multiprocessing import Process, Queue
+import shutil
 import pytest
-import numpy as np
-
-from mindspore import context
-from mindspore.common.tensor import Tensor
-from mindspore.communication.management import init
-from mindspore.context import ParallelMode
-from mindspore.train.callback import Callback
-from mindspore.train.model import Model
-from mindspore.train.train_thor import ConvertModelUtils
-from mindspore.train.loss_scale_manager import FixedLossScaleManager
-from mindspore.nn.optim import thor
-import mindspore.dataset as ds
-import mindspore.nn as nn
-
-from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
-from tests.st.networks.models.resnet50.src.dataset import create_dataset
-from tests.st.networks.models.resnet50.src.lr_generator import get_learning_rate
-from tests.st.networks.models.resnet50.src.config import config
-from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth
-from tests.st.networks.models.resnet50.src_thor.config import config as thor_config
-from tests.st.networks.models.resnet50.src_thor.dataset import create_dataset2 as create_dataset_thor
-from tests.st.networks.models.resnet50.src.resnet import resnet50
-
-MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_1.json"
-MINDSPORE_HCCL_CONFIG_PATH_2 = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_2.json"
-dataset_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
-eval_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
-
-np.random.seed(1)
-ds.config.set_seed(1)
-os.environ['GLOG_v'] = str(2)


-def get_thor_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100):
-    """get_model_lr"""
-    lr_each_step = []
-    total_steps = steps_per_epoch * total_epochs
-    for i in range(total_steps):
-        epoch = (i + 1) / steps_per_epoch
-        base = (1.0 - float(epoch) / total_epochs) ** decay
-        lr_local = lr_init * base
-        if epoch >= decay_epochs:
-            lr_local = lr_local * 0.5
-        if epoch >= decay_epochs + 1:
-            lr_local = lr_local * 0.5
-        lr_each_step.append(lr_local)
-    current_step = global_step
-    lr_each_step = np.array(lr_each_step).astype(np.float32)
-    learning_rate = lr_each_step[current_step:]
-    return learning_rate
+def get_env_info():
+    print("================== CPU ======================")
+    os.system("top -bi -n 2 -d 0.02")
+    print("================= IO ====================")
+    os.system("iostat")
+    print("================= Memory =====================")
+    os.system("free -h")
+    print("================= Process ====================")
+    os.system("ps -ef | grep python")
+    print("================= NPU ====================")
+    os.system("npu-smi info")


-def get_thor_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch):
-    """get_model_damping"""
-    damping_each_step = []
-    total_steps = steps_per_epoch * total_epochs
-    for step in range(total_steps):
-        epoch = (step + 1) / steps_per_epoch
-        damping_here = damping_init * (decay_rate ** (epoch / 10))
-        damping_each_step.append(damping_here)
-    current_step = global_step
-    damping_each_step = np.array(damping_each_step).astype(np.float32)
-    damping_now = damping_each_step[current_step:]
-    return damping_now
-
-
-class LossGet(Callback):
-    def __init__(self, per_print_times, data_size):
-        super(LossGet, self).__init__()
-        if not isinstance(per_print_times, int) or per_print_times < 0:
-            raise ValueError("print_step must be int and >= 0.")
-        self._per_print_times = per_print_times
-        self._loss = 0.0
-        self.data_size = data_size
-        self._epoch = 0
-
-    def step_end(self, run_context):
-        cb_params = run_context.original_args()
-        loss = cb_params.net_outputs
-        self._epoch = cb_params.cur_epoch_num
-        if isinstance(loss, (tuple, list)):
-            if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
-                loss = loss[0]
-
-        if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
-            loss = np.mean(loss.asnumpy())
-
-        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
-
-        if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
-            raise ValueError("epoch: {} step: {}. Invalid loss, terminating training."
-                             .format(cb_params.cur_epoch_num, cur_step_in_epoch))
-        cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
-        if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
-            self._loss = loss
-            print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num,
-                                                      cur_step_in_epoch, loss), flush=True)
-
-    def epoch_begin(self, run_context):
-        self.epoch_time = time.time()
-
-    def epoch_end(self, run_context):
-        epoch_mseconds = (time.time() - self.epoch_time) * 1000
-        self._per_step_mseconds = epoch_mseconds / self.data_size
-
-    def get_loss(self):
-        return self._loss
-
-    def get_per_step_time(self):
-        return self._per_step_mseconds
-
-    def get_epoch(self):
-        return self._epoch
-
-
-def train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset, q):
-    print("run_start", device_id)
-    eval_interval = config.eval_interval
-    step_size = dataset.get_dataset_size()
-    acc = 0.0
-    time_cost = 0.0
-    for epoch_idx in range(0, int(epoch_size / eval_interval)):
-        model.train(1, dataset, callbacks=loss_cb)
-        eval_start = time.time()
-        output = model.eval(eval_dataset)
-        eval_cost = (time.time() - eval_start) * 1000
-        acc = float(output["acc"])
-        time_cost = loss_cb.get_per_step_time()
-        loss = loss_cb.get_loss()
-        print("the {} epoch's resnet result:\n "
-              "device{}, training loss {}, acc {}, "
-              "training per step cost {:.2f} ms, eval cost {:.2f} ms, "
-              "total_cost {:.2f} ms".format(epoch_idx, device_id,
-                                            loss, acc, time_cost,
-                                            eval_cost,
-                                            time_cost * step_size + eval_cost))
-    q.put({'acc': acc, 'cost': time_cost})
-
-
-def train_process(q, device_id, epoch_size, device_num, enable_hccl):
-    os.system("mkdir " + str(device_id))
-    os.chdir(str(device_id))
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id)
-    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
-    os.environ['RANK_ID'] = str(device_id)
-    os.environ['RANK_SIZE'] = str(device_num)
-    if enable_hccl:
-        context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
-                                          gradients_mean=True, all_reduce_fusion_config=[107, 160])
-        init()
-
-    # network
-
-    net = resnet50(class_num=config.class_num)
-
-    # evaluation network
-    dist_eval_network = ClassifyCorrectCell(net)
-
-    if not config.use_label_smooth:
-        config.label_smooth_factor = 0.0
-
-    # loss
-    loss = CrossEntropySmooth(sparse=True, reduction="mean",
-                              smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
-
-    # train dataset
-    dataset = create_dataset(dataset_path=dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size)
-
-    step_size = dataset.get_dataset_size()
-    # evaluation dataset
-    eval_dataset = create_dataset(dataset_path=eval_path, do_train=False,
-                                  repeat_num=1, batch_size=config.eval_batch_size)
-
-    # loss scale
-    loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
-
-    # learning rate
-    lr = Tensor(get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max,
-                                  warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size,
-                                  steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode))
-
-    # optimizer
-    decayed_params = []
-    no_decayed_params = []
-    for param in net.trainable_params():
-        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
-            decayed_params.append(param)
-        else:
-            no_decayed_params.append(param)
-
-    group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
-                    {'params': no_decayed_params, 'weight_decay': 0.0},
-                    {'order_params': net.trainable_params()}]
-
-    if config.use_lars:
-        momentum = nn.Momentum(group_params, lr, config.momentum,
-                               loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
-        opt = nn.LARS(momentum, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient,
-                      lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name)
-
-    else:
-        opt = nn.Momentum(group_params, lr, config.momentum,
-                          loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
-
-    # model
-    model = Model(net, loss_fn=loss, optimizer=opt,
-                  loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False,
-                  metrics={'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num)},
-                  eval_network=dist_eval_network)
-
-    # callbacks
-    loss_cb = LossGet(1, step_size)
-    train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset, q)
-
-
-def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl):
-    os.system("mkdir " + str(device_id))
-    os.chdir(str(device_id))
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
-    context.set_context(device_id=device_id)
-    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2
-    os.environ['RANK_ID'] = str(device_id - 4)
-    os.environ['RANK_SIZE'] = str(device_num)
-    if enable_hccl:
-        context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
-                                          gradients_mean=True, all_reduce_fusion_config=[85, 160])
-        init()
-
-    # network
-    net = resnet50(thor_config.class_num)
-
-    if not thor_config.label_smooth:
-        thor_config.label_smooth_factor = 0.0
-
-    # loss
-    loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor,
-                              num_classes=thor_config.class_num)
-
-    # train dataset
-    dataset = create_dataset_thor(dataset_path=dataset_path, do_train=True,
-                                  batch_size=thor_config.batch_size, train_image_size=thor_config.train_image_size,
-                                  eval_image_size=thor_config.eval_image_size, target="Ascend",
-                                  distribute=True)
-    step_size = dataset.get_dataset_size()
-
-    # loss scale
-    loss_scale = FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False)
-
-    # learning rate
-    lr = get_thor_lr(0, 0.05803, 4.04839, 53, 5004, decay_epochs=39)
-    damping = get_thor_damping(0, 0.02714, 0.50036, 70, 5004)
-    # optimizer
-    split_indices = [26, 53]
-    opt = thor(net, Tensor(lr), Tensor(damping), thor_config.momentum, thor_config.weight_decay, thor_config.loss_scale,
-               thor_config.batch_size, split_indices=split_indices, frequency=thor_config.frequency)
-
-    # evaluation network
-    dist_eval_network = ClassifyCorrectCell(net)
-    # model
-    model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale,
-                  metrics={'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num)},
-                  amp_level="O2", keep_batchnorm_fp32=False,
-                  eval_network=dist_eval_network)
-
-    model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
-                                                      loss_scale_manager=loss_scale, metrics={'acc'},
-                                                      amp_level="O2", keep_batchnorm_fp32=False)
-
-    # callbacks
-    loss_cb = LossGet(1, step_size)
-
-    # train and eval
-    print("run_start", device_id)
-    model.train(2, dataset, callbacks=loss_cb,
-                sink_size=dataset.get_dataset_size(), dataset_sink_mode=True)
-    time_cost = loss_cb.get_per_step_time()
-    loss = loss_cb.get_loss()
-    epoch_idx = loss_cb.get_epoch()
-    print("the {} epoch's resnet result:\n "
-          "device{}, training loss {}, "
-          "training per step cost {:.2f} ms, total_cost {:.2f} ms".format(epoch_idx, device_id,
-                                                                          loss, time_cost, time_cost * step_size))
-    q.put({'loss': loss, 'cost': time_cost})
-
-
-def resnet_end(device_num, q):
-    acc = 0.0
-    cost = 0.0
-    for i in range(device_num):
-        assert not q.empty()
-        output = q.get()
-        acc += output['acc']
-        cost += output['cost']
-    acc = acc / device_num
-    cost = cost / device_num
-
-    for i in range(device_num):
-        os.system("rm -rf " + str(i))
-    print("End training...")
+def resnet_end():
+    acc = 0
+    cost = 0
+    sh_path = os.path.split(os.path.realpath(__file__))[0]
+    for i in range(4):
+        with open(os.path.join(sh_path, f"train_parallel{i}", f"resnet_{i}.txt")) as f:
+            lines = f.readlines()
+            acc += float(lines[0].strip().split(": ")[1])
+            cost += float(lines[1].strip().split(": ")[1])
+    acc /= 4
+    cost /= 4
+    print(f"resnet acc: {acc}, cost: {cost}")
    assert acc > 0.1
    assert cost < 26
+    for i in range(4):
+        shutil.rmtree(os.path.join(sh_path, f"train_parallel{i}"))


-def thor_end(device_num, q):
-    thor_loss = 0.0
-    thor_cost = 0.0
-    for i in range(device_num):
-        output = q.get()
-        thor_loss += output['loss']
-        thor_cost += output['cost']
-    thor_loss = thor_loss / device_num
-    thor_cost = thor_cost / device_num
-
-    for i in range(4, device_num + 4):
-        os.system("rm -rf " + str(i))
-    print("End training...")
+def thor_end():
+    thor_cost = 0
+    thor_loss = 0
+    sh_path = os.path.split(os.path.realpath(__file__))[0]
+    for i in range(4):
+        with open(os.path.join(sh_path, f"train_parallel{i+4}", f"thor_{i}.txt")) as f:
+            lines = f.readlines()
+            thor_loss += float(lines[0].strip().split(": ")[1])
+            thor_cost += float(lines[1].strip().split(": ")[1])
+    thor_loss /= 4
+    thor_cost /= 4
+    print(f"resnet thor_loss: {thor_loss}, thor_cost: {thor_cost}")
    assert thor_loss < 7
    assert thor_cost < 30
+    for i in range(4):
+        shutil.rmtree(os.path.join(sh_path, f"train_parallel{i+4}"))


-@pytest.mark.level1
+@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_single
@ -349,46 +78,9 @@ def test_resnet_imagenet_and_thor_4p():
    Description: Train and evaluate resnet50 network on imagenet dataset.
    Expectation: accuracy > 0.1, time cost < 26.
    """
-    context.set_context(enable_graph_kernel=False, enable_sparse=False)
-    context.reset_auto_parallel_context()
-    context.reset_ps_context()
-
-    q = Queue()
-    q2 = Queue()
-    device_num = 4
-    epoch_size = 2
-    epoch_size_2 = 1
-    enable_hccl = True
-    process = []
-    process2 = []
-    for i in range(device_num):
-        device_id = i
-        process.append(Process(target=train_process,
-                               args=(q, device_id, epoch_size, device_num, enable_hccl)))
-        process2.append(Process(target=train_process_thor,
-                                args=(q2, device_id + 4, epoch_size_2, device_num, enable_hccl)))
-    cpu_count = os.cpu_count()
-    half_cpu_count = cpu_count // 2
-    each_cpu_count = half_cpu_count // device_num
-    for i in range(device_num):
-        process[i].start()
-        process2[i].start()
-        if each_cpu_count > 1:
-            cpu_start = each_cpu_count * i
-            cpu_end = each_cpu_count * (i + 1)
-            process_cpu = [x for x in range(cpu_start, cpu_end)]
-            process2_cpu = [x for x in range(cpu_start + half_cpu_count, cpu_end + half_cpu_count)]
-            pid1 = process[i].pid
-            pid2 = process2[i].pid
-            os.sched_setaffinity(pid1, set(process_cpu))
-            os.sched_setaffinity(pid2, set(process2_cpu))
-    print("Waiting for all subprocesses done...")
-
-    for i in range(device_num):
-        process[i].join()
-        process2[i].join()
-    # resnet
-    resnet_end(device_num, q)
-    # thor
-    thor_end(device_num, q2)
-    
+    get_env_info()
+    sh_path = os.path.split(os.path.realpath(__file__))[0]
+    ret = os.system(f"sh {sh_path}/scripts/run_train.sh")
+    assert ret == 0
+    resnet_end()
+    thor_end()
--- a/tests/st/networks/models/resnet50/train_resnet50.py
+++ b/tests/st/networks/models/resnet50/train_resnet50.py
@ -0,0 +1,122 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""resnet train & eval case."""
+import os
+import time
+import mindspore as ms
+from mindspore import nn
+from tests.st.networks.models.resnet50.src.callback import LossGet
+from tests.st.networks.models.resnet50.src.config import config
+from tests.st.networks.models.resnet50.src.resnet import resnet50
+from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
+from tests.st.networks.models.resnet50.src.dataset import create_dataset
+from tests.st.networks.models.resnet50.src.lr_generator import get_learning_rate
+from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth
+
+TRAIN_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
+EVAL_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
+ms.set_seed(1)
+
+
+def get_optimizer(net, step_size):
+    # optimizer
+    lr = ms.Tensor(get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max,
+                                     warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size,
+                                     steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode))
+    decayed_params = []
+    no_decayed_params = []
+    for param in net.trainable_params():
+        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
+            decayed_params.append(param)
+        else:
+            no_decayed_params.append(param)
+
+    group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
+                    {'params': no_decayed_params, 'weight_decay': 0.0},
+                    {'order_params': net.trainable_params()}]
+
+    if config.use_lars:
+        momentum = nn.Momentum(group_params, lr, config.momentum,
+                               loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
+        opt = nn.LARS(momentum, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient,
+                      lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name)
+
+    else:
+        opt = nn.Momentum(group_params, lr, config.momentum,
+                          loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
+    return opt
+
+
+def train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset):
+    print("run_start", device_id)
+    eval_interval = config.eval_interval
+    step_size = dataset.get_dataset_size()
+    acc = 0.0
+    time_cost = 0.0
+    for epoch_idx in range(0, int(epoch_size / eval_interval)):
+        model.train(1, dataset, callbacks=loss_cb)
+        eval_start = time.time()
+        output = model.eval(eval_dataset)
+        eval_cost = (time.time() - eval_start) * 1000
+        acc = float(output["acc"])
+        time_cost = loss_cb.get_per_step_time()
+        loss = loss_cb.get_loss()
+        print("the {} epoch's resnet result:\n "
+              "device{}, training loss {}, acc {}, "
+              "training per step cost {:.2f} ms, eval cost {:.2f} ms, "
+              "total_cost {:.2f} ms".format(epoch_idx, device_id,
+                                            loss, acc, time_cost,
+                                            eval_cost,
+                                            time_cost * step_size + eval_cost))
+    print(f"===resnet_acc: {acc}")
+    print(f"===resnet_time_cost: {time_cost}")
+
+
+def run_train():
+    ms.context.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
+    rank_id = int(os.getenv('RANK_ID', '0'))
+    device_num = int(os.getenv('RANK_SIZE', '1'))
+    device_id = int(os.getenv('DEVICE_ID', '0'))
+    print(f"run resnet50 device_num:{device_num}, device_id:{device_id}, rank_id:{rank_id}")
+    if device_num > 1:
+        ms.communication.init()
+        ms.context.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL,
+                                             gradients_mean=True, all_reduce_fusion_config=[107, 160])
+    net = resnet50(class_num=config.class_num)
+    dist_eval_network = ClassifyCorrectCell(net)
+
+    if not config.use_label_smooth:
+        config.label_smooth_factor = 0.0
+    loss = CrossEntropySmooth(sparse=True, reduction="mean",
+                              smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
+
+    # dataset
+    dataset = create_dataset(dataset_path=TRAIN_PATH, do_train=True, repeat_num=1, batch_size=config.batch_size)
+    step_size = dataset.get_dataset_size()
+    eval_dataset = create_dataset(dataset_path=EVAL_PATH, do_train=False,
+                                  repeat_num=1, batch_size=config.eval_batch_size)
+
+    loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
+    opt = get_optimizer(net, step_size)
+
+    model = ms.Model(net, loss_fn=loss, optimizer=opt,
+                     loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False,
+                     metrics={'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num)},
+                     eval_network=dist_eval_network)
+    loss_cb = LossGet(1, step_size)
+    train_and_eval(device_id, 2, model, dataset, loss_cb, eval_dataset)
+
+if __name__ == '__main__':
+    run_train()
--- a/tests/st/networks/models/resnet50/train_resnet50_thor.py
+++ b/tests/st/networks/models/resnet50/train_resnet50_thor.py
@ -0,0 +1,133 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""resnet train & eval case."""
+import os
+import numpy as np
+import mindspore as ms
+from mindspore import nn
+from mindspore.train.train_thor import ConvertModelUtils
+from tests.st.networks.models.resnet50.src.callback import LossGet
+from tests.st.networks.models.resnet50.src_thor.config import config as thor_config
+from tests.st.networks.models.resnet50.src_thor.dataset import create_dataset2 as create_dataset_thor
+from tests.st.networks.models.resnet50.src.resnet import resnet50
+from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
+from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth
+
+TRAIN_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
+EVAL_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
+ms.set_seed(1)
+
+
+def get_thor_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100):
+    """get_model_lr"""
+    lr_each_step = []
+    total_steps = steps_per_epoch * total_epochs
+    for i in range(total_steps):
+        epoch = (i + 1) / steps_per_epoch
+        base = (1.0 - float(epoch) / total_epochs) ** decay
+        lr_local = lr_init * base
+        if epoch >= decay_epochs:
+            lr_local = lr_local * 0.5
+        if epoch >= decay_epochs + 1:
+            lr_local = lr_local * 0.5
+        lr_each_step.append(lr_local)
+    current_step = global_step
+    lr_each_step = np.array(lr_each_step).astype(np.float32)
+    learning_rate = lr_each_step[current_step:]
+    return learning_rate
+
+
+def get_thor_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch):
+    """get_model_damping"""
+    damping_each_step = []
+    total_steps = steps_per_epoch * total_epochs
+    for step in range(total_steps):
+        epoch = (step + 1) / steps_per_epoch
+        damping_here = damping_init * (decay_rate ** (epoch / 10))
+        damping_each_step.append(damping_here)
+    current_step = global_step
+    damping_each_step = np.array(damping_each_step).astype(np.float32)
+    damping_now = damping_each_step[current_step:]
+    return damping_now
+
+
+def run_train():
+    ms.context.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
+    rank_id = int(os.getenv('RANK_ID', '0'))
+    device_num = int(os.getenv('RANK_SIZE', '1'))
+    device_id = int(os.getenv('DEVICE_ID', '0'))
+    print(f"run resnet50 thor device_num:{device_num}, device_id:{device_id}, rank_id:{rank_id}")
+    if device_num > 1:
+        ms.communication.init()
+        ms.context.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL,
+                                             gradients_mean=True, all_reduce_fusion_config=[85, 160])
+    net = resnet50(thor_config.class_num)
+
+    if not thor_config.label_smooth:
+        thor_config.label_smooth_factor = 0.0
+
+    # loss
+    loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor,
+                              num_classes=thor_config.class_num)
+
+    # train dataset
+    dataset = create_dataset_thor(dataset_path=TRAIN_PATH, do_train=True,
+                                  batch_size=thor_config.batch_size, train_image_size=thor_config.train_image_size,
+                                  eval_image_size=thor_config.eval_image_size, target="Ascend",
+                                  distribute=True)
+    step_size = dataset.get_dataset_size()
+
+    # loss scale
+    loss_scale = ms.FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False)
+
+    # learning rate
+    lr = get_thor_lr(0, 0.05803, 4.04839, 53, 5004, decay_epochs=39)
+    damping = get_thor_damping(0, 0.02714, 0.50036, 70, 5004)
+    # optimizer
+    split_indices = [26, 53]
+    opt = nn.thor(net, ms.Tensor(lr), ms.Tensor(damping), thor_config.momentum, thor_config.weight_decay,
+                  thor_config.loss_scale, thor_config.batch_size, split_indices=split_indices,
+                  frequency=thor_config.frequency)
+
+    # evaluation network
+    dist_eval_network = ClassifyCorrectCell(net)
+    # model
+    model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale,
+                     metrics={'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num)},
+                     amp_level="O2", keep_batchnorm_fp32=False,
+                     eval_network=dist_eval_network)
+
+    model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
+                                                      loss_scale_manager=loss_scale, metrics={'acc'},
+                                                      amp_level="O2", keep_batchnorm_fp32=False)
+
+    # callbacks
+    loss_cb = LossGet(1, step_size)
+
+    # train and eval
+    print("run_start", device_id)
+    model.train(2, dataset, callbacks=loss_cb, dataset_sink_mode=True, sink_size=step_size)
+    time_cost = loss_cb.get_per_step_time()
+    loss = loss_cb.get_loss()
+    epoch_idx = loss_cb.get_epoch()
+    print("the {} epoch's resnet result:\n "
+          "device{}, training loss {}, "
+          "training per step cost {:.2f} ms, total_cost {:.2f} ms".format(epoch_idx, device_id,
+                                                                          loss, time_cost, time_cost * step_size))
+    print(f"===resnet_thor_loss: {loss}")
+    print(f"===resnet_thor_time_cost: {time_cost}")
+
+if __name__ == '__main__':
+    run_train()