!26469 Remove resnet_thor 4p st and add resnet 8p st.

Merge pull request !26469 from linqingke/bug_fix
This commit is contained in:
i-robot 2021-11-19 03:36:28 +00:00 committed by Gitee
commit 87e6cf438c
1 changed files with 11 additions and 155 deletions

View File

@ -1,4 +1,4 @@
# Copyright 2020 Huawei Technologies Co., Ltd
# Copyright 2020-2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -28,7 +28,6 @@ from mindspore.train.model import Model
from mindspore.context import ParallelMode
from mindspore.train.callback import Callback
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.nn.optim import thor
import mindspore.nn as nn
import mindspore.dataset as ds
@ -38,14 +37,9 @@ from tests.st.networks.models.resnet50.src.lr_generator import get_learning_rate
from tests.st.networks.models.resnet50.src.config import config
from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth
from tests.st.networks.models.resnet50.src_thor.config import config as thor_config
from tests.st.networks.models.resnet50.src_thor.dataset import create_dataset as create_dataset_thor
from tests.st.networks.models.resnet50.src_thor.model_thor import Model as THOR_Model
from tests.st.networks.models.resnet50.src_thor.resnet import resnet50 as resnet50_thor
MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_1.json"
MINDSPORE_HCCL_CONFIG_PATH_2 = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_2.json"
MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_table_8p.json"
dataset_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
eval_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
@ -54,39 +48,6 @@ ds.config.set_seed(1)
os.environ['GLOG_v'] = str(2)
def get_thor_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100):
"""get_model_lr"""
lr_each_step = []
total_steps = steps_per_epoch * total_epochs
for i in range(total_steps):
epoch = (i + 1) / steps_per_epoch
base = (1.0 - float(epoch) / total_epochs) ** decay
lr_local = lr_init * base
if epoch >= decay_epochs:
lr_local = lr_local * 0.5
if epoch >= decay_epochs + 1:
lr_local = lr_local * 0.5
lr_each_step.append(lr_local)
current_step = global_step
lr_each_step = np.array(lr_each_step).astype(np.float32)
learning_rate = lr_each_step[current_step:]
return learning_rate
def get_thor_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch):
"""get_model_damping"""
damping_each_step = []
total_steps = steps_per_epoch * total_epochs
for step in range(total_steps):
epoch = (step + 1) / steps_per_epoch
damping_here = damping_init * (decay_rate ** (epoch / 10))
damping_each_step.append(damping_here)
current_step = global_step
damping_each_step = np.array(damping_each_step).astype(np.float32)
damping_now = damping_each_step[current_step:]
return damping_now
class LossGet(Callback):
def __init__(self, per_print_times, data_size):
super(LossGet, self).__init__()
@ -227,134 +188,45 @@ def train_process(q, device_id, epoch_size, device_num, enable_hccl):
q.put({'acc': acc, 'cost': time_cost})
def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl):
os.system("mkdir " + str(device_id))
os.chdir(str(device_id))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
context.set_context(device_id=device_id)
os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2
os.environ['RANK_ID'] = str(device_id - 4)
os.environ['RANK_SIZE'] = str(device_num)
if enable_hccl:
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True, all_reduce_fusion_config=[85, 160])
init()
# network
net = resnet50_thor(thor_config.class_num)
if not thor_config.label_smooth:
thor_config.label_smooth_factor = 0.0
# loss
loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor,
num_classes=thor_config.class_num)
# train dataset
dataset = create_dataset_thor(dataset_path=dataset_path, do_train=True,
repeat_num=1, batch_size=thor_config.batch_size)
step_size = dataset.get_dataset_size()
eval_interval = thor_config.eval_interval
# evaluation dataset
eval_dataset = create_dataset(dataset_path=eval_path, do_train=False,
repeat_num=1, batch_size=thor_config.eval_batch_size)
# loss scale
loss_scale = FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False)
# learning rate
lr = get_thor_lr(0, 0.05803, 4.04839, 53, 5004, decay_epochs=39)
damping = get_thor_damping(0, 0.02714, 0.50036, 70, 5004)
# optimizer
split_indices = [26, 53]
opt = thor(net, Tensor(lr), Tensor(damping), thor_config.momentum, thor_config.weight_decay, thor_config.loss_scale,
thor_config.batch_size, split_indices=split_indices, frequency=thor_config.frequency)
# evaluation network
dist_eval_network = ClassifyCorrectCell(net)
# model
model = THOR_Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2",
keep_batchnorm_fp32=False,
metrics={'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num)},
eval_network=dist_eval_network, frequency=thor_config.frequency)
# model init
print("init_start", device_id)
model.init(dataset, eval_dataset)
print("init_stop", device_id)
# callbacks
loss_cb = LossGet(1, step_size)
# train and eval
acc = 0.0
time_cost = 0.0
print("run_start", device_id)
for epoch_idx in range(0, int(epoch_size / eval_interval)):
model.train(eval_interval, dataset, callbacks=loss_cb)
eval_start = time.time()
output = model.eval(eval_dataset)
eval_cost = (time.time() - eval_start) * 1000
acc = float(output["acc"])
time_cost = loss_cb.get_per_step_time()
loss = loss_cb.get_loss()
print("the {} epoch's resnet result:\n "
"device{}, training loss {}, acc {}, "
"training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms".format(
epoch_idx, device_id, loss, acc, time_cost, eval_cost, time_cost * step_size + eval_cost))
q.put({'acc': acc, 'cost': time_cost})
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_single
def test_resnet_and_resnet_thor_imagenet_4p():
# reset context
def test_resnet_imagenet_8p():
"""
Feature: Resnet50 network.
Description: Train and evaluate resnet50 network on imagenet dataset.
Expectation: accuracy > 0.1, time cost < 26.
"""
context.set_context(enable_graph_kernel=False, enable_sparse=False)
context.reset_auto_parallel_context()
context.reset_ps_context()
q = Queue()
q2 = Queue()
# resnet50
device_num = 4
device_num = 8
epoch_size = 2
epoch_size_2 = 1
enable_hccl = True
process = []
process2 = []
for i in range(device_num):
device_id = i
process.append(Process(target=train_process,
args=(q, device_id, epoch_size, device_num, enable_hccl)))
process2.append(Process(target=train_process_thor,
args=(q2, device_id + 4, epoch_size_2, device_num, enable_hccl)))
cpu_count = os.cpu_count()
half_cpu_count = cpu_count // 2
each_cpu_count = half_cpu_count // device_num
each_cpu_count = cpu_count // device_num
for i in range(device_num):
process[i].start()
process2[i].start()
if each_cpu_count > 1:
cpu_start = each_cpu_count * i
cpu_end = each_cpu_count * (i + 1)
process_cpu = [x for x in range(cpu_start, cpu_end)]
process2_cpu = [x for x in range(cpu_start + half_cpu_count, cpu_end + half_cpu_count)]
pid1 = process[i].pid
pid2 = process2[i].pid
os.sched_setaffinity(pid1, set(process_cpu))
os.sched_setaffinity(pid2, set(process2_cpu))
print("Waiting for all subprocesses done...")
for i in range(device_num):
process[i].join()
process2[i].join()
# resnet
acc = 0.0
@ -370,21 +242,5 @@ def test_resnet_and_resnet_thor_imagenet_4p():
for i in range(device_num):
os.system("rm -rf " + str(i))
print("End training...")
assert acc > 0.15
assert acc > 0.1
assert cost < 26
# THOR
thor_acc = 0.0
thor_cost = 0.0
for i in range(device_num):
output = q2.get()
thor_acc += output['acc']
thor_cost += output['cost']
thor_acc = thor_acc / device_num
thor_cost = thor_cost / device_num
for i in range(4, device_num + 4):
os.system("rm -rf " + str(i))
print("End training...")
assert thor_acc > 0.25
assert thor_cost < 22