forked from mindspore-Ecosystem/mindspore
fix resnet50 & thor tests
This commit is contained in:
parent
c52ef8ed33
commit
96607251ee
|
@ -0,0 +1,44 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Copyright 2022 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
BASE_PATH=$(cd "$(dirname $0)"; pwd)
|
||||||
|
|
||||||
|
export RANK_SIZE=4
|
||||||
|
export RANK_TABLE_FILE="/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_1.json"
|
||||||
|
|
||||||
|
cpus=`cat /proc/cpuinfo| grep "processor"| wc -l`
|
||||||
|
avg=`expr $cpus \/ 8`
|
||||||
|
gap=`expr $avg \- 1`
|
||||||
|
rank_start=0
|
||||||
|
for((i=0; i<$RANK_SIZE; i++))
|
||||||
|
do
|
||||||
|
j=$((rank_start + i))
|
||||||
|
start=`expr $j \* $avg`
|
||||||
|
end=`expr $start \+ $gap`
|
||||||
|
cmdopt=$start"-"$end
|
||||||
|
export DEVICE_ID=$((rank_start + i))
|
||||||
|
export RANK_ID=${i}
|
||||||
|
rm -rf $BASE_PATH/../train_parallel$j
|
||||||
|
mkdir $BASE_PATH/../train_parallel$j
|
||||||
|
cd $BASE_PATH/../train_parallel$j || exit
|
||||||
|
echo "start resnet training for rank $RANK_ID, device $DEVICE_ID"
|
||||||
|
(taskset -c $cmdopt python $BASE_PATH/../train_resnet50.py &> log; grep "===" log > resnet_$i.txt) &
|
||||||
|
cd ..
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
echo "result:"
|
||||||
|
cat $BASE_PATH/../train_parallel0/log
|
||||||
|
cat $BASE_PATH/../train_parallel*/resnet_*.txt
|
|
@ -0,0 +1,44 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Copyright 2022 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
BASE_PATH=$(cd "$(dirname $0)"; pwd)
|
||||||
|
|
||||||
|
export RANK_SIZE=4
|
||||||
|
export RANK_TABLE_FILE="/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_2.json"
|
||||||
|
|
||||||
|
cpus=`cat /proc/cpuinfo| grep "processor"| wc -l`
|
||||||
|
avg=`expr $cpus \/ 8`
|
||||||
|
gap=`expr $avg \- 1`
|
||||||
|
rank_start=4
|
||||||
|
for((i=0; i<$RANK_SIZE; i++))
|
||||||
|
do
|
||||||
|
j=$((rank_start + i))
|
||||||
|
start=`expr $j \* $avg`
|
||||||
|
end=`expr $start \+ $gap`
|
||||||
|
cmdopt=$start"-"$end
|
||||||
|
export DEVICE_ID=$((rank_start + i))
|
||||||
|
export RANK_ID=${i}
|
||||||
|
rm -rf $BASE_PATH/../train_parallel$j
|
||||||
|
mkdir $BASE_PATH/../train_parallel$j
|
||||||
|
cd $BASE_PATH/../train_parallel$j || exit
|
||||||
|
echo "start resnet thor training for rank $RANK_ID, device $DEVICE_ID"
|
||||||
|
(taskset -c $cmdopt python $BASE_PATH/../train_resnet50_thor.py &> log; grep "===" log > thor_$i.txt) &
|
||||||
|
cd ..
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
echo "result:"
|
||||||
|
cat $BASE_PATH/../train_parallel5/log
|
||||||
|
cat $BASE_PATH/../train_parallel*/thor_*.txt
|
|
@ -0,0 +1,20 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Copyright 2022 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
BASE_PATH=$(cd "$(dirname $0)"; pwd)
|
||||||
|
bash $BASE_PATH/run_resnet50_imagenet_4p.sh &
|
||||||
|
bash $BASE_PATH/run_resnet_thor_imagenet_4p.sh &
|
||||||
|
wait
|
|
@ -0,0 +1,70 @@
|
||||||
|
# Copyright 2022 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""custom callback."""
|
||||||
|
import time
|
||||||
|
import numpy as np
|
||||||
|
import mindspore as ms
|
||||||
|
from mindspore.train.callback import Callback
|
||||||
|
|
||||||
|
|
||||||
|
class LossGet(Callback):
|
||||||
|
def __init__(self, per_print_times, data_size):
|
||||||
|
super(LossGet, self).__init__()
|
||||||
|
if not isinstance(per_print_times, int) or per_print_times < 0:
|
||||||
|
raise ValueError("print_step must be int and >= 0.")
|
||||||
|
self._per_print_times = per_print_times
|
||||||
|
self._loss = 0.0
|
||||||
|
self.data_size = data_size
|
||||||
|
self._epoch = 0
|
||||||
|
self.epoch_time = time.time()
|
||||||
|
self._per_step_mseconds = 0
|
||||||
|
|
||||||
|
def step_end(self, run_context):
|
||||||
|
cb_params = run_context.original_args()
|
||||||
|
loss = cb_params.net_outputs
|
||||||
|
self._epoch = cb_params.cur_epoch_num
|
||||||
|
if isinstance(loss, (tuple, list)):
|
||||||
|
if isinstance(loss[0], ms.Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
|
||||||
|
loss = loss[0]
|
||||||
|
|
||||||
|
if isinstance(loss, ms.Tensor) and isinstance(loss.asnumpy(), np.ndarray):
|
||||||
|
loss = np.mean(loss.asnumpy())
|
||||||
|
|
||||||
|
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
|
||||||
|
|
||||||
|
if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
|
||||||
|
raise ValueError("epoch: {} step: {}. Invalid loss, terminating training."
|
||||||
|
.format(cb_params.cur_epoch_num, cur_step_in_epoch))
|
||||||
|
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
|
||||||
|
if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
|
||||||
|
self._loss = loss
|
||||||
|
print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num,
|
||||||
|
cur_step_in_epoch, loss), flush=True)
|
||||||
|
|
||||||
|
def epoch_begin(self, run_context):
|
||||||
|
self.epoch_time = time.time()
|
||||||
|
|
||||||
|
def epoch_end(self, run_context):
|
||||||
|
epoch_mseconds = (time.time() - self.epoch_time) * 1000
|
||||||
|
self._per_step_mseconds = epoch_mseconds / self.data_size
|
||||||
|
|
||||||
|
def get_loss(self):
|
||||||
|
return self._loss
|
||||||
|
|
||||||
|
def get_per_step_time(self):
|
||||||
|
return self._per_step_mseconds
|
||||||
|
|
||||||
|
def get_epoch(self):
|
||||||
|
return self._epoch
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
# Copyright 2022 Huawei Technologies Co., Ltd
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
|
@ -14,332 +14,61 @@
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
"""train and evaluate resnet50 network on imagenet dataset"""
|
"""train and evaluate resnet50 network on imagenet dataset"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import shutil
|
||||||
from multiprocessing import Process, Queue
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from mindspore import context
|
|
||||||
from mindspore.common.tensor import Tensor
|
|
||||||
from mindspore.communication.management import init
|
|
||||||
from mindspore.context import ParallelMode
|
|
||||||
from mindspore.train.callback import Callback
|
|
||||||
from mindspore.train.model import Model
|
|
||||||
from mindspore.train.train_thor import ConvertModelUtils
|
|
||||||
from mindspore.train.loss_scale_manager import FixedLossScaleManager
|
|
||||||
from mindspore.nn.optim import thor
|
|
||||||
import mindspore.dataset as ds
|
|
||||||
import mindspore.nn as nn
|
|
||||||
|
|
||||||
from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
|
|
||||||
from tests.st.networks.models.resnet50.src.dataset import create_dataset
|
|
||||||
from tests.st.networks.models.resnet50.src.lr_generator import get_learning_rate
|
|
||||||
from tests.st.networks.models.resnet50.src.config import config
|
|
||||||
from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth
|
|
||||||
from tests.st.networks.models.resnet50.src_thor.config import config as thor_config
|
|
||||||
from tests.st.networks.models.resnet50.src_thor.dataset import create_dataset2 as create_dataset_thor
|
|
||||||
from tests.st.networks.models.resnet50.src.resnet import resnet50
|
|
||||||
|
|
||||||
MINDSPORE_HCCL_CONFIG_PATH = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_1.json"
|
|
||||||
MINDSPORE_HCCL_CONFIG_PATH_2 = "/home/workspace/mindspore_config/hccl/rank_tabel_4p/rank_table_4p_2.json"
|
|
||||||
dataset_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
|
|
||||||
eval_path = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
|
|
||||||
|
|
||||||
np.random.seed(1)
|
|
||||||
ds.config.set_seed(1)
|
|
||||||
os.environ['GLOG_v'] = str(2)
|
|
||||||
|
|
||||||
|
|
||||||
def get_thor_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100):
|
def get_env_info():
|
||||||
"""get_model_lr"""
|
print("================== CPU ======================")
|
||||||
lr_each_step = []
|
os.system("top -bi -n 2 -d 0.02")
|
||||||
total_steps = steps_per_epoch * total_epochs
|
print("================= IO ====================")
|
||||||
for i in range(total_steps):
|
os.system("iostat")
|
||||||
epoch = (i + 1) / steps_per_epoch
|
print("================= Memory =====================")
|
||||||
base = (1.0 - float(epoch) / total_epochs) ** decay
|
os.system("free -h")
|
||||||
lr_local = lr_init * base
|
print("================= Process ====================")
|
||||||
if epoch >= decay_epochs:
|
os.system("ps -ef | grep python")
|
||||||
lr_local = lr_local * 0.5
|
print("================= NPU ====================")
|
||||||
if epoch >= decay_epochs + 1:
|
os.system("npu-smi info")
|
||||||
lr_local = lr_local * 0.5
|
|
||||||
lr_each_step.append(lr_local)
|
|
||||||
current_step = global_step
|
|
||||||
lr_each_step = np.array(lr_each_step).astype(np.float32)
|
|
||||||
learning_rate = lr_each_step[current_step:]
|
|
||||||
return learning_rate
|
|
||||||
|
|
||||||
|
|
||||||
def get_thor_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch):
|
def resnet_end():
|
||||||
"""get_model_damping"""
|
acc = 0
|
||||||
damping_each_step = []
|
cost = 0
|
||||||
total_steps = steps_per_epoch * total_epochs
|
sh_path = os.path.split(os.path.realpath(__file__))[0]
|
||||||
for step in range(total_steps):
|
for i in range(4):
|
||||||
epoch = (step + 1) / steps_per_epoch
|
with open(os.path.join(sh_path, f"train_parallel{i}", f"resnet_{i}.txt")) as f:
|
||||||
damping_here = damping_init * (decay_rate ** (epoch / 10))
|
lines = f.readlines()
|
||||||
damping_each_step.append(damping_here)
|
acc += float(lines[0].strip().split(": ")[1])
|
||||||
current_step = global_step
|
cost += float(lines[1].strip().split(": ")[1])
|
||||||
damping_each_step = np.array(damping_each_step).astype(np.float32)
|
acc /= 4
|
||||||
damping_now = damping_each_step[current_step:]
|
cost /= 4
|
||||||
return damping_now
|
print(f"resnet acc: {acc}, cost: {cost}")
|
||||||
|
|
||||||
|
|
||||||
class LossGet(Callback):
|
|
||||||
def __init__(self, per_print_times, data_size):
|
|
||||||
super(LossGet, self).__init__()
|
|
||||||
if not isinstance(per_print_times, int) or per_print_times < 0:
|
|
||||||
raise ValueError("print_step must be int and >= 0.")
|
|
||||||
self._per_print_times = per_print_times
|
|
||||||
self._loss = 0.0
|
|
||||||
self.data_size = data_size
|
|
||||||
self._epoch = 0
|
|
||||||
|
|
||||||
def step_end(self, run_context):
|
|
||||||
cb_params = run_context.original_args()
|
|
||||||
loss = cb_params.net_outputs
|
|
||||||
self._epoch = cb_params.cur_epoch_num
|
|
||||||
if isinstance(loss, (tuple, list)):
|
|
||||||
if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
|
|
||||||
loss = loss[0]
|
|
||||||
|
|
||||||
if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
|
|
||||||
loss = np.mean(loss.asnumpy())
|
|
||||||
|
|
||||||
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
|
|
||||||
|
|
||||||
if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
|
|
||||||
raise ValueError("epoch: {} step: {}. Invalid loss, terminating training."
|
|
||||||
.format(cb_params.cur_epoch_num, cur_step_in_epoch))
|
|
||||||
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
|
|
||||||
if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
|
|
||||||
self._loss = loss
|
|
||||||
print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num,
|
|
||||||
cur_step_in_epoch, loss), flush=True)
|
|
||||||
|
|
||||||
def epoch_begin(self, run_context):
|
|
||||||
self.epoch_time = time.time()
|
|
||||||
|
|
||||||
def epoch_end(self, run_context):
|
|
||||||
epoch_mseconds = (time.time() - self.epoch_time) * 1000
|
|
||||||
self._per_step_mseconds = epoch_mseconds / self.data_size
|
|
||||||
|
|
||||||
def get_loss(self):
|
|
||||||
return self._loss
|
|
||||||
|
|
||||||
def get_per_step_time(self):
|
|
||||||
return self._per_step_mseconds
|
|
||||||
|
|
||||||
def get_epoch(self):
|
|
||||||
return self._epoch
|
|
||||||
|
|
||||||
|
|
||||||
def train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset, q):
|
|
||||||
print("run_start", device_id)
|
|
||||||
eval_interval = config.eval_interval
|
|
||||||
step_size = dataset.get_dataset_size()
|
|
||||||
acc = 0.0
|
|
||||||
time_cost = 0.0
|
|
||||||
for epoch_idx in range(0, int(epoch_size / eval_interval)):
|
|
||||||
model.train(1, dataset, callbacks=loss_cb)
|
|
||||||
eval_start = time.time()
|
|
||||||
output = model.eval(eval_dataset)
|
|
||||||
eval_cost = (time.time() - eval_start) * 1000
|
|
||||||
acc = float(output["acc"])
|
|
||||||
time_cost = loss_cb.get_per_step_time()
|
|
||||||
loss = loss_cb.get_loss()
|
|
||||||
print("the {} epoch's resnet result:\n "
|
|
||||||
"device{}, training loss {}, acc {}, "
|
|
||||||
"training per step cost {:.2f} ms, eval cost {:.2f} ms, "
|
|
||||||
"total_cost {:.2f} ms".format(epoch_idx, device_id,
|
|
||||||
loss, acc, time_cost,
|
|
||||||
eval_cost,
|
|
||||||
time_cost * step_size + eval_cost))
|
|
||||||
q.put({'acc': acc, 'cost': time_cost})
|
|
||||||
|
|
||||||
|
|
||||||
def train_process(q, device_id, epoch_size, device_num, enable_hccl):
|
|
||||||
os.system("mkdir " + str(device_id))
|
|
||||||
os.chdir(str(device_id))
|
|
||||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id)
|
|
||||||
os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
|
|
||||||
os.environ['RANK_ID'] = str(device_id)
|
|
||||||
os.environ['RANK_SIZE'] = str(device_num)
|
|
||||||
if enable_hccl:
|
|
||||||
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
|
|
||||||
gradients_mean=True, all_reduce_fusion_config=[107, 160])
|
|
||||||
init()
|
|
||||||
|
|
||||||
# network
|
|
||||||
|
|
||||||
net = resnet50(class_num=config.class_num)
|
|
||||||
|
|
||||||
# evaluation network
|
|
||||||
dist_eval_network = ClassifyCorrectCell(net)
|
|
||||||
|
|
||||||
if not config.use_label_smooth:
|
|
||||||
config.label_smooth_factor = 0.0
|
|
||||||
|
|
||||||
# loss
|
|
||||||
loss = CrossEntropySmooth(sparse=True, reduction="mean",
|
|
||||||
smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
|
|
||||||
|
|
||||||
# train dataset
|
|
||||||
dataset = create_dataset(dataset_path=dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size)
|
|
||||||
|
|
||||||
step_size = dataset.get_dataset_size()
|
|
||||||
# evaluation dataset
|
|
||||||
eval_dataset = create_dataset(dataset_path=eval_path, do_train=False,
|
|
||||||
repeat_num=1, batch_size=config.eval_batch_size)
|
|
||||||
|
|
||||||
# loss scale
|
|
||||||
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
|
||||||
|
|
||||||
# learning rate
|
|
||||||
lr = Tensor(get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max,
|
|
||||||
warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size,
|
|
||||||
steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode))
|
|
||||||
|
|
||||||
# optimizer
|
|
||||||
decayed_params = []
|
|
||||||
no_decayed_params = []
|
|
||||||
for param in net.trainable_params():
|
|
||||||
if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
|
|
||||||
decayed_params.append(param)
|
|
||||||
else:
|
|
||||||
no_decayed_params.append(param)
|
|
||||||
|
|
||||||
group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
|
|
||||||
{'params': no_decayed_params, 'weight_decay': 0.0},
|
|
||||||
{'order_params': net.trainable_params()}]
|
|
||||||
|
|
||||||
if config.use_lars:
|
|
||||||
momentum = nn.Momentum(group_params, lr, config.momentum,
|
|
||||||
loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
|
|
||||||
opt = nn.LARS(momentum, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient,
|
|
||||||
lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name)
|
|
||||||
|
|
||||||
else:
|
|
||||||
opt = nn.Momentum(group_params, lr, config.momentum,
|
|
||||||
loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
|
|
||||||
|
|
||||||
# model
|
|
||||||
model = Model(net, loss_fn=loss, optimizer=opt,
|
|
||||||
loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False,
|
|
||||||
metrics={'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num)},
|
|
||||||
eval_network=dist_eval_network)
|
|
||||||
|
|
||||||
# callbacks
|
|
||||||
loss_cb = LossGet(1, step_size)
|
|
||||||
train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset, q)
|
|
||||||
|
|
||||||
|
|
||||||
def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl):
|
|
||||||
os.system("mkdir " + str(device_id))
|
|
||||||
os.chdir(str(device_id))
|
|
||||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
|
|
||||||
context.set_context(device_id=device_id)
|
|
||||||
os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2
|
|
||||||
os.environ['RANK_ID'] = str(device_id - 4)
|
|
||||||
os.environ['RANK_SIZE'] = str(device_num)
|
|
||||||
if enable_hccl:
|
|
||||||
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
|
|
||||||
gradients_mean=True, all_reduce_fusion_config=[85, 160])
|
|
||||||
init()
|
|
||||||
|
|
||||||
# network
|
|
||||||
net = resnet50(thor_config.class_num)
|
|
||||||
|
|
||||||
if not thor_config.label_smooth:
|
|
||||||
thor_config.label_smooth_factor = 0.0
|
|
||||||
|
|
||||||
# loss
|
|
||||||
loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor,
|
|
||||||
num_classes=thor_config.class_num)
|
|
||||||
|
|
||||||
# train dataset
|
|
||||||
dataset = create_dataset_thor(dataset_path=dataset_path, do_train=True,
|
|
||||||
batch_size=thor_config.batch_size, train_image_size=thor_config.train_image_size,
|
|
||||||
eval_image_size=thor_config.eval_image_size, target="Ascend",
|
|
||||||
distribute=True)
|
|
||||||
step_size = dataset.get_dataset_size()
|
|
||||||
|
|
||||||
# loss scale
|
|
||||||
loss_scale = FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False)
|
|
||||||
|
|
||||||
# learning rate
|
|
||||||
lr = get_thor_lr(0, 0.05803, 4.04839, 53, 5004, decay_epochs=39)
|
|
||||||
damping = get_thor_damping(0, 0.02714, 0.50036, 70, 5004)
|
|
||||||
# optimizer
|
|
||||||
split_indices = [26, 53]
|
|
||||||
opt = thor(net, Tensor(lr), Tensor(damping), thor_config.momentum, thor_config.weight_decay, thor_config.loss_scale,
|
|
||||||
thor_config.batch_size, split_indices=split_indices, frequency=thor_config.frequency)
|
|
||||||
|
|
||||||
# evaluation network
|
|
||||||
dist_eval_network = ClassifyCorrectCell(net)
|
|
||||||
# model
|
|
||||||
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale,
|
|
||||||
metrics={'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num)},
|
|
||||||
amp_level="O2", keep_batchnorm_fp32=False,
|
|
||||||
eval_network=dist_eval_network)
|
|
||||||
|
|
||||||
model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
|
|
||||||
loss_scale_manager=loss_scale, metrics={'acc'},
|
|
||||||
amp_level="O2", keep_batchnorm_fp32=False)
|
|
||||||
|
|
||||||
# callbacks
|
|
||||||
loss_cb = LossGet(1, step_size)
|
|
||||||
|
|
||||||
# train and eval
|
|
||||||
print("run_start", device_id)
|
|
||||||
model.train(2, dataset, callbacks=loss_cb,
|
|
||||||
sink_size=dataset.get_dataset_size(), dataset_sink_mode=True)
|
|
||||||
time_cost = loss_cb.get_per_step_time()
|
|
||||||
loss = loss_cb.get_loss()
|
|
||||||
epoch_idx = loss_cb.get_epoch()
|
|
||||||
print("the {} epoch's resnet result:\n "
|
|
||||||
"device{}, training loss {}, "
|
|
||||||
"training per step cost {:.2f} ms, total_cost {:.2f} ms".format(epoch_idx, device_id,
|
|
||||||
loss, time_cost, time_cost * step_size))
|
|
||||||
q.put({'loss': loss, 'cost': time_cost})
|
|
||||||
|
|
||||||
|
|
||||||
def resnet_end(device_num, q):
|
|
||||||
acc = 0.0
|
|
||||||
cost = 0.0
|
|
||||||
for i in range(device_num):
|
|
||||||
assert not q.empty()
|
|
||||||
output = q.get()
|
|
||||||
acc += output['acc']
|
|
||||||
cost += output['cost']
|
|
||||||
acc = acc / device_num
|
|
||||||
cost = cost / device_num
|
|
||||||
|
|
||||||
for i in range(device_num):
|
|
||||||
os.system("rm -rf " + str(i))
|
|
||||||
print("End training...")
|
|
||||||
assert acc > 0.1
|
assert acc > 0.1
|
||||||
assert cost < 26
|
assert cost < 26
|
||||||
|
for i in range(4):
|
||||||
|
shutil.rmtree(os.path.join(sh_path, f"train_parallel{i}"))
|
||||||
|
|
||||||
|
|
||||||
def thor_end(device_num, q):
|
def thor_end():
|
||||||
thor_loss = 0.0
|
thor_cost = 0
|
||||||
thor_cost = 0.0
|
thor_loss = 0
|
||||||
for i in range(device_num):
|
sh_path = os.path.split(os.path.realpath(__file__))[0]
|
||||||
output = q.get()
|
for i in range(4):
|
||||||
thor_loss += output['loss']
|
with open(os.path.join(sh_path, f"train_parallel{i+4}", f"thor_{i}.txt")) as f:
|
||||||
thor_cost += output['cost']
|
lines = f.readlines()
|
||||||
thor_loss = thor_loss / device_num
|
thor_loss += float(lines[0].strip().split(": ")[1])
|
||||||
thor_cost = thor_cost / device_num
|
thor_cost += float(lines[1].strip().split(": ")[1])
|
||||||
|
thor_loss /= 4
|
||||||
for i in range(4, device_num + 4):
|
thor_cost /= 4
|
||||||
os.system("rm -rf " + str(i))
|
print(f"resnet thor_loss: {thor_loss}, thor_cost: {thor_cost}")
|
||||||
print("End training...")
|
|
||||||
assert thor_loss < 7
|
assert thor_loss < 7
|
||||||
assert thor_cost < 30
|
assert thor_cost < 30
|
||||||
|
for i in range(4):
|
||||||
|
shutil.rmtree(os.path.join(sh_path, f"train_parallel{i+4}"))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.level1
|
@pytest.mark.level0
|
||||||
@pytest.mark.platform_arm_ascend_training
|
@pytest.mark.platform_arm_ascend_training
|
||||||
@pytest.mark.platform_x86_ascend_training
|
@pytest.mark.platform_x86_ascend_training
|
||||||
@pytest.mark.env_single
|
@pytest.mark.env_single
|
||||||
|
@ -349,46 +78,9 @@ def test_resnet_imagenet_and_thor_4p():
|
||||||
Description: Train and evaluate resnet50 network on imagenet dataset.
|
Description: Train and evaluate resnet50 network on imagenet dataset.
|
||||||
Expectation: accuracy > 0.1, time cost < 26.
|
Expectation: accuracy > 0.1, time cost < 26.
|
||||||
"""
|
"""
|
||||||
context.set_context(enable_graph_kernel=False, enable_sparse=False)
|
get_env_info()
|
||||||
context.reset_auto_parallel_context()
|
sh_path = os.path.split(os.path.realpath(__file__))[0]
|
||||||
context.reset_ps_context()
|
ret = os.system(f"sh {sh_path}/scripts/run_train.sh")
|
||||||
|
assert ret == 0
|
||||||
q = Queue()
|
resnet_end()
|
||||||
q2 = Queue()
|
thor_end()
|
||||||
device_num = 4
|
|
||||||
epoch_size = 2
|
|
||||||
epoch_size_2 = 1
|
|
||||||
enable_hccl = True
|
|
||||||
process = []
|
|
||||||
process2 = []
|
|
||||||
for i in range(device_num):
|
|
||||||
device_id = i
|
|
||||||
process.append(Process(target=train_process,
|
|
||||||
args=(q, device_id, epoch_size, device_num, enable_hccl)))
|
|
||||||
process2.append(Process(target=train_process_thor,
|
|
||||||
args=(q2, device_id + 4, epoch_size_2, device_num, enable_hccl)))
|
|
||||||
cpu_count = os.cpu_count()
|
|
||||||
half_cpu_count = cpu_count // 2
|
|
||||||
each_cpu_count = half_cpu_count // device_num
|
|
||||||
for i in range(device_num):
|
|
||||||
process[i].start()
|
|
||||||
process2[i].start()
|
|
||||||
if each_cpu_count > 1:
|
|
||||||
cpu_start = each_cpu_count * i
|
|
||||||
cpu_end = each_cpu_count * (i + 1)
|
|
||||||
process_cpu = [x for x in range(cpu_start, cpu_end)]
|
|
||||||
process2_cpu = [x for x in range(cpu_start + half_cpu_count, cpu_end + half_cpu_count)]
|
|
||||||
pid1 = process[i].pid
|
|
||||||
pid2 = process2[i].pid
|
|
||||||
os.sched_setaffinity(pid1, set(process_cpu))
|
|
||||||
os.sched_setaffinity(pid2, set(process2_cpu))
|
|
||||||
print("Waiting for all subprocesses done...")
|
|
||||||
|
|
||||||
for i in range(device_num):
|
|
||||||
process[i].join()
|
|
||||||
process2[i].join()
|
|
||||||
# resnet
|
|
||||||
resnet_end(device_num, q)
|
|
||||||
# thor
|
|
||||||
thor_end(device_num, q2)
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
# Copyright 2022 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""resnet train & eval case."""
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import mindspore as ms
|
||||||
|
from mindspore import nn
|
||||||
|
from tests.st.networks.models.resnet50.src.callback import LossGet
|
||||||
|
from tests.st.networks.models.resnet50.src.config import config
|
||||||
|
from tests.st.networks.models.resnet50.src.resnet import resnet50
|
||||||
|
from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
|
||||||
|
from tests.st.networks.models.resnet50.src.dataset import create_dataset
|
||||||
|
from tests.st.networks.models.resnet50.src.lr_generator import get_learning_rate
|
||||||
|
from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth
|
||||||
|
|
||||||
|
TRAIN_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
|
||||||
|
EVAL_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
|
||||||
|
ms.set_seed(1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_optimizer(net, step_size):
|
||||||
|
# optimizer
|
||||||
|
lr = ms.Tensor(get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max,
|
||||||
|
warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size,
|
||||||
|
steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode))
|
||||||
|
decayed_params = []
|
||||||
|
no_decayed_params = []
|
||||||
|
for param in net.trainable_params():
|
||||||
|
if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
|
||||||
|
decayed_params.append(param)
|
||||||
|
else:
|
||||||
|
no_decayed_params.append(param)
|
||||||
|
|
||||||
|
group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
|
||||||
|
{'params': no_decayed_params, 'weight_decay': 0.0},
|
||||||
|
{'order_params': net.trainable_params()}]
|
||||||
|
|
||||||
|
if config.use_lars:
|
||||||
|
momentum = nn.Momentum(group_params, lr, config.momentum,
|
||||||
|
loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
|
||||||
|
opt = nn.LARS(momentum, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient,
|
||||||
|
lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name)
|
||||||
|
|
||||||
|
else:
|
||||||
|
opt = nn.Momentum(group_params, lr, config.momentum,
|
||||||
|
loss_scale=config.loss_scale, use_nesterov=config.use_nesterov)
|
||||||
|
return opt
|
||||||
|
|
||||||
|
|
||||||
|
def train_and_eval(device_id, epoch_size, model, dataset, loss_cb, eval_dataset):
|
||||||
|
print("run_start", device_id)
|
||||||
|
eval_interval = config.eval_interval
|
||||||
|
step_size = dataset.get_dataset_size()
|
||||||
|
acc = 0.0
|
||||||
|
time_cost = 0.0
|
||||||
|
for epoch_idx in range(0, int(epoch_size / eval_interval)):
|
||||||
|
model.train(1, dataset, callbacks=loss_cb)
|
||||||
|
eval_start = time.time()
|
||||||
|
output = model.eval(eval_dataset)
|
||||||
|
eval_cost = (time.time() - eval_start) * 1000
|
||||||
|
acc = float(output["acc"])
|
||||||
|
time_cost = loss_cb.get_per_step_time()
|
||||||
|
loss = loss_cb.get_loss()
|
||||||
|
print("the {} epoch's resnet result:\n "
|
||||||
|
"device{}, training loss {}, acc {}, "
|
||||||
|
"training per step cost {:.2f} ms, eval cost {:.2f} ms, "
|
||||||
|
"total_cost {:.2f} ms".format(epoch_idx, device_id,
|
||||||
|
loss, acc, time_cost,
|
||||||
|
eval_cost,
|
||||||
|
time_cost * step_size + eval_cost))
|
||||||
|
print(f"===resnet_acc: {acc}")
|
||||||
|
print(f"===resnet_time_cost: {time_cost}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_train():
|
||||||
|
ms.context.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
|
||||||
|
rank_id = int(os.getenv('RANK_ID', '0'))
|
||||||
|
device_num = int(os.getenv('RANK_SIZE', '1'))
|
||||||
|
device_id = int(os.getenv('DEVICE_ID', '0'))
|
||||||
|
print(f"run resnet50 device_num:{device_num}, device_id:{device_id}, rank_id:{rank_id}")
|
||||||
|
if device_num > 1:
|
||||||
|
ms.communication.init()
|
||||||
|
ms.context.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL,
|
||||||
|
gradients_mean=True, all_reduce_fusion_config=[107, 160])
|
||||||
|
net = resnet50(class_num=config.class_num)
|
||||||
|
dist_eval_network = ClassifyCorrectCell(net)
|
||||||
|
|
||||||
|
if not config.use_label_smooth:
|
||||||
|
config.label_smooth_factor = 0.0
|
||||||
|
loss = CrossEntropySmooth(sparse=True, reduction="mean",
|
||||||
|
smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
|
||||||
|
|
||||||
|
# dataset
|
||||||
|
dataset = create_dataset(dataset_path=TRAIN_PATH, do_train=True, repeat_num=1, batch_size=config.batch_size)
|
||||||
|
step_size = dataset.get_dataset_size()
|
||||||
|
eval_dataset = create_dataset(dataset_path=EVAL_PATH, do_train=False,
|
||||||
|
repeat_num=1, batch_size=config.eval_batch_size)
|
||||||
|
|
||||||
|
loss_scale = ms.FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
||||||
|
opt = get_optimizer(net, step_size)
|
||||||
|
|
||||||
|
model = ms.Model(net, loss_fn=loss, optimizer=opt,
|
||||||
|
loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False,
|
||||||
|
metrics={'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num)},
|
||||||
|
eval_network=dist_eval_network)
|
||||||
|
loss_cb = LossGet(1, step_size)
|
||||||
|
train_and_eval(device_id, 2, model, dataset, loss_cb, eval_dataset)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
run_train()
|
|
@ -0,0 +1,133 @@
|
||||||
|
# Copyright 2022 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""resnet train & eval case."""
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import mindspore as ms
|
||||||
|
from mindspore import nn
|
||||||
|
from mindspore.train.train_thor import ConvertModelUtils
|
||||||
|
from tests.st.networks.models.resnet50.src.callback import LossGet
|
||||||
|
from tests.st.networks.models.resnet50.src_thor.config import config as thor_config
|
||||||
|
from tests.st.networks.models.resnet50.src_thor.dataset import create_dataset2 as create_dataset_thor
|
||||||
|
from tests.st.networks.models.resnet50.src.resnet import resnet50
|
||||||
|
from tests.st.networks.models.resnet50.src.metric import DistAccuracy, ClassifyCorrectCell
|
||||||
|
from tests.st.networks.models.resnet50.src.CrossEntropySmooth import CrossEntropySmooth
|
||||||
|
|
||||||
|
TRAIN_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/train"
|
||||||
|
EVAL_PATH = "/home/workspace/mindspore_dataset/imagenet/imagenet_original/val"
|
||||||
|
ms.set_seed(1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_thor_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100):
|
||||||
|
"""get_model_lr"""
|
||||||
|
lr_each_step = []
|
||||||
|
total_steps = steps_per_epoch * total_epochs
|
||||||
|
for i in range(total_steps):
|
||||||
|
epoch = (i + 1) / steps_per_epoch
|
||||||
|
base = (1.0 - float(epoch) / total_epochs) ** decay
|
||||||
|
lr_local = lr_init * base
|
||||||
|
if epoch >= decay_epochs:
|
||||||
|
lr_local = lr_local * 0.5
|
||||||
|
if epoch >= decay_epochs + 1:
|
||||||
|
lr_local = lr_local * 0.5
|
||||||
|
lr_each_step.append(lr_local)
|
||||||
|
current_step = global_step
|
||||||
|
lr_each_step = np.array(lr_each_step).astype(np.float32)
|
||||||
|
learning_rate = lr_each_step[current_step:]
|
||||||
|
return learning_rate
|
||||||
|
|
||||||
|
|
||||||
|
def get_thor_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch):
|
||||||
|
"""get_model_damping"""
|
||||||
|
damping_each_step = []
|
||||||
|
total_steps = steps_per_epoch * total_epochs
|
||||||
|
for step in range(total_steps):
|
||||||
|
epoch = (step + 1) / steps_per_epoch
|
||||||
|
damping_here = damping_init * (decay_rate ** (epoch / 10))
|
||||||
|
damping_each_step.append(damping_here)
|
||||||
|
current_step = global_step
|
||||||
|
damping_each_step = np.array(damping_each_step).astype(np.float32)
|
||||||
|
damping_now = damping_each_step[current_step:]
|
||||||
|
return damping_now
|
||||||
|
|
||||||
|
|
||||||
|
def run_train():
|
||||||
|
ms.context.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
|
||||||
|
rank_id = int(os.getenv('RANK_ID', '0'))
|
||||||
|
device_num = int(os.getenv('RANK_SIZE', '1'))
|
||||||
|
device_id = int(os.getenv('DEVICE_ID', '0'))
|
||||||
|
print(f"run resnet50 thor device_num:{device_num}, device_id:{device_id}, rank_id:{rank_id}")
|
||||||
|
if device_num > 1:
|
||||||
|
ms.communication.init()
|
||||||
|
ms.context.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL,
|
||||||
|
gradients_mean=True, all_reduce_fusion_config=[85, 160])
|
||||||
|
net = resnet50(thor_config.class_num)
|
||||||
|
|
||||||
|
if not thor_config.label_smooth:
|
||||||
|
thor_config.label_smooth_factor = 0.0
|
||||||
|
|
||||||
|
# loss
|
||||||
|
loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor,
|
||||||
|
num_classes=thor_config.class_num)
|
||||||
|
|
||||||
|
# train dataset
|
||||||
|
dataset = create_dataset_thor(dataset_path=TRAIN_PATH, do_train=True,
|
||||||
|
batch_size=thor_config.batch_size, train_image_size=thor_config.train_image_size,
|
||||||
|
eval_image_size=thor_config.eval_image_size, target="Ascend",
|
||||||
|
distribute=True)
|
||||||
|
step_size = dataset.get_dataset_size()
|
||||||
|
|
||||||
|
# loss scale
|
||||||
|
loss_scale = ms.FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False)
|
||||||
|
|
||||||
|
# learning rate
|
||||||
|
lr = get_thor_lr(0, 0.05803, 4.04839, 53, 5004, decay_epochs=39)
|
||||||
|
damping = get_thor_damping(0, 0.02714, 0.50036, 70, 5004)
|
||||||
|
# optimizer
|
||||||
|
split_indices = [26, 53]
|
||||||
|
opt = nn.thor(net, ms.Tensor(lr), ms.Tensor(damping), thor_config.momentum, thor_config.weight_decay,
|
||||||
|
thor_config.loss_scale, thor_config.batch_size, split_indices=split_indices,
|
||||||
|
frequency=thor_config.frequency)
|
||||||
|
|
||||||
|
# evaluation network
|
||||||
|
dist_eval_network = ClassifyCorrectCell(net)
|
||||||
|
# model
|
||||||
|
model = ms.Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale,
|
||||||
|
metrics={'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num)},
|
||||||
|
amp_level="O2", keep_batchnorm_fp32=False,
|
||||||
|
eval_network=dist_eval_network)
|
||||||
|
|
||||||
|
model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
|
||||||
|
loss_scale_manager=loss_scale, metrics={'acc'},
|
||||||
|
amp_level="O2", keep_batchnorm_fp32=False)
|
||||||
|
|
||||||
|
# callbacks
|
||||||
|
loss_cb = LossGet(1, step_size)
|
||||||
|
|
||||||
|
# train and eval
|
||||||
|
print("run_start", device_id)
|
||||||
|
model.train(2, dataset, callbacks=loss_cb, dataset_sink_mode=True, sink_size=step_size)
|
||||||
|
time_cost = loss_cb.get_per_step_time()
|
||||||
|
loss = loss_cb.get_loss()
|
||||||
|
epoch_idx = loss_cb.get_epoch()
|
||||||
|
print("the {} epoch's resnet result:\n "
|
||||||
|
"device{}, training loss {}, "
|
||||||
|
"training per step cost {:.2f} ms, total_cost {:.2f} ms".format(epoch_idx, device_id,
|
||||||
|
loss, time_cost, time_cost * step_size))
|
||||||
|
print(f"===resnet_thor_loss: {loss}")
|
||||||
|
print(f"===resnet_thor_time_cost: {time_cost}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
run_train()
|
Loading…
Reference in New Issue