!10406 deeplabv3 support cpu training

From: @caojian05
Reviewed-by: @wuxuejian,@oacjiewen
Signed-off-by: @wuxuejian
This commit is contained in:
mindspore-ci-bot 2020-12-26 16:24:17 +08:00 committed by Gitee
commit ba5f57babf
2 changed files with 55 additions and 3 deletions

View File

@ -0,0 +1,45 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
export DEVICE_ID=0
export SLOG_PRINT_TO_STDOUT=0
train_path=/PATH/TO/EXPERIMENTS_DIR
train_code_path=/PATH/TO/MODEL_ZOO_CODE
if [ -d ${train_path} ]; then
rm -rf ${train_path}
fi
mkdir -p ${train_path}
mkdir ${train_path}/device${DEVICE_ID}
mkdir ${train_path}/ckpt
cd ${train_path}/device${DEVICE_ID} || exit
python ${train_code_path}/train.py --data_file=/PATH/TO/MINDRECORD_NAME \
--device_target=CPU \
--train_dir=${train_path}/ckpt \
--train_epochs=200 \
--batch_size=32 \
--crop_size=513 \
--base_lr=0.015 \
--lr_type=cos \
--min_scale=0.5 \
--max_scale=2.0 \
--ignore_label=255 \
--num_classes=21 \
--model=deeplab_v3_s16 \
--ckpt_pre_trained=/PATH/TO/PRETRAIN_MODEL \
--save_steps=1500 \
--keep_checkpoint_max=200 >log 2>&1 &

View File

@ -32,8 +32,6 @@ from src.nets import net_factory
from src.utils import learning_rates
set_seed(1)
context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False,
device_target="Ascend", device_id=int(os.getenv('DEVICE_ID')))
class BuildTrainNetwork(nn.Cell):
@ -77,6 +75,8 @@ def parse_args():
parser.add_argument('--ckpt_pre_trained', type=str, default='', help='pretrained model')
# train
parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'CPU'],
help='device where the code will be implemented. (Default: Ascend)')
parser.add_argument('--is_distributed', action='store_true', help='distributed training')
parser.add_argument('--rank', type=int, default=0, help='local rank of distributed')
parser.add_argument('--group_size', type=int, default=1, help='world size of distributed')
@ -90,6 +90,12 @@ def parse_args():
def train():
args = parse_args()
if args.device_target == "CPU":
context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU")
else:
context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False,
device_target="Ascend", device_id=int(os.getenv('DEVICE_ID')))
# init multicards training
if args.is_distributed:
init()
@ -150,7 +156,8 @@ def train():
# loss scale
manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
model = Model(train_net, optimizer=opt, amp_level="O3", loss_scale_manager=manager_loss_scale)
amp_level = "O0" if args.device_target == "CPU" else "O3"
model = Model(train_net, optimizer=opt, amp_level=amp_level, loss_scale_manager=manager_loss_scale)
# callback for saving ckpts
time_cb = TimeMonitor(data_size=iters_per_epoch)