forked from mindspore-Ecosystem/mindspore
!10406 deeplabv3 support cpu training
From: @caojian05 Reviewed-by: @wuxuejian,@oacjiewen Signed-off-by: @wuxuejian
This commit is contained in:
commit
ba5f57babf
|
@ -0,0 +1,45 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
export DEVICE_ID=0
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
train_path=/PATH/TO/EXPERIMENTS_DIR
|
||||
train_code_path=/PATH/TO/MODEL_ZOO_CODE
|
||||
|
||||
if [ -d ${train_path} ]; then
|
||||
rm -rf ${train_path}
|
||||
fi
|
||||
mkdir -p ${train_path}
|
||||
mkdir ${train_path}/device${DEVICE_ID}
|
||||
mkdir ${train_path}/ckpt
|
||||
cd ${train_path}/device${DEVICE_ID} || exit
|
||||
|
||||
python ${train_code_path}/train.py --data_file=/PATH/TO/MINDRECORD_NAME \
|
||||
--device_target=CPU \
|
||||
--train_dir=${train_path}/ckpt \
|
||||
--train_epochs=200 \
|
||||
--batch_size=32 \
|
||||
--crop_size=513 \
|
||||
--base_lr=0.015 \
|
||||
--lr_type=cos \
|
||||
--min_scale=0.5 \
|
||||
--max_scale=2.0 \
|
||||
--ignore_label=255 \
|
||||
--num_classes=21 \
|
||||
--model=deeplab_v3_s16 \
|
||||
--ckpt_pre_trained=/PATH/TO/PRETRAIN_MODEL \
|
||||
--save_steps=1500 \
|
||||
--keep_checkpoint_max=200 >log 2>&1 &
|
|
@ -32,8 +32,6 @@ from src.nets import net_factory
|
|||
from src.utils import learning_rates
|
||||
|
||||
set_seed(1)
|
||||
context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False,
|
||||
device_target="Ascend", device_id=int(os.getenv('DEVICE_ID')))
|
||||
|
||||
|
||||
class BuildTrainNetwork(nn.Cell):
|
||||
|
@ -77,6 +75,8 @@ def parse_args():
|
|||
parser.add_argument('--ckpt_pre_trained', type=str, default='', help='pretrained model')
|
||||
|
||||
# train
|
||||
parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'CPU'],
|
||||
help='device where the code will be implemented. (Default: Ascend)')
|
||||
parser.add_argument('--is_distributed', action='store_true', help='distributed training')
|
||||
parser.add_argument('--rank', type=int, default=0, help='local rank of distributed')
|
||||
parser.add_argument('--group_size', type=int, default=1, help='world size of distributed')
|
||||
|
@ -90,6 +90,12 @@ def parse_args():
|
|||
def train():
|
||||
args = parse_args()
|
||||
|
||||
if args.device_target == "CPU":
|
||||
context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU")
|
||||
else:
|
||||
context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False,
|
||||
device_target="Ascend", device_id=int(os.getenv('DEVICE_ID')))
|
||||
|
||||
# init multicards training
|
||||
if args.is_distributed:
|
||||
init()
|
||||
|
@ -150,7 +156,8 @@ def train():
|
|||
|
||||
# loss scale
|
||||
manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
|
||||
model = Model(train_net, optimizer=opt, amp_level="O3", loss_scale_manager=manager_loss_scale)
|
||||
amp_level = "O0" if args.device_target == "CPU" else "O3"
|
||||
model = Model(train_net, optimizer=opt, amp_level=amp_level, loss_scale_manager=manager_loss_scale)
|
||||
|
||||
# callback for saving ckpts
|
||||
time_cb = TimeMonitor(data_size=iters_per_epoch)
|
||||
|
|
Loading…
Reference in New Issue