diff --git a/model_zoo/official/cv/deeplabv3/scripts/run_standalone_train_cpu.sh b/model_zoo/official/cv/deeplabv3/scripts/run_standalone_train_cpu.sh new file mode 100644 index 00000000000..4f8ae47b1ed --- /dev/null +++ b/model_zoo/official/cv/deeplabv3/scripts/run_standalone_train_cpu.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +export DEVICE_ID=0 +export SLOG_PRINT_TO_STDOUT=0 +train_path=/PATH/TO/EXPERIMENTS_DIR +train_code_path=/PATH/TO/MODEL_ZOO_CODE + +if [ -d ${train_path} ]; then + rm -rf ${train_path} +fi +mkdir -p ${train_path} +mkdir ${train_path}/device${DEVICE_ID} +mkdir ${train_path}/ckpt +cd ${train_path}/device${DEVICE_ID} || exit + +python ${train_code_path}/train.py --data_file=/PATH/TO/MINDRECORD_NAME \ + --device_target=CPU \ + --train_dir=${train_path}/ckpt \ + --train_epochs=200 \ + --batch_size=32 \ + --crop_size=513 \ + --base_lr=0.015 \ + --lr_type=cos \ + --min_scale=0.5 \ + --max_scale=2.0 \ + --ignore_label=255 \ + --num_classes=21 \ + --model=deeplab_v3_s16 \ + --ckpt_pre_trained=/PATH/TO/PRETRAIN_MODEL \ + --save_steps=1500 \ + --keep_checkpoint_max=200 >log 2>&1 & diff --git a/model_zoo/official/cv/deeplabv3/train.py b/model_zoo/official/cv/deeplabv3/train.py index 3313fc190b8..aa2fe888e16 100644 --- a/model_zoo/official/cv/deeplabv3/train.py +++ b/model_zoo/official/cv/deeplabv3/train.py @@ -32,8 +32,6 @@ from src.nets import net_factory from src.utils import learning_rates set_seed(1) -context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False, - device_target="Ascend", device_id=int(os.getenv('DEVICE_ID'))) class BuildTrainNetwork(nn.Cell): @@ -77,6 +75,8 @@ def parse_args(): parser.add_argument('--ckpt_pre_trained', type=str, default='', help='pretrained model') # train + parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'CPU'], + help='device where the code will be implemented. (Default: Ascend)') parser.add_argument('--is_distributed', action='store_true', help='distributed training') parser.add_argument('--rank', type=int, default=0, help='local rank of distributed') parser.add_argument('--group_size', type=int, default=1, help='world size of distributed') @@ -90,6 +90,12 @@ def parse_args(): def train(): args = parse_args() + if args.device_target == "CPU": + context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU") + else: + context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False, + device_target="Ascend", device_id=int(os.getenv('DEVICE_ID'))) + # init multicards training if args.is_distributed: init() @@ -150,7 +156,8 @@ def train(): # loss scale manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) - model = Model(train_net, optimizer=opt, amp_level="O3", loss_scale_manager=manager_loss_scale) + amp_level = "O0" if args.device_target == "CPU" else "O3" + model = Model(train_net, optimizer=opt, amp_level=amp_level, loss_scale_manager=manager_loss_scale) # callback for saving ckpts time_cb = TimeMonitor(data_size=iters_per_epoch)