!10406 deeplabv3 support cpu training

From: @caojian05 Reviewed-by: @wuxuejian,@oacjiewen Signed-off-by: @wuxuejian
2020-12-26 16:24:17 +08:00 · 2020-12-26 16:24:17 +08:00 · ba5f57babf
parent 45d6dc716f 2599f14916
commit ba5f57babf
2 changed files with 55 additions and 3 deletions
--- a/model_zoo/official/cv/deeplabv3/scripts/run_standalone_train_cpu.sh
+++ b/model_zoo/official/cv/deeplabv3/scripts/run_standalone_train_cpu.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+export DEVICE_ID=0
+export SLOG_PRINT_TO_STDOUT=0
+train_path=/PATH/TO/EXPERIMENTS_DIR
+train_code_path=/PATH/TO/MODEL_ZOO_CODE
+
+if [ -d ${train_path} ]; then
+  rm -rf ${train_path}
+fi
+mkdir -p ${train_path}
+mkdir ${train_path}/device${DEVICE_ID}
+mkdir ${train_path}/ckpt
+cd ${train_path}/device${DEVICE_ID} || exit
+
+python ${train_code_path}/train.py --data_file=/PATH/TO/MINDRECORD_NAME  \
+                    --device_target=CPU  \
+                    --train_dir=${train_path}/ckpt  \
+                    --train_epochs=200  \
+                    --batch_size=32  \
+                    --crop_size=513  \
+                    --base_lr=0.015  \
+                    --lr_type=cos  \
+                    --min_scale=0.5  \
+                    --max_scale=2.0  \
+                    --ignore_label=255  \
+                    --num_classes=21  \
+                    --model=deeplab_v3_s16  \
+                    --ckpt_pre_trained=/PATH/TO/PRETRAIN_MODEL  \
+                    --save_steps=1500  \
+                    --keep_checkpoint_max=200 >log 2>&1 &
--- a/model_zoo/official/cv/deeplabv3/train.py
+++ b/model_zoo/official/cv/deeplabv3/train.py
@ -32,8 +32,6 @@ from src.nets import net_factory
 from src.utils import learning_rates

 set_seed(1)
-context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False,
-                    device_target="Ascend", device_id=int(os.getenv('DEVICE_ID')))


 class BuildTrainNetwork(nn.Cell):
@ -77,6 +75,8 @@ def parse_args():
    parser.add_argument('--ckpt_pre_trained', type=str, default='', help='pretrained model')

    # train
+    parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'CPU'],
+                        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument('--is_distributed', action='store_true', help='distributed training')
    parser.add_argument('--rank', type=int, default=0, help='local rank of distributed')
    parser.add_argument('--group_size', type=int, default=1, help='world size of distributed')
@ -90,6 +90,12 @@ def parse_args():
 def train():
    args = parse_args()

+    if args.device_target == "CPU":
+        context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU")
+    else:
+        context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False,
+                            device_target="Ascend", device_id=int(os.getenv('DEVICE_ID')))
+
    # init multicards training
    if args.is_distributed:
        init()
@ -150,7 +156,8 @@ def train():

    # loss scale
    manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
-    model = Model(train_net, optimizer=opt, amp_level="O3", loss_scale_manager=manager_loss_scale)
+    amp_level = "O0" if args.device_target == "CPU" else "O3"
+    model = Model(train_net, optimizer=opt, amp_level=amp_level, loss_scale_manager=manager_loss_scale)

    # callback for saving ckpts
    time_cb = TimeMonitor(data_size=iters_per_epoch)