Modify FaceRecognition net for clould.

2021-04-28 15:02:15 +08:00 · 2021-04-28 15:02:15 +08:00 · 25c2227faf
parent a0fe698e61
commit 25c2227faf
17 changed files with 831 additions and 288 deletions
--- a/model_zoo/research/cv/FaceRecognition/README.md
+++ b/model_zoo/research/cv/FaceRecognition/README.md
@ -84,7 +84,6 @@ The entire code structure is as following:
  │   │   ├── head.py                       // head unit
  │   │   ├── resnet.py                     // resnet architecture
  │   ├── callback_factory.py               // callback logging
-  │   ├── config.py                         // parameter configuration
  │   ├── custom_dataset.py                 // custom dataset and sampler
  │   ├── custom_net.py                     // custom cell define
  │   ├── dataset_factory.py                // creating dataset
@ -94,6 +93,15 @@ The entire code structure is as following:
  │   ├── lrsche_factory.py                 // learning rate schedule
  │   ├── me_init.py                        // network parameter init method
  │   ├── metric_factory.py                 // metric fc layer
+  ── utils
+  │   ├── __init__.py                       // init file
+  │   ├── config.py                         // parameter analysis
+  │   ├── device_adapter.py                 // device adapter
+  │   ├── local_adapter.py                  // local adapter
+  │   ├── moxing_adapter.py                 // moxing adapter
+  ├─ base_config.yaml                       // parameter configuration
+  ├─ beta_config.yaml                       // parameter configuration
+  ├─ inference_config.yaml                  // parameter configuration
  ├─ train.py                               // training scripts
  ├─ eval.py                                // evaluation scripts
  └─ export.py                              // export air model
@ -163,6 +171,47 @@ The entire code structure is as following:
      sh run_distribute_train_beta.sh ./rank_table_8p.json
      ```

+- ModelArts (If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training as follows)
+
+    - base model
+
+      ```python
+      # (1) Add "config_path='/path_to_code/base_config.yaml'" on the website UI interface.
+      # (2) Perform a or b.
+      #       a. Set "enable_modelarts=True" on base_config.yaml file.
+      #          Set "is_distributed=1" on base_config.yaml file.
+      #          Set other parameters on base_config.yaml file you need.
+      #       b. Add "enable_modelarts=True" on the website UI interface.
+      #          Add "is_distributed=1" on the website UI interface.
+      #          Add other parameters on the website UI interface.
+      # (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
+      # (4) Set the code directory to "/path/FaceRecognition" on the website UI interface.
+      # (5) Set the startup file to "train.py" on the website UI interface.
+      # (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
+      # (7) Create your job.
+      ```
+
+    - beta model
+
+      ```python
+      # (1) Copy or upload your trained model to S3 bucket.
+      # (2) Add "config_path='/path_to_code/beta_config.yaml'" on the website UI interface.
+      # (3) Perform a or b.
+      #       a. Set "enable_modelarts=True" on beta_config.yaml file.
+      #          Set "is_distributed=1" on base_config.yaml file.
+      #          Set "pretrained='/cache/checkpoint_path/model.ckpt'" on beta_config.yaml file.
+      #          Set "checkpoint_url=/The path of checkpoint in S3/" on beta_config.yaml file.
+      #       b. Add "enable_modelarts=True" on the website UI interface.
+      #          Add "is_distributed=1" on the website UI interface.
+      #          Add "pretrained='/cache/checkpoint_path/model.ckpt'" on default_config.yaml file.
+      #          Add "checkpoint_url=/The path of checkpoint in S3/" on default_config.yaml file.
+      # (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
+      # (5) Set the code directory to "/path/FaceRecognition" on the website UI interface.
+      # (6) Set the startup file to "train.py" on the website UI interface.
+      # (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
+      # (8) Create your job.
+      ```
+
 You will get the loss value of each epoch as following in "./scripts/data_parallel_log_[DEVICE_ID]/outputs/logs/[TIME].log" or "./scripts/log_parallel_graph/face_recognition_[DEVICE_ID].log":

 ```python
@ -188,6 +237,24 @@ sh run_eval.sh [USE_DEVICE_ID]
 You will get the result as following in "./scripts/log_inference/outputs/models/logs/[TIME].log":
 [test_dataset]: zj2jk=0.9495, jk2zj=0.9480, avg=0.9487

+If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start evaluation as follows:
+
+```python
+# run evaluation on modelarts example
+# (1) Copy or upload your trained model to S3 bucket.
+# (2) Add "config_path='/path_to_code/inference_config.yaml'" on the website UI interface.
+# (3) Perform a or b.
+#       a. Set "weight='/cache/checkpoint_path/model.ckpt'" on default_config.yaml file.
+#          Set "checkpoint_url=/The path of checkpoint in S3/" on default_config.yaml file.
+#       b. Add "weight='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
+#          Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
+# (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.)
+# (5) Set the code directory to "/path/FaceRecognition" on the website UI interface.
+# (6) Set the startup file to "eval.py" on the website UI interface.
+# (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
+# (8) Create your job.
+```
+
 ### Convert model

 If you want to infer the network on Ascend 310, you should convert the model to AIR:
--- a/model_zoo/research/cv/FaceRecognition/base_config.yaml
+++ b/model_zoo/research/cv/FaceRecognition/base_config.yaml
@ -0,0 +1,76 @@
+# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
+enable_modelarts: False
+# Url for modelarts
+data_url: ""
+train_url: ""
+checkpoint_url: ""
+# Path for local
+data_path: "/cache/data"
+output_path: "/cache/train"
+load_path: "/cache/checkpoint_path"
+device_target: "Ascend"
+enable_profiling: False
+
+# ==============================================================================
+# Training options
+train_stage: "base"
+is_distributed: 1
+
+# dataset related
+data_dir: "/cache/data/face_recognition_dataset/train_dataset/"
+num_classes: 1
+per_batch_size: 192
+need_modelarts_dataset_unzip: True
+
+# network structure related
+backbone: "r100"
+use_se: 1
+emb_size: 512
+act_type: "relu"
+fp16: 1
+pre_bn: 1
+inference: 0
+use_drop: 1
+nc_16: 1
+
+# loss related
+margin_a: 1.0
+margin_b: 0.2
+margin_m: 0.3
+margin_s: 64
+
+# optimizer related
+lr: 0.4
+lr_scale: 1
+lr_epochs: "8,14,18"
+weight_decay: 0.0002
+momentum: 0.9
+max_epoch: 20
+pretrained: ""
+warmup_epochs: 2
+
+# distributed parameter
+local_rank: 0
+world_size: 1
+model_parallel: 0
+
+# logging related
+log_interval: 100
+ckpt_path: "outputs"
+max_ckpts: -1
+dynamic_init_loss_scale: 65536
+ckpt_steps: 1000
+
+---
+
+# Help description for each configuration
+enable_modelarts: "Whether training on modelarts, default: False"
+data_url: "Url for modelarts"
+train_url: "Url for modelarts"
+data_path: "The location of the input data."
+output_path: "The location of the output file."
+device_target: 'Target device type'
+enable_profiling: 'Whether enable profiling while training, default: False'
+
+train_stage: "Train stage, base or beta"
+is_distributed: "If multi device"
--- a/model_zoo/research/cv/FaceRecognition/beta_config.yaml
+++ b/model_zoo/research/cv/FaceRecognition/beta_config.yaml
@ -0,0 +1,76 @@
+# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
+enable_modelarts: False
+# Url for modelarts
+data_url: ""
+train_url: ""
+checkpoint_url: ""
+# Path for local
+data_path: "/cache/data"
+output_path: "/cache/train"
+load_path: "/cache/checkpoint_path"
+device_target: "Ascend"
+enable_profiling: False
+
+# ==============================================================================
+# Training options
+train_stage: "beta"
+is_distributed: 1
+
+# dataset related
+data_dir: "/cache/data/face_recognition_dataset/train_dataset/"
+num_classes: 1
+per_batch_size: 192
+need_modelarts_dataset_unzip: True
+
+# network structure related
+backbone: "r100"
+use_se: 0
+emb_size: 256
+act_type: "relu"
+fp16: 1
+pre_bn: 0
+inference: 0
+use_drop: 1
+nc_16: 1
+
+# loss related
+margin_a: 1.0
+margin_b: 0.2
+margin_m: 0.3
+margin_s: 64
+
+# optimizer related
+lr: 0.04
+lr_scale: 1
+lr_epochs: "8,14,18"
+weight_decay: 0.0002
+momentum: 0.9
+max_epoch: 20
+pretrained: "your_pretrained_model"
+warmup_epochs: 2
+
+# distributed parameter
+local_rank: 0
+world_size: 1
+model_parallel: 0
+
+# logging related
+log_interval: 100
+ckpt_path: "outputs"
+max_ckpts: -1
+dynamic_init_loss_scale: 65536
+ckpt_steps: 1000
+
+---
+
+# Help description for each configuration
+enable_modelarts: "Whether training on modelarts, default: False"
+data_url: "Url for modelarts"
+train_url: "Url for modelarts"
+data_path: "The location of the input data."
+output_path: "The location of the output file."
+device_target: 'Target device type'
+enable_profiling: 'Whether enable profiling while training, default: False'
+
+train_stage: "Train stage, base or beta"
+is_distributed: "If multi device"
--- a/model_zoo/research/cv/FaceRecognition/eval.py
+++ b/model_zoo/research/cv/FaceRecognition/eval.py
@ -26,12 +26,14 @@ import mindspore.dataset as de
 from mindspore import Tensor, context
 from mindspore.train.serialization import load_checkpoint, load_param_into_net

-from src.config import config_inference
 from src.backbone.resnet import get_backbone
 from src.my_logging import get_logger

-devid = int(os.getenv('DEVICE_ID'))
-context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid)
+from utils.config import config
+from utils.moxing_adapter import moxing_wrapper
+from utils.device_adapter import get_device_id, get_device_num, get_rank_id
+
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=get_device_id())


 class TxtDataset():
@ -198,7 +200,61 @@ def l2normalize(features):
    l2norm[np.logical_and(l2norm >= 0, l2norm < epsilon)] = epsilon
    return features/l2norm

-def main(args):
+def modelarts_pre_process():
+    '''modelarts pre process function.'''
+    def unzip(zip_file, save_dir):
+        import zipfile
+        s_time = time.time()
+        if not os.path.exists(os.path.join(save_dir, "face_recognition_dataset")):
+            zip_isexist = zipfile.is_zipfile(zip_file)
+            if zip_isexist:
+                fz = zipfile.ZipFile(zip_file, 'r')
+                data_num = len(fz.namelist())
+                print("Extract Start...")
+                print("unzip file num: {}".format(data_num))
+                i = 0
+                for file in fz.namelist():
+                    if i % int(data_num / 100) == 0:
+                        print("unzip percent: {}%".format(i / int(data_num / 100)), flush=True)
+                    i += 1
+                    fz.extract(file, save_dir)
+                print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
+                                                     int(int(time.time() - s_time) % 60)))
+                print("Extract Done.")
+            else:
+                print("This is not zip.")
+        else:
+            print("Zip has been extracted.")
+
+    if config.need_modelarts_dataset_unzip:
+        zip_file_1 = os.path.join(config.data_path, "face_recognition_dataset.zip")
+        save_dir_1 = os.path.join(config.data_path)
+
+        sync_lock = "/tmp/unzip_sync.lock"
+
+        # Each server contains 8 devices as most.
+        if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
+            print("Zip file path: ", zip_file_1)
+            print("Unzip file save dir: ", save_dir_1)
+            unzip(zip_file_1, save_dir_1)
+            print("===Finish extract data synchronization===")
+            try:
+                os.mknod(sync_lock)
+            except IOError:
+                pass
+
+        while True:
+            if os.path.exists(sync_lock):
+                break
+            time.sleep(1)
+
+        print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
+
+    config.ckpt_path = os.path.join(config.output_path, str(get_rank_id()), config.ckpt_path)
+
+@moxing_wrapper(pre_process=modelarts_pre_process)
+def run_eval(args):
+    '''run eval function.'''
    if not os.path.exists(args.test_dir):
        args.logger.info('ERROR, test_dir is not exists, please set test_dir in config.py.')
        return 0
@ -317,17 +373,17 @@ def main(args):
    return 0

 if __name__ == '__main__':
-    arg = config_inference
-    arg.test_img_predix = [arg.test_dir, arg.test_dir]
+    config.test_img_predix = [os.path.join(config.test_dir, 'test_dataset/'),
+                              os.path.join(config.test_dir, 'test_dataset/')]

-    arg.test_img_list = [os.path.join(arg.test_dir, 'lists/jk_list.txt'),
-                         os.path.join(arg.test_dir, 'lists/zj_list.txt')]
-    arg.dis_img_predix = [arg.test_dir,]
-    arg.dis_img_list = [os.path.join(arg.test_dir, 'lists/dis_list.txt'),]
+    config.test_img_list = [os.path.join(config.test_dir, 'lists/jk_list.txt'),
+                            os.path.join(config.test_dir, 'lists/zj_list.txt')]
+    config.dis_img_predix = [os.path.join(config.test_dir, 'dis_dataset/'),]
+    config.dis_img_list = [os.path.join(config.test_dir, 'lists/dis_list.txt'),]

-    log_path = os.path.join(arg.ckpt_path, 'logs')
-    arg.logger = get_logger(log_path, arg.local_rank)
+    log_path = os.path.join(config.ckpt_path, 'logs')
+    config.logger = get_logger(log_path, config.local_rank)

-    arg.logger.info('Config: %s', pformat(arg))
+    config.logger.info('Config %s', pformat(config))

-    main(arg)
+    run_eval(config)
--- a/model_zoo/research/cv/FaceRecognition/inference_config.yaml
+++ b/model_zoo/research/cv/FaceRecognition/inference_config.yaml
@ -0,0 +1,60 @@
+# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
+enable_modelarts: False
+# Url for modelarts
+data_url: ""
+train_url: ""
+checkpoint_url: ""
+# Path for local
+data_path: "/cache/data"
+output_path: "/cache/train"
+load_path: "/cache/checkpoint_path"
+device_target: "Ascend"
+enable_profiling: False
+
+# ==============================================================================
+# Training options
+
+# distributed parameter
+is_distributed: 0
+local_rank: 0
+world_size: 1
+
+# test weight
+weight: 'your_test_model'
+test_dir: '/cache/data/face_recognition_dataset/'
+need_modelarts_dataset_unzip: True
+
+# model define
+backbone: "r100"
+use_se: 0
+emb_size: 256
+act_type: "relu"
+fp16: 1
+pre_bn: 0
+inference: 1
+use_drop: 0
+
+# test and dis batch size
+test_batch_size: 128
+dis_batch_size: 512
+
+# log
+log_interval: 100
+ckpt_path: "outputs/models"
+
+# test and dis image list
+test_img_predix: ""
+test_img_list: ""
+dis_img_predix: ""
+dis_img_list: ""
+
+---
+
+# Help description for each configuration
+enable_modelarts: "Whether training on modelarts, default: False"
+data_url: "Url for modelarts"
+train_url: "Url for modelarts"
+data_path: "The location of the input data."
+output_path: "The location of the output file."
+device_target: 'Target device type'
+enable_profiling: 'Whether enable profiling while training, default: False'
--- a/model_zoo/research/cv/FaceRecognition/scripts/run_distribute_train_base.sh
+++ b/model_zoo/research/cv/FaceRecognition/scripts/run_distribute_train_base.sh
@ -59,6 +59,7 @@ do
    echo "start training for rank $RANK_ID, device $DEVICE_ID"
    env > ${EXECUTE_PATH}/log_parallel_graph/face_recognition_$i.log
    python ${EXECUTE_PATH}/../train.py \
+    --config_path=${EXECUTE_PATH}/../base_config.yaml \
    --train_stage=base \
    --is_distributed=1 &> ${EXECUTE_PATH}/log_parallel_graph/face_recognition_$i.log &
 done
--- a/model_zoo/research/cv/FaceRecognition/scripts/run_distribute_train_beta.sh
+++ b/model_zoo/research/cv/FaceRecognition/scripts/run_distribute_train_beta.sh
@ -59,6 +59,7 @@ do
    echo "start training for rank $RANK_ID, device $DEVICE_ID"
    env > ${EXECUTE_PATH}/log_parallel_graph/face_recognition_$i.log
    python ${EXECUTE_PATH}/../train.py \
+    --config_path=${EXECUTE_PATH}/../beta_config.yaml \
    --train_stage=beta \
    --is_distributed=1 &> ${EXECUTE_PATH}/log_parallel_graph/face_recognition_$i.log &
 done
--- a/model_zoo/research/cv/FaceRecognition/scripts/run_eval.sh
+++ b/model_zoo/research/cv/FaceRecognition/scripts/run_eval.sh
@ -41,6 +41,6 @@ mkdir ${EXECUTE_PATH}/log_inference

 cd ${EXECUTE_PATH}/log_inference || exit
 env > ${EXECUTE_PATH}/log_inference/face_recognition.log
-python ${EXECUTE_PATH}/../eval.py &> ${EXECUTE_PATH}/log_inference/face_recognition.log &
+python ${EXECUTE_PATH}/../eval.py --config_path=${EXECUTE_PATH}/../inference_config.yaml &> ${EXECUTE_PATH}/log_inference/face_recognition.log &

 echo "[INFO] Start inference..."
--- a/model_zoo/research/cv/FaceRecognition/scripts/run_standalone_train_base.sh
+++ b/model_zoo/research/cv/FaceRecognition/scripts/run_standalone_train_base.sh
@ -46,6 +46,7 @@ cd ${EXECUTE_PATH}/data_standalone_log_$USE_DEVICE_ID || exit
 echo "start training for rank $RANK_ID, device $USE_DEVICE_ID"
 env > ${EXECUTE_PATH}/log_standalone_graph/face_recognition_$USE_DEVICE_ID.log
 python ${EXECUTE_PATH}/../train.py \
+    --config_path=${EXECUTE_PATH}/../base_config.yaml \
    --train_stage=base \
    --is_distributed=0 &> ${EXECUTE_PATH}/log_standalone_graph/face_recognition_$USE_DEVICE_ID.log &

--- a/model_zoo/research/cv/FaceRecognition/scripts/run_standalone_train_beta.sh
+++ b/model_zoo/research/cv/FaceRecognition/scripts/run_standalone_train_beta.sh
@ -46,6 +46,7 @@ cd ${EXECUTE_PATH}/data_standalone_log_$USE_DEVICE_ID || exit
 echo "start training for rank $RANK_ID, device $USE_DEVICE_ID"
 env > ${EXECUTE_PATH}/log_standalone_graph/face_recognition_$USE_DEVICE_ID.log
 python ${EXECUTE_PATH}/../train.py \
+    --config_path=${EXECUTE_PATH}/../base_config.yaml \
    --train_stage=beta \
    --is_distributed=0 &> ${EXECUTE_PATH}/log_standalone_graph/face_recognition_$USE_DEVICE_ID.log &

--- a/model_zoo/research/cv/FaceRecognition/src/config.py
+++ b/model_zoo/research/cv/FaceRecognition/src/config.py
@ -1,148 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#" :===========================================================================
-
-"""network config setting, will be used in train.py and eval.py."""
-
-from easydict import EasyDict as edict
-
-config_base = edict({
-    # dataset related
-    'data_dir': "your_dataset_path",
-    'num_classes': 1,
-    'per_batch_size': 192,
-
-    # network structure related
-    'backbone': 'r100',
-    'use_se': 1,
-    'emb_size': 512,
-    'act_type': 'relu',
-    'fp16': 1,
-    'pre_bn': 1,
-    'inference': 0,
-    'use_drop': 1,
-    'nc_16': 1,
-
-    # loss related
-    'margin_a': 1.0,
-    'margin_b': 0.2,
-    'margin_m': 0.3,
-    'margin_s': 64,
-
-    # optimizer related
-    'lr': 0.4,
-    'lr_scale': 1,
-    'lr_epochs': '8,14,18',
-    'weight_decay': 0.0002,
-    'momentum': 0.9,
-    'max_epoch': 20,
-    'pretrained': '',
-    'warmup_epochs': 2,
-
-    # distributed parameter
-    'is_distributed': 1,
-    'local_rank': 0,
-    'world_size': 1,
-    'model_parallel': 0,
-
-    # logging related
-    'log_interval': 100,
-    'ckpt_path': 'outputs',
-    'max_ckpts': -1,
-    'dynamic_init_loss_scale': 65536,
-    'ckpt_steps': 1000
-})
-
-config_beta = edict({
-    # dataset related
-    'data_dir': "your_dataset_path",
-    'num_classes': 1,
-    'per_batch_size': 192,
-
-    # network structure related
-    'backbone': 'r100',
-    'use_se': 0,
-    'emb_size': 256,
-    'act_type': 'relu',
-    'fp16': 1,
-    'pre_bn': 0,
-    'inference': 0,
-    'use_drop': 1,
-    'nc_16': 1,
-
-    # loss related
-    'margin_a': 1.0,
-    'margin_b': 0.2,
-    'margin_m': 0.3,
-    'margin_s': 64,
-
-    # optimizer related
-    'lr': 0.04,
-    'lr_scale': 1,
-    'lr_epochs': '8,14,18',
-    'weight_decay': 0.0002,
-    'momentum': 0.9,
-    'max_epoch': 20,
-    'pretrained': 'your_pretrained_model',
-    'warmup_epochs': 2,
-
-    # distributed parameter
-    'is_distributed': 1,
-    'local_rank': 0,
-    'world_size': 1,
-    'model_parallel': 0,
-
-    # logging related
-    'log_interval': 100,
-    'ckpt_path': 'outputs',
-    'max_ckpts': -1,
-    'dynamic_init_loss_scale': 65536,
-    'ckpt_steps': 1000
-})
-
-
-config_inference = edict({
-    # distributed parameter
-    'is_distributed': 0,
-    'local_rank': 0,
-    'world_size': 1,
-
-    # test weight
-    'weight': 'your_test_model',
-    'test_dir': 'your_dataset_path',
-
-    # model define
-    'backbone': 'r100',
-    'use_se': 0,
-    'emb_size': 256,
-    'act_type': 'relu',
-    'fp16': 1,
-    'pre_bn': 0,
-    'inference': 1,
-    'use_drop': 0,
-
-    # test and dis batch size
-    'test_batch_size': 128,
-    'dis_batch_size': 512,
-
-    # log
-    'log_interval': 100,
-    'ckpt_path': 'outputs/models',
-
-    # test and dis image list
-    'test_img_predix': '',
-    'test_img_list': '',
-    'dis_img_predix': '',
-    'dis_img_list': ''
-})
--- a/model_zoo/research/cv/FaceRecognition/train.py
+++ b/model_zoo/research/cv/FaceRecognition/train.py
@ -14,20 +14,19 @@
 # ============================================================================
 """Face Recognition train."""
 import os
-import argparse
+import time

 import mindspore
 from mindspore.nn import Cell
 from mindspore import context
 from mindspore.context import ParallelMode
-from mindspore.communication.management import get_group_size, init, get_rank
+from mindspore.communication.management import init
 from mindspore.nn.optim import Momentum
 from mindspore.train.model import Model
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
 from mindspore.train.loss_scale_manager import DynamicLossScaleManager
 from mindspore.train.serialization import load_checkpoint, load_param_into_net

-from src.config import config_base, config_beta
 from src.my_logging import get_logger
 from src.init_network import init_net
 from src.dataset_factory import get_de_dataset
@ -37,10 +36,13 @@ from src.loss_factory import get_loss
 from src.lrsche_factory import warmup_step_list, list_to_gen
 from src.callback_factory import ProgressMonitor

+from utils.moxing_adapter import moxing_wrapper
+from utils.config import config
+from utils.device_adapter import get_device_id, get_device_num, get_rank_id
+
 mindspore.common.seed.set_seed(1)
-devid = int(os.getenv('DEVICE_ID'))
 context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False,
-                    device_id=devid, reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)
+                    device_id=get_device_id(), reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)

 class DistributedHelper(Cell):
    '''DistributedHelper'''
@ -84,103 +86,13 @@ class BuildTrainNetwork(Cell):

        return loss

-def parse_args():
-    parser = argparse.ArgumentParser('MindSpore Face Recognition')
-    parser.add_argument('--train_stage', type=str, default='base', help='train stage, base or beta')
-    parser.add_argument('--is_distributed', type=int, default=1, help='if multi device')

-    args_opt_1, _ = parser.parse_known_args()
-    return args_opt_1
-
-if __name__ == "__main__":
-    args_opt = parse_args()
-
-    support_train_stage = ['base', 'beta']
-    if args_opt.train_stage.lower() not in support_train_stage:
-        args.logger.info('support train stage is:{}, while yours is:{}'.
-                         format(support_train_stage, args_opt.train_stage))
-        raise ValueError('train stage not support.')
-    args = config_base if args_opt.train_stage.lower() == 'base' else config_beta
-    args.is_distributed = args_opt.is_distributed
-    if args_opt.is_distributed:
-        init()
-        args.local_rank = get_rank()
-        args.world_size = get_group_size()
-        parallel_mode = ParallelMode.HYBRID_PARALLEL
-    else:
-        parallel_mode = ParallelMode.STAND_ALONE
-
-    context.set_auto_parallel_context(parallel_mode=parallel_mode,
-                                      device_num=args.world_size, gradients_mean=True)
-
-    if not os.path.exists(args.data_dir):
-        args.logger.info('ERROR, data_dir is not exists, please set data_dir in config.py')
-        raise ValueError('ERROR, data_dir is not exists, please set data_dir in config.py')
-
-    args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
-
-
-    log_path = os.path.join(args.ckpt_path, 'logs')
-    args.logger = get_logger(log_path, args.local_rank)
-
-    if args.local_rank % 8 == 0:
-        if not os.path.exists(args.ckpt_path):
-            os.makedirs(args.ckpt_path)
-
-    args.logger.info('args.world_size:{}'.format(args.world_size))
-    args.logger.info('args.local_rank:{}'.format(args.local_rank))
-    args.logger.info('args.lr:{}'.format(args.lr))
-
-    momentum = args.momentum
-    weight_decay = args.weight_decay
-
-    de_dataset, steps_per_epoch, num_classes = get_de_dataset(args)
-    args.logger.info('de_dataset:{}'.format(de_dataset.get_dataset_size()))
-    args.steps_per_epoch = steps_per_epoch
-    args.num_classes = num_classes
-
-    args.logger.info('loaded, nums: {}'.format(args.num_classes))
-    if args.nc_16 == 1:
-        if args.model_parallel == 0:
-            if args.num_classes % 16 == 0:
-                args.logger.info('data parallel aleardy 16, nums: {}'.format(args.num_classes))
-            else:
-                args.num_classes = (args.num_classes // 16 + 1) * 16
-        else:
-            if args.num_classes % (args.world_size * 16) == 0:
-                args.logger.info('model parallel aleardy 16, nums: {}'.format(args.num_classes))
-            else:
-                args.num_classes = (args.num_classes // (args.world_size * 16) + 1) * args.world_size * 16
-
-    args.logger.info('for D, loaded, class nums: {}'.format(args.num_classes))
-    args.logger.info('steps_per_epoch:{}'.format(args.steps_per_epoch))
-    args.logger.info('img_total_num:{}'.format(args.steps_per_epoch * args.per_batch_size))
-
-    args.logger.info('get_backbone----in----')
-    _backbone = get_backbone(args)
-    args.logger.info('get_backbone----out----')
-
-    args.logger.info('get_metric_fc----in----')
-    margin_fc_1 = get_metric_fc(args)
-    args.logger.info('get_metric_fc----out----')
-
-    args.logger.info('DistributedHelper----in----')
-    network_1 = DistributedHelper(_backbone, margin_fc_1)
-    args.logger.info('DistributedHelper----out----')
-
-    args.logger.info('network fp16----in----')
-    if args.fp16 == 1:
-        network_1.add_flags_recursive(fp16=True)
-    args.logger.info('network fp16----out----')
-
-    criterion_1 = get_loss(args)
-    if args.fp16 == 1 and args.model_parallel == 0:
-        criterion_1.add_flags_recursive(fp32=True)
-
-    if os.path.isfile(args.pretrained):
-        param_dict = load_checkpoint(args.pretrained)
+def load_pretrain(cfg, net):
+    '''load pretrain function.'''
+    if os.path.isfile(cfg.pretrained):
+        param_dict = load_checkpoint(cfg.pretrained)
        param_dict_new = {}
-        if args_opt.train_stage.lower() == 'base':
+        if cfg.train_stage.lower() == 'base':
            for key, value in param_dict.items():
                if key.startswith('moments.'):
                    continue
@ -201,35 +113,169 @@ if __name__ == "__main__":
                        continue
                    else:
                        param_dict_new[key[8:]] = value
-        load_param_into_net(network_1, param_dict_new)
-        args.logger.info('load model {} success'.format(args.pretrained))
+        load_param_into_net(net, param_dict_new)
+        cfg.logger.info('load model {} success'.format(cfg.pretrained))
    else:
-        init_net(args, network_1)
+        if cfg.train_stage.lower() == 'beta':
+            raise ValueError("Train beta mode load pretrain model fail from: {}".format(cfg.pretrained))
+        init_net(cfg, net)
+        cfg.logger.info('init model success')
+    return net

-    train_net = BuildTrainNetwork(network_1, criterion_1, args)

-    args.logger.info('args:{}'.format(args))
-    # call warmup_step should behind the args steps_per_epoch
-    args.lrs = warmup_step_list(args, gamma=0.1)
-    lrs_gen = list_to_gen(args.lrs)
-    opt = Momentum(params=train_net.trainable_params(), learning_rate=lrs_gen, momentum=momentum,
-                   weight_decay=weight_decay)
-    scale_manager = DynamicLossScaleManager(init_loss_scale=args.dynamic_init_loss_scale, scale_factor=2,
+def modelarts_pre_process():
+    '''modelarts pre process function.'''
+    def unzip(zip_file, save_dir):
+        import zipfile
+        s_time = time.time()
+        if not os.path.exists(os.path.join(save_dir, "face_recognition_dataset")):
+            zip_isexist = zipfile.is_zipfile(zip_file)
+            if zip_isexist:
+                fz = zipfile.ZipFile(zip_file, 'r')
+                data_num = len(fz.namelist())
+                print("Extract Start...")
+                print("unzip file num: {}".format(data_num))
+                i = 0
+                for file in fz.namelist():
+                    if i % int(data_num / 100) == 0:
+                        print("unzip percent: {}%".format(i / int(data_num / 100)), flush=True)
+                    i += 1
+                    fz.extract(file, save_dir)
+                print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60),
+                                                     int(int(time.time() - s_time) % 60)))
+                print("Extract Done.")
+            else:
+                print("This is not zip.")
+        else:
+            print("Zip has been extracted.")
+
+    if config.need_modelarts_dataset_unzip:
+        zip_file_1 = os.path.join(config.data_path, "face_recognition_dataset.zip")
+        save_dir_1 = os.path.join(config.data_path)
+
+        sync_lock = "/tmp/unzip_sync.lock"
+
+        # Each server contains 8 devices as most.
+        if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
+            print("Zip file path: ", zip_file_1)
+            print("Unzip file save dir: ", save_dir_1)
+            unzip(zip_file_1, save_dir_1)
+            print("===Finish extract data synchronization===")
+            try:
+                os.mknod(sync_lock)
+            except IOError:
+                pass
+
+        while True:
+            if os.path.exists(sync_lock):
+                break
+            time.sleep(1)
+
+        print("Device: {}, Finish sync unzip data from {} to {}.".format(get_device_id(), zip_file_1, save_dir_1))
+
+    config.ckpt_path = os.path.join(config.output_path, str(get_rank_id()), config.ckpt_path)
+
+
+@moxing_wrapper(pre_process=modelarts_pre_process)
+def run_train():
+    '''run train function.'''
+    config.local_rank = get_rank_id()
+    config.world_size = get_device_num()
+    log_path = os.path.join(config.ckpt_path, 'logs')
+    config.logger = get_logger(log_path, config.local_rank)
+
+    support_train_stage = ['base', 'beta']
+    if config.train_stage.lower() not in support_train_stage:
+        config.logger.info('your train stage is not support.')
+        raise ValueError('train stage not support.')
+
+    if not os.path.exists(config.data_dir):
+        config.logger.info('ERROR, data_dir is not exists, please set data_dir in config.py')
+        raise ValueError('ERROR, data_dir is not exists, please set data_dir in config.py')
+
+    parallel_mode = ParallelMode.HYBRID_PARALLEL if config.is_distributed else ParallelMode.STAND_ALONE
+    context.set_auto_parallel_context(parallel_mode=parallel_mode,
+                                      device_num=config.world_size, gradients_mean=True)
+    if config.is_distributed:
+        init()
+
+    if config.local_rank % 8 == 0:
+        if not os.path.exists(config.ckpt_path):
+            os.makedirs(config.ckpt_path)
+
+    de_dataset, steps_per_epoch, num_classes = get_de_dataset(config)
+    config.logger.info('de_dataset: %d', de_dataset.get_dataset_size())
+
+    config.steps_per_epoch = steps_per_epoch
+    config.num_classes = num_classes
+    config.lr_epochs = list(map(int, config.lr_epochs.split(',')))
+    config.logger.info('config.num_classes: %d', config.num_classes)
+    config.logger.info('config.world_size: %d', config.world_size)
+    config.logger.info('config.local_rank: %d', config.local_rank)
+    config.logger.info('config.lr: %f', config.lr)
+
+    if config.nc_16 == 1:
+        if config.model_parallel == 0:
+            if config.num_classes % 16 == 0:
+                config.logger.info('data parallel aleardy 16, nums: %d', config.num_classes)
+            else:
+                config.num_classes = (config.num_classes // 16 + 1) * 16
+        else:
+            if config.num_classes % (config.world_size * 16) == 0:
+                config.logger.info('model parallel aleardy 16, nums: %d', config.num_classes)
+            else:
+                config.num_classes = (config.num_classes // (config.world_size * 16) + 1) * config.world_size * 16
+
+    config.logger.info('for D, loaded, class nums: %d', config.num_classes)
+    config.logger.info('steps_per_epoch: %d', config.steps_per_epoch)
+    config.logger.info('img_total_num: %d', config.steps_per_epoch * config.per_batch_size)
+
+    config.logger.info('get_backbone----in----')
+    _backbone = get_backbone(config)
+    config.logger.info('get_backbone----out----')
+    config.logger.info('get_metric_fc----in----')
+    margin_fc_1 = get_metric_fc(config)
+    config.logger.info('get_metric_fc----out----')
+    config.logger.info('DistributedHelper----in----')
+    network_1 = DistributedHelper(_backbone, margin_fc_1)
+    config.logger.info('DistributedHelper----out----')
+    config.logger.info('network fp16----in----')
+    if config.fp16 == 1:
+        network_1.add_flags_recursive(fp16=True)
+    config.logger.info('network fp16----out----')
+
+    criterion_1 = get_loss(config)
+    if config.fp16 == 1 and config.model_parallel == 0:
+        criterion_1.add_flags_recursive(fp32=True)
+
+    network_1 = load_pretrain(config, network_1)
+    train_net = BuildTrainNetwork(network_1, criterion_1, config)
+
+    # call warmup_step should behind the config steps_per_epoch
+    config.lrs = warmup_step_list(config, gamma=0.1)
+    lrs_gen = list_to_gen(config.lrs)
+    opt = Momentum(params=train_net.trainable_params(), learning_rate=lrs_gen, momentum=config.momentum,
+                   weight_decay=config.weight_decay)
+    scale_manager = DynamicLossScaleManager(init_loss_scale=config.dynamic_init_loss_scale, scale_factor=2,
                                            scale_window=2000)
    model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=scale_manager)
-    save_checkpoint_steps = args.ckpt_steps
-    args.logger.info('save_checkpoint_steps:{}'.format(save_checkpoint_steps))
-    if args.max_ckpts == -1:
-        keep_checkpoint_max = int(args.steps_per_epoch * args.max_epoch / save_checkpoint_steps) + 5 # for more than 5
+
+    save_checkpoint_steps = config.ckpt_steps
+    config.logger.info('save_checkpoint_steps: %d', save_checkpoint_steps)
+    if config.max_ckpts == -1:
+        keep_checkpoint_max = int(config.steps_per_epoch * config.max_epoch / save_checkpoint_steps) + 5
    else:
-        keep_checkpoint_max = args.max_ckpts
-    args.logger.info('keep_checkpoint_max:{}'.format(keep_checkpoint_max))
+        keep_checkpoint_max = config.max_ckpts
+    config.logger.info('keep_checkpoint_max: %d', keep_checkpoint_max)

    ckpt_config = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max)
-    max_epoch_train = args.max_epoch
-    args.logger.info('max_epoch_train:{}'.format(max_epoch_train))
-    ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.ckpt_path, prefix='{}'.format(args.local_rank))
-    args.epoch_cnt = 0
-    progress_cb = ProgressMonitor(args)
-    new_epoch_train = max_epoch_train * steps_per_epoch // args.log_interval
-    model.train(new_epoch_train, de_dataset, callbacks=[progress_cb, ckpt_cb], sink_size=args.log_interval)
+    config.logger.info('max_epoch_train: %d', config.max_epoch)
+    ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=config.ckpt_path, prefix='{}'.format(config.local_rank))
+    config.epoch_cnt = 0
+    progress_cb = ProgressMonitor(config)
+    new_epoch_train = config.max_epoch * steps_per_epoch // config.log_interval
+    model.train(new_epoch_train, de_dataset, callbacks=[progress_cb, ckpt_cb], sink_size=config.log_interval)
+
+
+if __name__ == "__main__":
+    run_train()
--- a/model_zoo/research/cv/FaceRecognition/utils/init.py
+++ b/model_zoo/research/cv/FaceRecognition/utils/init.py
--- a/model_zoo/research/cv/FaceRecognition/utils/config.py
+++ b/model_zoo/research/cv/FaceRecognition/utils/config.py
@ -0,0 +1,127 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Parse arguments"""
+
+import os
+import ast
+import argparse
+from pprint import pprint, pformat
+import yaml
+
+class Config:
+    """
+    Configuration namespace. Convert dictionary to members.
+    """
+    def __init__(self, cfg_dict):
+        for k, v in cfg_dict.items():
+            if isinstance(v, (list, tuple)):
+                setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
+            else:
+                setattr(self, k, Config(v) if isinstance(v, dict) else v)
+
+    def __str__(self):
+        return pformat(self.__dict__)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
+    """
+    Parse command line arguments to the configuration according to the default yaml.
+
+    Args:
+        parser: Parent parser.
+        cfg: Base configuration.
+        helper: Helper description.
+        cfg_path: Path to the default yaml config.
+    """
+    parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
+                                     parents=[parser])
+    helper = {} if helper is None else helper
+    choices = {} if choices is None else choices
+    for item in cfg:
+        if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
+            help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
+            choice = choices[item] if item in choices else None
+            if isinstance(cfg[item], bool):
+                parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
+                                    help=help_description)
+            else:
+                parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
+                                    help=help_description)
+    args = parser.parse_args()
+    return args
+
+
+def parse_yaml(yaml_path):
+    """
+    Parse the yaml config file.
+
+    Args:
+        yaml_path: Path to the yaml config.
+    """
+    with open(yaml_path, 'r') as fin:
+        try:
+            cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
+            cfgs = [x for x in cfgs]
+            if len(cfgs) == 1:
+                cfg_helper = {}
+                cfg = cfgs[0]
+                cfg_choices = {}
+            elif len(cfgs) == 2:
+                cfg, cfg_helper = cfgs
+                cfg_choices = {}
+            elif len(cfgs) == 3:
+                cfg, cfg_helper, cfg_choices = cfgs
+            else:
+                raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
+            print(cfg_helper)
+        except:
+            raise ValueError("Failed to parse yaml")
+    return cfg, cfg_helper, cfg_choices
+
+
+def merge(args, cfg):
+    """
+    Merge the base config from yaml file and command line arguments.
+
+    Args:
+        args: Command line arguments.
+        cfg: Base configuration.
+    """
+    args_var = vars(args)
+    for item in args_var:
+        cfg[item] = args_var[item]
+    return cfg
+
+
+def get_config():
+    """
+    Get Config according to the yaml file and cli arguments.
+    """
+    parser = argparse.ArgumentParser(description="default name", add_help=False)
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"),
+                        help="Config file path")
+    path_args, _ = parser.parse_known_args()
+    default, helper, choices = parse_yaml(path_args.config_path)
+    pprint(default)
+    args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
+    final_config = merge(args, default)
+    return Config(final_config)
+
+config = get_config()
--- a/model_zoo/research/cv/FaceRecognition/utils/device_adapter.py
+++ b/model_zoo/research/cv/FaceRecognition/utils/device_adapter.py
@ -0,0 +1,27 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Device adapter for ModelArts"""
+
+from utils.config import config
+
+if config.enable_modelarts:
+    from utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+else:
+    from utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+
+__all__ = [
+    "get_device_id", "get_device_num", "get_rank_id", "get_job_id"
+]
--- a/model_zoo/research/cv/FaceRecognition/utils/local_adapter.py
+++ b/model_zoo/research/cv/FaceRecognition/utils/local_adapter.py
@ -0,0 +1,36 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Local adapter"""
+
+import os
+
+def get_device_id():
+    device_id = os.getenv('DEVICE_ID', '0')
+    return int(device_id)
+
+
+def get_device_num():
+    device_num = os.getenv('RANK_SIZE', '1')
+    return int(device_num)
+
+
+def get_rank_id():
+    global_rank_id = os.getenv('RANK_ID', '0')
+    return int(global_rank_id)
+
+
+def get_job_id():
+    return "Local Job"
--- a/model_zoo/research/cv/FaceRecognition/utils/moxing_adapter.py
+++ b/model_zoo/research/cv/FaceRecognition/utils/moxing_adapter.py
@ -0,0 +1,116 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Moxing adapter for ModelArts"""
+
+import os
+import functools
+from mindspore import context
+from utils.config import config
+
+_global_sync_count = 0
+
+def get_device_id():
+    device_id = os.getenv('DEVICE_ID', '0')
+    return int(device_id)
+
+
+def get_device_num():
+    device_num = os.getenv('RANK_SIZE', '1')
+    return int(device_num)
+
+
+def get_rank_id():
+    global_rank_id = os.getenv('RANK_ID', '0')
+    return int(global_rank_id)
+
+
+def get_job_id():
+    job_id = os.getenv('JOB_ID')
+    job_id = job_id if job_id != "" else "default"
+    return job_id
+
+def sync_data(from_path, to_path):
+    """
+    Download data from remote obs to local directory if the first url is remote url and the second one is local path
+    Upload data from local directory to remote obs in contrast.
+    """
+    import moxing as mox
+    import time
+    global _global_sync_count
+    sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
+    _global_sync_count += 1
+
+    # Each server contains 8 devices as most.
+    if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
+        print("from path: ", from_path)
+        print("to path: ", to_path)
+        mox.file.copy_parallel(from_path, to_path)
+        print("===finish data synchronization===")
+        try:
+            os.mknod(sync_lock)
+        except IOError:
+            pass
+        print("===save flag===")
+
+    while True:
+        if os.path.exists(sync_lock):
+            break
+        time.sleep(1)
+
+    print("Finish sync data from {} to {}.".format(from_path, to_path))
+
+
+def moxing_wrapper(pre_process=None, post_process=None):
+    """
+    Moxing wrapper to download dataset and upload outputs.
+    """
+    def wrapper(run_func):
+        @functools.wraps(run_func)
+        def wrapped_func(*args, **kwargs):
+            # Download data from data_url
+            if config.enable_modelarts:
+                if config.data_url:
+                    sync_data(config.data_url, config.data_path)
+                    print("Dataset downloaded: ", os.listdir(config.data_path))
+                if config.checkpoint_url:
+                    sync_data(config.checkpoint_url, config.load_path)
+                    print("Preload downloaded: ", os.listdir(config.load_path))
+                if config.train_url:
+                    sync_data(config.train_url, config.output_path)
+                    print("Workspace downloaded: ", os.listdir(config.output_path))
+
+                context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
+                config.device_num = get_device_num()
+                config.device_id = get_device_id()
+                if not os.path.exists(config.output_path):
+                    os.makedirs(config.output_path)
+
+                if pre_process:
+                    pre_process()
+
+            # Run the main function
+            run_func(*args, **kwargs)
+
+            # Upload data to train_url
+            if config.enable_modelarts:
+                if post_process:
+                    post_process()
+
+                if config.train_url:
+                    print("Start to copy output directory")
+                    sync_data(config.output_path, config.train_url)
+        return wrapped_func
+    return wrapper