!20630 Update model scaffolding, add envs for gpu_mpi

Merge pull request !20630 from chenhaozhe/update-model-scaffolding
2021-08-30 06:10:50 +00:00 · 2021-08-30 06:10:50 +00:00 · ed2814c3a6
parent 3b8b56c14c 36382c2672
commit ed2814c3a6
11 changed files with 415 additions and 9 deletions
--- a/model_zoo/utils/model_scaffolding/example/eval.py
+++ b/model_zoo/utils/model_scaffolding/example/eval.py
--- a/model_zoo/utils/model_scaffolding/example/src/init.py
+++ b/model_zoo/utils/model_scaffolding/example/src/init.py
--- a/model_zoo/utils/model_scaffolding/example/src/config.py
+++ b/model_zoo/utils/model_scaffolding/example/src/config.py
@ -0,0 +1,127 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Parse arguments"""
+
+import os
+import ast
+import argparse
+from pprint import pprint, pformat
+import yaml
+
+class Config:
+    """
+    Configuration namespace. Convert dictionary to members.
+    """
+    def __init__(self, cfg_dict):
+        for k, v in cfg_dict.items():
+            if isinstance(v, (list, tuple)):
+                setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
+            else:
+                setattr(self, k, Config(v) if isinstance(v, dict) else v)
+
+    def __str__(self):
+        return pformat(self.__dict__)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
+    """
+    Parse command line arguments to the configuration according to the default yaml.
+
+    Args:
+        parser: Parent parser.
+        cfg: Base configuration.
+        helper: Helper description.
+        cfg_path: Path to the default yaml config.
+    """
+    parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
+                                     parents=[parser])
+    helper = {} if helper is None else helper
+    choices = {} if choices is None else choices
+    for item in cfg:
+        if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
+            help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
+            choice = choices[item] if item in choices else None
+            if isinstance(cfg[item], bool):
+                parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
+                                    help=help_description)
+            else:
+                parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
+                                    help=help_description)
+    args = parser.parse_args()
+    return args
+
+
+def parse_yaml(yaml_path):
+    """
+    Parse the yaml config file.
+
+    Args:
+        yaml_path: Path to the yaml config.
+    """
+    with open(yaml_path, 'r') as fin:
+        try:
+            cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
+            cfgs = [x for x in cfgs]
+            if len(cfgs) == 1:
+                cfg_helper = {}
+                cfg = cfgs[0]
+                cfg_choices = {}
+            elif len(cfgs) == 2:
+                cfg, cfg_helper = cfgs
+                cfg_choices = {}
+            elif len(cfgs) == 3:
+                cfg, cfg_helper, cfg_choices = cfgs
+            else:
+                raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
+            print(cfg_helper)
+        except:
+            raise ValueError("Failed to parse yaml")
+    return cfg, cfg_helper, cfg_choices
+
+
+def merge(args, cfg):
+    """
+    Merge the base config from yaml file and command line arguments.
+
+    Args:
+        args: Command line arguments.
+        cfg: Base configuration.
+    """
+    args_var = vars(args)
+    for item in args_var:
+        cfg[item] = args_var[item]
+    return cfg
+
+
+def get_config():
+    """
+    Get Config according to the yaml file and cli arguments.
+    """
+    parser = argparse.ArgumentParser(description="default name", add_help=False)
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"),
+                        help="Config file path")
+    path_args, _ = parser.parse_known_args()
+    default, helper, choices = parse_yaml(path_args.config_path)
+    pprint(default)
+    args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
+    final_config = merge(args, default)
+    return Config(final_config)
+
+config = get_config()
--- a/model_zoo/utils/model_scaffolding/example/src/dataset.py
+++ b/model_zoo/utils/model_scaffolding/example/src/dataset.py
--- a/model_zoo/utils/model_scaffolding/example/src/device_adapter.py
+++ b/model_zoo/utils/model_scaffolding/example/src/device_adapter.py
@ -0,0 +1,27 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Device adapter for ModelArts"""
+
+from .config import config
+
+if config.enable_modelarts:
+    from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+else:
+    from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+
+__all__ = [
+    "get_device_id", "get_device_num", "get_rank_id", "get_job_id"
+]
--- a/model_zoo/utils/model_scaffolding/example/src/foo.py
+++ b/model_zoo/utils/model_scaffolding/example/src/foo.py
--- a/model_zoo/utils/model_scaffolding/example/src/local_adapter.py
+++ b/model_zoo/utils/model_scaffolding/example/src/local_adapter.py
@ -0,0 +1,61 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Local adapter"""
+
+import os
+from .config import config
+
+def get_device_id():
+    if config.device_target == "Ascend":
+        device_id = os.getenv('DEVICE_ID', '0')
+    elif config.device_target == "GPU":
+        device_id = os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', '0')
+    else:
+        device_id = 0
+    return int(device_id)
+
+
+def get_device_num():
+    if config.device_target == "Ascend":
+        local_device_num = os.getenv('RANK_SIZE', '1')
+    elif config.device_target == "GPU":
+        local_device_num = os.getenv('OMPI_COMM_WORLD_SIZE', '1')
+    else:
+        local_device_num = 1
+    return int(local_device_num)
+
+
+def get_local_device_num():
+    if config.device_target == "Ascend":
+        local_device_num = min(get_device_num, 8)
+    elif config.device_target == "GPU":
+        local_device_num = os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE', '1')
+    else:
+        local_device_num = 1
+    return int(local_device_num)
+
+
+def get_rank_id():
+    if config.device_target == "Ascend":
+        global_rank_id = os.getenv('RANK_ID', '0')
+    elif config.device_target == "GPU":
+        global_rank_id = os.getenv('OMPI_COMM_WORLD_RANK', '0')
+    else:
+        global_rank_id = 0
+    return int(global_rank_id)
+
+def get_job_id():
+    return "Local Job"
--- a/model_zoo/utils/model_scaffolding/example/src/moxing_adapter.py
+++ b/model_zoo/utils/model_scaffolding/example/src/moxing_adapter.py
@ -0,0 +1,141 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Moxing adapter for ModelArts"""
+
+import os
+import functools
+from mindspore import context
+from .config import config
+
+_global_sync_count = 0
+
+def get_device_id():
+    if config.device_target == "Ascend":
+        device_id = os.getenv('DEVICE_ID', '0')
+    elif config.device_target == "GPU":
+        device_id = os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', '0')
+    else:
+        device_id = 0
+    return int(device_id)
+
+
+def get_device_num():
+    if config.device_target == "Ascend":
+        local_device_num = os.getenv('RANK_SIZE', '1')
+    elif config.device_target == "GPU":
+        local_device_num = os.getenv('OMPI_COMM_WORLD_SIZE', '1')
+    else:
+        local_device_num = 1
+    return int(local_device_num)
+
+
+def get_local_device_num():
+    if config.device_target == "Ascend":
+        local_device_num = min(get_device_num, 8)
+    elif config.device_target == "GPU":
+        local_device_num = os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE', '1')
+    else:
+        local_device_num = 1
+    return int(local_device_num)
+
+
+def get_rank_id():
+    if config.device_target == "Ascend":
+        global_rank_id = os.getenv('RANK_ID', '0')
+    elif config.device_target == "GPU":
+        global_rank_id = os.getenv('OMPI_COMM_WORLD_RANK', '0')
+    else:
+        global_rank_id = 0
+    return int(global_rank_id)
+
+
+def get_job_id():
+    job_id = os.getenv('JOB_ID')
+    job_id = job_id if job_id != "" else "default"
+    return job_id
+
+def sync_data(from_path, to_path):
+    """
+    Download data from remote obs to local directory if the first url is remote url and the second one is local path
+    Upload data from local directory to remote obs in contrast.
+    """
+    import moxing as mox
+    import time
+    global _global_sync_count
+    sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
+    _global_sync_count += 1
+
+    # Each server contains 8 devices as most.
+    if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
+        print("from path: ", from_path)
+        print("to path: ", to_path)
+        mox.file.copy_parallel(from_path, to_path)
+        print("===finish data synchronization===")
+        try:
+            os.mknod(sync_lock)
+        except IOError:
+            pass
+        print("===save flag===")
+
+    while True:
+        if os.path.exists(sync_lock):
+            break
+        time.sleep(1)
+
+    print("Finish sync data from {} to {}.".format(from_path, to_path))
+
+
+def moxing_wrapper(pre_process=None, post_process=None):
+    """
+    Moxing wrapper to download dataset and upload outputs.
+    """
+    def wrapper(run_func):
+        @functools.wraps(run_func)
+        def wrapped_func(*args, **kwargs):
+            # Download data from data_url
+            if config.enable_modelarts:
+                if config.data_url:
+                    sync_data(config.data_url, config.data_path)
+                    print("Dataset downloaded: ", os.listdir(config.data_path))
+                if config.checkpoint_url:
+                    sync_data(config.checkpoint_url, config.load_path)
+                    print("Preload downloaded: ", os.listdir(config.load_path))
+                if config.train_url:
+                    sync_data(config.train_url, config.output_path)
+                    print("Workspace downloaded: ", os.listdir(config.output_path))
+
+                context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
+                config.device_num = get_device_num()
+                config.device_id = get_device_id()
+                if not os.path.exists(config.output_path):
+                    os.makedirs(config.output_path)
+
+                if pre_process:
+                    pre_process()
+
+            # Run the main function
+            run_func(*args, **kwargs)
+
+            # Upload data to train_url
+            if config.enable_modelarts:
+                if post_process:
+                    post_process()
+
+                if config.train_url:
+                    print("Start to copy output directory")
+                    sync_data(config.output_path, config.train_url)
+        return wrapped_func
+    return wrapper
--- a/model_zoo/utils/model_scaffolding/example/train.py
+++ b/model_zoo/utils/model_scaffolding/example/train.py
--- a/model_zoo/utils/model_scaffolding/src/local_adapter.py
+++ b/model_zoo/utils/model_scaffolding/src/local_adapter.py
@ -16,21 +16,46 @@
 """Local adapter"""

 import os
+from .config import config

 def get_device_id():
-    device_id = os.getenv('DEVICE_ID', '0')
+    if config.device_target == "Ascend":
+        device_id = os.getenv('DEVICE_ID', '0')
+    elif config.device_target == "GPU":
+        device_id = os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', '0')
+    else:
+        device_id = 0
    return int(device_id)


 def get_device_num():
-    device_num = os.getenv('RANK_SIZE', '1')
-    return int(device_num)
+    if config.device_target == "Ascend":
+        local_device_num = os.getenv('RANK_SIZE', '1')
+    elif config.device_target == "GPU":
+        local_device_num = os.getenv('OMPI_COMM_WORLD_SIZE', '1')
+    else:
+        local_device_num = 1
+    return int(local_device_num)
+
+
+def get_local_device_num():
+    if config.device_target == "Ascend":
+        local_device_num = min(get_device_num, 8)
+    elif config.device_target == "GPU":
+        local_device_num = os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE', '1')
+    else:
+        local_device_num = 1
+    return int(local_device_num)


 def get_rank_id():
-    global_rank_id = os.getenv('RANK_ID', '0')
+    if config.device_target == "Ascend":
+        global_rank_id = os.getenv('RANK_ID', '0')
+    elif config.device_target == "GPU":
+        global_rank_id = os.getenv('OMPI_COMM_WORLD_RANK', '0')
+    else:
+        global_rank_id = 0
    return int(global_rank_id)

-
 def get_job_id():
    return "Local Job"
--- a/model_zoo/utils/model_scaffolding/src/moxing_adapter.py
+++ b/model_zoo/utils/model_scaffolding/src/moxing_adapter.py
@ -23,17 +23,42 @@ from .config import config
 _global_sync_count = 0

 def get_device_id():
-    device_id = os.getenv('DEVICE_ID', '0')
+    if config.device_target == "Ascend":
+        device_id = os.getenv('DEVICE_ID', '0')
+    elif config.device_target == "GPU":
+        device_id = os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', '0')
+    else:
+        device_id = 0
    return int(device_id)


 def get_device_num():
-    device_num = os.getenv('RANK_SIZE', '1')
-    return int(device_num)
+    if config.device_target == "Ascend":
+        local_device_num = os.getenv('RANK_SIZE', '1')
+    elif config.device_target == "GPU":
+        local_device_num = os.getenv('OMPI_COMM_WORLD_SIZE', '1')
+    else:
+        local_device_num = 1
+    return int(local_device_num)
+
+
+def get_local_device_num():
+    if config.device_target == "Ascend":
+        local_device_num = min(get_device_num, 8)
+    elif config.device_target == "GPU":
+        local_device_num = os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE', '1')
+    else:
+        local_device_num = 1
+    return int(local_device_num)


 def get_rank_id():
-    global_rank_id = os.getenv('RANK_ID', '0')
+    if config.device_target == "Ascend":
+        global_rank_id = os.getenv('RANK_ID', '0')
+    elif config.device_target == "GPU":
+        global_rank_id = os.getenv('OMPI_COMM_WORLD_RANK', '0')
+    else:
+        global_rank_id = 0
    return int(global_rank_id)