!20630 Update model scaffolding, add envs for gpu_mpi

Merge pull request !20630 from chenhaozhe/update-model-scaffolding
2021-08-30 06:10:50 +00:00 · 2021-08-30 06:10:50 +00:00 · ed2814c3a6
parent 3b8b56c14c 36382c2672
commit ed2814c3a6
11 changed files with 415 additions and 9 deletions
--- a/model_zoo/utils/model_scaffolding/example/eval.py
+++ b/model_zoo/utils/model_scaffolding/example/eval.py
--- a/model_zoo/utils/model_scaffolding/example/src/init.py
+++ b/model_zoo/utils/model_scaffolding/example/src/init.py
--- a/model_zoo/utils/model_scaffolding/example/src/config.py
+++ b/model_zoo/utils/model_scaffolding/example/src/config.py
@ -0,0 +1,127 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Parse arguments"""
 import os
 import ast
 import argparse
 from pprint import pprint, pformat
 import yaml
 class Config:
    """
    Configuration namespace. Convert dictionary to members.
    """
    def __init__(self, cfg_dict):
        for k, v in cfg_dict.items():
            if isinstance(v, (list, tuple)):
                setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
            else:
                setattr(self, k, Config(v) if isinstance(v, dict) else v)
    def __str__(self):
        return pformat(self.__dict__)
    def __repr__(self):
        return self.__str__()
 def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
    """
    Parse command line arguments to the configuration according to the default yaml.
    Args:
        parser: Parent parser.
        cfg: Base configuration.
        helper: Helper description.
        cfg_path: Path to the default yaml config.
    """
    parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
                                     parents=[parser])
    helper = {} if helper is None else helper
    choices = {} if choices is None else choices
    for item in cfg:
        if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
            help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
            choice = choices[item] if item in choices else None
            if isinstance(cfg[item], bool):
                parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
                                    help=help_description)
            else:
                parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
                                    help=help_description)
    args = parser.parse_args()
    return args
 def parse_yaml(yaml_path):
    """
    Parse the yaml config file.
    Args:
        yaml_path: Path to the yaml config.
    """
    with open(yaml_path, 'r') as fin:
        try:
            cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
            cfgs = [x for x in cfgs]
            if len(cfgs) == 1:
                cfg_helper = {}
                cfg = cfgs[0]
                cfg_choices = {}
            elif len(cfgs) == 2:
                cfg, cfg_helper = cfgs
                cfg_choices = {}
            elif len(cfgs) == 3:
                cfg, cfg_helper, cfg_choices = cfgs
            else:
                raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml")
            print(cfg_helper)
        except:
            raise ValueError("Failed to parse yaml")
    return cfg, cfg_helper, cfg_choices
 def merge(args, cfg):
    """
    Merge the base config from yaml file and command line arguments.
    Args:
        args: Command line arguments.
        cfg: Base configuration.
    """
    args_var = vars(args)
    for item in args_var:
        cfg[item] = args_var[item]
    return cfg
 def get_config():
    """
    Get Config according to the yaml file and cli arguments.
    """
    parser = argparse.ArgumentParser(description="default name", add_help=False)
    current_dir = os.path.dirname(os.path.abspath(__file__))
    parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../default_config.yaml"),
                        help="Config file path")
    path_args, _ = parser.parse_known_args()
    default, helper, choices = parse_yaml(path_args.config_path)
    pprint(default)
    args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
    final_config = merge(args, default)
    return Config(final_config)
 config = get_config()
--- a/model_zoo/utils/model_scaffolding/example/src/dataset.py
+++ b/model_zoo/utils/model_scaffolding/example/src/dataset.py
--- a/model_zoo/utils/model_scaffolding/example/src/device_adapter.py
+++ b/model_zoo/utils/model_scaffolding/example/src/device_adapter.py
@ -0,0 +1,27 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Device adapter for ModelArts"""
 from .config import config
 if config.enable_modelarts:
    from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
 else:
    from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
 __all__ = [
    "get_device_id", "get_device_num", "get_rank_id", "get_job_id"
 ]
--- a/model_zoo/utils/model_scaffolding/example/src/foo.py
+++ b/model_zoo/utils/model_scaffolding/example/src/foo.py
--- a/model_zoo/utils/model_scaffolding/example/src/local_adapter.py
+++ b/model_zoo/utils/model_scaffolding/example/src/local_adapter.py
@ -0,0 +1,61 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Local adapter"""
 import os
 from .config import config
 def get_device_id():
    if config.device_target == "Ascend":
        device_id = os.getenv('DEVICE_ID', '0')
    elif config.device_target == "GPU":
        device_id = os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', '0')
    else:
        device_id = 0
    return int(device_id)
 def get_device_num():
    if config.device_target == "Ascend":
        local_device_num = os.getenv('RANK_SIZE', '1')
    elif config.device_target == "GPU":
        local_device_num = os.getenv('OMPI_COMM_WORLD_SIZE', '1')
    else:
        local_device_num = 1
    return int(local_device_num)
 def get_local_device_num():
    if config.device_target == "Ascend":
        local_device_num = min(get_device_num, 8)
    elif config.device_target == "GPU":
        local_device_num = os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE', '1')
    else:
        local_device_num = 1
    return int(local_device_num)
 def get_rank_id():
    if config.device_target == "Ascend":
        global_rank_id = os.getenv('RANK_ID', '0')
    elif config.device_target == "GPU":
        global_rank_id = os.getenv('OMPI_COMM_WORLD_RANK', '0')
    else:
        global_rank_id = 0
    return int(global_rank_id)
 def get_job_id():
    return "Local Job"
--- a/model_zoo/utils/model_scaffolding/example/src/moxing_adapter.py
+++ b/model_zoo/utils/model_scaffolding/example/src/moxing_adapter.py
@ -0,0 +1,141 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """Moxing adapter for ModelArts"""
 import os
 import functools
 from mindspore import context
 from .config import config
 _global_sync_count = 0
 def get_device_id():
    if config.device_target == "Ascend":
        device_id = os.getenv('DEVICE_ID', '0')
    elif config.device_target == "GPU":
        device_id = os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', '0')
    else:
        device_id = 0
    return int(device_id)
 def get_device_num():
    if config.device_target == "Ascend":
        local_device_num = os.getenv('RANK_SIZE', '1')
    elif config.device_target == "GPU":
        local_device_num = os.getenv('OMPI_COMM_WORLD_SIZE', '1')
    else:
        local_device_num = 1
    return int(local_device_num)
 def get_local_device_num():
    if config.device_target == "Ascend":
        local_device_num = min(get_device_num, 8)
    elif config.device_target == "GPU":
        local_device_num = os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE', '1')
    else:
        local_device_num = 1
    return int(local_device_num)
 def get_rank_id():
    if config.device_target == "Ascend":
        global_rank_id = os.getenv('RANK_ID', '0')
    elif config.device_target == "GPU":
        global_rank_id = os.getenv('OMPI_COMM_WORLD_RANK', '0')
    else:
        global_rank_id = 0
    return int(global_rank_id)
 def get_job_id():
    job_id = os.getenv('JOB_ID')
    job_id = job_id if job_id != "" else "default"
    return job_id
 def sync_data(from_path, to_path):
    """
    Download data from remote obs to local directory if the first url is remote url and the second one is local path
    Upload data from local directory to remote obs in contrast.
    """
    import moxing as mox
    import time
    global _global_sync_count
    sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
    _global_sync_count += 1
    # Each server contains 8 devices as most.
    if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
        print("from path: ", from_path)
        print("to path: ", to_path)
        mox.file.copy_parallel(from_path, to_path)
        print("===finish data synchronization===")
        try:
            os.mknod(sync_lock)
        except IOError:
            pass
        print("===save flag===")
    while True:
        if os.path.exists(sync_lock):
            break
        time.sleep(1)
    print("Finish sync data from {} to {}.".format(from_path, to_path))
 def moxing_wrapper(pre_process=None, post_process=None):
    """
    Moxing wrapper to download dataset and upload outputs.
    """
    def wrapper(run_func):
        @functools.wraps(run_func)
        def wrapped_func(*args, **kwargs):
            # Download data from data_url
            if config.enable_modelarts:
                if config.data_url:
                    sync_data(config.data_url, config.data_path)
                    print("Dataset downloaded: ", os.listdir(config.data_path))
                if config.checkpoint_url:
                    sync_data(config.checkpoint_url, config.load_path)
                    print("Preload downloaded: ", os.listdir(config.load_path))
                if config.train_url:
                    sync_data(config.train_url, config.output_path)
                    print("Workspace downloaded: ", os.listdir(config.output_path))
                context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
                config.device_num = get_device_num()
                config.device_id = get_device_id()
                if not os.path.exists(config.output_path):
                    os.makedirs(config.output_path)
                if pre_process:
                    pre_process()
            # Run the main function
            run_func(*args, **kwargs)
            # Upload data to train_url
            if config.enable_modelarts:
                if post_process:
                    post_process()
                if config.train_url:
                    print("Start to copy output directory")
                    sync_data(config.output_path, config.train_url)
        return wrapped_func
    return wrapper
--- a/model_zoo/utils/model_scaffolding/example/train.py
+++ b/model_zoo/utils/model_scaffolding/example/train.py
--- a/model_zoo/utils/model_scaffolding/src/local_adapter.py
+++ b/model_zoo/utils/model_scaffolding/src/local_adapter.py
@ -16,21 +16,46 @@
 """Local adapter"""
 import os
 from .config import config
 def get_device_id():
    if config.device_target == "Ascend":
        device_id = os.getenv('DEVICE_ID', '0')
    elif config.device_target == "GPU":
        device_id = os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', '0')
    else:
        device_id = 0
    return int(device_id)
 def get_device_num():
-    device_num = os.getenv('RANK_SIZE', '1')
+    if config.device_target == "Ascend":
-    return int(device_num)
+        local_device_num = os.getenv('RANK_SIZE', '1')
    elif config.device_target == "GPU":
        local_device_num = os.getenv('OMPI_COMM_WORLD_SIZE', '1')
    else:
        local_device_num = 1
    return int(local_device_num)
 def get_local_device_num():
    if config.device_target == "Ascend":
        local_device_num = min(get_device_num, 8)
    elif config.device_target == "GPU":
        local_device_num = os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE', '1')
    else:
        local_device_num = 1
    return int(local_device_num)
 def get_rank_id():
    if config.device_target == "Ascend":
        global_rank_id = os.getenv('RANK_ID', '0')
    elif config.device_target == "GPU":
        global_rank_id = os.getenv('OMPI_COMM_WORLD_RANK', '0')
    else:
        global_rank_id = 0
    return int(global_rank_id)
 def get_job_id():
    return "Local Job"
--- a/model_zoo/utils/model_scaffolding/src/moxing_adapter.py
+++ b/model_zoo/utils/model_scaffolding/src/moxing_adapter.py
@ -23,17 +23,42 @@ from .config import config
 _global_sync_count = 0
 def get_device_id():
    if config.device_target == "Ascend":
        device_id = os.getenv('DEVICE_ID', '0')
    elif config.device_target == "GPU":
        device_id = os.getenv('OMPI_COMM_WORLD_LOCAL_RANK', '0')
    else:
        device_id = 0
    return int(device_id)
 def get_device_num():
-    device_num = os.getenv('RANK_SIZE', '1')
+    if config.device_target == "Ascend":
-    return int(device_num)
+        local_device_num = os.getenv('RANK_SIZE', '1')
    elif config.device_target == "GPU":
        local_device_num = os.getenv('OMPI_COMM_WORLD_SIZE', '1')
    else:
        local_device_num = 1
    return int(local_device_num)
 def get_local_device_num():
    if config.device_target == "Ascend":
        local_device_num = min(get_device_num, 8)
    elif config.device_target == "GPU":
        local_device_num = os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE', '1')
    else:
        local_device_num = 1
    return int(local_device_num)
 def get_rank_id():
    if config.device_target == "Ascend":
        global_rank_id = os.getenv('RANK_ID', '0')
    elif config.device_target == "GPU":
        global_rank_id = os.getenv('OMPI_COMM_WORLD_RANK', '0')
    else:
        global_rank_id = 0
    return int(global_rank_id)