!18887 merge dynamics

Merge pull request !18887 from Maige/dynamics
2021-06-26 08:04:58 +00:00 · 2021-06-26 08:04:58 +00:00 · bc1a1fc8be
parent 2384cadeb9 99a1ab0f10
commit bc1a1fc8be
8 changed files with 412 additions and 16 deletions
--- a/model_zoo/research/hpc/molecular_dynamics/README.md
+++ b/model_zoo/research/hpc/molecular_dynamics/README.md
@ -74,9 +74,15 @@ In `deepmodeling/deepmd-kit/source`:
    ├── script
    │   ├── eval.sh                 # evaluation script
    ├── src
+    │   ├── src
+    │       ├── config.py           # Parameter config
+    │       ├── moxing_adapter.py   # modelarts device configuration
+    │       ├── device_adapter.py   # Device Config
+    │       ├── local_adapter.py    # local device config
    │   ├── descriptor.py           # descriptor function
    │   └── network.py              # MD simulation architecture
    └── eval.py                     # evaluation interface
+    └── default_config.yaml         # config file
 ```

 ### Training Process
@ -88,7 +94,7 @@ To Be Done
 After installing MindSpore via the official website, you can start evaluation as follows:

 ```shell
-python eval.py --dataset_path [DATASET_PATH] --checkpoint_path [CHECKPOINT_PATH]
+python eval.py --dataset_path [DATASET_PATH] --checkpoint_path [CHECKPOINT_PATH] --baseline_path [BASELINE_PATH]
 ```

 > checkpoint can be trained by using DeePMD-kit, and convert into the ckpt of MindSpore.
@ -102,6 +108,39 @@ energy: -29944.03
 atom_energy: -94.38766   -94.294426  -94.39194   -94.70758   -94.51311   -94.457954 ...
 ```

+- running on ModelArts
+- If you want to train the model on modelarts, you can refer to the [official guidance document] of modelarts (https://support.huaweicloud.com/modelarts/)
+
+```python
+#  Example of using distributed training dpn on modelarts :
+#  Data set storage method
+
+#  ├── molecular_dynamics_dataset                               # dataset dir
+#    ├──baseline.npz                                            # baseline dataset
+#    ├──input_tensor.npz                                        # infer input dataset
+#    ├──water_md.ckpt                                           # checkpoint
+
+# Choose either a (modify yaml file parameters) or b (modelArts create training job to modify parameters) 。
+# Example of using model inference on modelarts
+# (1) Place the trained model to the corresponding position of the bucket。
+# (2) chocie a or b。
+#        a.set "enable_modelarts=True"
+#          set "checkpoint_path=/cache/data/water_md.ckpt"
+#          set "dataset_path=/cache/data/input_tensor.npz"
+#          set "baseline_path=/cache/data/baseline.npz"
+
+#       b. Add "enable_modelarts=True" parameter on the interface of modearts。
+#          Set the parameters required by method a on the modelarts interface
+#          Note: The path parameter does not need to be quoted
+
+# (3) Set the path of the network configuration file "_config_path=/The path of config in default_config.yaml/"
+# (4) Set the code path on the modelarts interface "/path/molecular_dynamics"。
+# (5) Set the model's startup file on the modelarts interface "eval.py" 。
+# (6) Set the data path of the model on the modelarts interface ".../molecular_dynamics"(choices molecular_dynamics Folder path) ,
+# The output path of the model "Output file path" and the log path of the model "Job log path"  。
+# (7) Start model inference。
+```
+
 ## ModelZoo Homepage

 Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo).
--- a/model_zoo/research/hpc/molecular_dynamics/default_config.yaml
+++ b/model_zoo/research/hpc/molecular_dynamics/default_config.yaml
@ -0,0 +1,32 @@
+# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unlesee you know exactly what you are doing)
+enable_modelarts: False
+# url for modelarts
+data_url: ""
+train_url: ""
+checkpoint_url: ""
+# path for local
+data_path: "/cache/data"
+output_path: "/cache/train"
+load_path: "/cache/checkpoint_path"
+device_target: "Ascend"
+enable_profiling: False
+
+# ======================================================================================
+# Eval options
+checkpoint_path: ""
+dataset_path: ""
+
+---
+# Help description for each configuration
+enable_modelarts: "Whether training on modelarts default: False"
+data_url: "Url for modelarts"
+train_url: "Url for modelarts"
+data_path: "The location of input data"
+output_pah: "The location of the output file"
+device_target: "device id of GPU or Ascend. (Default: None)"
+enable_profiling: "Whether enable profiling while training default: False"
+file_name: "CNN&CTC output air name"
+file_format: "choices [AIR, MINDIR]"
+ckpt_file: "CNN&CTC ckpt file"
+checkpoint_path: "Checkpoint file path"
+dataset_path: "Datasetpath"
--- a/model_zoo/research/hpc/molecular_dynamics/eval.py
+++ b/model_zoo/research/hpc/molecular_dynamics/eval.py
@ -13,25 +13,30 @@
 # limitations under the License.
 # ============================================================================
 """eval."""
-import argparse
 import numpy as np
-
 import mindspore.common.dtype as mstype
 from mindspore import Tensor
 from mindspore import context
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from src.network import Network
+from src.model_utils.config import config
+from src.model_utils.moxing_adapter import moxing_wrapper

-parser = argparse.ArgumentParser(description='MD Simulation')
-parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
-parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
-args_opt = parser.parse_args()

-context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
+context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=config.device_target)

-if __name__ == '__main__':
+
+def modelarts_pre_process():
+    pass
+
+
+@moxing_wrapper(pre_process=modelarts_pre_process)
+def model_eval():
+    """
+    infer network
+    """
    # get input data
-    r = np.load(args_opt.dataset_path)
+    r = np.load(config.dataset_path)
    d_coord, d_nlist, avg, std, atype, nlist = r['d_coord'], r['d_nlist'], r['avg'], r['std'], r['atype'], r['nlist']
    batch_size = 1
    atype_tensor = Tensor(atype)
@ -46,10 +51,14 @@ if __name__ == '__main__':
    frames = Tensor(frames)
    # evaluation
    net = Network()
-    param_dict = load_checkpoint(args_opt.checkpoint_path)
+    param_dict = load_checkpoint(config.checkpoint_path)
    load_param_into_net(net, param_dict)
    net.to_float(mstype.float32)
    energy, atom_ener, _ = \
        net(d_coord_tensor, d_nlist_tensor, frames, avg_tensor, std_tensor, atype_tensor, nlist_tensor)
    print('energy:', energy)
    print('atom_energy:', atom_ener)
+
+
+if __name__ == '__main__':
+    model_eval()
--- a/model_zoo/research/hpc/molecular_dynamics/src/model_utils/init.py
+++ b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/init.py
--- a/model_zoo/research/hpc/molecular_dynamics/src/model_utils/config.py
+++ b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/config.py
@ -0,0 +1,130 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License Version 2.0(the "License");
+# you may not use this file except in compliance with the License.
+# you may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0#
+#
+# Unless required by applicable law or agreed to in writing software
+# distributed under the License is distributed on an "AS IS" BASIS
+# WITHOUT WARRANT IES OR CONITTONS OF ANY KIND， either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================================
+
+"""Parse arguments"""
+import os
+import ast
+import argparse
+from pprint import pprint, pformat
+import yaml
+
+
+_config_path = '../../default_config.yaml'
+
+
+class Config:
+    """
+    Configuration namespace. Convert dictionary to members
+    """
+    def __init__(self, cfg_dict):
+        for k, v in cfg_dict.items():
+            if isinstance(v, (list, tuple)):
+                setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
+            else:
+                setattr(self, k, Config(v) if isinstance(v, dict) else v)
+
+    def __str__(self):
+        return pformat(self.__dict__)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path='default_config.yaml'):
+    """
+    Parse command line arguments to the configuration according to the default yaml
+
+    Args:
+        parser: Parent parser
+        cfg: Base configuration
+        helper: Helper description
+        cfg_path: Path to the default yaml config
+    """
+    parser = argparse.ArgumentParser(description='[REPLACE THIS at config.py]',
+                                     parents=[parser])
+    helper = {} if helper is None else helper
+    choices = {} if choices is None else choices
+    for item in cfg:
+        if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
+            help_description = helper[item] if item in helper else 'Please reference to {}'.format(cfg_path)
+            choice = choices[item] if item in choices else None
+            if isinstance(cfg[item], bool):
+                parser.add_argument('--' + item, type=ast.literal_eval, default=cfg[item], choices=choice,
+                                    help=help_description)
+            else:
+                parser.add_argument('--' + item, type=type(cfg[item]), default=cfg[item], choices=choice,
+                                    help=help_description)
+    args = parser.parse_args()
+    return args
+
+
+def parse_yaml(yaml_path):
+    """
+    Parse the yaml config file
+
+    Args:
+        yaml_path: Path to the yaml config
+    """
+    with open(yaml_path, 'r') as fin:
+        try:
+            cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
+            cfgs = [x for x in cfgs]
+            if len(cfgs) == 1:
+                cfg_helper = {}
+                cfg = cfgs[0]
+                cfg_choices = {}
+            elif len(cfgs) == 2:
+                cfg, cfg_helper = cfgs
+                cfg_choices = {}
+            elif len(cfgs) == 3:
+                cfg, cfg_helper, cfg_choices = cfgs
+            else:
+                raise ValueError('At most 3 docs (config description for help, choices) are supported in config yaml')
+            print(cfg_helper)
+        except:
+            raise ValueError('Failed to parse yaml')
+    return cfg, cfg_helper, cfg_choices
+
+
+def merge(args, cfg):
+    """
+    Merge the base config from yaml file and command line arguments
+
+    Args:
+        args: command line arguments
+        cfg: Base configuration
+    """
+    args_var = vars(args)
+    for item in args_var:
+        cfg[item] = args_var[item]
+    return cfg
+
+
+def get_config():
+    """
+    Get Config according to the yaml file and cli arguments
+    """
+    parser = argparse.ArgumentParser(description='default name', add_help=False)
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parser.add_argument('--config_path', type=str, default=os.path.join(current_dir, _config_path),
+                        help='Config file path')
+    path_args, _ = parser.parse_known_args()
+    default, helper, choices = parse_yaml(path_args.config_path)
+    pprint(default)
+    args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path)
+    final_config = merge(args, default)
+    return Config(final_config)
+
+config = get_config()
--- a/model_zoo/research/hpc/molecular_dynamics/src/model_utils/device_adapter.py
+++ b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/device_adapter.py
@ -0,0 +1,26 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License Version 2.0(the "License");
+# you may not use this file except in compliance with the License.
+# you may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0#
+#
+# Unless required by applicable law or agreed to in writing software
+# distributed under the License is distributed on an "AS IS" BASIS
+# WITHOUT WARRANT IES OR CONITTONS OF ANY KIND， either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================================
+
+"""Device adapter for ModelArts"""
+
+from .config import config
+if config.enable_modelarts:
+    from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+else:
+    from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
+
+__all__ = [
+    'get_device_id', 'get_device_num', 'get_job_id', 'get_rank_id'
+]
--- a/model_zoo/research/hpc/molecular_dynamics/src/model_utils/local_adapter.py
+++ b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/local_adapter.py
@ -0,0 +1,36 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License Version 2.0(the "License");
+# you may not use this file except in compliance with the License.
+# you may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0#
+#
+# Unless required by applicable law or agreed to in writing software
+# distributed under the License is distributed on an "AS IS" BASIS
+# WITHOUT WARRANT IES OR CONITTONS OF ANY KIND， either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================================
+
+"""Local adapter"""
+
+import os
+
+def get_device_id():
+    device_id = os.getenv('DEVICE_ID', '0')
+    return int(device_id)
+
+
+def get_device_num():
+    device_num = os.getenv('RANK_SIZE', '1')
+    return int(device_num)
+
+
+def get_rank_id():
+    global_rank_id = os.getenv('RANK_ID', '0')
+    return int(global_rank_id)
+
+
+def get_job_id():
+    return 'Local Job'
--- a/model_zoo/research/hpc/molecular_dynamics/src/model_utils/moxing_adapter.py
+++ b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/moxing_adapter.py
@ -0,0 +1,124 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License Version 2.0(the "License");
+# you may not use this file except in compliance with the License.
+# you may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0#
+#
+# Unless required by applicable law or agreed to in writing software
+# distributed under the License is distributed on an "AS IS" BASIS
+# WITHOUT WARRANT IES OR CONITTONS OF ANY KIND， either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================================
+
+"""Moxing adapter for ModelArts"""
+
+import os
+import functools
+from mindspore import context
+from .config import config
+
+
+_global_syn_count = 0
+
+
+def get_device_id():
+    device_id = os.getenv('DEVICE_ID', '0')
+    return int(device_id)
+
+
+def get_device_num():
+    device_num = os.getenv('RANK_SIZE', '1')
+    return int(device_num)
+
+
+def get_rank_id():
+    global_rank_id = os.getenv('RANK_ID', '0')
+    return int(global_rank_id)
+
+
+def get_job_id():
+    job_id = os.getenv('JOB_ID')
+    job_id = job_id if job_id != "" else "default"
+    return job_id
+
+
+def sync_data(from_path, to_path):
+    """
+    Download data from remote obs to local directory if the first url is remote url and the second one is local
+    Uploca data from local directory to remote obs in contrast
+    """
+    import moxing as mox
+    import time
+    global _global_syn_count
+    sync_lock = '/tmp/copy_sync.lock' + str(_global_syn_count)
+    _global_syn_count += 1
+
+    # Each server contains 8 devices as most
+    if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
+        print('from path: ', from_path)
+        print('to path: ', to_path)
+        mox.file.copy_parallel(from_path, to_path)
+        print('===finished data synchronization===')
+        try:
+            os.mknod(sync_lock)
+        except IOError:
+            pass
+        print('===save flag===')
+
+    while True:
+        if os.path.exists(sync_lock):
+            break
+        time.sleep(1)
+    print('Finish sync data from {} to {}'.format(from_path, to_path))
+
+
+def moxing_wrapper(pre_process=None, post_process=None):
+    """
+    Moxing wrapper to download dataset and upload outputs
+    """
+    def wrapper(run_func):
+        @functools.wraps(run_func)
+        def wrapped_func(*args, **kwargs):
+            # Download data from data_url
+            if config.enable_modelarts:
+                if config.data_url:
+                    sync_data(config.data_url, config.data_path)
+                    print('Dataset downloaded: ', os.listdir(config.data_path))
+                if config.checkpoint_url:
+                    if not os.path.exists(config.load_path):
+                        # os.makedirs(config.load_path)
+                        print('=' * 20 + 'makedirs')
+                        if os.path.isdir(config.load_path):
+                            print('=' * 20 + 'makedirs success')
+                        else:
+                            print('=' * 20 + 'makedirs fail')
+                    sync_data(config.checkpoint_url, config.load_path)
+                    print('Preload downloaded: ', os.listdir(config.load_path))
+                if config.train_url:
+                    sync_data(config.train_url, config.output_path)
+                    print('Workspace downloaded: ', os.listdir(config.output_path))
+
+                context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
+                config.device_num = get_device_num()
+                config.device_id = get_device_id()
+                if not os.path.exists(config.output_path):
+                    os.makedirs(config.output_path)
+
+                if pre_process:
+                    pre_process()
+
+            run_func(*args, **kwargs)
+
+            # Upload data to train_url
+            if config.enable_modelarts:
+                if post_process:
+                    post_process()
+
+                if config.train_url:
+                    print('Start to copy output directory')
+                    sync_data(config.output_path, config.train_url)
+        return wrapped_func
+    return wrapper