diff --git a/model_zoo/research/hpc/molecular_dynamics/README.md b/model_zoo/research/hpc/molecular_dynamics/README.md index 585c0d01c77..7ca514b59c6 100644 --- a/model_zoo/research/hpc/molecular_dynamics/README.md +++ b/model_zoo/research/hpc/molecular_dynamics/README.md @@ -70,13 +70,19 @@ In `deepmodeling/deepmd-kit/source`: ```shell ├── md - ├── README.md # descriptions about MD + ├── README.md # descriptions about MD ├── script - │ ├── eval.sh # evaluation script + │ ├── eval.sh # evaluation script ├── src - │ ├── descriptor.py # descriptor function - │ └── network.py # MD simulation architecture - └── eval.py # evaluation interface + │ ├── src + │ ├── config.py # Parameter config + │ ├── moxing_adapter.py # modelarts device configuration + │ ├── device_adapter.py # Device Config + │ ├── local_adapter.py # local device config + │ ├── descriptor.py # descriptor function + │ └── network.py # MD simulation architecture + └── eval.py # evaluation interface + └── default_config.yaml # config file ``` ### Training Process @@ -88,7 +94,7 @@ To Be Done After installing MindSpore via the official website, you can start evaluation as follows: ```shell -python eval.py --dataset_path [DATASET_PATH] --checkpoint_path [CHECKPOINT_PATH] +python eval.py --dataset_path [DATASET_PATH] --checkpoint_path [CHECKPOINT_PATH] --baseline_path [BASELINE_PATH] ``` > checkpoint can be trained by using DeePMD-kit, and convert into the ckpt of MindSpore. @@ -102,6 +108,39 @@ energy: -29944.03 atom_energy: -94.38766 -94.294426 -94.39194 -94.70758 -94.51311 -94.457954 ... ``` +- running on ModelArts +- If you want to train the model on modelarts, you can refer to the [official guidance document] of modelarts (https://support.huaweicloud.com/modelarts/) + +```python +# Example of using distributed training dpn on modelarts : +# Data set storage method + +# ├── molecular_dynamics_dataset # dataset dir +# ├──baseline.npz # baseline dataset +# ├──input_tensor.npz # infer input dataset +# ├──water_md.ckpt # checkpoint + +# Choose either a (modify yaml file parameters) or b (modelArts create training job to modify parameters) 。 +# Example of using model inference on modelarts +# (1) Place the trained model to the corresponding position of the bucket。 +# (2) chocie a or b。 +# a.set "enable_modelarts=True" +# set "checkpoint_path=/cache/data/water_md.ckpt" +# set "dataset_path=/cache/data/input_tensor.npz" +# set "baseline_path=/cache/data/baseline.npz" + +# b. Add "enable_modelarts=True" parameter on the interface of modearts。 +# Set the parameters required by method a on the modelarts interface +# Note: The path parameter does not need to be quoted + +# (3) Set the path of the network configuration file "_config_path=/The path of config in default_config.yaml/" +# (4) Set the code path on the modelarts interface "/path/molecular_dynamics"。 +# (5) Set the model's startup file on the modelarts interface "eval.py" 。 +# (6) Set the data path of the model on the modelarts interface ".../molecular_dynamics"(choices molecular_dynamics Folder path) , +# The output path of the model "Output file path" and the log path of the model "Job log path" 。 +# (7) Start model inference。 +``` + ## ModelZoo Homepage Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). diff --git a/model_zoo/research/hpc/molecular_dynamics/default_config.yaml b/model_zoo/research/hpc/molecular_dynamics/default_config.yaml new file mode 100644 index 00000000000..1af4b6d7739 --- /dev/null +++ b/model_zoo/research/hpc/molecular_dynamics/default_config.yaml @@ -0,0 +1,32 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unlesee you know exactly what you are doing) +enable_modelarts: False +# url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# path for local +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +device_target: "Ascend" +enable_profiling: False + +# ====================================================================================== +# Eval options +checkpoint_path: "" +dataset_path: "" + +--- +# Help description for each configuration +enable_modelarts: "Whether training on modelarts default: False" +data_url: "Url for modelarts" +train_url: "Url for modelarts" +data_path: "The location of input data" +output_pah: "The location of the output file" +device_target: "device id of GPU or Ascend. (Default: None)" +enable_profiling: "Whether enable profiling while training default: False" +file_name: "CNN&CTC output air name" +file_format: "choices [AIR, MINDIR]" +ckpt_file: "CNN&CTC ckpt file" +checkpoint_path: "Checkpoint file path" +dataset_path: "Datasetpath" diff --git a/model_zoo/research/hpc/molecular_dynamics/eval.py b/model_zoo/research/hpc/molecular_dynamics/eval.py index 2f9799e256a..703b73731f4 100644 --- a/model_zoo/research/hpc/molecular_dynamics/eval.py +++ b/model_zoo/research/hpc/molecular_dynamics/eval.py @@ -13,25 +13,30 @@ # limitations under the License. # ============================================================================ """eval.""" -import argparse import numpy as np - import mindspore.common.dtype as mstype from mindspore import Tensor from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net from src.network import Network +from src.model_utils.config import config +from src.model_utils.moxing_adapter import moxing_wrapper -parser = argparse.ArgumentParser(description='MD Simulation') -parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') -parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') -args_opt = parser.parse_args() -context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend") +context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=config.device_target) -if __name__ == '__main__': + +def modelarts_pre_process(): + pass + + +@moxing_wrapper(pre_process=modelarts_pre_process) +def model_eval(): + """ + infer network + """ # get input data - r = np.load(args_opt.dataset_path) + r = np.load(config.dataset_path) d_coord, d_nlist, avg, std, atype, nlist = r['d_coord'], r['d_nlist'], r['avg'], r['std'], r['atype'], r['nlist'] batch_size = 1 atype_tensor = Tensor(atype) @@ -46,10 +51,14 @@ if __name__ == '__main__': frames = Tensor(frames) # evaluation net = Network() - param_dict = load_checkpoint(args_opt.checkpoint_path) + param_dict = load_checkpoint(config.checkpoint_path) load_param_into_net(net, param_dict) net.to_float(mstype.float32) energy, atom_ener, _ = \ net(d_coord_tensor, d_nlist_tensor, frames, avg_tensor, std_tensor, atype_tensor, nlist_tensor) print('energy:', energy) print('atom_energy:', atom_ener) + + +if __name__ == '__main__': + model_eval() diff --git a/model_zoo/research/hpc/molecular_dynamics/src/model_utils/__init__.py b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/model_zoo/research/hpc/molecular_dynamics/src/model_utils/config.py b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/config.py new file mode 100644 index 00000000000..efc856cf0cf --- /dev/null +++ b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/config.py @@ -0,0 +1,130 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License Version 2.0(the "License"); +# you may not use this file except in compliance with the License. +# you may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0# +# +# Unless required by applicable law or agreed to in writing software +# distributed under the License is distributed on an "AS IS" BASIS +# WITHOUT WARRANT IES OR CONITTONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==================================================================================== + +"""Parse arguments""" +import os +import ast +import argparse +from pprint import pprint, pformat +import yaml + + +_config_path = '../../default_config.yaml' + + +class Config: + """ + Configuration namespace. Convert dictionary to members + """ + def __init__(self, cfg_dict): + for k, v in cfg_dict.items(): + if isinstance(v, (list, tuple)): + setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) + else: + setattr(self, k, Config(v) if isinstance(v, dict) else v) + + def __str__(self): + return pformat(self.__dict__) + + def __repr__(self): + return self.__str__() + + +def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path='default_config.yaml'): + """ + Parse command line arguments to the configuration according to the default yaml + + Args: + parser: Parent parser + cfg: Base configuration + helper: Helper description + cfg_path: Path to the default yaml config + """ + parser = argparse.ArgumentParser(description='[REPLACE THIS at config.py]', + parents=[parser]) + helper = {} if helper is None else helper + choices = {} if choices is None else choices + for item in cfg: + if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): + help_description = helper[item] if item in helper else 'Please reference to {}'.format(cfg_path) + choice = choices[item] if item in choices else None + if isinstance(cfg[item], bool): + parser.add_argument('--' + item, type=ast.literal_eval, default=cfg[item], choices=choice, + help=help_description) + else: + parser.add_argument('--' + item, type=type(cfg[item]), default=cfg[item], choices=choice, + help=help_description) + args = parser.parse_args() + return args + + +def parse_yaml(yaml_path): + """ + Parse the yaml config file + + Args: + yaml_path: Path to the yaml config + """ + with open(yaml_path, 'r') as fin: + try: + cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) + cfgs = [x for x in cfgs] + if len(cfgs) == 1: + cfg_helper = {} + cfg = cfgs[0] + cfg_choices = {} + elif len(cfgs) == 2: + cfg, cfg_helper = cfgs + cfg_choices = {} + elif len(cfgs) == 3: + cfg, cfg_helper, cfg_choices = cfgs + else: + raise ValueError('At most 3 docs (config description for help, choices) are supported in config yaml') + print(cfg_helper) + except: + raise ValueError('Failed to parse yaml') + return cfg, cfg_helper, cfg_choices + + +def merge(args, cfg): + """ + Merge the base config from yaml file and command line arguments + + Args: + args: command line arguments + cfg: Base configuration + """ + args_var = vars(args) + for item in args_var: + cfg[item] = args_var[item] + return cfg + + +def get_config(): + """ + Get Config according to the yaml file and cli arguments + """ + parser = argparse.ArgumentParser(description='default name', add_help=False) + current_dir = os.path.dirname(os.path.abspath(__file__)) + parser.add_argument('--config_path', type=str, default=os.path.join(current_dir, _config_path), + help='Config file path') + path_args, _ = parser.parse_known_args() + default, helper, choices = parse_yaml(path_args.config_path) + pprint(default) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) + return Config(final_config) + +config = get_config() diff --git a/model_zoo/research/hpc/molecular_dynamics/src/model_utils/device_adapter.py b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/device_adapter.py new file mode 100644 index 00000000000..ad8415af0f6 --- /dev/null +++ b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/device_adapter.py @@ -0,0 +1,26 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License Version 2.0(the "License"); +# you may not use this file except in compliance with the License. +# you may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0# +# +# Unless required by applicable law or agreed to in writing software +# distributed under the License is distributed on an "AS IS" BASIS +# WITHOUT WARRANT IES OR CONITTONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==================================================================================== + +"""Device adapter for ModelArts""" + +from .config import config +if config.enable_modelarts: + from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id +else: + from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id + +__all__ = [ + 'get_device_id', 'get_device_num', 'get_job_id', 'get_rank_id' +] diff --git a/model_zoo/research/hpc/molecular_dynamics/src/model_utils/local_adapter.py b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/local_adapter.py new file mode 100644 index 00000000000..4ff88c4fba5 --- /dev/null +++ b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/local_adapter.py @@ -0,0 +1,36 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License Version 2.0(the "License"); +# you may not use this file except in compliance with the License. +# you may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0# +# +# Unless required by applicable law or agreed to in writing software +# distributed under the License is distributed on an "AS IS" BASIS +# WITHOUT WARRANT IES OR CONITTONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==================================================================================== + +"""Local adapter""" + +import os + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + return 'Local Job' diff --git a/model_zoo/research/hpc/molecular_dynamics/src/model_utils/moxing_adapter.py b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/moxing_adapter.py new file mode 100644 index 00000000000..c2d2282402b --- /dev/null +++ b/model_zoo/research/hpc/molecular_dynamics/src/model_utils/moxing_adapter.py @@ -0,0 +1,124 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License Version 2.0(the "License"); +# you may not use this file except in compliance with the License. +# you may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0# +# +# Unless required by applicable law or agreed to in writing software +# distributed under the License is distributed on an "AS IS" BASIS +# WITHOUT WARRANT IES OR CONITTONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==================================================================================== + +"""Moxing adapter for ModelArts""" + +import os +import functools +from mindspore import context +from .config import config + + +_global_syn_count = 0 + + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + job_id = os.getenv('JOB_ID') + job_id = job_id if job_id != "" else "default" + return job_id + + +def sync_data(from_path, to_path): + """ + Download data from remote obs to local directory if the first url is remote url and the second one is local + Uploca data from local directory to remote obs in contrast + """ + import moxing as mox + import time + global _global_syn_count + sync_lock = '/tmp/copy_sync.lock' + str(_global_syn_count) + _global_syn_count += 1 + + # Each server contains 8 devices as most + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print('from path: ', from_path) + print('to path: ', to_path) + mox.file.copy_parallel(from_path, to_path) + print('===finished data synchronization===') + try: + os.mknod(sync_lock) + except IOError: + pass + print('===save flag===') + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + print('Finish sync data from {} to {}'.format(from_path, to_path)) + + +def moxing_wrapper(pre_process=None, post_process=None): + """ + Moxing wrapper to download dataset and upload outputs + """ + def wrapper(run_func): + @functools.wraps(run_func) + def wrapped_func(*args, **kwargs): + # Download data from data_url + if config.enable_modelarts: + if config.data_url: + sync_data(config.data_url, config.data_path) + print('Dataset downloaded: ', os.listdir(config.data_path)) + if config.checkpoint_url: + if not os.path.exists(config.load_path): + # os.makedirs(config.load_path) + print('=' * 20 + 'makedirs') + if os.path.isdir(config.load_path): + print('=' * 20 + 'makedirs success') + else: + print('=' * 20 + 'makedirs fail') + sync_data(config.checkpoint_url, config.load_path) + print('Preload downloaded: ', os.listdir(config.load_path)) + if config.train_url: + sync_data(config.train_url, config.output_path) + print('Workspace downloaded: ', os.listdir(config.output_path)) + + context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + config.device_num = get_device_num() + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + + if pre_process: + pre_process() + + run_func(*args, **kwargs) + + # Upload data to train_url + if config.enable_modelarts: + if post_process: + post_process() + + if config.train_url: + print('Start to copy output directory') + sync_data(config.output_path, config.train_url) + return wrapped_func + return wrapper