From a703321955981051e867327f0f93a0c49d3f3d6a Mon Sep 17 00:00:00 2001 From: wyabc123 Date: Fri, 11 Jun 2021 14:44:54 +0800 Subject: [PATCH] clould --- model_zoo/official/nlp/lstm/README.md | 14 +- model_zoo/official/nlp/lstm/README_CN.md | 14 +- .../official/nlp/lstm/config_ascend.yaml | 82 +++++++++++ .../official/nlp/lstm/config_ascend_8p.yaml | 84 ++++++++++++ .../official/nlp/lstm/default_config.yaml | 77 +++++++++++ model_zoo/official/nlp/lstm/eval.py | 70 +++++----- model_zoo/official/nlp/lstm/export.py | 62 ++++----- model_zoo/official/nlp/lstm/postprocess.py | 23 +--- model_zoo/official/nlp/lstm/preprocess.py | 27 ++-- .../run_distribute_train_ascend.sh | 7 +- .../{script => scripts}/run_eval_ascend.sh | 11 +- .../lstm/{script => scripts}/run_eval_cpu.sh | 11 +- .../lstm/{script => scripts}/run_eval_gpu.sh | 11 +- .../lstm/{script => scripts}/run_infer_310.sh | 7 +- .../{script => scripts}/run_train_ascend.sh | 9 +- .../lstm/{script => scripts}/run_train_cpu.sh | 11 +- .../lstm/{script => scripts}/run_train_gpu.sh | 11 +- model_zoo/official/nlp/lstm/src/__init__.py | 8 ++ model_zoo/official/nlp/lstm/src/config.py | 76 ----------- model_zoo/official/nlp/lstm/src/dataset.py | 1 - model_zoo/official/nlp/lstm/src/imdb.py | 1 - .../nlp/lstm/src/model_utils/__init__.py | 0 .../nlp/lstm/src/model_utils/config.py | 127 ++++++++++++++++++ .../lstm/src/model_utils/device_adapter.py | 27 ++++ .../nlp/lstm/src/model_utils/local_adapter.py | 36 +++++ .../lstm/src/model_utils/moxing_adapter.py | 122 +++++++++++++++++ model_zoo/official/nlp/lstm/train.py | 122 +++++++---------- 27 files changed, 766 insertions(+), 285 deletions(-) create mode 100644 model_zoo/official/nlp/lstm/config_ascend.yaml create mode 100644 model_zoo/official/nlp/lstm/config_ascend_8p.yaml create mode 100644 model_zoo/official/nlp/lstm/default_config.yaml rename model_zoo/official/nlp/lstm/{script => scripts}/run_distribute_train_ascend.sh (88%) rename model_zoo/official/nlp/lstm/{script => scripts}/run_eval_ascend.sh (83%) rename model_zoo/official/nlp/lstm/{script => scripts}/run_eval_cpu.sh (84%) rename model_zoo/official/nlp/lstm/{script => scripts}/run_eval_gpu.sh (85%) rename model_zoo/official/nlp/lstm/{script => scripts}/run_infer_310.sh (90%) rename model_zoo/official/nlp/lstm/{script => scripts}/run_train_ascend.sh (87%) rename model_zoo/official/nlp/lstm/{script => scripts}/run_train_cpu.sh (85%) rename model_zoo/official/nlp/lstm/{script => scripts}/run_train_gpu.sh (85%) delete mode 100644 model_zoo/official/nlp/lstm/src/config.py create mode 100644 model_zoo/official/nlp/lstm/src/model_utils/__init__.py create mode 100644 model_zoo/official/nlp/lstm/src/model_utils/config.py create mode 100644 model_zoo/official/nlp/lstm/src/model_utils/device_adapter.py create mode 100644 model_zoo/official/nlp/lstm/src/model_utils/local_adapter.py create mode 100644 model_zoo/official/nlp/lstm/src/model_utils/moxing_adapter.py diff --git a/model_zoo/official/nlp/lstm/README.md b/model_zoo/official/nlp/lstm/README.md index 32311fcf106..13607a34087 100644 --- a/model_zoo/official/nlp/lstm/README.md +++ b/model_zoo/official/nlp/lstm/README.md @@ -94,13 +94,21 @@ Note that you can run the scripts based on the dataset mentioned in original pap    │   ├── run_eval_cpu.sh # shell script for evaluation on CPU    │   ├── run_train_gpu.sh # shell script for training on GPU    │   ├── run_train_ascend.sh # shell script for training on Ascend -    │   └── run_train_cpu.sh # shell script for training on CPU +    │   ├── run_train_cpu.sh # shell script for training on CPU +   │   └── run_infer_310.sh # shell script for infer310    ├── src -    │   ├── config.py # parameter configuration +    │   ├── lstm.py # Sentiment model    │   ├── dataset.py # dataset preprocess    │   ├── imdb.py # imdb dataset read script    │   ├── lr_schedule.py # dynamic_lr script -    │   └── lstm.py # Sentiment model + │ └─model_utils + │ ├── config.py # Processing configuration parameters + │ ├── device_adapter.py # Get cloud ID + │ ├── local_adapter.py # Get local ID + │ └── moxing_adapter.py # Parameter processing + ├── default_config.yaml # Training parameter profile(cpu/gpu) + ├── config_ascend.yaml # Training parameter profile(ascend) + ├── config_ascend_8p.yaml # Training parameter profile(ascend_8p)    ├── eval.py # evaluation script on GPU, CPU and Ascend    └── train.py # training script on GPU, CPU and Ascend ``` diff --git a/model_zoo/official/nlp/lstm/README_CN.md b/model_zoo/official/nlp/lstm/README_CN.md index 00217180272..2ff0c98fdba 100644 --- a/model_zoo/official/nlp/lstm/README_CN.md +++ b/model_zoo/official/nlp/lstm/README_CN.md @@ -101,13 +101,21 @@ LSTM模型包含嵌入层、编码器和解码器这几个模块,编码器模    │   ├── run_eval_cpu.sh # CPU评估shell脚本    │   ├── run_train_ascend.sh # Ascend训练的shell脚本    │   ├── run_train_gpu.sh # GPU训练的shell脚本 -    │   └── run_train_cpu.sh # CPU训练的shell脚本 +   │   ├── run_train_cpu.sh # CPU训练的shell脚本 +    │   └── run_infer_310.sh # infer310的shell脚本    ├── src -    │   ├── config.py # 参数配置 +    │   ├── lstm.py # 情感模型    │   ├── dataset.py # 数据集预处理    │   ├── imdb.py # IMDB数据集读脚本    │   ├── lr_schedule.py # 动态学习率脚步 -    │   └── lstm.py # 情感模型 + │ └── model_utils + │ ├── config.py # 获取.yaml配置参数 + │ ├── device_adapter.py # 获取云上id + │ ├── local_adapter.py # 获取本地id + │ └── moxing_adapter.py # 云上数据准备 + ├── default_config.yaml # 训练配置参数(cpu/gpu) + ├── config_ascend.yaml # 训练配置参数(ascend) + ├── config_ascend_8p.yaml # 训练配置参数(ascend_8p)    ├── eval.py # GPU、CPU和Ascend的评估脚本    └── train.py # GPU、CPU和Ascend的训练脚本 ``` diff --git a/model_zoo/official/nlp/lstm/config_ascend.yaml b/model_zoo/official/nlp/lstm/config_ascend.yaml new file mode 100644 index 00000000000..35a7bf21ce8 --- /dev/null +++ b/model_zoo/official/nlp/lstm/config_ascend.yaml @@ -0,0 +1,82 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +data_url: "" +train_url: "" +checkpoint_url: "" +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +checkpoint_path: './checkpoint/' +checkpoint_file: './checkpoint/lstm-20_390.ckpt' +device_target: Ascend +enable_profiling: False + +# ============================================================================== +# LSTM CONFIG IN ASCEND for 1p training +num_classes: 2 +momentum: 0.9 +num_epochs: 20 +batch_size: 64 +embed_size: 300 +num_hiddens: 128 +num_layers: 2 +bidirectional: True +save_checkpoint_steps: 7800 +keep_checkpoint_max: 10 +dynamic_lr: True +lr_init: 0.05 +lr_end: 0.01 +lr_max: 0.1 +lr_adjust_epoch: 6 +warmup_epochs: 1 +global_step: 0 + +# MindSpore LSTM Example - train.py +preprocess: 'false' +aclimdb_path: "/cache/data/aclImdb" +glove_path: "/cache/data" +preprocess_path: "./preprocess" +ckpt_path: './ckpt_lstm/' +pre_trained: '' # None +device_num: 1 +distribute: "false" +enable_graph_kernel: "true" + +# export.py +ckpt_file: './ckpt_lstm/lstm-20_390.ckpt' +device_id: 0 +file_name: "lstm" +file_format: "AIR" + +# LSTM Postprocess +label_dir: '' +result_dir: "./result_Files" + +# preprocess +result_path: './preprocess_Result/' + +--- +# Config description for each option +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +train_url: 'Training output url for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' + +preprocess: 'whether to preprocess data.' +aclimdb_path: 'path where the dataset is stored.' +glove_path: 'path where the GloVe is stored.' +preprocess_path: 'path where the pre-process data is stored.' +ckpt_path: 'the path to save the checkpoint file.' +pre_trained: 'the pretrained checkpoint file path.' +device_target: 'the target device to run, support "GPU", "CPU". Default: "Ascend".' +device_num: 'Use device nums, default is 1.' +distribute: 'Run distribute, default is false.' +enable_graph_kernel: 'Accelerate by graph kernel, default is true.' + +--- +device_target: ['Ascend', 'GPU', 'CPU'] +distribute: ['true', 'false'] +distribute: ['true', 'false'] +enable_graph_kernel: ['true', 'false'] +file_format: ['AIR', 'ONNX', 'MINDIR'] \ No newline at end of file diff --git a/model_zoo/official/nlp/lstm/config_ascend_8p.yaml b/model_zoo/official/nlp/lstm/config_ascend_8p.yaml new file mode 100644 index 00000000000..f4a871cf3f5 --- /dev/null +++ b/model_zoo/official/nlp/lstm/config_ascend_8p.yaml @@ -0,0 +1,84 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +data_url: "" +train_url: "" +checkpoint_url: "" +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +checkpoint_path: './checkpoint/' +checkpoint_file: './checkpoint/lstm-20_390.ckpt' +device_target: Ascend +enable_profiling: False + +# ============================================================================== +# LSTM CONFIG IN ASCEND for 8p training +num_classes: 2 +momentum: 0.9 +num_epochs: 20 +batch_size: 64 +embed_size: 300 +num_hiddens: 128 +num_layers: 2 +bidirectional: True +save_checkpoint_steps: 7800 +keep_checkpoint_max: 10 +dynamic_lr: True +lr_init: 0.05 +lr_end: 0.01 +lr_max: 0.3 +lr_adjust_epoch: 20 +warmup_epochs: 2 +global_step: 0 + + +# MindSpore LSTM Example - train.py +preprocess: 'false' +aclimdb_path: "/cache/data/aclImdb" +glove_path: "/cache/data" +preprocess_path: "./preprocess" +ckpt_path: './ckpt_lstm/' +pre_trained: '' # None +device_num: 8 +distribute: "true" +enable_graph_kernel: "true" + +# export.py +ckpt_file: './ckpt_lstm/lstm-20_390.ckpt' +device_id: 0 +rank_id: 0 +file_name: "lstm" +file_format: "AIR" + +# LSTM Postprocess +label_dir: '' +result_dir: "./result_Files" + +# preprocess +result_path: './preprocess_Result/' + +--- +# Config description for each option +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +train_url: 'Training output url for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' + +preprocess: 'whether to preprocess data.' +aclimdb_path: 'path where the dataset is stored.' +glove_path: 'path where the GloVe is stored.' +preprocess_path: 'path where the pre-process data is stored.' +ckpt_path: 'the path to save the checkpoint file.' +pre_trained: 'the pretrained checkpoint file path.' +device_target: 'the target device to run, support "GPU", "CPU". Default: "Ascend".' +device_num: 'Use device nums, default is 1.' +distribute: 'Run distribute, default is false.' +enable_graph_kernel: 'Accelerate by graph kernel, default is true.' + +--- +device_target: ['Ascend', 'GPU', 'CPU'] +distribute: ['true', 'false'] +distribute: ['true', 'false'] +enable_graph_kernel: ['true', 'false'] +file_format: ['AIR', 'ONNX', 'MINDIR'] diff --git a/model_zoo/official/nlp/lstm/default_config.yaml b/model_zoo/official/nlp/lstm/default_config.yaml new file mode 100644 index 00000000000..5be3b87feec --- /dev/null +++ b/model_zoo/official/nlp/lstm/default_config.yaml @@ -0,0 +1,77 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) +enable_modelarts: False +data_url: "" +train_url: "" +checkpoint_url: "" +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +checkpoint_path: './checkpoint/' +checkpoint_file: './checkpoint/lstm-20_390.ckpt' +device_target: CPU +enable_profiling: False + +# ============================================================================== +# LSTM CONFIG +num_classes: 2 +dynamic_lr: False +learning_rate: 0.1 +momentum: 0.9 +num_epochs: 20 +batch_size: 64 +embed_size: 300 +num_hiddens: 100 +num_layers: 2 +bidirectional: True +save_checkpoint_steps: 390 +keep_checkpoint_max: 10 + +# MindSpore LSTM Example - train.py +preprocess: 'false' +aclimdb_path: "/cache/data/aclImdb" +glove_path: "/cache/data" +preprocess_path: "./preprocess" +ckpt_path: './ckpt_lstm/' +pre_trained: '' # None +device_num: 1 +distribute: "false" +enable_graph_kernel: "true" + +# export.py +ckpt_file: './ckpt_lstm/lstm-20_390.ckpt' +device_id: 0 +file_name: "lstm" +file_format: "AIR" + +# LSTM Postprocess +label_dir: '' +result_dir: "./result_Files" + +# preprocess +result_path: './preprocess_Result/' + +--- +# Config description for each option +enable_modelarts: 'Whether training on modelarts, default: False' +data_url: 'Dataset url for obs' +train_url: 'Training output url for obs' +data_path: 'Dataset path for local' +output_path: 'Training output path for local' + +preprocess: 'whether to preprocess data.' +aclimdb_path: 'path where the dataset is stored.' +glove_path: 'path where the GloVe is stored.' +preprocess_path: 'path where the pre-process data is stored.' +ckpt_path: 'the path to save the checkpoint file.' +pre_trained: 'the pretrained checkpoint file path.' +device_target: 'the target device to run, support "GPU", "CPU". Default: "Ascend".' +device_num: 'Use device nums, default is 1.' +distribute: 'Run distribute, default is false.' +enable_graph_kernel: 'Accelerate by graph kernel, default is true.' + +--- +device_target: ['Ascend', 'GPU', 'CPU'] +distribute: ['true', 'false'] +distribute: ['true', 'false'] +enable_graph_kernel: ['true', 'false'] +file_format: ['AIR', 'MINDIR'] diff --git a/model_zoo/official/nlp/lstm/eval.py b/model_zoo/official/nlp/lstm/eval.py index 511a88176b3..f0dd4a21430 100644 --- a/model_zoo/official/nlp/lstm/eval.py +++ b/model_zoo/official/nlp/lstm/eval.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,76 +15,66 @@ """ #################train lstm example on aclImdb######################## """ -import argparse import os - import numpy as np -from src.config import lstm_cfg, lstm_cfg_ascend +from src.model_utils.config import config +from src.model_utils.moxing_adapter import moxing_wrapper from src.dataset import lstm_create_dataset, convert_to_mindrecord from src.lstm import SentimentNet from mindspore import Tensor, nn, Model, context from mindspore.nn import Accuracy, Recall, F1 from mindspore.train.serialization import load_checkpoint, load_param_into_net -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='MindSpore LSTM Example') - parser.add_argument('--preprocess', type=str, default='false', choices=['true', 'false'], - help='whether to preprocess data.') - parser.add_argument('--aclimdb_path', type=str, default="./aclImdb", - help='path where the dataset is stored.') - parser.add_argument('--glove_path', type=str, default="./glove", - help='path where the GloVe is stored.') - parser.add_argument('--preprocess_path', type=str, default="./preprocess", - help='path where the pre-process data is stored.') - parser.add_argument('--ckpt_path', type=str, default=None, - help='the checkpoint file path used to evaluate model.') - parser.add_argument('--device_target', type=str, default="Ascend", choices=['GPU', 'CPU', 'Ascend'], - help='the target device to run, support "GPU", "CPU". Default: "Ascend".') - args = parser.parse_args() +def modelarts_process(): + config.ckpt_file = os.path.join(config.output_path, config.ckpt_file) + +@moxing_wrapper(pre_process=modelarts_process) +def eval_lstm(): + """ eval lstm """ + print('\neval.py config: \n', config) + config.preprocess_path = os.path.join(config.glove_path, config.preprocess_path) context.set_context( mode=context.GRAPH_MODE, save_graphs=False, - device_target=args.device_target) + device_target=config.device_target) - if args.device_target == 'Ascend': - cfg = lstm_cfg_ascend - else: - cfg = lstm_cfg - - if args.preprocess == "true": + if config.preprocess == "true": print("============== Starting Data Pre-processing ==============") - convert_to_mindrecord(cfg.embed_size, args.aclimdb_path, args.preprocess_path, args.glove_path) + convert_to_mindrecord(config.embed_size, config.aclimdb_path, config.preprocess_path, config.glove_path) - embedding_table = np.loadtxt(os.path.join(args.preprocess_path, "weight.txt")).astype(np.float32) + embedding_table = np.loadtxt(os.path.join(config.preprocess_path, "weight.txt")).astype(np.float32) # DynamicRNN in this network on Ascend platform only support the condition that the shape of input_size # and hiddle_size is multiples of 16, this problem will be solved later. - if args.device_target == 'Ascend': - pad_num = int(np.ceil(cfg.embed_size / 16) * 16 - cfg.embed_size) + if config.device_target == 'Ascend': + pad_num = int(np.ceil(config.embed_size / 16) * 16 - config.embed_size) if pad_num > 0: embedding_table = np.pad(embedding_table, [(0, 0), (0, pad_num)], 'constant') - cfg.embed_size = int(np.ceil(cfg.embed_size / 16) * 16) + config.embed_size = int(np.ceil(config.embed_size / 16) * 16) network = SentimentNet(vocab_size=embedding_table.shape[0], - embed_size=cfg.embed_size, - num_hiddens=cfg.num_hiddens, - num_layers=cfg.num_layers, - bidirectional=cfg.bidirectional, - num_classes=cfg.num_classes, + embed_size=config.embed_size, + num_hiddens=config.num_hiddens, + num_layers=config.num_layers, + bidirectional=config.bidirectional, + num_classes=config.num_classes, weight=Tensor(embedding_table), - batch_size=cfg.batch_size) + batch_size=config.batch_size) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - ds_eval = lstm_create_dataset(args.preprocess_path, cfg.batch_size, training=False) + ds_eval = lstm_create_dataset(config.preprocess_path, config.batch_size, training=False) model = Model(network, loss, metrics={'acc': Accuracy(), 'recall': Recall(), 'f1': F1()}) print("============== Starting Testing ==============") - param_dict = load_checkpoint(args.ckpt_path) + param_dict = load_checkpoint(config.ckpt_file) load_param_into_net(network, param_dict) - if args.device_target == "CPU": + if config.device_target == "CPU": acc = model.eval(ds_eval, dataset_sink_mode=False) else: acc = model.eval(ds_eval) print("============== {} ==============".format(acc)) + +if __name__ == '__main__': + eval_lstm() diff --git a/model_zoo/official/nlp/lstm/export.py b/model_zoo/official/nlp/lstm/export.py index 2768a386fd7..086e9b25f99 100644 --- a/model_zoo/official/nlp/lstm/export.py +++ b/model_zoo/official/nlp/lstm/export.py @@ -16,58 +16,52 @@ ##############export checkpoint file into mindir model################# python export.py """ -import argparse import os - import numpy as np from mindspore import Tensor, context from mindspore import export, load_checkpoint, load_param_into_net -from src.config import lstm_cfg, lstm_cfg_ascend + from src.lstm import SentimentNet +from src.model_utils.config import config +from src.model_utils.moxing_adapter import moxing_wrapper +from src.model_utils.device_adapter import get_device_id -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='MindSpore LSTM Exporter') - parser.add_argument('--preprocess_path', type=str, default='./preprocess', - help='path where the pre-process data is stored.') - parser.add_argument('--ckpt_file', type=str, required=True, help='lstm ckpt file.') - parser.add_argument("--device_id", type=int, default=0, help="Device id") - parser.add_argument("--file_name", type=str, default="lstm", help="output file name.") - parser.add_argument('--file_format', type=str, choices=["AIR", "MINDIR"], default='AIR', help='file format') - parser.add_argument('--device_target', type=str, default="Ascend", choices=['GPU', 'CPU', 'Ascend'], - help='the target device to run, support "GPU", "CPU". Default: "Ascend".') - args = parser.parse_args() +def modelarts_process(): + config.ckpt_file = os.path.join(config.output_path, config.ckpt_file) +@moxing_wrapper(pre_process=modelarts_process) +def export_lstm(): + """ export lstm """ + config.preprocess_path = os.path.join(config.glove_path, config.preprocess_path) context.set_context( mode=context.GRAPH_MODE, save_graphs=False, - device_target=args.device_target, - device_id=args.device_id) + device_target=config.device_target, + device_id=get_device_id()) - if args.device_target == 'Ascend': - cfg = lstm_cfg_ascend - else: - cfg = lstm_cfg + embedding_table = np.loadtxt(os.path.join(config.preprocess_path, "weight.txt")).astype(np.float32) - embedding_table = np.loadtxt(os.path.join(args.preprocess_path, "weight.txt")).astype(np.float32) - - if args.device_target == 'Ascend': - pad_num = int(np.ceil(cfg.embed_size / 16) * 16 - cfg.embed_size) + if config.device_target == 'Ascend': + pad_num = int(np.ceil(config.embed_size / 16) * 16 - config.embed_size) if pad_num > 0: embedding_table = np.pad(embedding_table, [(0, 0), (0, pad_num)], 'constant') - cfg.embed_size = int(np.ceil(cfg.embed_size / 16) * 16) + config.embed_size = int(np.ceil(config.embed_size / 16) * 16) network = SentimentNet(vocab_size=embedding_table.shape[0], - embed_size=cfg.embed_size, - num_hiddens=cfg.num_hiddens, - num_layers=cfg.num_layers, - bidirectional=cfg.bidirectional, - num_classes=cfg.num_classes, + embed_size=config.embed_size, + num_hiddens=config.num_hiddens, + num_layers=config.num_layers, + bidirectional=config.bidirectional, + num_classes=config.num_classes, weight=Tensor(embedding_table), - batch_size=cfg.batch_size) + batch_size=config.batch_size) - param_dict = load_checkpoint(args.ckpt_file) + param_dict = load_checkpoint(config.ckpt_file) load_param_into_net(network, param_dict) - input_arr = Tensor(np.random.uniform(0.0, 1e5, size=[cfg.batch_size, 500]).astype(np.int32)) - export(network, input_arr, file_name=args.file_name, file_format=args.file_format) + input_arr = Tensor(np.random.uniform(0.0, 1e5, size=[config.batch_size, 500]).astype(np.int32)) + export(network, input_arr, file_name=config.file_name, file_format=config.file_format) + +if __name__ == '__main__': + export_lstm() diff --git a/model_zoo/official/nlp/lstm/postprocess.py b/model_zoo/official/nlp/lstm/postprocess.py index b00fe1d5fdb..af68df3695d 100644 --- a/model_zoo/official/nlp/lstm/postprocess.py +++ b/model_zoo/official/nlp/lstm/postprocess.py @@ -15,33 +15,20 @@ """ #################lstm postprocess######################## """ -import argparse import os import numpy as np from mindspore.nn import Accuracy -from src.config import lstm_cfg, lstm_cfg_ascend +from src.model_utils.config import config -parser = argparse.ArgumentParser(description='LSTM Postprocess') -parser.add_argument('--label_dir', type=str, default='', help='label data directory.') -parser.add_argument('--result_dir', type=str, default="./result_Files", - help='infer result dir.') -parser.add_argument('--device_target', type=str, default="Ascend", choices=['GPU', 'CPU', 'Ascend'], - help='the target device to run, support "GPU", "CPU". Default: "Ascend".') -args, _ = parser.parse_known_args() if __name__ == '__main__': metrics = Accuracy() - rst_path = args.result_dir - labels = np.load(args.label_dir) - - if args.device_target == 'Ascend': - cfg = lstm_cfg_ascend - else: - cfg = lstm_cfg + rst_path = config.result_dir + labels = np.load(config.label_dir) for i in range(len(os.listdir(rst_path))): - file_name = os.path.join(rst_path, "LSTM_data_bs" + str(cfg.batch_size) + '_' + str(i) + '_0.bin') - output = np.fromfile(file_name, np.float32).reshape(cfg.batch_size, cfg.num_classes) + file_name = os.path.join(rst_path, "LSTM_data_bs" + str(config.batch_size) + '_' + str(i) + '_0.bin') + output = np.fromfile(file_name, np.float32).reshape(config.batch_size, config.num_classes) metrics.update(output, labels[i]) print("result of Accuracy is: ", metrics.eval()) diff --git a/model_zoo/official/nlp/lstm/preprocess.py b/model_zoo/official/nlp/lstm/preprocess.py index 2de0b63178a..93828bce508 100644 --- a/model_zoo/official/nlp/lstm/preprocess.py +++ b/model_zoo/official/nlp/lstm/preprocess.py @@ -15,35 +15,24 @@ """ ##############preprocess################# """ -import argparse import os import numpy as np -from src.config import lstm_cfg, lstm_cfg_ascend -from src.dataset import lstm_create_dataset -parser = argparse.ArgumentParser(description='preprocess') -parser.add_argument('--preprocess_path', type=str, default="./preprocess", - help='path where the pre-process data is stored.') -parser.add_argument('--result_path', type=str, default='./preprocess_Result/', help='result path') -parser.add_argument('--device_target', type=str, default="Ascend", choices=['GPU', 'CPU', 'Ascend'], - help='the target device to run, support "GPU", "CPU". Default: "Ascend".') -args = parser.parse_args() +from src.dataset import lstm_create_dataset +from src.model_utils.config import config + if __name__ == '__main__': - if args.device_target == 'Ascend': - cfg = lstm_cfg_ascend - else: - cfg = lstm_cfg - - dataset = lstm_create_dataset(args.preprocess_path, cfg.batch_size, training=False) - img_path = os.path.join(args.result_path, "00_data") + config.preprocess_path = os.path.join(config.glove_path, config.preprocess_path) + dataset = lstm_create_dataset(config.preprocess_path, config.batch_size, training=False) + img_path = os.path.join(config.result_path, "00_data") os.makedirs(img_path) label_list = [] for i, data in enumerate(dataset.create_dict_iterator(output_numpy=True)): - file_name = "LSTM_data_bs" + str(cfg.batch_size) + "_" + str(i) + ".bin" + file_name = "LSTM_data_bs" + str(config.batch_size) + "_" + str(i) + ".bin" file_path = img_path + "/" + file_name data['feature'].tofile(file_path) label_list.append(data['label']) - np.save(args.result_path + "label_ids.npy", label_list) + np.save(config.result_path + "label_ids.npy", label_list) print("="*20, "export bin files finished", "="*20) diff --git a/model_zoo/official/nlp/lstm/script/run_distribute_train_ascend.sh b/model_zoo/official/nlp/lstm/scripts/run_distribute_train_ascend.sh similarity index 88% rename from model_zoo/official/nlp/lstm/script/run_distribute_train_ascend.sh rename to model_zoo/official/nlp/lstm/scripts/run_distribute_train_ascend.sh index 5fc2559b439..15c9d1b946f 100644 --- a/model_zoo/official/nlp/lstm/script/run_distribute_train_ascend.sh +++ b/model_zoo/official/nlp/lstm/scripts/run_distribute_train_ascend.sh @@ -27,6 +27,8 @@ RANK_SIZE=$2 ACLIMDB_DIR=$3 GLOVE_DIR=$4 +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../config_ascend_8p.yaml" for((i=0;i<${RANK_SIZE};i++)); do @@ -34,15 +36,18 @@ do mkdir ${ROOT_PATH}/device$i cd ${ROOT_PATH}/device$i || exit cp ../../*.py ./ + cp ../../*.yaml ./ cp -r ../../src ./ export RANK_ID=$i export DEVICE_ID=$i python train.py \ + --config_path=$CONFIG_FILE \ --device_target="Ascend" \ --aclimdb_path=$ACLIMDB_DIR \ --glove_path=$GLOVE_DIR \ --distribute=true \ --device_num=$RANK_SIZE \ - --preprocess=true \ + --device_id=$i --rank_id=$i \ + --preprocess=false \ --preprocess_path=./preprocess > log.txt 2>&1 & done diff --git a/model_zoo/official/nlp/lstm/script/run_eval_ascend.sh b/model_zoo/official/nlp/lstm/scripts/run_eval_ascend.sh similarity index 83% rename from model_zoo/official/nlp/lstm/script/run_eval_ascend.sh rename to model_zoo/official/nlp/lstm/scripts/run_eval_ascend.sh index 5131f8a7553..8a26bcdc55a 100644 --- a/model_zoo/official/nlp/lstm/script/run_eval_ascend.sh +++ b/model_zoo/official/nlp/lstm/scripts/run_eval_ascend.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,8 +32,13 @@ CUR_DIR=`pwd` export GLOG_log_dir=${CUR_DIR}/ms_log export GLOG_logtostderr=0 export DEVICE_ID=$DEVICE_ID + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../../config_ascend.yaml" + python ../../eval.py \ + --config_path=$CONFIG_FILE \ --device_target="Ascend" \ --preprocess=false \ - --preprocess_path=$PREPROCESS_DIR \ - --ckpt_path=$CKPT_FILE > log.txt 2>&1 & + --glove_path=$PREPROCESS_DIR \ + --ckpt_file=$CKPT_FILE > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/lstm/script/run_eval_cpu.sh b/model_zoo/official/nlp/lstm/scripts/run_eval_cpu.sh similarity index 84% rename from model_zoo/official/nlp/lstm/script/run_eval_cpu.sh rename to model_zoo/official/nlp/lstm/scripts/run_eval_cpu.sh index 2ffe41c2e39..536a279649b 100644 --- a/model_zoo/official/nlp/lstm/script/run_eval_cpu.sh +++ b/model_zoo/official/nlp/lstm/scripts/run_eval_cpu.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,10 +28,15 @@ mkdir -p ms_log CUR_DIR=`pwd` export GLOG_log_dir=${CUR_DIR}/ms_log export GLOG_logtostderr=0 -python eval.py \ + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + +python ../eval.py \ + --config_path=$CONFIG_FILE \ --device_target="CPU" \ --aclimdb_path=$ACLIMDB_DIR \ --glove_path=$GLOVE_DIR \ --preprocess=false \ --preprocess_path=./preprocess \ - --ckpt_path=$CKPT_FILE > log.txt 2>&1 & + --ckpt_file=$CKPT_FILE > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/lstm/script/run_eval_gpu.sh b/model_zoo/official/nlp/lstm/scripts/run_eval_gpu.sh similarity index 85% rename from model_zoo/official/nlp/lstm/script/run_eval_gpu.sh rename to model_zoo/official/nlp/lstm/scripts/run_eval_gpu.sh index e2fa176f0ff..2ec5cf1a227 100644 --- a/model_zoo/official/nlp/lstm/script/run_eval_gpu.sh +++ b/model_zoo/official/nlp/lstm/scripts/run_eval_gpu.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,10 +31,15 @@ mkdir -p ms_log CUR_DIR=`pwd` export GLOG_log_dir=${CUR_DIR}/ms_log export GLOG_logtostderr=0 -python eval.py \ + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + +python ../eval.py \ + --config_path=$CONFIG_FILE \ --device_target="GPU" \ --aclimdb_path=$ACLIMDB_DIR \ --glove_path=$GLOVE_DIR \ --preprocess=false \ --preprocess_path=./preprocess \ - --ckpt_path=$CKPT_FILE > log.txt 2>&1 & + --ckpt_file=$CKPT_FILE > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/lstm/script/run_infer_310.sh b/model_zoo/official/nlp/lstm/scripts/run_infer_310.sh similarity index 90% rename from model_zoo/official/nlp/lstm/script/run_infer_310.sh rename to model_zoo/official/nlp/lstm/scripts/run_infer_310.sh index 8f903b1bf06..28eef942708 100644 --- a/model_zoo/official/nlp/lstm/script/run_infer_310.sh +++ b/model_zoo/official/nlp/lstm/scripts/run_infer_310.sh @@ -56,6 +56,9 @@ echo "need preprocess: "$need_preprocess echo "device_target: "$device_target echo "device id: "$device_id +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + export ASCEND_HOME=/usr/local/Ascend/ if [ -d ${ASCEND_HOME}/ascend-toolkit ]; then export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/atc/bin:$PATH @@ -76,7 +79,7 @@ function preprocess_data() rm -rf ./preprocess_Result fi mkdir preprocess_Result - python3.7 ../preprocess.py --preprocess_path=$dataset_path --result_path=./preprocess_Result/ --device_target=$device_target + python3.7 ../preprocess.py --config_path=$CONFIG_FILE --preprocess_path=$dataset_path --result_path=./preprocess_Result/ --device_target=$device_target } function compile_app() @@ -103,7 +106,7 @@ function infer() function cal_acc() { - python3.7 ../postprocess.py --result_dir=./result_Files --label_dir=./preprocess_Result/label_ids.npy --device_target=$device_target &> acc.log + python3.7 ../postprocess.py --config_path=$CONFIG_FILE --result_dir=./result_Files --label_dir=./preprocess_Result/label_ids.npy --device_target=$device_target &> acc.log } if [ $need_preprocess == "y" ]; then diff --git a/model_zoo/official/nlp/lstm/script/run_train_ascend.sh b/model_zoo/official/nlp/lstm/scripts/run_train_ascend.sh similarity index 87% rename from model_zoo/official/nlp/lstm/script/run_train_ascend.sh rename to model_zoo/official/nlp/lstm/scripts/run_train_ascend.sh index 43ef5738683..4d28f9eb55d 100644 --- a/model_zoo/official/nlp/lstm/script/run_train_ascend.sh +++ b/model_zoo/official/nlp/lstm/scripts/run_train_ascend.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,9 +31,14 @@ CUR_DIR=`pwd` export GLOG_log_dir=${CUR_DIR}/ms_log export GLOG_logtostderr=0 export DEVICE_ID=$DEVICE_ID + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../../config_ascend.yaml" + python ../../train.py \ + --config_path=$CONFIG_FILE \ --device_target="Ascend" \ --aclimdb_path=$ACLIMDB_DIR \ --glove_path=$GLOVE_DIR \ - --preprocess=true \ + --preprocess=false \ --preprocess_path=./preprocess > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/lstm/script/run_train_cpu.sh b/model_zoo/official/nlp/lstm/scripts/run_train_cpu.sh similarity index 85% rename from model_zoo/official/nlp/lstm/script/run_train_cpu.sh rename to model_zoo/official/nlp/lstm/scripts/run_train_cpu.sh index 6d871deb8ca..e7f89023d69 100644 --- a/model_zoo/official/nlp/lstm/script/run_train_cpu.sh +++ b/model_zoo/official/nlp/lstm/scripts/run_train_cpu.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,9 +27,14 @@ mkdir -p ms_log CUR_DIR=`pwd` export GLOG_log_dir=${CUR_DIR}/ms_log export GLOG_logtostderr=0 -python train.py \ + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + +python ../train.py \ + --config_path=$CONFIG_FILE \ --device_target="CPU" \ --aclimdb_path=$ACLIMDB_DIR \ --glove_path=$GLOVE_DIR \ - --preprocess=true \ + --preprocess=false \ --preprocess_path=./preprocess > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/lstm/script/run_train_gpu.sh b/model_zoo/official/nlp/lstm/scripts/run_train_gpu.sh similarity index 85% rename from model_zoo/official/nlp/lstm/script/run_train_gpu.sh rename to model_zoo/official/nlp/lstm/scripts/run_train_gpu.sh index df292345668..5235db474f8 100644 --- a/model_zoo/official/nlp/lstm/script/run_train_gpu.sh +++ b/model_zoo/official/nlp/lstm/scripts/run_train_gpu.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,9 +30,14 @@ mkdir -p ms_log CUR_DIR=`pwd` export GLOG_log_dir=${CUR_DIR}/ms_log export GLOG_logtostderr=0 -python train.py \ + +BASE_PATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASE_PATH}/../default_config.yaml" + +python ../train.py \ + --config_path=$CONFIG_FILE \ --device_target="GPU" \ --aclimdb_path=$ACLIMDB_DIR \ --glove_path=$GLOVE_DIR \ - --preprocess=true \ + --preprocess=false \ --preprocess_path=./preprocess > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/lstm/src/__init__.py b/model_zoo/official/nlp/lstm/src/__init__.py index 301ef9dcb71..fdc80c72940 100644 --- a/model_zoo/official/nlp/lstm/src/__init__.py +++ b/model_zoo/official/nlp/lstm/src/__init__.py @@ -12,3 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +""" +__init__.py +""" + +from . import imdb +from . import dataset +from . import lr_schedule +from . import lstm diff --git a/model_zoo/official/nlp/lstm/src/config.py b/model_zoo/official/nlp/lstm/src/config.py deleted file mode 100644 index 13f7de30c57..00000000000 --- a/model_zoo/official/nlp/lstm/src/config.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -network config setting -""" -from easydict import EasyDict as edict - -# LSTM CONFIG -lstm_cfg = edict({ - 'num_classes': 2, - 'dynamic_lr': False, - 'learning_rate': 0.1, - 'momentum': 0.9, - 'num_epochs': 20, - 'batch_size': 64, - 'embed_size': 300, - 'num_hiddens': 100, - 'num_layers': 2, - 'bidirectional': True, - 'save_checkpoint_steps': 390, - 'keep_checkpoint_max': 10 -}) - -# LSTM CONFIG IN ASCEND for 1p training -lstm_cfg_ascend = edict({ - 'num_classes': 2, - 'momentum': 0.9, - 'num_epochs': 20, - 'batch_size': 64, - 'embed_size': 300, - 'num_hiddens': 128, - 'num_layers': 2, - 'bidirectional': True, - 'save_checkpoint_steps': 7800, - 'keep_checkpoint_max': 10, - 'dynamic_lr': True, - 'lr_init': 0.05, - 'lr_end': 0.01, - 'lr_max': 0.1, - 'lr_adjust_epoch': 6, - 'warmup_epochs': 1, - 'global_step': 0 -}) - -# LSTM CONFIG IN ASCEND for 8p training -lstm_cfg_ascend_8p = edict({ - 'num_classes': 2, - 'momentum': 0.9, - 'num_epochs': 20, - 'batch_size': 64, - 'embed_size': 300, - 'num_hiddens': 128, - 'num_layers': 2, - 'bidirectional': True, - 'save_checkpoint_steps': 7800, - 'keep_checkpoint_max': 10, - 'dynamic_lr': True, - 'lr_init': 0.05, - 'lr_end': 0.01, - 'lr_max': 0.3, - 'lr_adjust_epoch': 20, - 'warmup_epochs': 2, - 'global_step': 0 -}) diff --git a/model_zoo/official/nlp/lstm/src/dataset.py b/model_zoo/official/nlp/lstm/src/dataset.py index 9b030ff7f27..c6af9336f0e 100644 --- a/model_zoo/official/nlp/lstm/src/dataset.py +++ b/model_zoo/official/nlp/lstm/src/dataset.py @@ -16,7 +16,6 @@ Data operations, will be used in train.py and eval.py """ import os - import numpy as np import mindspore.dataset as ds diff --git a/model_zoo/official/nlp/lstm/src/imdb.py b/model_zoo/official/nlp/lstm/src/imdb.py index 918af2210a2..ed0bb180c15 100644 --- a/model_zoo/official/nlp/lstm/src/imdb.py +++ b/model_zoo/official/nlp/lstm/src/imdb.py @@ -17,7 +17,6 @@ imdb dataset parser. """ import os from itertools import chain - import numpy as np import gensim diff --git a/model_zoo/official/nlp/lstm/src/model_utils/__init__.py b/model_zoo/official/nlp/lstm/src/model_utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/model_zoo/official/nlp/lstm/src/model_utils/config.py b/model_zoo/official/nlp/lstm/src/model_utils/config.py new file mode 100644 index 00000000000..7f1ff6e2b8d --- /dev/null +++ b/model_zoo/official/nlp/lstm/src/model_utils/config.py @@ -0,0 +1,127 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Parse arguments""" + +import os +import ast +import argparse +from pprint import pprint, pformat +import yaml + +class Config: + """ + Configuration namespace. Convert dictionary to members. + """ + def __init__(self, cfg_dict): + for k, v in cfg_dict.items(): + if isinstance(v, (list, tuple)): + setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) + else: + setattr(self, k, Config(v) if isinstance(v, dict) else v) + + def __str__(self): + return pformat(self.__dict__) + + def __repr__(self): + return self.__str__() + + +def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): + """ + Parse command line arguments to the configuration according to the default yaml. + + Args: + parser: Parent parser. + cfg: Base configuration. + helper: Helper description. + cfg_path: Path to the default yaml config. + """ + parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", + parents=[parser]) + helper = {} if helper is None else helper + choices = {} if choices is None else choices + for item in cfg: + if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): + help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) + choice = choices[item] if item in choices else None + if isinstance(cfg[item], bool): + parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, + help=help_description) + else: + parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, + help=help_description) + args = parser.parse_args() + return args + + +def parse_yaml(yaml_path): + """ + Parse the yaml config file. + + Args: + yaml_path: Path to the yaml config. + """ + with open(yaml_path, 'r') as fin: + try: + cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) + cfgs = [x for x in cfgs] + if len(cfgs) == 1: + cfg_helper = {} + cfg = cfgs[0] + cfg_choices = {} + elif len(cfgs) == 2: + cfg, cfg_helper = cfgs + cfg_choices = {} + elif len(cfgs) == 3: + cfg, cfg_helper, cfg_choices = cfgs + else: + raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") + print(cfg_helper) + except: + raise ValueError("Failed to parse yaml") + return cfg, cfg_helper, cfg_choices + + +def merge(args, cfg): + """ + Merge the base config from yaml file and command line arguments. + + Args: + args: Command line arguments. + cfg: Base configuration. + """ + args_var = vars(args) + for item in args_var: + cfg[item] = args_var[item] + return cfg + + +def get_config(): + """ + Get Config according to the yaml file and cli arguments. + """ + parser = argparse.ArgumentParser(description="default name", add_help=False) + current_dir = os.path.dirname(os.path.abspath(__file__)) + parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../default_config.yaml"), + help="Config file path") + path_args, _ = parser.parse_known_args() + default, helper, choices = parse_yaml(path_args.config_path) + pprint(default) + args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) + final_config = merge(args, default) + return Config(final_config) + +config = get_config() diff --git a/model_zoo/official/nlp/lstm/src/model_utils/device_adapter.py b/model_zoo/official/nlp/lstm/src/model_utils/device_adapter.py new file mode 100644 index 00000000000..7c5d7f837dd --- /dev/null +++ b/model_zoo/official/nlp/lstm/src/model_utils/device_adapter.py @@ -0,0 +1,27 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Device adapter for ModelArts""" + +from .config import config + +if config.enable_modelarts: + from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id +else: + from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id + +__all__ = [ + "get_device_id", "get_device_num", "get_rank_id", "get_job_id" +] diff --git a/model_zoo/official/nlp/lstm/src/model_utils/local_adapter.py b/model_zoo/official/nlp/lstm/src/model_utils/local_adapter.py new file mode 100644 index 00000000000..769fa6dc78e --- /dev/null +++ b/model_zoo/official/nlp/lstm/src/model_utils/local_adapter.py @@ -0,0 +1,36 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Local adapter""" + +import os + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + return "Local Job" diff --git a/model_zoo/official/nlp/lstm/src/model_utils/moxing_adapter.py b/model_zoo/official/nlp/lstm/src/model_utils/moxing_adapter.py new file mode 100644 index 00000000000..830d19a6fc9 --- /dev/null +++ b/model_zoo/official/nlp/lstm/src/model_utils/moxing_adapter.py @@ -0,0 +1,122 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""Moxing adapter for ModelArts""" + +import os +import functools +from mindspore import context +from mindspore.profiler import Profiler +from .config import config + +_global_sync_count = 0 + +def get_device_id(): + device_id = os.getenv('DEVICE_ID', '0') + return int(device_id) + + +def get_device_num(): + device_num = os.getenv('RANK_SIZE', '1') + return int(device_num) + + +def get_rank_id(): + global_rank_id = os.getenv('RANK_ID', '0') + return int(global_rank_id) + + +def get_job_id(): + job_id = os.getenv('JOB_ID') + job_id = job_id if job_id != "" else "default" + return job_id + +def sync_data(from_path, to_path): + """ + Download data from remote obs to local directory if the first url is remote url and the second one is local path + Upload data from local directory to remote obs in contrast. + """ + import moxing as mox + import time + global _global_sync_count + sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) + _global_sync_count += 1 + + # Each server contains 8 devices as most. + if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): + print("from path: ", from_path) + print("to path: ", to_path) + mox.file.copy_parallel(from_path, to_path) + print("===finish data synchronization===") + try: + os.mknod(sync_lock) + except IOError: + pass + print("===save flag===") + + while True: + if os.path.exists(sync_lock): + break + time.sleep(1) + + print("Finish sync data from {} to {}.".format(from_path, to_path)) + + +def moxing_wrapper(pre_process=None, post_process=None): + """ + Moxing wrapper to download dataset and upload outputs. + """ + def wrapper(run_func): + @functools.wraps(run_func) + def wrapped_func(*args, **kwargs): + # Download data from data_url + if config.enable_modelarts: + if config.data_url: + sync_data(config.data_url, config.data_path) + print("Dataset downloaded: ", os.listdir(config.data_path)) + if config.checkpoint_url: + sync_data(config.checkpoint_url, config.load_path) + print("Preload downloaded: ", os.listdir(config.load_path)) + if config.train_url: + sync_data(config.train_url, config.output_path) + print("Workspace downloaded: ", os.listdir(config.output_path)) + + context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) + config.device_num = get_device_num() + config.device_id = get_device_id() + if not os.path.exists(config.output_path): + os.makedirs(config.output_path) + + if pre_process: + pre_process() + + if config.enable_profiling: + profiler = Profiler() + + run_func(*args, **kwargs) + + if config.enable_profiling: + profiler.analyse() + + # Upload data to train_url + if config.enable_modelarts: + if post_process: + post_process() + + if config.train_url: + print("Start to copy output directory") + sync_data(config.output_path, config.train_url) + return wrapped_func + return wrapper diff --git a/model_zoo/official/nlp/lstm/train.py b/model_zoo/official/nlp/lstm/train.py index fdf5002ee64..de78d86b6f6 100644 --- a/model_zoo/official/nlp/lstm/train.py +++ b/model_zoo/official/nlp/lstm/train.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,16 +15,16 @@ """ #################train lstm example on aclImdb######################## """ -import argparse import os - import numpy as np -from src.config import lstm_cfg, lstm_cfg_ascend, lstm_cfg_ascend_8p +from src.model_utils.config import config +from src.model_utils.moxing_adapter import moxing_wrapper from src.dataset import convert_to_mindrecord from src.dataset import lstm_create_dataset from src.lr_schedule import get_lr from src.lstm import SentimentNet + from mindspore import Tensor, nn, Model, context from mindspore.nn import Accuracy from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor @@ -32,103 +32,85 @@ from mindspore.train.serialization import load_param_into_net, load_checkpoint from mindspore.communication.management import init, get_rank from mindspore.context import ParallelMode -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='MindSpore LSTM Example') - parser.add_argument('--preprocess', type=str, default='false', choices=['true', 'false'], - help='whether to preprocess data.') - parser.add_argument('--aclimdb_path', type=str, default="./aclImdb", - help='path where the dataset is stored.') - parser.add_argument('--glove_path', type=str, default="./glove", - help='path where the GloVe is stored.') - parser.add_argument('--preprocess_path', type=str, default="./preprocess", - help='path where the pre-process data is stored.') - parser.add_argument('--ckpt_path', type=str, default="./", - help='the path to save the checkpoint file.') - parser.add_argument('--pre_trained', type=str, default=None, - help='the pretrained checkpoint file path.') - parser.add_argument('--device_target', type=str, default="Ascend", choices=['GPU', 'CPU', 'Ascend'], - help='the target device to run, support "GPU", "CPU". Default: "Ascend".') - parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") - parser.add_argument("--distribute", type=str, default="false", choices=["true", "false"], - help="Run distribute, default is false.") - parser.add_argument("--enable_graph_kernel", type=str, default="true", choices=["true", "false"], - help="Accelerate by graph kernel, default is true.") +def modelarts_pre_process(): + config.ckpt_path = os.path.join(config.output_path, config.ckpt_path) - args = parser.parse_args() +@moxing_wrapper(pre_process=modelarts_pre_process) +def train_lstm(): + """ train lstm """ + print('\ntrain.py config: \n', config) + config.preprocess_path = os.path.join(config.glove_path, config.preprocess_path) - _enable_graph_kernel = args.enable_graph_kernel == "true" and args.device_target == "GPU" + _enable_graph_kernel = config.enable_graph_kernel == "true" and config.device_target == "GPU" context.set_context( mode=context.GRAPH_MODE, save_graphs=False, enable_graph_kernel=_enable_graph_kernel, - device_target=args.device_target) + device_target=config.device_target) rank = 0 device_num = 1 - if args.device_target == 'Ascend': - cfg = lstm_cfg_ascend - if args.distribute == "true": - cfg = lstm_cfg_ascend_8p - init() - device_num = args.device_num - rank = get_rank() + if config.device_target == 'Ascend' and config.distribute == "true": + init() + device_num = config.device_num # get_device_num() + rank = get_rank() + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, \ + device_num=device_num) - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, - device_num=device_num) - else: - cfg = lstm_cfg - - if args.preprocess == "true": + if config.preprocess == "true": print("============== Starting Data Pre-processing ==============") - convert_to_mindrecord(cfg.embed_size, args.aclimdb_path, args.preprocess_path, args.glove_path) + convert_to_mindrecord(config.embed_size, config.aclimdb_path, config.preprocess_path, config.glove_path) - embedding_table = np.loadtxt(os.path.join(args.preprocess_path, "weight.txt")).astype(np.float32) + embedding_table = np.loadtxt(os.path.join(config.preprocess_path, "weight.txt")).astype(np.float32) # DynamicRNN in this network on Ascend platform only support the condition that the shape of input_size # and hiddle_size is multiples of 16, this problem will be solved later. - if args.device_target == 'Ascend': - pad_num = int(np.ceil(cfg.embed_size / 16) * 16 - cfg.embed_size) + if config.device_target == 'Ascend': + pad_num = int(np.ceil(config.embed_size / 16) * 16 - config.embed_size) if pad_num > 0: embedding_table = np.pad(embedding_table, [(0, 0), (0, pad_num)], 'constant') - cfg.embed_size = int(np.ceil(cfg.embed_size / 16) * 16) + config.embed_size = int(np.ceil(config.embed_size / 16) * 16) network = SentimentNet(vocab_size=embedding_table.shape[0], - embed_size=cfg.embed_size, - num_hiddens=cfg.num_hiddens, - num_layers=cfg.num_layers, - bidirectional=cfg.bidirectional, - num_classes=cfg.num_classes, + embed_size=config.embed_size, + num_hiddens=config.num_hiddens, + num_layers=config.num_layers, + bidirectional=config.bidirectional, + num_classes=config.num_classes, weight=Tensor(embedding_table), - batch_size=cfg.batch_size) + batch_size=config.batch_size) # pre_trained - if args.pre_trained: - load_param_into_net(network, load_checkpoint(args.pre_trained)) + if config.pre_trained: + load_param_into_net(network, load_checkpoint(config.pre_trained)) - ds_train = lstm_create_dataset(args.preprocess_path, cfg.batch_size, 1, device_num=device_num, rank=rank) + ds_train = lstm_create_dataset(config.preprocess_path, config.batch_size, 1, device_num=device_num, rank=rank) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') - if cfg.dynamic_lr: - lr = Tensor(get_lr(global_step=cfg.global_step, - lr_init=cfg.lr_init, lr_end=cfg.lr_end, lr_max=cfg.lr_max, - warmup_epochs=cfg.warmup_epochs, - total_epochs=cfg.num_epochs, + if config.dynamic_lr: + lr = Tensor(get_lr(global_step=config.global_step, + lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, + warmup_epochs=config.warmup_epochs, + total_epochs=config.num_epochs, steps_per_epoch=ds_train.get_dataset_size(), - lr_adjust_epoch=cfg.lr_adjust_epoch)) + lr_adjust_epoch=config.lr_adjust_epoch)) else: - lr = cfg.learning_rate + lr = config.learning_rate - opt = nn.Momentum(network.trainable_params(), lr, cfg.momentum) + opt = nn.Momentum(network.trainable_params(), lr, config.momentum) loss_cb = LossMonitor() model = Model(network, loss, opt, {'acc': Accuracy()}) print("============== Starting Training ==============") - config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - ckpoint_cb = ModelCheckpoint(prefix="lstm", directory=args.ckpt_path, config=config_ck) + config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps, + keep_checkpoint_max=config.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint(prefix="lstm", directory=config.ckpt_path, config=config_ck) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - if args.device_target == "CPU": - model.train(cfg.num_epochs, ds_train, callbacks=[time_cb, ckpoint_cb, loss_cb], dataset_sink_mode=False) + if config.device_target == "CPU": + model.train(config.num_epochs, ds_train, callbacks=[time_cb, ckpoint_cb, loss_cb], dataset_sink_mode=False) else: - model.train(cfg.num_epochs, ds_train, callbacks=[time_cb, ckpoint_cb, loss_cb]) + model.train(config.num_epochs, ds_train, callbacks=[time_cb, ckpoint_cb, loss_cb]) print("============== Training Success ==============") + +if __name__ == '__main__': + train_lstm()