commit
3e3ced7015
|
@ -91,6 +91,23 @@ After dataset preparation, you can start training and evaluation as follows:
|
|||
VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET
|
||||
```
|
||||
|
||||
- running on GPU
|
||||
|
||||
```bash
|
||||
# run training example
|
||||
cd ./scripts
|
||||
bash run_standalone_train_gpu.sh PRE_TRAIN_DATASET DEVICE_ID
|
||||
|
||||
# run distributed training example
|
||||
cd ./scripts
|
||||
bash run_distributed_train_gpu.sh PRE_TRAIN_DATASET
|
||||
|
||||
# run evaluation example
|
||||
cd ./scripts
|
||||
bash run_standalone_eval_gpu.sh TEST_DATASET EXISTED_CKPT_PATH \
|
||||
VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET DEVICE_ID
|
||||
```
|
||||
|
||||
- ModelArts (If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training as follows)
|
||||
|
||||
```bash
|
||||
|
@ -206,10 +223,15 @@ The GNMT network script and code result are as follows:
|
|||
│ ├──optimizer.py // Optimizer.
|
||||
├── scripts
|
||||
│ ├──run_distributed_train_ascend.sh // Shell script for distributed train on ascend.
|
||||
│ ├──run_distributed_train_gpu.sh // Shell script for distributed train on GPU.
|
||||
│ ├──run_standalone_eval_ascend.sh // Shell script for standalone eval on ascend.
|
||||
│ ├──run_standalone_eval_gpu.sh // Shell script for standalone eval on GPU.
|
||||
│ ├──run_standalone_train_ascend.sh // Shell script for standalone eval on ascend.
|
||||
├── default_config.yaml // Configurations for train
|
||||
├── default_test_config.yaml // Configurations for eval
|
||||
│ ├──run_standalone_train_gpu.sh // Shell script for standalone eval on GPU.
|
||||
├── default_config.yaml // Configurations for train on ascend.
|
||||
├── default_config_gpu.yaml // Configurations for train on GPU.
|
||||
├── default_test_config.yaml // Configurations for eval on ascend.
|
||||
├── default_test_config_gpu.yaml // Configurations for eval on GPU.
|
||||
├── create_dataset.py // Dataset preparation.
|
||||
├── eval.py // Infer API entry.
|
||||
├── export.py // Export checkpoint file into air models.
|
||||
|
@ -262,49 +284,96 @@ For more configuration details, please refer the script `./default_config.yaml`
|
|||
|
||||
## Training Process
|
||||
|
||||
For a pre-trained model, configure the following options in the `./default_config.yaml` file:
|
||||
- running on Ascend
|
||||
|
||||
- Select an optimizer ('momentum/adam/lamb' is available).
|
||||
- Specify `ckpt_prefix` and `ckpt_path` in `checkpoint_path` to save the model file.
|
||||
- Set other parameters, including dataset configuration and network configuration.
|
||||
- If a pre-trained model exists, assign `existed_ckpt` to the path of the existing model during fine-tuning.
|
||||
For a pre-trained model, configure the following options in the `./default_config.yaml` file:
|
||||
|
||||
Start task training on a single device and run the shell script `scripts/run_standalone_train_ascend.sh`:
|
||||
- Select an optimizer ('momentum/adam/lamb' is available).
|
||||
- Specify `ckpt_prefix` and `ckpt_path` in `checkpoint_path` to save the model file.
|
||||
- Set other parameters, including dataset configuration and network configuration.
|
||||
- If a pre-trained model exists, assign `existed_ckpt` to the path of the existing model during fine-tuning.
|
||||
|
||||
```bash
|
||||
cd ./scripts
|
||||
bash run_standalone_train_ascend.sh PRE_TRAIN_DATASET
|
||||
```
|
||||
Start task training on a single device and run the shell script `scripts/run_standalone_train_ascend.sh`:
|
||||
|
||||
In this script, the `PRE_TRAIN_DATASET` is the dataset address.
|
||||
```bash
|
||||
cd ./scripts
|
||||
bash run_standalone_train_ascend.sh PRE_TRAIN_DATASET
|
||||
```
|
||||
|
||||
Run `scripts/run_distributed_train_ascend.sh` for distributed training of GNMTv2 model.
|
||||
Task training on multiple devices and run the following command in bash to be executed in `scripts/`.:
|
||||
In this script, the `PRE_TRAIN_DATASET` is the dataset address.
|
||||
|
||||
```bash
|
||||
cd ./scripts
|
||||
bash run_distributed_train_ascend.sh RANK_TABLE_ADDR PRE_TRAIN_DATASET
|
||||
```
|
||||
Run `scripts/run_distributed_train_ascend.sh` for distributed training of GNMTv2 model.
|
||||
Task training on multiple devices and run the following command in bash to be executed in `scripts/`.:
|
||||
|
||||
Note: the `RANK_TABLE_ADDR` is the hccl_json file assigned when distributed training is running.
|
||||
Currently, inconsecutive device IDs are not supported in `scripts/run_distributed_train_ascend.sh`. The device ID must start from 0 in the `RANK_TABLE_ADDR` file.
|
||||
```bash
|
||||
cd ./scripts
|
||||
bash run_distributed_train_ascend.sh RANK_TABLE_ADDR PRE_TRAIN_DATASET
|
||||
```
|
||||
|
||||
Note: the `RANK_TABLE_ADDR` is the hccl_json file assigned when distributed training is running.
|
||||
Currently, inconsecutive device IDs are not supported in `scripts/run_distributed_train_ascend.sh`. The device ID must start from 0 in the `RANK_TABLE_ADDR` file.
|
||||
|
||||
- running on GPU
|
||||
|
||||
For a pre-trained model, configure the following options in the `./default_config_gpu.yaml` file:
|
||||
|
||||
- Select an optimizer ('momentum/adam/lamb' is available).
|
||||
- Specify `ckpt_prefix` and `ckpt_path` in `checkpoint_path` to save the model file.
|
||||
- Set other parameters, including dataset configuration and network configuration.
|
||||
- If a pre-trained model exists, assign `existed_ckpt` to the path of the existing model during fine-tuning.
|
||||
|
||||
Start task training on a single device and run the shell script `scripts/run_standalone_train_gpu.sh`:
|
||||
|
||||
```bash
|
||||
cd ./scripts
|
||||
bash run_standalone_train_gpu.sh PRE_TRAIN_DATASET DEVICE_ID
|
||||
```
|
||||
|
||||
In this script, the `PRE_TRAIN_DATASET` is the dataset address.
|
||||
|
||||
Run `scripts/run_distributed_train_gpu.sh` for distributed training of GNMTv2 model.
|
||||
Task training on multiple devices and run the following command in bash to be executed in `scripts/`.:
|
||||
|
||||
```bash
|
||||
cd ./scripts
|
||||
bash run_distributed_train_ascend.sh PRE_TRAIN_DATASET
|
||||
```
|
||||
|
||||
Currently, inconsecutive device IDs are not supported in `scripts/run_distributed_train_gpu.sh`. The device ID must start from 0 to 7.
|
||||
|
||||
## Inference Process
|
||||
|
||||
For inference using a trained model on multiple hardware platforms, such as Ascend 910.
|
||||
Set options in `./default_config.yaml`.
|
||||
- running on Ascend
|
||||
|
||||
Run the shell script `scripts/run_standalone_eval_ascend.sh` to process the output token ids to get the BLEU scores.
|
||||
For inference using a trained model on multiple hardware platforms, such as Ascend 910.
|
||||
Set options in `./default_test_config.yaml`.
|
||||
|
||||
```bash
|
||||
cd ./scripts
|
||||
bash run_standalone_eval_ascend.sh
|
||||
bash run_standalone_eval_ascend.sh TEST_DATASET EXISTED_CKPT_PATH \
|
||||
VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET
|
||||
```
|
||||
Run the shell script `scripts/run_standalone_eval_ascend.sh` to process the output token ids to get the BLEU scores.
|
||||
|
||||
The `TEST_DATASET` is the address of inference dataset, and `EXISTED_CKPT_PATH` is the path of the model file generated during training process.
|
||||
The `VOCAB_ADDR` is the vocabulary address, `BPE_CODE_ADDR` is the bpe code address and the `TEST_TARGET` are the path of answers.
|
||||
```bash
|
||||
cd ./scripts
|
||||
bash run_standalone_eval_ascend.sh TEST_DATASET EXISTED_CKPT_PATH \
|
||||
VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET
|
||||
```
|
||||
|
||||
The `TEST_DATASET` is the address of inference dataset, and `EXISTED_CKPT_PATH` is the path of the model file generated during training process.
|
||||
The `VOCAB_ADDR` is the vocabulary address, `BPE_CODE_ADDR` is the bpe code address and the `TEST_TARGET` are the path of answers.
|
||||
|
||||
- running on GPU
|
||||
|
||||
For inference using a trained model on GPU.
|
||||
Set options in `./default_test_config_gpu.yaml`.
|
||||
|
||||
Run the shell script `scripts/run_standalone_eval_gpu.sh` to process the output token ids to get the BLEU scores.
|
||||
|
||||
```bash
|
||||
cd ./scripts
|
||||
bash run_standalone_eval_gpu.sh TEST_DATASET EXISTED_CKPT_PATH \
|
||||
VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET DEVICE_ID
|
||||
```
|
||||
|
||||
The `TEST_DATASET` is the address of inference dataset, and `EXISTED_CKPT_PATH` is the path of the model file generated during training process.
|
||||
The `VOCAB_ADDR` is the vocabulary address, `BPE_CODE_ADDR` is the bpe code address and the `TEST_TARGET` are the path of answers.
|
||||
|
||||
# [Model Description](#contents)
|
||||
|
||||
|
@ -312,36 +381,36 @@ The `VOCAB_ADDR` is the vocabulary address, `BPE_CODE_ADDR` is the bpe code addr
|
|||
|
||||
### Training Performance
|
||||
|
||||
| Parameters | Ascend |
|
||||
| -------------------------- | -------------------------------------------------------------- |
|
||||
| Resource | Ascend 910; OS Euler2.8 |
|
||||
| uploaded Date | 11/06/2020 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 |
|
||||
| Dataset | WMT English-German for training |
|
||||
| Training Parameters | epoch=6, batch_size=128 |
|
||||
| Optimizer | Adam |
|
||||
| Loss Function | Softmax Cross Entropy |
|
||||
| outputs | probability |
|
||||
| Speed | 344ms/step (8pcs) |
|
||||
| Total Time | 7800s (8pcs) |
|
||||
| Loss | 63.35 |
|
||||
| Params (M) | 613 |
|
||||
| Checkpoint for inference | 1.8G (.ckpt file) |
|
||||
| Scripts | [gnmt_v2](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/gnmt_v2) |
|
||||
| Parameters | Ascend |GPU |
|
||||
| -------------------------- | -------------------------------------------------------------- | -------------------------------------------------------------- |
|
||||
| Resource | Ascend 910; OS Euler2.8 | NV SMX2 V100-32G |
|
||||
| uploaded Date | 11/06/2020 (month/day/year) | 08/05/2021 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 | 1.3.0 |
|
||||
| Dataset | WMT English-German for training | WMT English-German for training |
|
||||
| Training Parameters | epoch=6, batch_size=128 | epoch=8, batch_size=128 |
|
||||
| Optimizer | Adam | Adam |
|
||||
| Loss Function | Softmax Cross Entropy | Softmax Cross Entropy |
|
||||
| outputs | probability | probability |
|
||||
| Speed | 344ms/step (8pcs) | 620 ms/step (1pcs) |
|
||||
| Total Time | 7800s (8pcs) | 17079s (1pcs) |
|
||||
| Loss | 63.35 | 55.42 |
|
||||
| Params (M) | 613 | 613 |
|
||||
| Checkpoint for inference | 1.8G (.ckpt file) | 1.8G (.ckpt file) |
|
||||
| Scripts | [gnmt_v2](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/gnmt_v2) | [gnmt_v2](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/nlp/gnmt_v2) |
|
||||
|
||||
### Inference Performance
|
||||
|
||||
| Parameters | Ascend |
|
||||
| ------------------- | --------------------------- |
|
||||
| Resource | Ascend 910; OS Euler2.8 |
|
||||
| Uploaded Date | 11/06/2020 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 |
|
||||
| Dataset | WMT newstest2014 |
|
||||
| batch_size | 128 |
|
||||
| Total Time | 1560s |
|
||||
| outputs | probability |
|
||||
| Accuracy | BLEU Score= 24.05 |
|
||||
| Model for inference | 1.8G (.ckpt file) |
|
||||
| Parameters | Ascend | GPU |
|
||||
| ------------------- | --------------------------- | --------------------------- |
|
||||
| Resource | Ascend 910; OS Euler2.8 | NV SMX2 V100-32G |
|
||||
| Uploaded Date | 11/06/2020 (month/day/year) | 08/05/2021 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 | 1.3.0 |
|
||||
| Dataset | WMT newstest2014 | WMT newstest2014 |
|
||||
| batch_size | 128 | 128 |
|
||||
| Total Time | 1560s | 180s |
|
||||
| outputs | probability | probability |
|
||||
| Accuracy | BLEU Score= 24.05 | BLEU Score= 24.4 |
|
||||
| Model for inference | 1.8G (.ckpt file) | 1.8G (.ckpt file) |
|
||||
|
||||
# [Random Situation Description](#contents)
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ data_path: "/cache/data"
|
|||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
device_target: "Ascend"
|
||||
device_id: 0
|
||||
need_modelarts_dataset_unzip: False
|
||||
modelarts_dataset_unzip_name: ""
|
||||
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
device_target: "GPU"
|
||||
device_id: 0
|
||||
need_modelarts_dataset_unzip: False
|
||||
modelarts_dataset_unzip_name: ""
|
||||
|
||||
# ==============================================================================
|
||||
# dataset_config
|
||||
random_seed: 50
|
||||
epochs: 8
|
||||
batch_size: 128
|
||||
pre_train_dataset: "/home/workspace/dataset_menu/train.tok.clean.bpe.32000.en.mindrecord"
|
||||
fine_tune_dataset: ""
|
||||
test_dataset: ""
|
||||
valid_dataset: ""
|
||||
dataset_sink_mode: true
|
||||
input_mask_from_dataset: False
|
||||
|
||||
# model_config
|
||||
seq_length: 51
|
||||
vocab_size: 32320
|
||||
hidden_size: 1024
|
||||
num_hidden_layers: 4
|
||||
intermediate_size: 4096
|
||||
hidden_dropout_prob: 0.2
|
||||
attention_dropout_prob: 0.2
|
||||
initializer_range: 0.1
|
||||
label_smoothing: 0.1
|
||||
beam_width: 2
|
||||
length_penalty_weight: 0.6
|
||||
max_decode_length: 50
|
||||
|
||||
# loss_scale_config
|
||||
init_loss_scale: 65536
|
||||
loss_scale_factor: 2
|
||||
scale_window: 1000
|
||||
|
||||
# learn_rate_config
|
||||
optimizer: "adam"
|
||||
lr: 0.002 # 2e-3
|
||||
lr_scheduler: "WarmupMultiStepLR"
|
||||
lr_scheduler_power: 0.5
|
||||
warmup_lr_remain_steps: 0.666
|
||||
warmup_lr_decay_interval: -1
|
||||
decay_steps: 4
|
||||
decay_start_step: -1
|
||||
warmup_steps: 200
|
||||
min_lr: 0.000001 #1e-6
|
||||
|
||||
# checkpoint_options
|
||||
existed_ckpt: ""
|
||||
save_ckpt_steps: 3452
|
||||
keep_ckpt_max: 8
|
||||
ckpt_prefix: "gnmt"
|
||||
ckpt_path: "text_translation"
|
||||
|
||||
# export option
|
||||
file_name: "gnmt_v2"
|
||||
file_format: "AIR"
|
||||
vocab_file: ""
|
||||
bpe_codes: ""
|
||||
|
||||
---
|
||||
|
||||
# Help description for each configuration
|
||||
enable_modelarts: "Whether training on modelarts, default: False"
|
||||
data_url: "Url for modelarts"
|
||||
train_url: "Url for modelarts"
|
||||
data_path: "The location of the input data."
|
||||
output_path: "The location of the output file."
|
||||
device_target: 'Target device type'
|
||||
|
||||
file_name: "output file name."
|
||||
file_format: "file format, choices in ['AIR', 'ONNX', 'MINDIR']"
|
||||
infer_config: "gnmt_v2 config file"
|
||||
vocab_file: "existed checkpoint address."
|
||||
bpe_codes: "bpe codes to use."
|
|
@ -9,6 +9,7 @@ data_path: "/cache/data"
|
|||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
device_target: "Ascend"
|
||||
device_id: 0
|
||||
need_modelarts_dataset_unzip: False
|
||||
modelarts_dataset_unzip_name: ""
|
||||
|
||||
|
|
|
@ -0,0 +1,95 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
device_target: "GPU"
|
||||
device_id: 0
|
||||
need_modelarts_dataset_unzip: False
|
||||
modelarts_dataset_unzip_name: ""
|
||||
|
||||
# ==============================================================================
|
||||
# dataset_config
|
||||
random_seed: 50
|
||||
epochs: 6
|
||||
batch_size: 128
|
||||
pre_train_dataset: ""
|
||||
fine_tune_dataset: ""
|
||||
test_dataset: "/home/workspace/dataset_menu/newstest2014.en.mindrecord"
|
||||
valid_dataset: ""
|
||||
dataset_sink_mode: true
|
||||
input_mask_from_dataset: False
|
||||
|
||||
# model_config
|
||||
seq_length: 107
|
||||
vocab_size: 32320
|
||||
hidden_size: 1024
|
||||
num_hidden_layers: 4
|
||||
intermediate_size: 4096
|
||||
hidden_dropout_prob: 0.2
|
||||
attention_dropout_prob: 0.2
|
||||
initializer_range: 0.1
|
||||
label_smoothing: 0.1
|
||||
beam_width: 2
|
||||
length_penalty_weight: 0.6
|
||||
max_decode_length: 80
|
||||
|
||||
# loss_scale_config
|
||||
init_loss_scale: 65536
|
||||
loss_scale_factor: 2
|
||||
scale_window: 1000
|
||||
|
||||
# learn_rate_config
|
||||
optimizer: "adam"
|
||||
lr: 0.002 # 2e-3
|
||||
lr_scheduler: "WarmupMultiStepLR"
|
||||
lr_scheduler_power: 0.5
|
||||
warmup_lr_remain_steps: 0.666
|
||||
warmup_lr_decay_interval: -1
|
||||
decay_steps: 4
|
||||
decay_start_step: -1
|
||||
warmup_steps: 200
|
||||
min_lr: 0.000001 # 1e-6
|
||||
|
||||
# checkpoint_options
|
||||
existed_ckpt: "/home/workspace/gnmt_v2/gnmt-6_3452.ckpt"
|
||||
save_ckpt_steps: 3452
|
||||
keep_ckpt_max: 6
|
||||
ckpt_prefix: "gnmt"
|
||||
ckpt_path: "text_translation"
|
||||
|
||||
# eval option
|
||||
bpe_codes: ""
|
||||
test_tgt: ""
|
||||
vocab: ""
|
||||
output: "./output.npz"
|
||||
|
||||
# export option
|
||||
file_name: "gnmt_v2"
|
||||
file_format: "AIR"
|
||||
vocab_file: ""
|
||||
|
||||
---
|
||||
|
||||
# Help description for each configuration
|
||||
enable_modelarts: "Whether training on modelarts, default: False"
|
||||
data_url: "Url for modelarts"
|
||||
train_url: "Url for modelarts"
|
||||
data_path: "The location of the input data."
|
||||
output_path: "The location of the output file."
|
||||
device_target: 'Target device type'
|
||||
|
||||
# eval option
|
||||
bpe_codes: "bpe codes to use."
|
||||
test_tgt: "data file of the test target"
|
||||
output: "result file path."
|
||||
|
||||
file_name: "output file name."
|
||||
file_format: "file format, choices in ['AIR', 'ONNX', 'MINDIR']"
|
||||
infer_config: "gnmt_v2 config file"
|
||||
vocab_file: "existed checkpoint address."
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -16,6 +16,7 @@
|
|||
import pickle
|
||||
import os
|
||||
import time
|
||||
from mindspore import context
|
||||
|
||||
from src.gnmt_model import infer
|
||||
from src.gnmt_model.bleu_calculate import bleu_calculate
|
||||
|
@ -83,6 +84,12 @@ def run_eval():
|
|||
'''run eval.'''
|
||||
_config = get_config(default_config)
|
||||
result = infer(_config)
|
||||
context.set_context(
|
||||
mode=context.GRAPH_MODE,
|
||||
save_graphs=False,
|
||||
device_target=_config.device_target,
|
||||
device_id=_config.device_id,
|
||||
reserve_class_name_in_scope=False)
|
||||
|
||||
with open(_config.output, "wb") as f:
|
||||
pickle.dump(result, f, 1)
|
||||
|
|
|
@ -47,12 +47,12 @@ do
|
|||
cp -r ../../src .
|
||||
cp -r ../../model_utils .
|
||||
export RANK_ID=$i
|
||||
export DEVICE_ID=$i
|
||||
config_path="${current_exec_path}/device${i}/default_config.yaml"
|
||||
echo "config path is : ${config_path}"
|
||||
python ../../train.py \
|
||||
--config_path=$config_path \
|
||||
--pre_train_dataset=$PRE_TRAIN_DATASET > log_gnmt_network${i}.log 2>&1 &
|
||||
cd ${current_exec_path} || exit
|
||||
python ../../train.py \
|
||||
--config_path=$config_path \
|
||||
--pre_train_dataset=$PRE_TRAIN_DATASET \
|
||||
--device_id=$i > log_gnmt_network${i}.log 2>&1 &
|
||||
cd ${current_exec_path} || exit
|
||||
done
|
||||
cd ${current_exec_path} || exit
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the script as: "
|
||||
echo "bash run_distributed_train_gpu.sh PRE_TRAIN_DATASET"
|
||||
echo "for example:"
|
||||
echo "bash run_distributed_train_gpu.sh \
|
||||
/home/workspace/dataset_menu/train.tok.clean.bpe.32000.en.mindrecord"
|
||||
echo "It is better to use absolute path."
|
||||
echo "=============================================================================================================="
|
||||
|
||||
PRE_TRAIN_DATASET=$1
|
||||
|
||||
current_exec_path=$(pwd)
|
||||
echo ${current_exec_path}
|
||||
|
||||
export RANK_SIZE=8
|
||||
export GLOG_v=2
|
||||
|
||||
rm -rf LOG
|
||||
mkdir ./LOG
|
||||
cp ../*.py ./LOG
|
||||
cp ../*.yaml ./LOG
|
||||
cp -r ../src ./LOG
|
||||
cd ./LOG || exit
|
||||
config_path="${current_exec_path}/LOG/default_config_gpu.yaml"
|
||||
echo "config path is : ${config_path}"
|
||||
|
||||
|
||||
if [ $# == 1 ]
|
||||
then
|
||||
mpirun -allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python ../../train.py \
|
||||
--config_path=$config_path \
|
||||
--device_target="GPU" \
|
||||
--pre_train_dataset=$PRE_TRAIN_DATASET > log_gnmt_train.log 2>&1 &
|
||||
fi
|
||||
|
||||
cd ${current_exec_path} || exit
|
|
@ -0,0 +1,70 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the script as: "
|
||||
echo "bash run_standalone_eval_gpu.sh TEST_DATASET EXISTED_CKPT_PATH \
|
||||
VOCAB_ADDR BPE_CODE_ADDR TEST_TARGET DEVICE_ID"
|
||||
echo "for example:"
|
||||
echo "bash run_standalone_eval_gpu.sh \
|
||||
/home/workspace/dataset_menu/newstest2014.en.mindrecord \
|
||||
/home/workspace/gnmt_v2/gnmt-6_3452.ckpt \
|
||||
/home/workspace/wmt16_de_en/vocab.bpe.32000 \
|
||||
/home/workspace/wmt16_de_en/bpe.32000 \
|
||||
/home/workspace/wmt16_de_en/newstest2014.de \
|
||||
0"
|
||||
echo "It is better to use absolute path."
|
||||
echo "=============================================================================================================="
|
||||
|
||||
TEST_DATASET=$1
|
||||
EXISTED_CKPT_PATH=$2
|
||||
VOCAB_ADDR=$3
|
||||
BPE_CODE_ADDR=$4
|
||||
TEST_TARGET=$5
|
||||
export CUDA_VISIBLE_DEVICES=$6
|
||||
|
||||
current_exec_path=$(pwd)
|
||||
echo ${current_exec_path}
|
||||
|
||||
|
||||
export GLOG_v=2
|
||||
|
||||
if [ -d "eval" ];
|
||||
then
|
||||
rm -rf ./eval
|
||||
fi
|
||||
mkdir ./eval
|
||||
cp ../*.py ./eval
|
||||
cp ../*.yaml ./eval
|
||||
cp -r ../src ./eval
|
||||
cp -r ../model_utils ./eval
|
||||
cd ./eval || exit
|
||||
echo "start for evaluation"
|
||||
env > env.log
|
||||
|
||||
config_path="${current_exec_path}/eval/default_test_config_gpu.yaml"
|
||||
echo "config path is : ${config_path}"
|
||||
|
||||
python eval.py \
|
||||
--config_path=$config_path \
|
||||
--test_dataset=$TEST_DATASET \
|
||||
--existed_ckpt=$EXISTED_CKPT_PATH \
|
||||
--vocab=$VOCAB_ADDR \
|
||||
--bpe_codes=$BPE_CODE_ADDR \
|
||||
--test_tgt=$TEST_TARGET \
|
||||
--device_target="GPU" \
|
||||
--device_id=0 >log_infer.log 2>&1 &
|
||||
cd ..
|
|
@ -0,0 +1,54 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the script as: "
|
||||
echo "bash run_standalone_train_gpu.sh PRE_TRAIN_DATASET DEVICE_ID"
|
||||
echo "for example:"
|
||||
echo "bash run_standalone_train_gpu.sh \
|
||||
/home/workspace/dataset_menu/train.tok.clean.bpe.32000.en.mindrecord 0"
|
||||
echo "It is better to use absolute path."
|
||||
echo "=============================================================================================================="
|
||||
|
||||
PRE_TRAIN_DATASET=$1
|
||||
|
||||
export GLOG_v=2
|
||||
export CUDA_VISIBLE_DEVICES=$2
|
||||
|
||||
current_exec_path=$(pwd)
|
||||
echo ${current_exec_path}
|
||||
if [ -d "train" ];
|
||||
then
|
||||
rm -rf ./train
|
||||
fi
|
||||
mkdir ./train
|
||||
cp ../*.py ./train
|
||||
cp ../*.yaml ./train
|
||||
cp -r ../src ./train
|
||||
cp -r ../model_utils ./train
|
||||
cd ./train || exit
|
||||
echo "start for training"
|
||||
env > env.log
|
||||
|
||||
config_path="${current_exec_path}/train/default_config_gpu.yaml"
|
||||
echo "config path is : ${config_path}"
|
||||
|
||||
python train.py \
|
||||
--config_path=$config_path \
|
||||
--pre_train_dataset=$PRE_TRAIN_DATASET \
|
||||
--device_id=0 \
|
||||
--device_target="GPU" > log_gnmt_network.log 2>&1 &
|
||||
cd ..
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -18,6 +18,7 @@ import numpy as np
|
|||
import mindspore.ops.operations as P
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.nn as nn
|
||||
from mindspore import context
|
||||
from mindspore.common.parameter import Parameter
|
||||
from mindspore.common.tensor import Tensor
|
||||
|
||||
|
@ -41,7 +42,6 @@ class DynamicRNNCell(nn.Cell):
|
|||
hidden_size=1024,
|
||||
initializer_range=0.1):
|
||||
super(DynamicRNNCell, self).__init__()
|
||||
self.rnn = P.DynamicRNN()
|
||||
self.num_step = num_setp
|
||||
self.batch_size = batch_size
|
||||
self.input_size = word_embed_dim
|
||||
|
@ -57,15 +57,32 @@ class DynamicRNNCell(nn.Cell):
|
|||
self.dynamicRNN_h = Tensor(np.zeros((1, self.batch_size, self.hidden_size)), mstype.float32)
|
||||
self.dynamicRNN_c = Tensor(np.zeros((1, self.batch_size, self.hidden_size)), mstype.float32)
|
||||
self.cast = P.Cast()
|
||||
self.is_ascend = context.get_context("device_target") == "Ascend"
|
||||
if self.is_ascend:
|
||||
self.compute_type = mstype.float16
|
||||
self.rnn = P.DynamicRNN()
|
||||
else:
|
||||
self.compute_type = mstype.float32
|
||||
self.lstm = nn.LSTM(self.input_size,
|
||||
self.hidden_size,
|
||||
num_layers=1,
|
||||
has_bias=True,
|
||||
batch_first=False,
|
||||
dropout=0,
|
||||
bidirectional=False)
|
||||
|
||||
def construct(self, x, init_h=None, init_c=None):
|
||||
w = self.cast(self.dynamicRNN_w, mstype.float16)
|
||||
b = self.cast(self.dynamicRNN_b, mstype.float16)
|
||||
"""DynamicRNNCell Network."""
|
||||
if init_h is None or init_c is None:
|
||||
init_h = self.cast(self.dynamicRNN_h, mstype.float16)
|
||||
init_c = self.cast(self.dynamicRNN_c, mstype.float16)
|
||||
out = self.rnn(x, w, b, None, init_h, init_c)
|
||||
return out[0], out[1], out[2]
|
||||
init_h = self.cast(self.dynamicRNN_h, self.compute_type)
|
||||
init_c = self.cast(self.dynamicRNN_c, self.compute_type)
|
||||
if self.is_ascend:
|
||||
w = self.cast(self.dynamicRNN_w, self.compute_type)
|
||||
b = self.cast(self.dynamicRNN_b, self.compute_type)
|
||||
output, hn, cn = self.rnn(x, w, b, None, init_h, init_c)
|
||||
else:
|
||||
output, (hn, cn) = self.lstm(x, (init_h, init_c))
|
||||
return output, hn, cn
|
||||
|
||||
|
||||
class DynamicRNNNet(nn.Cell):
|
||||
|
@ -94,13 +111,18 @@ class DynamicRNNNet(nn.Cell):
|
|||
batch_size=batchsize,
|
||||
word_embed_dim=word_embed_dim,
|
||||
hidden_size=hidden_size)
|
||||
self.is_ascend = context.get_context("device_target") == "Ascend"
|
||||
if self.is_ascend:
|
||||
self.compute_type = mstype.float16
|
||||
else:
|
||||
self.compute_type = mstype.float32
|
||||
|
||||
def construct(self, inputs, init_state=None):
|
||||
"""DynamicRNN Network."""
|
||||
inputs = self.cast(inputs, mstype.float16)
|
||||
inputs = self.cast(inputs, self.compute_type)
|
||||
if init_state is not None:
|
||||
init_h = self.cast(init_state[0:1, :, :], mstype.float16)
|
||||
init_c = self.cast(init_state[-1:, :, :], mstype.float16)
|
||||
init_h = self.cast(init_state[0:1, :, :], self.compute_type)
|
||||
init_c = self.cast(init_state[-1:, :, :], self.compute_type)
|
||||
out, state_h, state_c = self.net(inputs, init_h, init_c)
|
||||
else:
|
||||
out, state_h, state_c = self.net(inputs)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -14,14 +14,13 @@
|
|||
# ============================================================================
|
||||
"""Infer api."""
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
import mindspore.nn as nn
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore import context, Parameter
|
||||
from mindspore import Parameter
|
||||
from mindspore.train.model import Model
|
||||
|
||||
from src.dataset import load_dataset
|
||||
|
@ -29,13 +28,6 @@ from .gnmt import GNMT
|
|||
from ..utils import zero_weight
|
||||
from ..utils.load_weights import load_infer_weights
|
||||
|
||||
context.set_context(
|
||||
mode=context.GRAPH_MODE,
|
||||
save_graphs=False,
|
||||
device_target="Ascend",
|
||||
reserve_class_name_in_scope=False)
|
||||
|
||||
|
||||
class GNMTInferCell(nn.Cell):
|
||||
"""
|
||||
Encapsulation class of GNMT network infer.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -26,7 +26,7 @@ from mindspore.train.loss_scale_manager import DynamicLossScaleManager
|
|||
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, SummaryCollector, TimeMonitor
|
||||
from mindspore import context, Parameter
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.communication import management as MultiAscend
|
||||
from mindspore.communication import management as MultiDevice
|
||||
from mindspore.train.serialization import load_checkpoint
|
||||
from mindspore.common import set_seed
|
||||
|
||||
|
@ -63,7 +63,7 @@ def _train(model, config,
|
|||
epoch_size = pre_training_dataset.get_repeat_count()
|
||||
print("epoch size ", epoch_size)
|
||||
if os.getenv("RANK_SIZE") is not None and int(os.getenv("RANK_SIZE")) > 1:
|
||||
print(f" | Rank {MultiAscend.get_rank()} Call model train.")
|
||||
print(f" | Rank {MultiDevice.get_rank()} Call model train.")
|
||||
model.train(config.epochs, pre_training_dataset,
|
||||
callbacks=callbacks, dataset_sink_mode=config.dataset_sink_mode)
|
||||
|
||||
|
@ -203,10 +203,10 @@ def _build_training_pipeline(config,
|
|||
|
||||
rank_size = os.getenv('RANK_SIZE')
|
||||
callbacks = [time_cb, loss_monitor]
|
||||
if rank_size is not None and int(rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
|
||||
if rank_size is not None and int(rank_size) > 1 and MultiDevice.get_rank() % 8 == 0:
|
||||
ckpt_callback = ModelCheckpoint(
|
||||
prefix=config.ckpt_prefix,
|
||||
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
|
||||
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(MultiDevice.get_rank())),
|
||||
config=ckpt_config)
|
||||
callbacks.append(ckpt_callback)
|
||||
summary_callback = SummaryCollector(summary_dir="./summary", collect_freq=50)
|
||||
|
@ -215,7 +215,7 @@ def _build_training_pipeline(config,
|
|||
if rank_size is None or int(rank_size) == 1:
|
||||
ckpt_callback = ModelCheckpoint(
|
||||
prefix=config.ckpt_prefix,
|
||||
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
|
||||
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(config.device_id)),
|
||||
config=ckpt_config)
|
||||
callbacks.append(ckpt_callback)
|
||||
summary_callback = SummaryCollector(summary_dir="./summary", collect_freq=50)
|
||||
|
@ -231,10 +231,10 @@ def _build_training_pipeline(config,
|
|||
|
||||
def _setup_parallel_env():
|
||||
context.reset_auto_parallel_context()
|
||||
MultiAscend.init()
|
||||
MultiDevice.init()
|
||||
context.set_auto_parallel_context(
|
||||
parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
device_num=MultiAscend.get_group_size(),
|
||||
device_num=MultiDevice.get_group_size(),
|
||||
gradients_mean=True
|
||||
)
|
||||
|
||||
|
@ -253,22 +253,22 @@ def train_parallel(config):
|
|||
data_files=config.pre_train_dataset,
|
||||
batch_size=config.batch_size,
|
||||
sink_mode=config.dataset_sink_mode,
|
||||
rank_size=MultiAscend.get_group_size(),
|
||||
rank_id=MultiAscend.get_rank()
|
||||
rank_size=MultiDevice.get_group_size(),
|
||||
rank_id=MultiDevice.get_rank()
|
||||
) if config.pre_train_dataset else None
|
||||
fine_tune_dataset = load_dataset(
|
||||
data_files=config.fine_tune_dataset,
|
||||
batch_size=config.batch_size,
|
||||
sink_mode=config.dataset_sink_mode,
|
||||
rank_size=MultiAscend.get_group_size(),
|
||||
rank_id=MultiAscend.get_rank()
|
||||
rank_size=MultiDevice.get_group_size(),
|
||||
rank_id=MultiDevice.get_rank()
|
||||
) if config.fine_tune_dataset else None
|
||||
test_dataset = load_dataset(
|
||||
data_files=config.test_dataset,
|
||||
batch_size=config.batch_size,
|
||||
sink_mode=config.dataset_sink_mode,
|
||||
rank_size=MultiAscend.get_group_size(),
|
||||
rank_id=MultiAscend.get_rank()
|
||||
rank_size=MultiDevice.get_group_size(),
|
||||
rank_id=MultiDevice.get_rank()
|
||||
) if config.test_dataset else None
|
||||
|
||||
_build_training_pipeline(config=config,
|
||||
|
@ -359,17 +359,12 @@ def modelarts_pre_process():
|
|||
@moxing_wrapper(pre_process=modelarts_pre_process)
|
||||
def run_train():
|
||||
'''run train.'''
|
||||
device_id = os.getenv('DEVICE_ID', None)
|
||||
if device_id is None:
|
||||
raise RuntimeError("`DEVICE_ID` can not be None.")
|
||||
|
||||
device_id = int(device_id)
|
||||
context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend",
|
||||
reserve_class_name_in_scope=True, device_id=device_id)
|
||||
_rank_size = os.getenv('RANK_SIZE')
|
||||
|
||||
_config = get_config(default_config)
|
||||
_config.pre_train_dataset = default_config.pre_train_dataset
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target=_config.device_target,
|
||||
reserve_class_name_in_scope=True, device_id=_config.device_id)
|
||||
_rank_size = os.getenv('RANK_SIZE')
|
||||
set_seed(_config.random_seed)
|
||||
if _rank_size is not None and int(_rank_size) > 1:
|
||||
train_parallel(_config)
|
||||
|
|
Loading…
Reference in New Issue