clould
This commit is contained in:
parent
fb6ec96862
commit
3713ef59bb
|
@ -19,8 +19,8 @@ ann_file: "./annotations/instances_val2017.json"
|
||||||
modelarts_dataset_unzip_name: 'cocodataset'
|
modelarts_dataset_unzip_name: 'cocodataset'
|
||||||
need_modelarts_dataset_unzip: True
|
need_modelarts_dataset_unzip: True
|
||||||
|
|
||||||
img_path: '' # "image file path."
|
img_path: ''
|
||||||
result_path: '' # "result file path."
|
result_path: ''
|
||||||
|
|
||||||
# Training options
|
# Training options
|
||||||
img_width: 1280
|
img_width: 1280
|
||||||
|
@ -30,10 +30,15 @@ flip_ratio: 0.5
|
||||||
expand_ratio: 1.0
|
expand_ratio: 1.0
|
||||||
|
|
||||||
max_instance_count: 128
|
max_instance_count: 128
|
||||||
mask_shape: (28, 28)
|
mask_shape: [28, 28]
|
||||||
|
|
||||||
# anchor
|
# anchor
|
||||||
feature_shapes: [(192, 320), (96, 160), (48, 80), (24, 40), (12, 20)]
|
feature_shapes:
|
||||||
|
- [192, 320]
|
||||||
|
- [96, 160]
|
||||||
|
- [48, 80]
|
||||||
|
- [24, 40]
|
||||||
|
- [12, 20]
|
||||||
anchor_scales: [8]
|
anchor_scales: [8]
|
||||||
anchor_ratios: [0.5, 1.0, 2.0]
|
anchor_ratios: [0.5, 1.0, 2.0]
|
||||||
anchor_strides: [4, 8, 16, 32, 64]
|
anchor_strides: [4, 8, 16, 32, 64]
|
||||||
|
@ -72,7 +77,7 @@ activate_num_classes: 2
|
||||||
use_sigmoid_cls: True
|
use_sigmoid_cls: True
|
||||||
|
|
||||||
# roi_align
|
# roi_align
|
||||||
roi_layer: dict(type='RoIAlign', out_size=7, mask_out_size=14, sample_num=2)
|
roi_layer: {type: 'RoIAlign', out_size: 7, mask_out_size: 14, sample_num: 2}
|
||||||
roi_align_out_channels: 256
|
roi_align_out_channels: 256
|
||||||
roi_align_featmap_strides: [4, 8, 16, 32]
|
roi_align_featmap_strides: [4, 8, 16, 32]
|
||||||
roi_align_finest_scale: 56
|
roi_align_finest_scale: 56
|
||||||
|
@ -127,7 +132,7 @@ base_lr: 0.02
|
||||||
base_step: 58633
|
base_step: 58633
|
||||||
total_epoch: 13
|
total_epoch: 13
|
||||||
warmup_step: 500
|
warmup_step: 500
|
||||||
warmup_ratio: 1/3.0
|
warmup_ratio: 0.333333
|
||||||
sgd_momentum: 0.9
|
sgd_momentum: 0.9
|
||||||
|
|
||||||
# train
|
# train
|
||||||
|
@ -142,11 +147,11 @@ save_checkpoint_epochs: 1
|
||||||
keep_checkpoint_max: 12
|
keep_checkpoint_max: 12
|
||||||
save_checkpoint_path: "./"
|
save_checkpoint_path: "./"
|
||||||
|
|
||||||
mindrecord_dir: "./MindRecord_COCO" # "/home/mask_rcnn/MindRecord_COCO2017_Train"
|
mindrecord_dir: "./MindRecord_COCO"
|
||||||
train_data_type: "train2017"
|
train_data_type: "train2017"
|
||||||
val_data_type: "val2017"
|
val_data_type: "val2017"
|
||||||
instance_set: "annotations/instances_{}.json"
|
instance_set: "annotations/instances_{}.json"
|
||||||
coco_classes: ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
|
coco_classes: ['background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
|
||||||
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
|
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
|
||||||
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
|
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
|
||||||
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
|
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
|
||||||
|
@ -160,7 +165,7 @@ coco_classes: ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane
|
||||||
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
|
'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
|
||||||
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
|
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
|
||||||
'refrigerator', 'book', 'clock', 'vase', 'scissors',
|
'refrigerator', 'book', 'clock', 'vase', 'scissors',
|
||||||
'teddy bear', 'hair drier', 'toothbrush')
|
'teddy bear', 'hair drier', 'toothbrush']
|
||||||
num_classes: 81
|
num_classes: 81
|
||||||
|
|
||||||
only_create_dataset: False
|
only_create_dataset: False
|
||||||
|
@ -173,7 +178,6 @@ device_num: 1
|
||||||
rank_id: 0
|
rank_id: 0
|
||||||
|
|
||||||
# maskrcnn export
|
# maskrcnn export
|
||||||
batch_size_export: 2
|
|
||||||
file_name: "maskrcnn"
|
file_name: "maskrcnn"
|
||||||
file_format: "AIR"
|
file_format: "AIR"
|
||||||
ckpt_file: '/cache/data/cocodataset/ckpt_maskrcnn/mask_rcnn-12_7393.ckpt'
|
ckpt_file: '/cache/data/cocodataset/ckpt_maskrcnn/mask_rcnn-12_7393.ckpt'
|
||||||
|
|
|
@ -16,7 +16,6 @@
|
||||||
"""Evaluation for MaskRcnn"""
|
"""Evaluation for MaskRcnn"""
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import re
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from src.model_utils.config import config
|
from src.model_utils.config import config
|
||||||
|
@ -34,13 +33,6 @@ from mindspore.common import set_seed
|
||||||
|
|
||||||
set_seed(1)
|
set_seed(1)
|
||||||
|
|
||||||
lss = [int(re.findall(r'[0-9]+', i)[0]) for i in config.feature_shapes]
|
|
||||||
config.feature_shapes = [(lss[2*i], lss[2*i+1]) for i in range(int(len(lss)/2))]
|
|
||||||
config.roi_layer = dict(type='RoIAlign', out_size=7, mask_out_size=14, sample_num=2)
|
|
||||||
config.warmup_ratio = 1/3.0
|
|
||||||
config.mask_shape = (28, 28)
|
|
||||||
|
|
||||||
|
|
||||||
def maskrcnn_eval(dataset_path, ckpt_path, ann_file):
|
def maskrcnn_eval(dataset_path, ckpt_path, ann_file):
|
||||||
"""MaskRcnn evaluation."""
|
"""MaskRcnn evaluation."""
|
||||||
ds = create_maskrcnn_dataset(dataset_path, batch_size=config.test_batch_size, is_training=False)
|
ds = create_maskrcnn_dataset(dataset_path, batch_size=config.test_batch_size, is_training=False)
|
||||||
|
|
|
@ -14,7 +14,6 @@
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
"""export checkpoint file into air, onnx, mindir models"""
|
"""export checkpoint file into air, onnx, mindir models"""
|
||||||
|
|
||||||
import re
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from src.model_utils.config import config
|
from src.model_utils.config import config
|
||||||
from src.model_utils.device_adapter import get_device_id
|
from src.model_utils.device_adapter import get_device_id
|
||||||
|
@ -23,15 +22,6 @@ from src.maskrcnn.mask_rcnn_r50 import MaskRcnn_Infer
|
||||||
from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export
|
from mindspore import Tensor, context, load_checkpoint, load_param_into_net, export
|
||||||
|
|
||||||
|
|
||||||
lss = [int(re.findall(r'[0-9]+', i)[0]) for i in config.feature_shapes]
|
|
||||||
config.feature_shapes = [(lss[2*i], lss[2*i+1]) for i in range(int(len(lss)/2))]
|
|
||||||
config.roi_layer = dict(type='RoIAlign', out_size=7, mask_out_size=14, sample_num=2)
|
|
||||||
config.warmup_ratio = 1/3.0
|
|
||||||
config.mask_shape = (28, 28)
|
|
||||||
train_cls = [i for i in re.findall(r'[a-zA-Z\s]+', config.coco_classes) if i != ' ']
|
|
||||||
config.coco_classes = np.array(train_cls)
|
|
||||||
config.batch_size = config.batch_size_export
|
|
||||||
|
|
||||||
if not config.enable_modelarts:
|
if not config.enable_modelarts:
|
||||||
config.ckpt_file = config.ckpt_file_local
|
config.ckpt_file = config.ckpt_file_local
|
||||||
|
|
||||||
|
@ -45,6 +35,7 @@ def modelarts_process():
|
||||||
@moxing_wrapper(pre_process=modelarts_process)
|
@moxing_wrapper(pre_process=modelarts_process)
|
||||||
def export_maskrcnn():
|
def export_maskrcnn():
|
||||||
""" export_maskrcnn """
|
""" export_maskrcnn """
|
||||||
|
config.test_batch_size = config.batch_size
|
||||||
net = MaskRcnn_Infer(config=config)
|
net = MaskRcnn_Infer(config=config)
|
||||||
param_dict = load_checkpoint(config.ckpt_file)
|
param_dict = load_checkpoint(config.ckpt_file)
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from numpy import random
|
from numpy import random
|
||||||
import cv2
|
import cv2
|
||||||
|
@ -29,8 +28,6 @@ from mindspore.mindrecord import FileWriter
|
||||||
from .model_utils.config import config
|
from .model_utils.config import config
|
||||||
|
|
||||||
|
|
||||||
config.mask_shape = (28, 28)
|
|
||||||
|
|
||||||
def bbox_overlaps(bboxes1, bboxes2, mode='iou'):
|
def bbox_overlaps(bboxes1, bboxes2, mode='iou'):
|
||||||
"""Calculate the ious between each bbox of bboxes1 and bboxes2.
|
"""Calculate the ious between each bbox of bboxes1 and bboxes2.
|
||||||
|
|
||||||
|
@ -390,10 +387,7 @@ def create_coco_label(is_training):
|
||||||
data_type = config.train_data_type
|
data_type = config.train_data_type
|
||||||
|
|
||||||
# Classes need to train or test.
|
# Classes need to train or test.
|
||||||
# train_cls = config.coco_classes
|
train_cls = config.coco_classes
|
||||||
train_cls = [i for i in re.findall(r'[a-zA-Z\s]+', config.coco_classes) if i != ' ']
|
|
||||||
train_cls = np.array(train_cls)
|
|
||||||
print(train_cls)
|
|
||||||
|
|
||||||
train_cls_dict = {}
|
train_cls_dict = {}
|
||||||
for i, cls in enumerate(train_cls):
|
for i, cls in enumerate(train_cls):
|
||||||
|
|
|
@ -114,7 +114,7 @@ class RcnnCls(nn.Cell):
|
||||||
self.train_batch_size = batch_size
|
self.train_batch_size = batch_size
|
||||||
self.test_batch_size = cfg.test_batch_size
|
self.test_batch_size = cfg.test_batch_size
|
||||||
|
|
||||||
self.fpn_cls = FpnCls(self.in_channels, self.rcnn_fc_out_channels, self.num_classes, cfg.roi_layer["out_size"])
|
self.fpn_cls = FpnCls(self.in_channels, self.rcnn_fc_out_channels, self.num_classes, cfg.roi_layer.out_size)
|
||||||
self.relu = P.ReLU()
|
self.relu = P.ReLU()
|
||||||
self.logicaland = P.LogicalAnd()
|
self.logicaland = P.LogicalAnd()
|
||||||
self.loss_cls = P.SoftmaxCrossEntropyWithLogits()
|
self.loss_cls = P.SoftmaxCrossEntropyWithLogits()
|
||||||
|
|
|
@ -88,9 +88,9 @@ class SingleRoIExtractor(nn.Cell):
|
||||||
self.out_channels = out_channels
|
self.out_channels = out_channels
|
||||||
self.featmap_strides = featmap_strides
|
self.featmap_strides = featmap_strides
|
||||||
self.num_levels = len(self.featmap_strides)
|
self.num_levels = len(self.featmap_strides)
|
||||||
self.out_size = roi_layer['mask_out_size'] if mask else roi_layer['out_size']
|
self.out_size = config.roi_layer.mask_out_size if mask else config.roi_layer.out_size
|
||||||
self.mask = mask
|
self.mask = mask
|
||||||
self.sample_num = roi_layer['sample_num']
|
self.sample_num = config.roi_layer.sample_num
|
||||||
self.roi_layers = self.build_roi_layers(self.featmap_strides)
|
self.roi_layers = self.build_roi_layers(self.featmap_strides)
|
||||||
self.roi_layers = L.CellList(self.roi_layers)
|
self.roi_layers = L.CellList(self.roi_layers)
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
|
|
||||||
from src.model_utils.config import config
|
from src.model_utils.config import config
|
||||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||||
|
@ -40,12 +39,6 @@ from mindspore.communication.management import get_rank, get_group_size
|
||||||
|
|
||||||
set_seed(1)
|
set_seed(1)
|
||||||
|
|
||||||
lss = [int(re.findall(r'[0-9]+', i)[0]) for i in config.feature_shapes]
|
|
||||||
config.feature_shapes = [(lss[2*i], lss[2*i+1]) for i in range(int(len(lss)/2))]
|
|
||||||
config.roi_layer = dict(type='RoIAlign', out_size=7, mask_out_size=14, sample_num=2)
|
|
||||||
config.warmup_ratio = 1/3.0
|
|
||||||
config.mask_shape = (28, 28)
|
|
||||||
|
|
||||||
def modelarts_pre_process():
|
def modelarts_pre_process():
|
||||||
def unzip(zip_file, save_dir):
|
def unzip(zip_file, save_dir):
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
|
@ -189,7 +189,6 @@ python eval.py > eval.log 2>&1 &
|
||||||
├─__init__.py
|
├─__init__.py
|
||||||
├─beam_search.py
|
├─beam_search.py
|
||||||
├─dataset.py
|
├─dataset.py
|
||||||
├─eval_config.py
|
|
||||||
├─lr_schedule.py
|
├─lr_schedule.py
|
||||||
├─process_output.py
|
├─process_output.py
|
||||||
├─tokenization.py
|
├─tokenization.py
|
||||||
|
@ -244,15 +243,12 @@ options:
|
||||||
#### Running Options
|
#### Running Options
|
||||||
|
|
||||||
```text
|
```text
|
||||||
config.py:
|
default_config.yaml:
|
||||||
transformer_network version of Transformer model: base | large, default is large
|
transformer_network version of Transformer model: base | large, default is large
|
||||||
init_loss_scale_value initial value of loss scale: N, default is 2^10
|
init_loss_scale_value initial value of loss scale: N, default is 2^10
|
||||||
scale_factor factor used to update loss scale: N, default is 2
|
scale_factor factor used to update loss scale: N, default is 2
|
||||||
scale_window steps for once updatation of loss scale: N, default is 2000
|
scale_window steps for once updatation of loss scale: N, default is 2000
|
||||||
optimizer optimizer used in the network: Adam, default is "Adam"
|
optimizer optimizer used in the network: Adam, default is "Adam"
|
||||||
|
|
||||||
eval_config.py:
|
|
||||||
transformer_network version of Transformer model: base | large, default is large
|
|
||||||
data_file data file: PATH
|
data_file data file: PATH
|
||||||
model_file checkpoint file to be loaded: PATH
|
model_file checkpoint file to be loaded: PATH
|
||||||
output_file output file of evaluation: PATH
|
output_file output file of evaluation: PATH
|
||||||
|
@ -313,7 +309,7 @@ Parameters for learning rate:
|
||||||
|
|
||||||
## [Training Process](#contents)
|
## [Training Process](#contents)
|
||||||
|
|
||||||
- Set options in `config.py`, including loss_scale, learning rate and network hyperparameters. Click [here](https://www.mindspore.cn/tutorial/training/zh-CN/master/use/data_preparation.html) for more information about dataset.
|
- Set options in `default_config.yaml`, including loss_scale, learning rate and network hyperparameters. Click [here](https://www.mindspore.cn/tutorial/training/zh-CN/master/use/data_preparation.html) for more information about dataset.
|
||||||
|
|
||||||
- Run `run_standalone_train.sh` for non-distributed training of Transformer model.
|
- Run `run_standalone_train.sh` for non-distributed training of Transformer model.
|
||||||
|
|
||||||
|
@ -331,7 +327,7 @@ Parameters for learning rate:
|
||||||
|
|
||||||
## [Evaluation Process](#contents)
|
## [Evaluation Process](#contents)
|
||||||
|
|
||||||
- Set options in `eval_config.py`. Make sure the 'data_file', 'model_file' and 'output_file' are set to your own path.
|
- Set options in `default_config.yaml`. Make sure the 'data_file', 'model_file' and 'output_file' are set to your own path.
|
||||||
|
|
||||||
- Run `eval.py` for evaluation of Transformer model.
|
- Run `eval.py` for evaluation of Transformer model.
|
||||||
|
|
||||||
|
@ -422,7 +418,7 @@ There are three random situations:
|
||||||
- Initialization of some model weights.
|
- Initialization of some model weights.
|
||||||
- Dropout operations.
|
- Dropout operations.
|
||||||
|
|
||||||
Some seeds have already been set in train.py to avoid the randomness of dataset shuffle and weight initialization. If you want to disable dropout, please set the corresponding dropout_prob parameter to 0 in src/config.py.
|
Some seeds have already been set in train.py to avoid the randomness of dataset shuffle and weight initialization. If you want to disable dropout, please set the corresponding dropout_prob parameter to 0 in default_config.yaml.
|
||||||
|
|
||||||
## [ModelZoo Homepage](#contents)
|
## [ModelZoo Homepage](#contents)
|
||||||
|
|
||||||
|
|
|
@ -195,7 +195,6 @@ python eval.py > eval.log 2>&1 &
|
||||||
├─__init__.py
|
├─__init__.py
|
||||||
├─beam_search.py
|
├─beam_search.py
|
||||||
├─dataset.py
|
├─dataset.py
|
||||||
├─eval_config.py
|
|
||||||
├─lr_schedule.py
|
├─lr_schedule.py
|
||||||
├─process_output.py
|
├─process_output.py
|
||||||
├─tokenization.py
|
├─tokenization.py
|
||||||
|
@ -250,15 +249,12 @@ options:
|
||||||
#### 运行选项
|
#### 运行选项
|
||||||
|
|
||||||
```text
|
```text
|
||||||
config.py:
|
default_config.yaml:
|
||||||
transformer_network version of Transformer model: base | large, default is large
|
transformer_network version of Transformer model: base | large, default is large
|
||||||
init_loss_scale_value initial value of loss scale: N, default is 2^10
|
init_loss_scale_value initial value of loss scale: N, default is 2^10
|
||||||
scale_factor factor used to update loss scale: N, default is 2
|
scale_factor factor used to update loss scale: N, default is 2
|
||||||
scale_window steps for once updatation of loss scale: N, default is 2000
|
scale_window steps for once updatation of loss scale: N, default is 2000
|
||||||
optimizer optimizer used in the network: Adam, default is "Adam"
|
optimizer optimizer used in the network: Adam, default is "Adam"
|
||||||
|
|
||||||
eval_config.py:
|
|
||||||
transformer_network version of Transformer model: base | large, default is large
|
|
||||||
data_file data file: PATH
|
data_file data file: PATH
|
||||||
model_file checkpoint file to be loaded: PATH
|
model_file checkpoint file to be loaded: PATH
|
||||||
output_file output file of evaluation: PATH
|
output_file output file of evaluation: PATH
|
||||||
|
@ -320,7 +316,7 @@ Parameters for learning rate:
|
||||||
|
|
||||||
### 训练过程
|
### 训练过程
|
||||||
|
|
||||||
- 在`config.py`中设置选项,包括loss_scale、学习率和网络超参数。点击[这里](https://www.mindspore.cn/tutorial/training/zh-CN/master/use/data_preparation.html)查看更多数据集信息。
|
- 在`default_config.yaml`中设置选项,包括loss_scale、学习率和网络超参数。点击[这里](https://www.mindspore.cn/tutorial/training/zh-CN/master/use/data_preparation.html)查看更多数据集信息。
|
||||||
|
|
||||||
- 运行`run_standalone_train.sh`,进行Transformer模型的非分布式训练。
|
- 运行`run_standalone_train.sh`,进行Transformer模型的非分布式训练。
|
||||||
|
|
||||||
|
@ -338,7 +334,7 @@ Parameters for learning rate:
|
||||||
|
|
||||||
### 评估过程
|
### 评估过程
|
||||||
|
|
||||||
- 在`eval_config.py`中设置选项。确保已设置了‘data_file'、'model_file’和'output_file'文件路径。
|
- 在`default_config.yaml`中设置选项。确保已设置了‘data_file'、'model_file’和'output_file'文件路径。
|
||||||
|
|
||||||
- 运行`eval.py`,评估Transformer模型。
|
- 运行`eval.py`,评估Transformer模型。
|
||||||
|
|
||||||
|
@ -429,7 +425,7 @@ bash run_infer_310.sh [MINDIR_PATH] [NEED_PREPROCESS] [DEVICE_ID]
|
||||||
- 初始化部分模型权重
|
- 初始化部分模型权重
|
||||||
- 随机失活运行
|
- 随机失活运行
|
||||||
|
|
||||||
train.py已经设置了一些种子,避免数据集轮换和权重初始化的随机性。若需关闭随机失活,将src/config.py中相应的dropout_prob参数设置为0。
|
train.py已经设置了一些种子,避免数据集轮换和权重初始化的随机性。若需关闭随机失活,将default_config.yaml中相应的dropout_prob参数设置为0。
|
||||||
|
|
||||||
## ModelZoo主页
|
## ModelZoo主页
|
||||||
|
|
||||||
|
|
|
@ -16,8 +16,8 @@
|
||||||
if [ $# != 5 ] ; then
|
if [ $# != 5 ] ; then
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
echo "Please run the script as: "
|
echo "Please run the script as: "
|
||||||
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE CONFIG_PATH"
|
echo "sh run_distribute_train_ascend.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE CONFIG_PATH"
|
||||||
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json ./default_config_large.yaml"
|
echo "for example: sh run_distribute_train_ascend.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json ./default_config_large.yaml"
|
||||||
echo "It is better to use absolute path."
|
echo "It is better to use absolute path."
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
exit 1;
|
exit 1;
|
||||||
|
|
|
@ -64,8 +64,7 @@ do
|
||||||
--checkpoint_path="" \
|
--checkpoint_path="" \
|
||||||
--save_checkpoint_steps=2500 \
|
--save_checkpoint_steps=2500 \
|
||||||
--save_checkpoint_num=30 \
|
--save_checkpoint_num=30 \
|
||||||
--data_path=$DATA_PATH \
|
--data_path=$DATA_PATH > log.txt 2>&1 &
|
||||||
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
|
|
||||||
cd ../
|
cd ../
|
||||||
done
|
done
|
||||||
cd ..
|
cd ..
|
|
@ -17,7 +17,7 @@ if [ $# != 4 ] ; then
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
echo "Please run the script as: "
|
echo "Please run the script as: "
|
||||||
echo "sh run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_PATH CONFIG_PATH"
|
echo "sh run_distribute_train_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_PATH CONFIG_PATH"
|
||||||
echo "for example: sh run_distribute_pretrain.sh 8 55 /path/ende-l128-mindrecord00 ./default_config_large_gpu.yaml"
|
echo "for example: sh run_distribute_train_gpu.sh 8 55 /path/ende-l128-mindrecord00 ./default_config_large_gpu.yaml"
|
||||||
echo "It is better to use absolute path."
|
echo "It is better to use absolute path."
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
exit 1;
|
exit 1;
|
||||||
|
@ -47,5 +47,4 @@ mpirun -n $RANK_SIZE \
|
||||||
--checkpoint_path="" \
|
--checkpoint_path="" \
|
||||||
--save_checkpoint_steps=2500 \
|
--save_checkpoint_steps=2500 \
|
||||||
--save_checkpoint_num=30 \
|
--save_checkpoint_num=30 \
|
||||||
--data_path=$DATA_PATH \
|
--data_path=$DATA_PATH > log.txt 2>&1 &
|
||||||
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
|
|
||||||
|
|
|
@ -48,8 +48,7 @@ if [ $DEVICE_TARGET == 'Ascend' ];then
|
||||||
--checkpoint_path="" \
|
--checkpoint_path="" \
|
||||||
--save_checkpoint_steps=2500 \
|
--save_checkpoint_steps=2500 \
|
||||||
--save_checkpoint_num=30 \
|
--save_checkpoint_num=30 \
|
||||||
--data_path=$DATA_PATH \
|
--data_path=$DATA_PATH > log.txt 2>&1 &
|
||||||
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
|
|
||||||
elif [ $DEVICE_TARGET == 'GPU' ];then
|
elif [ $DEVICE_TARGET == 'GPU' ];then
|
||||||
export CUDA_VISIBLE_DEVICES="$2"
|
export CUDA_VISIBLE_DEVICES="$2"
|
||||||
|
|
||||||
|
@ -64,8 +63,7 @@ elif [ $DEVICE_TARGET == 'GPU' ];then
|
||||||
--checkpoint_path="" \
|
--checkpoint_path="" \
|
||||||
--save_checkpoint_steps=2500 \
|
--save_checkpoint_steps=2500 \
|
||||||
--save_checkpoint_num=30 \
|
--save_checkpoint_num=30 \
|
||||||
--data_path=$DATA_PATH \
|
--data_path=$DATA_PATH > log.txt 2>&1 &
|
||||||
--bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 &
|
|
||||||
else
|
else
|
||||||
echo "Not supported device target."
|
echo "Not supported device target."
|
||||||
fi
|
fi
|
||||||
|
|
Loading…
Reference in New Issue