fix posenet gradient overflow

This commit is contained in:
zhaojichen 2021-08-27 09:56:34 +08:00
parent a64fee672f
commit bd2b149b80
12 changed files with 139 additions and 47 deletions

View File

@ -65,8 +65,8 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF单目六自由度
- 框架
- [MindSpore](https://www.mindspore.cn/install)
- 如需查看详情,请参见如下资源:
- [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/r1.3/index.html)
- [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/r1.3/index.html)
- [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html)
- [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html)
# 快速入门
@ -76,13 +76,13 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF单目六自由度
```python
# 运行单机训练示例
sh run_standalone_train.sh [DATASET_NAME] [DEVICE_ID]
bash run_standalone_train.sh [DATASET_NAME] [DEVICE_ID]
# 运行分布式训练示例
sh run_distribute_train.sh [DATASET_NAME] [RANK_SIZE]
bash run_distribute_train.sh [DATASET_NAME] [RANK_TABLE]
# 运行评估示例
sh run_eval.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
bash run_eval.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
```
对于分布式训练需要提前创建JSON格式的hccl配置文件。
@ -97,13 +97,13 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF单目六自由度
```python
# 运行单机训练示例
sh run_standalone_train_gpu.sh [DATASET_NAME] [DEVICE_ID]
bash run_standalone_train_gpu.sh [DATASET_NAME] [DEVICE_ID]
# 运行分布式训练示例
sh run_distribute_train_gpu.sh [DATASET_NAME] [RANK_SIZE]
bash run_distribute_train_gpu.sh [DATASET_NAME] [RANK_TABLE]
# 运行评估示例
sh run_eval_gpu.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
bash run_eval_gpu.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
```
默认使用KingsCollege数据集。您也可以将`$dataset_name`传入脚本,以便选择其他数据集。如需查看更多详情,请参考指定脚本。
@ -173,7 +173,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF单目六自由度
- Ascend处理器环境运行
```bash
sh run_standalone_train.sh [DATASET_NAME] [DEVICE_ID]
bash run_standalone_train.sh [DATASET_NAME] [DEVICE_ID]
```
上述python命令将在后台运行您可以通过train.log文件查看结果。
@ -191,7 +191,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF单目六自由度
- GPU处理器环境运行
```bash
sh run_standalone_train_gpu.sh [DATASET_NAME] [DEVICE_ID]
bash run_standalone_train_gpu.sh [DATASET_NAME] [DEVICE_ID]
```
上述python命令将在后台运行您可以通过train.log文件查看结果。
@ -211,7 +211,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF单目六自由度
- Ascend处理器环境运行
```bash
sh run_distribute_train.sh [DATASET_NAME] [RANK_SIZE]
bash run_distribute_train.sh [DATASET_NAME] [RANK_TABLE]
```
上述shell脚本将在后台运行分布训练。您可以通过device[X]/log文件查看结果。采用以下方式达到损失值
@ -228,7 +228,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF单目六自由度
- GPU处理器环境运行
```bash
sh run_distribute_train_gpu.sh [DATASET_NAME] [RANK_SIZE]
bash run_distribute_train_gpu.sh [DATASET_NAME] [RANK_SIZE]
```
上述shell脚本将在后台运行分布训练。您可以通过device[X]/log文件查看结果。采用以下方式达到损失值
@ -252,7 +252,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF单目六自由度
请将检查点路径设置为相对路径,例如“../checkpoint/train_posenet_KingsCollege-790_38.ckpt”。
```bash
sh run_eval.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
bash run_eval.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
```
上述python命令将在后台运行您可以通过eval/eval.log文件查看结果。测试数据集的准确性如下
@ -267,7 +267,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF单目六自由度
请将检查点路径设置为相对路径,例如“../checkpoint/train_posenet_KingsCollege-1875_2.ckpt”。
```bash
sh run_eval_gpu.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
bash run_eval_gpu.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
```
上述python命令将在后台运行您可以通过eval/eval.log文件查看结果。测试数据集的准确性如下
@ -320,7 +320,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF单目六自由度
| 参数(M) | 10.7 | 10.7 |
| 微调检查点 | 82.91M (.ckpt文件) | 82.91M (.ckpt文件) |
| 推理模型 | 41.66M (.mindir文件) | 41.66M (.mindir文件) |
| 脚本 | [posenet脚本](https://gitee.com/mindspore/mindspore/tree/r1.3/model_zoo/official/cv/posenet) | [posenet脚本](https://gitee.com/mindspore/mindspore/tree/r1.3/model_zoo/official/cv/posenet) |
| 脚本 | [posenet脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/posenet) | [posenet脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/posenet) |
### 推理性能

View File

@ -41,6 +41,9 @@ if __name__ == '__main__':
parser.add_argument('--is_modelarts', type=ast.literal_eval, default=False, help='Train in Modelarts.')
parser.add_argument('--data_url', default=None, help='Location of data.')
parser.add_argument('--train_url', default=None, help='Location of training outputs.')
parser.add_argument('--device_target', type=str, default='Ascend',
choices=['Ascend', 'GPU'],
help='Name of device target.')
args_opt = parser.parse_args()
cfg = common_config
@ -49,8 +52,8 @@ if __name__ == '__main__':
elif args_opt.dataset == "StMarysChurch":
dataset_cfg = StMarysChurch
device_target = cfg.device_target
context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target)
device_target = args_opt.device_target
context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
if args_opt.device_id is not None:
context.set_context(device_id=args_opt.device_id)

View File

@ -17,37 +17,30 @@
echo "=============================================================================================================="
echo "Please run the script as: "
echo "bash run.sh DATASET_NAME RANK_SIZE"
echo "For example: bash run_distribute.sh dataset_name 8"
echo "For example: bash run_distribute.sh dataset_name rank_table"
echo "It is better to use the absolute path."
echo "=============================================================================================================="
set -e
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
DATASET_NAME=$1
RANK_SIZE=$2
export DATASET_NAME
export RANK_SIZE
export RANK_TABLE_FILE=$(get_real_path $2)
export RANK_SIZE=8
EXEC_PATH=$(pwd)
echo "$EXEC_PATH"
test_dist_8pcs()
{
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
export RANK_SIZE=8
}
test_dist_2pcs()
{
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
export RANK_SIZE=2
}
test_dist_${RANK_SIZE}pcs
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
cd ../
for((i=0;i<${RANK_SIZE};i++))
for((i=0;i<8;i++))
do
rm -rf device$i
mkdir device$i
@ -61,7 +54,10 @@ do
export RANK_ID=$i
echo "start training for device $i"
env > env$i.log
python train.py --run_distribute True --dataset $1 --device_num $2 --is_modelarts False > train$i.log 2>&1 &
python train.py --run_distribute True \
--dataset $1 --device_num $2 \
--is_modelarts False \
--device_target "Ascend" > train$i.log 2>&1 &
echo "$i finish"
cd ../
done

View File

@ -39,5 +39,8 @@ cd ./train_parallel
env > env.log
echo "start training"
mpirun -n $1 --allow-run-as-root \
python train.py --device_num $1 --dataset $2 --is_modelarts False --run_distribute True > train.log 2>&1 &
python train.py --device_num $1 \
--dataset $2 --is_modelarts False \
--run_distribute True \
--device_target "GPU" > train.log 2>&1 &

View File

@ -46,7 +46,7 @@ cd ./eval
env > env0.log
echo "Eval begin."
python eval.py --device_id $1 --dataset $2 --ckpt_url $3 --is_modelarts False > ./eval.log 2>&1 &
python eval.py --device_id $1 --dataset $2 --ckpt_url $3 --is_modelarts False --device_target "GPU" > ./eval.log 2>&1 &
if [ $? -eq 0 ];then
echo "evaling success"

View File

@ -45,7 +45,10 @@ cd ./train
env > env0.log
echo "Standalone train begin."
python train.py --run_distribute False --device_id $2 --dataset $1 --device_num 1 --is_modelarts False > ./train_alone.log 2>&1 &
python train.py --run_distribute False \
--device_id $2 --dataset $1 \
--device_num 1 --is_modelarts False \
--device_target "Ascend" > ./train_alone.log 2>&1 &
if [ $? -eq 0 ];then
echo "training success"

View File

@ -45,7 +45,7 @@ cd ./train
env > env0.log
echo "Standalone train begin."
python train.py --run_distribute False --device_id $2 --dataset $1 --device_num 1 --is_modelarts False > ./train_alone.log 2>&1 &
python train.py --run_distribute False --device_id $2 --dataset $1 --device_num 1 --is_modelarts False --device_target "Ascend" > ./train_alone.log 2>&1 &
if [ $? -eq 0 ];then
echo "training success"

View File

@ -18,7 +18,6 @@ network config setting
from easydict import EasyDict as edict
common_config = edict({
'device_target': 'GPU',
'device_id': 0,
'pre_trained': True,
'max_steps': 30000,

View File

@ -14,7 +14,7 @@
# ============================================================================
"""define evaluation loss function for network."""
import mindspore.nn as nn
from mindspore.nn.loss.loss import _Loss
from mindspore.nn.loss.loss import LossBase
from mindspore.common import dtype as mstype
from mindspore.ops import operations as P
from mindspore.train.serialization import load_checkpoint, load_param_into_net
@ -42,7 +42,7 @@ class EuclideanDistance(nn.Cell):
return res
class PoseLoss(_Loss):
class PoseLoss(LossBase):
"""define loss function"""
def __init__(self, w1_x, w2_x, w3_x, w1_q, w2_q, w3_q):
super(PoseLoss, self).__init__()

View File

@ -16,6 +16,12 @@
import mindspore.nn as nn
from mindspore.common.initializer import TruncatedNormal
from mindspore.ops import operations as P
from mindspore.ops import functional as F
from mindspore.ops import composite as C
from mindspore.context import ParallelMode
from mindspore.parallel._utils import (_get_device_num, _get_gradients_mean,
_get_parallel_mode)
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
def weight_variable():
"""Weight variable."""
@ -172,3 +178,80 @@ class PoseNet(nn.Cell):
return cls1_fc_pose_xyz, cls1_fc_pose_wpqr, \
cls2_fc_pose_xyz, cls2_fc_pose_wpqr, \
cls3_fc_pose_xyz, cls3_fc_pose_wpqr
GRADIENT_CLIP_TYPE = 1
GRADIENT_CLIP_VALUE = 50.0
clip_grad = C.MultitypeFuncGraph("clip_grad")
@clip_grad.register("Number", "Number", "Tensor")
def _clip_grad(clip_type, clip_value, grad):
"""
Clip gradients
Inputs:
clip_type: The way to clip, 0 for 'value', 1 for 'norm'
clip_value: Specifies how much to clip
grad: Gradients
Outputs:
tuple[Tensor], clipped gradients
"""
if clip_type not in (0, 1):
return grad
dt = F.dtype(grad)
if clip_type == 0:
new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
F.cast(F.tuple_to_array((clip_value,)), dt))
else:
new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
return new_grad
class PoseTrainOneStepCell(nn.Cell):
r"""
Network training package class.
Wraps the network with an optimizer. The resulting Cell is trained with input '\*inputs'.
The backward graph will be created in the construct function to update the parameter. Different
parallel modes are available for training.
Args:
network (Cell): The training network. The network only supports single output.
optimizer (Union[Cell]): Optimizer for updating the weights.
sens (numbers.Number): The scaling number to be filled as the input of backpropagation. Default value is 1.0.
Inputs:
- **(\*inputs)** (Tuple(Tensor)) - Tuple of input tensors with shape :math:`(N, \ldots)`.
Outputs:
Tensor, a tensor means the loss value, the shape of which is usually :math:`()`.
Raises:
TypeError: If `sens` is not a number.
"""
def __init__(self, network, optimizer, sens=1.0):
super(PoseTrainOneStepCell, self).__init__(auto_prefix=False)
self.network = network
self.network.set_grad()
self.optimizer = optimizer
self.weights = self.optimizer.parameters
self.grad = C.GradOperation(get_by_list=True, sens_param=True)
self.sens = sens
self.reducer_flag = False
self.grad_reducer = F.identity
self.hyper_map = C.HyperMap()
self.parallel_mode = _get_parallel_mode()
self.reducer_flag = self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL)
if self.reducer_flag:
self.mean = _get_gradients_mean()
self.degree = _get_device_num()
self.grad_reducer = DistributedGradReducer(self.weights, self.mean, self.degree)
def construct(self, *inputs):
loss = self.network(*inputs)
sens = F.fill(loss.dtype, loss.shape, self.sens)
grads = self.grad(self.network, self.weights)(*inputs, sens)
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
grads = self.grad_reducer(grads)
loss = F.depend(loss, self.optimizer(grads))
return loss

View File

@ -29,6 +29,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMoni
from src.config import common_config, KingsCollege, StMarysChurch
from src.dataset import data_to_mindrecord, create_posenet_dataset
from src.loss import PosenetWithLoss
from src.posenet import PoseTrainOneStepCell
set_seed(1)
@ -45,6 +46,9 @@ if __name__ == '__main__':
parser.add_argument('--is_modelarts', type=ast.literal_eval, default=False, help='Train in Modelarts.')
parser.add_argument('--data_url', default=None, help='Location of data.')
parser.add_argument('--train_url', default=None, help='Location of training outputs.')
parser.add_argument('--device_target', type=str, default='Ascend',
choices=['Ascend', 'GPU'],
help='Name of device target.')
args_opt = parser.parse_args()
cfg = common_config
@ -53,7 +57,7 @@ if __name__ == '__main__':
elif args_opt.dataset == "StMarysChurch":
dataset_cfg = StMarysChurch
device_target = cfg.device_target
device_target = args_opt.device_target
context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
if args_opt.run_distribute:
if device_target == "Ascend":
@ -104,7 +108,8 @@ if __name__ == '__main__':
opt = Adagrad(params=net_with_loss.trainable_params(),
learning_rate=dataset_cfg.lr_init,
weight_decay=dataset_cfg.weight_decay)
model = Model(net_with_loss, optimizer=opt)
net_with_grad = PoseTrainOneStepCell(net_with_loss, opt)
model = Model(net_with_grad)
time_cb = TimeMonitor(data_size=step_per_epoch)
loss_cb = LossMonitor()

View File

@ -61,8 +61,8 @@ if __name__ == '__main__':
if args.distribute:
if target == "Ascend":
init()
device_id = int(os.getenv('DEVICE_ID'))
context.set_auto_parallel_context(device_id=device_id,
device_num = int(os.getenv('RANK_SIZE'))
context.set_auto_parallel_context(device_num=device_num,
parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True)
if target == "GPU":