forked from mindspore-Ecosystem/mindspore
fix posenet gradient overflow
This commit is contained in:
parent
a64fee672f
commit
bd2b149b80
|
@ -65,8 +65,8 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度
|
|||
- 框架
|
||||
- [MindSpore](https://www.mindspore.cn/install)
|
||||
- 如需查看详情,请参见如下资源:
|
||||
- [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/r1.3/index.html)
|
||||
- [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/r1.3/index.html)
|
||||
- [MindSpore教程](https://www.mindspore.cn/tutorials/zh-CN/master/index.html)
|
||||
- [MindSpore Python API](https://www.mindspore.cn/docs/api/zh-CN/master/index.html)
|
||||
|
||||
# 快速入门
|
||||
|
||||
|
@ -76,13 +76,13 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度
|
|||
|
||||
```python
|
||||
# 运行单机训练示例
|
||||
sh run_standalone_train.sh [DATASET_NAME] [DEVICE_ID]
|
||||
bash run_standalone_train.sh [DATASET_NAME] [DEVICE_ID]
|
||||
|
||||
# 运行分布式训练示例
|
||||
sh run_distribute_train.sh [DATASET_NAME] [RANK_SIZE]
|
||||
bash run_distribute_train.sh [DATASET_NAME] [RANK_TABLE]
|
||||
|
||||
# 运行评估示例
|
||||
sh run_eval.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
|
||||
bash run_eval.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
|
||||
```
|
||||
|
||||
对于分布式训练,需要提前创建JSON格式的hccl配置文件。
|
||||
|
@ -97,13 +97,13 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度
|
|||
|
||||
```python
|
||||
# 运行单机训练示例
|
||||
sh run_standalone_train_gpu.sh [DATASET_NAME] [DEVICE_ID]
|
||||
bash run_standalone_train_gpu.sh [DATASET_NAME] [DEVICE_ID]
|
||||
|
||||
# 运行分布式训练示例
|
||||
sh run_distribute_train_gpu.sh [DATASET_NAME] [RANK_SIZE]
|
||||
bash run_distribute_train_gpu.sh [DATASET_NAME] [RANK_TABLE]
|
||||
|
||||
# 运行评估示例
|
||||
sh run_eval_gpu.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
|
||||
bash run_eval_gpu.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
|
||||
```
|
||||
|
||||
默认使用KingsCollege数据集。您也可以将`$dataset_name`传入脚本,以便选择其他数据集。如需查看更多详情,请参考指定脚本。
|
||||
|
@ -173,7 +173,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度
|
|||
- Ascend处理器环境运行
|
||||
|
||||
```bash
|
||||
sh run_standalone_train.sh [DATASET_NAME] [DEVICE_ID]
|
||||
bash run_standalone_train.sh [DATASET_NAME] [DEVICE_ID]
|
||||
```
|
||||
|
||||
上述python命令将在后台运行,您可以通过train.log文件查看结果。
|
||||
|
@ -191,7 +191,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度
|
|||
- GPU处理器环境运行
|
||||
|
||||
```bash
|
||||
sh run_standalone_train_gpu.sh [DATASET_NAME] [DEVICE_ID]
|
||||
bash run_standalone_train_gpu.sh [DATASET_NAME] [DEVICE_ID]
|
||||
```
|
||||
|
||||
上述python命令将在后台运行,您可以通过train.log文件查看结果。
|
||||
|
@ -211,7 +211,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度
|
|||
- Ascend处理器环境运行
|
||||
|
||||
```bash
|
||||
sh run_distribute_train.sh [DATASET_NAME] [RANK_SIZE]
|
||||
bash run_distribute_train.sh [DATASET_NAME] [RANK_TABLE]
|
||||
```
|
||||
|
||||
上述shell脚本将在后台运行分布训练。您可以通过device[X]/log文件查看结果。采用以下方式达到损失值:
|
||||
|
@ -228,7 +228,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度
|
|||
- GPU处理器环境运行
|
||||
|
||||
```bash
|
||||
sh run_distribute_train_gpu.sh [DATASET_NAME] [RANK_SIZE]
|
||||
bash run_distribute_train_gpu.sh [DATASET_NAME] [RANK_SIZE]
|
||||
```
|
||||
|
||||
上述shell脚本将在后台运行分布训练。您可以通过device[X]/log文件查看结果。采用以下方式达到损失值:
|
||||
|
@ -252,7 +252,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度
|
|||
请将检查点路径设置为相对路径,例如“../checkpoint/train_posenet_KingsCollege-790_38.ckpt”。
|
||||
|
||||
```bash
|
||||
sh run_eval.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
|
||||
bash run_eval.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
|
||||
```
|
||||
|
||||
上述python命令将在后台运行,您可以通过eval/eval.log文件查看结果。测试数据集的准确性如下:
|
||||
|
@ -267,7 +267,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度
|
|||
请将检查点路径设置为相对路径,例如“../checkpoint/train_posenet_KingsCollege-1875_2.ckpt”。
|
||||
|
||||
```bash
|
||||
sh run_eval_gpu.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
|
||||
bash run_eval_gpu.sh [DEVICE_ID] [DATASET_NAME] [CKPT_PATH]
|
||||
```
|
||||
|
||||
上述python命令将在后台运行,您可以通过eval/eval.log文件查看结果。测试数据集的准确性如下:
|
||||
|
@ -320,7 +320,7 @@ PoseNet是剑桥大学提出的一种鲁棒、实时的6DOF(单目六自由度
|
|||
| 参数(M) | 10.7 | 10.7 |
|
||||
| 微调检查点 | 82.91M (.ckpt文件) | 82.91M (.ckpt文件) |
|
||||
| 推理模型 | 41.66M (.mindir文件) | 41.66M (.mindir文件) |
|
||||
| 脚本 | [posenet脚本](https://gitee.com/mindspore/mindspore/tree/r1.3/model_zoo/official/cv/posenet) | [posenet脚本](https://gitee.com/mindspore/mindspore/tree/r1.3/model_zoo/official/cv/posenet) |
|
||||
| 脚本 | [posenet脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/posenet) | [posenet脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/posenet) |
|
||||
|
||||
### 推理性能
|
||||
|
||||
|
|
|
@ -41,6 +41,9 @@ if __name__ == '__main__':
|
|||
parser.add_argument('--is_modelarts', type=ast.literal_eval, default=False, help='Train in Modelarts.')
|
||||
parser.add_argument('--data_url', default=None, help='Location of data.')
|
||||
parser.add_argument('--train_url', default=None, help='Location of training outputs.')
|
||||
parser.add_argument('--device_target', type=str, default='Ascend',
|
||||
choices=['Ascend', 'GPU'],
|
||||
help='Name of device target.')
|
||||
args_opt = parser.parse_args()
|
||||
|
||||
cfg = common_config
|
||||
|
@ -49,8 +52,8 @@ if __name__ == '__main__':
|
|||
elif args_opt.dataset == "StMarysChurch":
|
||||
dataset_cfg = StMarysChurch
|
||||
|
||||
device_target = cfg.device_target
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target)
|
||||
device_target = args_opt.device_target
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
|
||||
|
||||
if args_opt.device_id is not None:
|
||||
context.set_context(device_id=args_opt.device_id)
|
||||
|
|
|
@ -17,37 +17,30 @@
|
|||
echo "=============================================================================================================="
|
||||
echo "Please run the script as: "
|
||||
echo "bash run.sh DATASET_NAME RANK_SIZE"
|
||||
echo "For example: bash run_distribute.sh dataset_name 8"
|
||||
echo "For example: bash run_distribute.sh dataset_name rank_table"
|
||||
echo "It is better to use the absolute path."
|
||||
echo "=============================================================================================================="
|
||||
set -e
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
DATASET_NAME=$1
|
||||
RANK_SIZE=$2
|
||||
export DATASET_NAME
|
||||
export RANK_SIZE
|
||||
export RANK_TABLE_FILE=$(get_real_path $2)
|
||||
export RANK_SIZE=8
|
||||
|
||||
EXEC_PATH=$(pwd)
|
||||
echo "$EXEC_PATH"
|
||||
|
||||
test_dist_8pcs()
|
||||
{
|
||||
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_8pcs.json
|
||||
export RANK_SIZE=8
|
||||
}
|
||||
|
||||
test_dist_2pcs()
|
||||
{
|
||||
export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_2pcs.json
|
||||
export RANK_SIZE=2
|
||||
}
|
||||
|
||||
test_dist_${RANK_SIZE}pcs
|
||||
|
||||
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||
|
||||
cd ../
|
||||
for((i=0;i<${RANK_SIZE};i++))
|
||||
for((i=0;i<8;i++))
|
||||
do
|
||||
rm -rf device$i
|
||||
mkdir device$i
|
||||
|
@ -61,7 +54,10 @@ do
|
|||
export RANK_ID=$i
|
||||
echo "start training for device $i"
|
||||
env > env$i.log
|
||||
python train.py --run_distribute True --dataset $1 --device_num $2 --is_modelarts False > train$i.log 2>&1 &
|
||||
python train.py --run_distribute True \
|
||||
--dataset $1 --device_num $2 \
|
||||
--is_modelarts False \
|
||||
--device_target "Ascend" > train$i.log 2>&1 &
|
||||
echo "$i finish"
|
||||
cd ../
|
||||
done
|
||||
|
|
|
@ -39,5 +39,8 @@ cd ./train_parallel
|
|||
env > env.log
|
||||
echo "start training"
|
||||
mpirun -n $1 --allow-run-as-root \
|
||||
python train.py --device_num $1 --dataset $2 --is_modelarts False --run_distribute True > train.log 2>&1 &
|
||||
python train.py --device_num $1 \
|
||||
--dataset $2 --is_modelarts False \
|
||||
--run_distribute True \
|
||||
--device_target "GPU" > train.log 2>&1 &
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ cd ./eval
|
|||
|
||||
env > env0.log
|
||||
echo "Eval begin."
|
||||
python eval.py --device_id $1 --dataset $2 --ckpt_url $3 --is_modelarts False > ./eval.log 2>&1 &
|
||||
python eval.py --device_id $1 --dataset $2 --ckpt_url $3 --is_modelarts False --device_target "GPU" > ./eval.log 2>&1 &
|
||||
|
||||
if [ $? -eq 0 ];then
|
||||
echo "evaling success"
|
||||
|
|
|
@ -45,7 +45,10 @@ cd ./train
|
|||
env > env0.log
|
||||
|
||||
echo "Standalone train begin."
|
||||
python train.py --run_distribute False --device_id $2 --dataset $1 --device_num 1 --is_modelarts False > ./train_alone.log 2>&1 &
|
||||
python train.py --run_distribute False \
|
||||
--device_id $2 --dataset $1 \
|
||||
--device_num 1 --is_modelarts False \
|
||||
--device_target "Ascend" > ./train_alone.log 2>&1 &
|
||||
|
||||
if [ $? -eq 0 ];then
|
||||
echo "training success"
|
||||
|
|
|
@ -45,7 +45,7 @@ cd ./train
|
|||
env > env0.log
|
||||
|
||||
echo "Standalone train begin."
|
||||
python train.py --run_distribute False --device_id $2 --dataset $1 --device_num 1 --is_modelarts False > ./train_alone.log 2>&1 &
|
||||
python train.py --run_distribute False --device_id $2 --dataset $1 --device_num 1 --is_modelarts False --device_target "Ascend" > ./train_alone.log 2>&1 &
|
||||
|
||||
if [ $? -eq 0 ];then
|
||||
echo "training success"
|
||||
|
|
|
@ -18,7 +18,6 @@ network config setting
|
|||
from easydict import EasyDict as edict
|
||||
|
||||
common_config = edict({
|
||||
'device_target': 'GPU',
|
||||
'device_id': 0,
|
||||
'pre_trained': True,
|
||||
'max_steps': 30000,
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
# ============================================================================
|
||||
"""define evaluation loss function for network."""
|
||||
import mindspore.nn as nn
|
||||
from mindspore.nn.loss.loss import _Loss
|
||||
from mindspore.nn.loss.loss import LossBase
|
||||
from mindspore.common import dtype as mstype
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
|
@ -42,7 +42,7 @@ class EuclideanDistance(nn.Cell):
|
|||
|
||||
return res
|
||||
|
||||
class PoseLoss(_Loss):
|
||||
class PoseLoss(LossBase):
|
||||
"""define loss function"""
|
||||
def __init__(self, w1_x, w2_x, w3_x, w1_q, w2_q, w3_q):
|
||||
super(PoseLoss, self).__init__()
|
||||
|
|
|
@ -16,6 +16,12 @@
|
|||
import mindspore.nn as nn
|
||||
from mindspore.common.initializer import TruncatedNormal
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import composite as C
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.parallel._utils import (_get_device_num, _get_gradients_mean,
|
||||
_get_parallel_mode)
|
||||
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
|
||||
|
||||
def weight_variable():
|
||||
"""Weight variable."""
|
||||
|
@ -172,3 +178,80 @@ class PoseNet(nn.Cell):
|
|||
return cls1_fc_pose_xyz, cls1_fc_pose_wpqr, \
|
||||
cls2_fc_pose_xyz, cls2_fc_pose_wpqr, \
|
||||
cls3_fc_pose_xyz, cls3_fc_pose_wpqr
|
||||
|
||||
|
||||
GRADIENT_CLIP_TYPE = 1
|
||||
GRADIENT_CLIP_VALUE = 50.0
|
||||
|
||||
clip_grad = C.MultitypeFuncGraph("clip_grad")
|
||||
|
||||
@clip_grad.register("Number", "Number", "Tensor")
|
||||
def _clip_grad(clip_type, clip_value, grad):
|
||||
"""
|
||||
Clip gradients
|
||||
Inputs:
|
||||
clip_type: The way to clip, 0 for 'value', 1 for 'norm'
|
||||
clip_value: Specifies how much to clip
|
||||
grad: Gradients
|
||||
Outputs:
|
||||
tuple[Tensor], clipped gradients
|
||||
"""
|
||||
if clip_type not in (0, 1):
|
||||
return grad
|
||||
dt = F.dtype(grad)
|
||||
if clip_type == 0:
|
||||
new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
|
||||
F.cast(F.tuple_to_array((clip_value,)), dt))
|
||||
else:
|
||||
new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
|
||||
return new_grad
|
||||
|
||||
class PoseTrainOneStepCell(nn.Cell):
|
||||
r"""
|
||||
Network training package class.
|
||||
|
||||
Wraps the network with an optimizer. The resulting Cell is trained with input '\*inputs'.
|
||||
The backward graph will be created in the construct function to update the parameter. Different
|
||||
parallel modes are available for training.
|
||||
|
||||
Args:
|
||||
network (Cell): The training network. The network only supports single output.
|
||||
optimizer (Union[Cell]): Optimizer for updating the weights.
|
||||
sens (numbers.Number): The scaling number to be filled as the input of backpropagation. Default value is 1.0.
|
||||
|
||||
Inputs:
|
||||
- **(\*inputs)** (Tuple(Tensor)) - Tuple of input tensors with shape :math:`(N, \ldots)`.
|
||||
|
||||
Outputs:
|
||||
Tensor, a tensor means the loss value, the shape of which is usually :math:`()`.
|
||||
|
||||
Raises:
|
||||
TypeError: If `sens` is not a number.
|
||||
"""
|
||||
|
||||
def __init__(self, network, optimizer, sens=1.0):
|
||||
super(PoseTrainOneStepCell, self).__init__(auto_prefix=False)
|
||||
self.network = network
|
||||
self.network.set_grad()
|
||||
self.optimizer = optimizer
|
||||
self.weights = self.optimizer.parameters
|
||||
self.grad = C.GradOperation(get_by_list=True, sens_param=True)
|
||||
self.sens = sens
|
||||
self.reducer_flag = False
|
||||
self.grad_reducer = F.identity
|
||||
self.hyper_map = C.HyperMap()
|
||||
self.parallel_mode = _get_parallel_mode()
|
||||
self.reducer_flag = self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL)
|
||||
if self.reducer_flag:
|
||||
self.mean = _get_gradients_mean()
|
||||
self.degree = _get_device_num()
|
||||
self.grad_reducer = DistributedGradReducer(self.weights, self.mean, self.degree)
|
||||
|
||||
def construct(self, *inputs):
|
||||
loss = self.network(*inputs)
|
||||
sens = F.fill(loss.dtype, loss.shape, self.sens)
|
||||
grads = self.grad(self.network, self.weights)(*inputs, sens)
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
grads = self.grad_reducer(grads)
|
||||
loss = F.depend(loss, self.optimizer(grads))
|
||||
return loss
|
||||
|
|
|
@ -29,6 +29,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMoni
|
|||
from src.config import common_config, KingsCollege, StMarysChurch
|
||||
from src.dataset import data_to_mindrecord, create_posenet_dataset
|
||||
from src.loss import PosenetWithLoss
|
||||
from src.posenet import PoseTrainOneStepCell
|
||||
|
||||
set_seed(1)
|
||||
|
||||
|
@ -45,6 +46,9 @@ if __name__ == '__main__':
|
|||
parser.add_argument('--is_modelarts', type=ast.literal_eval, default=False, help='Train in Modelarts.')
|
||||
parser.add_argument('--data_url', default=None, help='Location of data.')
|
||||
parser.add_argument('--train_url', default=None, help='Location of training outputs.')
|
||||
parser.add_argument('--device_target', type=str, default='Ascend',
|
||||
choices=['Ascend', 'GPU'],
|
||||
help='Name of device target.')
|
||||
args_opt = parser.parse_args()
|
||||
|
||||
cfg = common_config
|
||||
|
@ -53,7 +57,7 @@ if __name__ == '__main__':
|
|||
elif args_opt.dataset == "StMarysChurch":
|
||||
dataset_cfg = StMarysChurch
|
||||
|
||||
device_target = cfg.device_target
|
||||
device_target = args_opt.device_target
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
|
||||
if args_opt.run_distribute:
|
||||
if device_target == "Ascend":
|
||||
|
@ -104,7 +108,8 @@ if __name__ == '__main__':
|
|||
opt = Adagrad(params=net_with_loss.trainable_params(),
|
||||
learning_rate=dataset_cfg.lr_init,
|
||||
weight_decay=dataset_cfg.weight_decay)
|
||||
model = Model(net_with_loss, optimizer=opt)
|
||||
net_with_grad = PoseTrainOneStepCell(net_with_loss, opt)
|
||||
model = Model(net_with_grad)
|
||||
|
||||
time_cb = TimeMonitor(data_size=step_per_epoch)
|
||||
loss_cb = LossMonitor()
|
||||
|
|
|
@ -61,8 +61,8 @@ if __name__ == '__main__':
|
|||
if args.distribute:
|
||||
if target == "Ascend":
|
||||
init()
|
||||
device_id = int(os.getenv('DEVICE_ID'))
|
||||
context.set_auto_parallel_context(device_id=device_id,
|
||||
device_num = int(os.getenv('RANK_SIZE'))
|
||||
context.set_auto_parallel_context(device_num=device_num,
|
||||
parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True)
|
||||
if target == "GPU":
|
||||
|
|
Loading…
Reference in New Issue