From 9e2b38c961e720194976cd31129809a717c1aa90 Mon Sep 17 00:00:00 2001 From: hanhuifeng2020 Date: Sat, 15 Aug 2020 16:58:13 +0800 Subject: [PATCH] The teacher use fp16 calculations to optimize the performance of tinybert on the gpu --- .../official/nlp/tinybert/run_general_distill.py | 7 ++----- .../official/nlp/tinybert/run_task_distill.py | 7 ++----- .../scripts/run_distribute_gd_for_gpu.sh | 1 + .../official/nlp/tinybert/src/tinybert_model.py | 16 ++++++++++++---- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/model_zoo/official/nlp/tinybert/run_general_distill.py b/model_zoo/official/nlp/tinybert/run_general_distill.py index 62730b62c5a..8fdc86b8bcd 100644 --- a/model_zoo/official/nlp/tinybert/run_general_distill.py +++ b/model_zoo/official/nlp/tinybert/run_general_distill.py @@ -87,13 +87,10 @@ def run_general_distill(): enable_loss_scale = True if args_opt.device_target == "GPU": - if bert_teacher_net_cfg.compute_type != mstype.float32: - logger.warning('GPU only support fp32 temporarily, run with fp32.') - bert_teacher_net_cfg.compute_type = mstype.float32 if bert_student_net_cfg.compute_type != mstype.float32: - logger.warning('GPU only support fp32 temporarily, run with fp32.') + logger.warning('Compute about the student only support float32 temporarily, run with float32.') bert_student_net_cfg.compute_type = mstype.float32 - # Both the forward and backward of the network are calculated using fp32, + # Backward of the network are calculated using fp32, # and the loss scale is not necessary enable_loss_scale = False diff --git a/model_zoo/official/nlp/tinybert/run_task_distill.py b/model_zoo/official/nlp/tinybert/run_task_distill.py index b7eceac8e26..517f652d8a1 100644 --- a/model_zoo/official/nlp/tinybert/run_task_distill.py +++ b/model_zoo/official/nlp/tinybert/run_task_distill.py @@ -285,13 +285,10 @@ if __name__ == '__main__': enable_loss_scale = True if args_opt.device_target == "GPU": - if td_teacher_net_cfg.compute_type != mstype.float32: - logger.warning('GPU only support fp32 temporarily, run with fp32.') - td_teacher_net_cfg.compute_type = mstype.float32 if td_student_net_cfg.compute_type != mstype.float32: - logger.warning('GPU only support fp32 temporarily, run with fp32.') + logger.warning('Compute about the student only support float32 temporarily, run with float32.') td_student_net_cfg.compute_type = mstype.float32 - # Both the forward and backward of the network are calculated using fp32, + # Backward of the network are calculated using fp32, # and the loss scale is not necessary enable_loss_scale = False diff --git a/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh b/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh index d345ddef5c9..a12b58f98f1 100644 --- a/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh +++ b/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh @@ -37,4 +37,5 @@ mpirun --allow-run-as-root -n $RANK_SIZE \ --save_ckpt_path="" \ --data_dir=$DATA_DIR \ --schema_dir=$SCHEMA_DIR \ + --enable_data_sink=False \ --load_teacher_ckpt_path=$TEACHER_CKPT_PATH > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/tinybert/src/tinybert_model.py b/model_zoo/official/nlp/tinybert/src/tinybert_model.py index cc5477bc4f2..b2499e8c53d 100644 --- a/model_zoo/official/nlp/tinybert/src/tinybert_model.py +++ b/model_zoo/official/nlp/tinybert/src/tinybert_model.py @@ -24,6 +24,7 @@ from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter +from mindspore import context from .fused_layer_norm import FusedLayerNorm @@ -250,11 +251,16 @@ class BertOutput(nn.Cell): weight_init=TruncatedNormal(initializer_range)).to_float(compute_type) self.dropout = nn.Dropout(1 - dropout_prob) self.add = P.TensorAdd() - if compute_type == mstype.float16: - self.layernorm = FusedLayerNorm((out_channels,), - use_batch_norm=enable_fused_layernorm).to_float(compute_type) + self.is_gpu = context.get_context('device_target') == "GPU" + if self.is_gpu: + self.layernorm = nn.LayerNorm((out_channels,)).to_float(mstype.float32) + self.compute_type = compute_type else: - self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type) + if compute_type == mstype.float16: + self.layernorm = FusedLayerNorm((out_channels,), + use_batch_norm=enable_fused_layernorm).to_float(compute_type) + else: + self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type) self.cast = P.Cast() @@ -264,6 +270,8 @@ class BertOutput(nn.Cell): output = self.dropout(output) output = self.add(input_tensor, output) output = self.layernorm(output) + if self.is_gpu: + output = self.cast(output, self.compute_type) return output