From 9e2b38c961e720194976cd31129809a717c1aa90 Mon Sep 17 00:00:00 2001
From: hanhuifeng2020 <hanhuifeng1@huawei.com>
Date: Sat, 15 Aug 2020 16:58:13 +0800
Subject: [PATCH] The teacher use fp16 calculations to optimize the performance
 of tinybert on the gpu

---
 .../official/nlp/tinybert/run_general_distill.py |  7 ++-----
 .../official/nlp/tinybert/run_task_distill.py    |  7 ++-----
 .../scripts/run_distribute_gd_for_gpu.sh         |  1 +
 .../official/nlp/tinybert/src/tinybert_model.py  | 16 ++++++++++++----
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/model_zoo/official/nlp/tinybert/run_general_distill.py b/model_zoo/official/nlp/tinybert/run_general_distill.py
index 62730b62c5a..8fdc86b8bcd 100644
--- a/model_zoo/official/nlp/tinybert/run_general_distill.py
+++ b/model_zoo/official/nlp/tinybert/run_general_distill.py
@@ -87,13 +87,10 @@ def run_general_distill():
 
     enable_loss_scale = True
     if args_opt.device_target == "GPU":
-        if bert_teacher_net_cfg.compute_type != mstype.float32:
-            logger.warning('GPU only support fp32 temporarily, run with fp32.')
-            bert_teacher_net_cfg.compute_type = mstype.float32
         if bert_student_net_cfg.compute_type != mstype.float32:
-            logger.warning('GPU only support fp32 temporarily, run with fp32.')
+            logger.warning('Compute about the student only support float32 temporarily, run with float32.')
             bert_student_net_cfg.compute_type = mstype.float32
-        # Both the forward and backward of the network are calculated using fp32,
+        # Backward of the network are calculated using fp32,
         # and the loss scale is not necessary
         enable_loss_scale = False
 
diff --git a/model_zoo/official/nlp/tinybert/run_task_distill.py b/model_zoo/official/nlp/tinybert/run_task_distill.py
index b7eceac8e26..517f652d8a1 100644
--- a/model_zoo/official/nlp/tinybert/run_task_distill.py
+++ b/model_zoo/official/nlp/tinybert/run_task_distill.py
@@ -285,13 +285,10 @@ if __name__ == '__main__':
 
     enable_loss_scale = True
     if args_opt.device_target == "GPU":
-        if td_teacher_net_cfg.compute_type != mstype.float32:
-            logger.warning('GPU only support fp32 temporarily, run with fp32.')
-            td_teacher_net_cfg.compute_type = mstype.float32
         if td_student_net_cfg.compute_type != mstype.float32:
-            logger.warning('GPU only support fp32 temporarily, run with fp32.')
+            logger.warning('Compute about the student only support float32 temporarily, run with float32.')
             td_student_net_cfg.compute_type = mstype.float32
-        # Both the forward and backward of the network are calculated using fp32,
+        # Backward of the network are calculated using fp32,
         # and the loss scale is not necessary
         enable_loss_scale = False
 
diff --git a/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh b/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh
index d345ddef5c9..a12b58f98f1 100644
--- a/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh
+++ b/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd_for_gpu.sh
@@ -37,4 +37,5 @@ mpirun --allow-run-as-root -n $RANK_SIZE \
 	--save_ckpt_path="" \
 	--data_dir=$DATA_DIR \
 	--schema_dir=$SCHEMA_DIR \
+	--enable_data_sink=False \
 	--load_teacher_ckpt_path=$TEACHER_CKPT_PATH > log.txt 2>&1 &
diff --git a/model_zoo/official/nlp/tinybert/src/tinybert_model.py b/model_zoo/official/nlp/tinybert/src/tinybert_model.py
index cc5477bc4f2..b2499e8c53d 100644
--- a/model_zoo/official/nlp/tinybert/src/tinybert_model.py
+++ b/model_zoo/official/nlp/tinybert/src/tinybert_model.py
@@ -24,6 +24,7 @@ from mindspore.ops import operations as P
 from mindspore.ops import composite as C
 from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import Parameter
+from mindspore import context
 from .fused_layer_norm import FusedLayerNorm
 
 
@@ -250,11 +251,16 @@ class BertOutput(nn.Cell):
                               weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
         self.dropout = nn.Dropout(1 - dropout_prob)
         self.add = P.TensorAdd()
-        if compute_type == mstype.float16:
-            self.layernorm = FusedLayerNorm((out_channels,),
-                                            use_batch_norm=enable_fused_layernorm).to_float(compute_type)
+        self.is_gpu = context.get_context('device_target') == "GPU"
+        if self.is_gpu:
+            self.layernorm = nn.LayerNorm((out_channels,)).to_float(mstype.float32)
+            self.compute_type = compute_type
         else:
-            self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
+            if compute_type == mstype.float16:
+                self.layernorm = FusedLayerNorm((out_channels,),
+                                                use_batch_norm=enable_fused_layernorm).to_float(compute_type)
+            else:
+                self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
 
         self.cast = P.Cast()
 
@@ -264,6 +270,8 @@ class BertOutput(nn.Cell):
         output = self.dropout(output)
         output = self.add(input_tensor, output)
         output = self.layernorm(output)
+        if self.is_gpu:
+            output = self.cast(output, self.compute_type)
         return output