From 2353756515773edcdc6af0b0f2c70a7eea2332e0 Mon Sep 17 00:00:00 2001 From: alouhahaha Date: Tue, 17 Nov 2020 15:34:25 +0800 Subject: [PATCH] BugFix for GPT --- .../official/nlp/gpt/scripts/run_distribute_train.sh | 4 ++-- model_zoo/official/nlp/gpt/src/gpt.py | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/model_zoo/official/nlp/gpt/scripts/run_distribute_train.sh b/model_zoo/official/nlp/gpt/scripts/run_distribute_train.sh index a97b8aeaaa9..49174e79a4d 100644 --- a/model_zoo/official/nlp/gpt/scripts/run_distribute_train.sh +++ b/model_zoo/official/nlp/gpt/scripts/run_distribute_train.sh @@ -21,13 +21,13 @@ echo "for example: bash run_distributed_pretrain_ascend.sh /path/dataset /path/h echo "It is better to use absolute path." echo "==============================================================================================================" -ROOT_PATH='pwd' +ROOT_PATH=`pwd` DATA_DIR=$1 export RANK_TABLE_FILE=$2 RANK_SIZE=$3 -for((i=0;i<=${RANK_SIZE};i++)); +for((i=0;i<${RANK_SIZE};i++)); do rm ${ROOT_PATH}/device$i/ -rf mkdir ${ROOT_PATH}/device$i diff --git a/model_zoo/official/nlp/gpt/src/gpt.py b/model_zoo/official/nlp/gpt/src/gpt.py index 016f055f428..fb00d35aa67 100644 --- a/model_zoo/official/nlp/gpt/src/gpt.py +++ b/model_zoo/official/nlp/gpt/src/gpt.py @@ -21,7 +21,7 @@ import mindspore.nn as nn from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter import mindspore.common.dtype as mstype -from mindspore.common.initializer import TruncatedNormal, initializer +from mindspore.common.initializer import TruncatedNormal, initializer, Normal from mindspore.ops import operations as P from mindspore.ops import functional as F @@ -48,7 +48,7 @@ class LayerNorm(nn.Cell): def construct(self, x): mean = self.mean(x, -1) - variance = self.mean(F.square(x - mean)) + variance = self.mean(F.square(x - mean), -1) output = (x - mean) / F.sqrt(variance + self.eps) rescaled_output = output * self.gamma + self.beta return rescaled_output @@ -100,10 +100,8 @@ class Mapping(nn.Cell): super(Mapping, self).__init__() self.output_size = output_size self.input_size = input_size - weight = np.random.normal(loc=0.0, scale=0.02*scale, size=(input_size, output_size)) - bias = np.zeros(shape=(output_size,)) - self.weight = Parameter(Tensor(weight, mstype.float32), name="mapping_weight") - self.bias = Parameter(Tensor(bias, mstype.float32), name="mapping_bias") + self.weight = Parameter(initializer(Normal(sigma=0.02*scale), [input_size, output_size]), name="mapping_weight") + self.bias = Parameter(initializer("zeros", [output_size,]), name="mapping_bias") self.dtype = dtype self.cast = P.Cast() @@ -363,7 +361,6 @@ class Block(nn.Cell): """ def __init__(self, config, layer_idx): super(Block, self).__init__() - scale = 1 / math.sqrt(2.0*layer_idx) self.layernorm1 = LayerNorm((config.embedding_size,)).to_float(config.compute_dtype) self.attention = Attention(config, scale, layer_idx) self.layernorm2 = LayerNorm((config.embedding_size,)).to_float(config.compute_dtype)