!8704 BugFix for GPT

From: @alouhahahahaha
Reviewed-by: @stsuteng,@yingjy
Signed-off-by: @stsuteng
This commit is contained in:
mindspore-ci-bot 2020-11-17 21:31:32 +08:00 committed by Gitee
commit 78fc3e722f
2 changed files with 6 additions and 9 deletions

View File

@ -21,13 +21,13 @@ echo "for example: bash run_distributed_pretrain_ascend.sh /path/dataset /path/h
echo "It is better to use absolute path."
echo "=============================================================================================================="
ROOT_PATH='pwd'
ROOT_PATH=`pwd`
DATA_DIR=$1
export RANK_TABLE_FILE=$2
RANK_SIZE=$3
for((i=0;i<=${RANK_SIZE};i++));
for((i=0;i<${RANK_SIZE};i++));
do
rm ${ROOT_PATH}/device$i/ -rf
mkdir ${ROOT_PATH}/device$i

View File

@ -21,7 +21,7 @@ import mindspore.nn as nn
from mindspore.common.tensor import Tensor
from mindspore.common.parameter import Parameter
import mindspore.common.dtype as mstype
from mindspore.common.initializer import TruncatedNormal, initializer
from mindspore.common.initializer import TruncatedNormal, initializer, Normal
from mindspore.ops import operations as P
from mindspore.ops import functional as F
@ -48,7 +48,7 @@ class LayerNorm(nn.Cell):
def construct(self, x):
mean = self.mean(x, -1)
variance = self.mean(F.square(x - mean))
variance = self.mean(F.square(x - mean), -1)
output = (x - mean) / F.sqrt(variance + self.eps)
rescaled_output = output * self.gamma + self.beta
return rescaled_output
@ -100,10 +100,8 @@ class Mapping(nn.Cell):
super(Mapping, self).__init__()
self.output_size = output_size
self.input_size = input_size
weight = np.random.normal(loc=0.0, scale=0.02*scale, size=(input_size, output_size))
bias = np.zeros(shape=(output_size,))
self.weight = Parameter(Tensor(weight, mstype.float32), name="mapping_weight")
self.bias = Parameter(Tensor(bias, mstype.float32), name="mapping_bias")
self.weight = Parameter(initializer(Normal(sigma=0.02*scale), [input_size, output_size]), name="mapping_weight")
self.bias = Parameter(initializer("zeros", [output_size,]), name="mapping_bias")
self.dtype = dtype
self.cast = P.Cast()
@ -363,7 +361,6 @@ class Block(nn.Cell):
"""
def __init__(self, config, layer_idx):
super(Block, self).__init__()
scale = 1 / math.sqrt(2.0*layer_idx)
self.layernorm1 = LayerNorm((config.embedding_size,)).to_float(config.compute_dtype)
self.attention = Attention(config, scale, layer_idx)
self.layernorm2 = LayerNorm((config.embedding_size,)).to_float(config.compute_dtype)