diff --git a/mindspore/python/mindspore/nn/transformer/moe.py b/mindspore/python/mindspore/nn/transformer/moe.py index d78beab3c7c..580b88f8024 100644 --- a/mindspore/python/mindspore/nn/transformer/moe.py +++ b/mindspore/python/mindspore/nn/transformer/moe.py @@ -417,6 +417,7 @@ class TopkRouter(Cell): self.div1 = P.RealDiv() self.div2 = P.RealDiv() self.add = P.Add() + self.add1 = P.Add() self.add2 = P.Add() self.add3 = P.Add() self.add4 = P.Add() @@ -453,8 +454,8 @@ class TopkRouter(Cell): self.reduce_mean2 = P.ReduceMean(keep_dims=False).shard(((dp, 1, 1),)) self.reduce_mean3 = P.ReduceMean(keep_dims=False).shard(((dp, 1),)) self.mul = P.Mul().shard(((dp, 1), (dp, 1))) - self.mul2 = P.Mul().shard(((1,), ())) - self.mul3 = P.Mul().shard(((1,), ())) + self.mul2 = P.Mul().shard(((), ())) + self.mul3 = P.Mul().shard(((), ())) self.mul4 = P.Mul().shard(((dp, 1, 1), (dp, 1, 1))) self.mul5 = P.Mul().shard(((dp, 1, 1), (dp, 1, 1))) self.mul6 = P.Mul().shard(((dp, 1), (dp, 1))) @@ -465,6 +466,7 @@ class TopkRouter(Cell): self.div1 = P.RealDiv().shard(((dp, 1, 1), (dp, 1, 1))) self.div2 = P.RealDiv().shard(((dp, 1, 1, 1), (dp, 1, 1, 1))) self.add = P.Add().shard(((dp, 1, 1), (dp, 1, 1))) + self.add1 = P.Add().shard(((dp, 1, 1), ())) self.add2 = P.Add().shard(((dp, 1, 1, 1), (dp, 1, 1, 1))) self.add3 = P.Add().shard(((dp, 1), (dp, 1))) self.add4 = P.Add().shard(((dp, 1, 1, 1), ())) @@ -537,7 +539,7 @@ class TopkRouter(Cell): # expert_mask's shape: (dp_group, tokens_per_group, self.expert_dim) expert_mask = self.onehot(expert_index, self.expert_dim, self.on_value, self.off_value) # renormalize the rest prob to be of sum 1 - router_prob_normal = self.div1(router_prob, self.add(self.reduce_sum_keep(router_prob, -1), 1e-9)) + router_prob_normal = self.div1(router_prob, self.add1(self.reduce_sum_keep(router_prob, -1), 1e-9)) # the balance loss is computed at each routing step loss += self._auxiliary_loss(expert_mask, router_prob_normal) diff --git a/tests/ut/python/parallel/test_parallel_moe.py b/tests/ut/python/parallel/test_parallel_moe.py index 653284126f3..b127b6da5ec 100644 --- a/tests/ut/python/parallel/test_parallel_moe.py +++ b/tests/ut/python/parallel/test_parallel_moe.py @@ -66,6 +66,20 @@ class NetWithLossFiveInputs(nn.Cell): return self.loss(predict) +class NetWithLossMoe(nn.Cell): + def __init__(self, network): + super(NetWithLossMoe, self).__init__() + self.network = network + self.add = P.Add().shard(((), ())) + self.reduce_mean = P.ReduceMean(keep_dims=False).shard(((1, 1),)) + + def construct(self, x1, x2, x3, x4, x5): + predict, _, _, moe_loss = self.network(x1, x2, x3, x4, x5) + predict = P.Reshape()(predict, (-1, 1)) + predict = self.reduce_mean(predict) + return self.add(predict, moe_loss) + + def test_transformer_model(): """ Feature: Test Transformer+MoE, with All2All enabled. @@ -91,7 +105,7 @@ def test_transformer_model(): decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32) decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16) memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16) - net = NetWithLossFiveInputs(net) + net = NetWithLossMoe(net) params = net.trainable_params() optimizer = AdamWeightDecay(params) dataset = Dataset(encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask,