adding check for exceptional values in moe

This commit is contained in:
Xiaoda Zhang 2021-12-02 15:12:16 +08:00
parent 3be3c7cdfd
commit 9e1d095c99
2 changed files with 20 additions and 10 deletions

View File

@ -41,8 +41,8 @@ class MoEConfig:
which is >=1.0. Default: 1.1. which is >=1.0. Default: 1.1.
aux_loss_factor (float): The factor is used to indicate how much the load balance loss (produced by the aux_loss_factor (float): The factor is used to indicate how much the load balance loss (produced by the
router) to be added to the entire model loss, which is < 1.0. Default: 0.05. router) to be added to the entire model loss, which is < 1.0. Default: 0.05.
num_experts_chosen (int): The number of experts is chosen by each token. This value should be less num_experts_chosen (int): The number of experts is chosen by each token. Since only 'Top1' routing policy
than or equal to 'expert_num'. Default: 1. is supported currently, the value should be 1. Default: 1.
Supported Platforms: Supported Platforms:
``Ascend`` ``GPU`` ``Ascend`` ``GPU``
@ -62,10 +62,9 @@ class MoEConfig:
if aux_loss_factor >= 1.0: if aux_loss_factor >= 1.0:
raise ValueError(f"'aux_loss_factor' should be less than 1.0, " raise ValueError(f"'aux_loss_factor' should be less than 1.0, "
f"but got {aux_loss_factor}.") f"but got {aux_loss_factor}.")
if num_experts_chosen > expert_num: if num_experts_chosen != 1:
raise ValueError(f"'num_experts_chosen' should be less than or equal to 'expert_num', " raise ValueError(f"'num_experts_chosen' should be 1. Since only 'Top1' routing policy supported currently, "
f"but got {num_experts_chosen} for 'num_experts_chosen', " f"the value should be 1.")
f"and {expert_num} for 'expert_num'.")
self.expert_num = expert_num self.expert_num = expert_num
self.capacity_factor = capacity_factor self.capacity_factor = capacity_factor
self.aux_loss_factor = aux_loss_factor self.aux_loss_factor = aux_loss_factor
@ -73,7 +72,15 @@ class MoEConfig:
default_moe_config = MoEConfig() default_moe_config = MoEConfig()
def _check_moe_config(moe_config=None, parallel_config=None):
if not isinstance(moe_config, MoEConfig):
raise TypeError(f"'moe_config' should be an instance of MoEConfig, but got {type(moe_config).__name__}.")
use_moe = (moe_config.expert_num > 1)
if use_moe and moe_config.expert_num % parallel_config.data_parallel != 0:
raise ValueError(f"When using MoE, the 'expert_num' in {type(moe_config).__name__} must be a multiple "
f"of 'data_parallel' value in {type(parallel_config).__name__}, but got "
f"{moe_config.expert_num} for 'expert_num' and {parallel_config.data_parallel} for "
f"'data_parallel'.")
@constexpr @constexpr
def calculate_expert_capacity(k, tokens_per_device, capacity_factor, expert_dim): def calculate_expert_capacity(k, tokens_per_device, capacity_factor, expert_dim):
return math.ceil(k * tokens_per_device * capacity_factor / expert_dim) return math.ceil(k * tokens_per_device * capacity_factor / expert_dim)

View File

@ -35,7 +35,7 @@ from .layers import _LayerNorm, _Linear, _check_input_shape, \
_args_type_validator_check, _valid_type_checks, _valid_value_checks, \ _args_type_validator_check, _valid_type_checks, _valid_value_checks, \
_check_shape_equal, _check_past_none_input_none, _check_input_dtype, _check_input_shape_value _check_shape_equal, _check_past_none_input_none, _check_input_dtype, _check_input_shape_value
from .op_parallel_config import default_dpmp_config, _PipeLineConfig, OpParallelConfig, _Config, _check_config from .op_parallel_config import default_dpmp_config, _PipeLineConfig, OpParallelConfig, _Config, _check_config
from .moe import default_moe_config, MoE from .moe import default_moe_config, MoE, _check_moe_config
__all__ = [ __all__ = [
"AttentionMask", "AttentionMask",
@ -1304,6 +1304,7 @@ class TransformerEncoderLayer(Cell):
param_init_type=param_init_type, param_init_type=param_init_type,
use_past=use_past, use_past=use_past,
parallel_config=parallel_config) parallel_config=parallel_config)
_check_moe_config(moe_config, parallel_config)
self.use_moe = (moe_config.expert_num > 1) self.use_moe = (moe_config.expert_num > 1)
if self.use_moe is True: if self.use_moe is True:
self.output = MoE(hidden_size=hidden_size, self.output = MoE(hidden_size=hidden_size,
@ -1625,6 +1626,7 @@ class TransformerDecoderLayer(Cell):
self.cross_attention_layernorm = _LayerNorm((hidden_size,)).to_float( self.cross_attention_layernorm = _LayerNorm((hidden_size,)).to_float(
layernorm_compute_type) layernorm_compute_type)
self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),)) self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),))
_check_moe_config(moe_config, parallel_config)
self.use_moe = (moe_config.expert_num > 1) self.use_moe = (moe_config.expert_num > 1)
if self.use_moe is True: if self.use_moe is True:
self.output = MoE(hidden_size=hidden_size, self.output = MoE(hidden_size=hidden_size,
@ -2004,7 +2006,7 @@ class TransformerEncoder(Cell):
parallel_config=default_transformer_config): parallel_config=default_transformer_config):
super(TransformerEncoder, self).__init__() super(TransformerEncoder, self).__init__()
_check_config(parallel_config) _check_config(parallel_config)
_check_moe_config(moe_config, parallel_config)
self.use_moe = (moe_config.expert_num > 1) self.use_moe = (moe_config.expert_num > 1)
self.add = P.Add().shard(((), ())) self.add = P.Add().shard(((), ()))
self.aux_loss = Tensor(0.0, mstype.float32) self.aux_loss = Tensor(0.0, mstype.float32)
@ -2205,6 +2207,7 @@ class TransformerDecoder(Cell):
raise RuntimeError(f"The {self.cls_name} does not support auto parallel mode now.") raise RuntimeError(f"The {self.cls_name} does not support auto parallel mode now.")
self.num_layers = num_layers self.num_layers = num_layers
self.blocks = nn.CellList() self.blocks = nn.CellList()
_check_moe_config(moe_config, parallel_config)
self.use_moe = (moe_config.expert_num > 1) self.use_moe = (moe_config.expert_num > 1)
for i in range(num_layers): for i in range(num_layers):
block = TransformerDecoderLayer(hidden_size=hidden_size, block = TransformerDecoderLayer(hidden_size=hidden_size,
@ -2433,7 +2436,7 @@ class Transformer(Cell):
# The shard setting of Transformer is set within the TransformerEncoderLayer # The shard setting of Transformer is set within the TransformerEncoderLayer
if not lambda_func: if not lambda_func:
lambda_func = _get_lambda_func(total_layer=encoder_layers + decoder_layers) lambda_func = _get_lambda_func(total_layer=encoder_layers + decoder_layers)
_check_moe_config(moe_config, parallel_config)
self.use_moe = (moe_config.expert_num > 1) self.use_moe = (moe_config.expert_num > 1)
self.add = P.Add().shard(((), ())) self.add = P.Add().shard(((), ()))
self.aux_loss = Tensor(0.0, mstype.float32) self.aux_loss = Tensor(0.0, mstype.float32)