adding check for exceptional values in moe
This commit is contained in:
parent
3be3c7cdfd
commit
9e1d095c99
|
@ -41,8 +41,8 @@ class MoEConfig:
|
|||
which is >=1.0. Default: 1.1.
|
||||
aux_loss_factor (float): The factor is used to indicate how much the load balance loss (produced by the
|
||||
router) to be added to the entire model loss, which is < 1.0. Default: 0.05.
|
||||
num_experts_chosen (int): The number of experts is chosen by each token. This value should be less
|
||||
than or equal to 'expert_num'. Default: 1.
|
||||
num_experts_chosen (int): The number of experts is chosen by each token. Since only 'Top1' routing policy
|
||||
is supported currently, the value should be 1. Default: 1.
|
||||
Supported Platforms:
|
||||
``Ascend`` ``GPU``
|
||||
|
||||
|
@ -62,10 +62,9 @@ class MoEConfig:
|
|||
if aux_loss_factor >= 1.0:
|
||||
raise ValueError(f"'aux_loss_factor' should be less than 1.0, "
|
||||
f"but got {aux_loss_factor}.")
|
||||
if num_experts_chosen > expert_num:
|
||||
raise ValueError(f"'num_experts_chosen' should be less than or equal to 'expert_num', "
|
||||
f"but got {num_experts_chosen} for 'num_experts_chosen', "
|
||||
f"and {expert_num} for 'expert_num'.")
|
||||
if num_experts_chosen != 1:
|
||||
raise ValueError(f"'num_experts_chosen' should be 1. Since only 'Top1' routing policy supported currently, "
|
||||
f"the value should be 1.")
|
||||
self.expert_num = expert_num
|
||||
self.capacity_factor = capacity_factor
|
||||
self.aux_loss_factor = aux_loss_factor
|
||||
|
@ -73,7 +72,15 @@ class MoEConfig:
|
|||
|
||||
default_moe_config = MoEConfig()
|
||||
|
||||
|
||||
def _check_moe_config(moe_config=None, parallel_config=None):
|
||||
if not isinstance(moe_config, MoEConfig):
|
||||
raise TypeError(f"'moe_config' should be an instance of MoEConfig, but got {type(moe_config).__name__}.")
|
||||
use_moe = (moe_config.expert_num > 1)
|
||||
if use_moe and moe_config.expert_num % parallel_config.data_parallel != 0:
|
||||
raise ValueError(f"When using MoE, the 'expert_num' in {type(moe_config).__name__} must be a multiple "
|
||||
f"of 'data_parallel' value in {type(parallel_config).__name__}, but got "
|
||||
f"{moe_config.expert_num} for 'expert_num' and {parallel_config.data_parallel} for "
|
||||
f"'data_parallel'.")
|
||||
@constexpr
|
||||
def calculate_expert_capacity(k, tokens_per_device, capacity_factor, expert_dim):
|
||||
return math.ceil(k * tokens_per_device * capacity_factor / expert_dim)
|
||||
|
|
|
@ -35,7 +35,7 @@ from .layers import _LayerNorm, _Linear, _check_input_shape, \
|
|||
_args_type_validator_check, _valid_type_checks, _valid_value_checks, \
|
||||
_check_shape_equal, _check_past_none_input_none, _check_input_dtype, _check_input_shape_value
|
||||
from .op_parallel_config import default_dpmp_config, _PipeLineConfig, OpParallelConfig, _Config, _check_config
|
||||
from .moe import default_moe_config, MoE
|
||||
from .moe import default_moe_config, MoE, _check_moe_config
|
||||
|
||||
__all__ = [
|
||||
"AttentionMask",
|
||||
|
@ -1304,6 +1304,7 @@ class TransformerEncoderLayer(Cell):
|
|||
param_init_type=param_init_type,
|
||||
use_past=use_past,
|
||||
parallel_config=parallel_config)
|
||||
_check_moe_config(moe_config, parallel_config)
|
||||
self.use_moe = (moe_config.expert_num > 1)
|
||||
if self.use_moe is True:
|
||||
self.output = MoE(hidden_size=hidden_size,
|
||||
|
@ -1625,6 +1626,7 @@ class TransformerDecoderLayer(Cell):
|
|||
self.cross_attention_layernorm = _LayerNorm((hidden_size,)).to_float(
|
||||
layernorm_compute_type)
|
||||
self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),))
|
||||
_check_moe_config(moe_config, parallel_config)
|
||||
self.use_moe = (moe_config.expert_num > 1)
|
||||
if self.use_moe is True:
|
||||
self.output = MoE(hidden_size=hidden_size,
|
||||
|
@ -2004,7 +2006,7 @@ class TransformerEncoder(Cell):
|
|||
parallel_config=default_transformer_config):
|
||||
super(TransformerEncoder, self).__init__()
|
||||
_check_config(parallel_config)
|
||||
|
||||
_check_moe_config(moe_config, parallel_config)
|
||||
self.use_moe = (moe_config.expert_num > 1)
|
||||
self.add = P.Add().shard(((), ()))
|
||||
self.aux_loss = Tensor(0.0, mstype.float32)
|
||||
|
@ -2205,6 +2207,7 @@ class TransformerDecoder(Cell):
|
|||
raise RuntimeError(f"The {self.cls_name} does not support auto parallel mode now.")
|
||||
self.num_layers = num_layers
|
||||
self.blocks = nn.CellList()
|
||||
_check_moe_config(moe_config, parallel_config)
|
||||
self.use_moe = (moe_config.expert_num > 1)
|
||||
for i in range(num_layers):
|
||||
block = TransformerDecoderLayer(hidden_size=hidden_size,
|
||||
|
@ -2433,7 +2436,7 @@ class Transformer(Cell):
|
|||
# The shard setting of Transformer is set within the TransformerEncoderLayer
|
||||
if not lambda_func:
|
||||
lambda_func = _get_lambda_func(total_layer=encoder_layers + decoder_layers)
|
||||
|
||||
_check_moe_config(moe_config, parallel_config)
|
||||
self.use_moe = (moe_config.expert_num > 1)
|
||||
self.add = P.Add().shard(((), ()))
|
||||
self.aux_loss = Tensor(0.0, mstype.float32)
|
||||
|
|
Loading…
Reference in New Issue