adding check for exceptional values in moe
This commit is contained in:
parent
3be3c7cdfd
commit
9e1d095c99
|
@ -41,8 +41,8 @@ class MoEConfig:
|
||||||
which is >=1.0. Default: 1.1.
|
which is >=1.0. Default: 1.1.
|
||||||
aux_loss_factor (float): The factor is used to indicate how much the load balance loss (produced by the
|
aux_loss_factor (float): The factor is used to indicate how much the load balance loss (produced by the
|
||||||
router) to be added to the entire model loss, which is < 1.0. Default: 0.05.
|
router) to be added to the entire model loss, which is < 1.0. Default: 0.05.
|
||||||
num_experts_chosen (int): The number of experts is chosen by each token. This value should be less
|
num_experts_chosen (int): The number of experts is chosen by each token. Since only 'Top1' routing policy
|
||||||
than or equal to 'expert_num'. Default: 1.
|
is supported currently, the value should be 1. Default: 1.
|
||||||
Supported Platforms:
|
Supported Platforms:
|
||||||
``Ascend`` ``GPU``
|
``Ascend`` ``GPU``
|
||||||
|
|
||||||
|
@ -62,10 +62,9 @@ class MoEConfig:
|
||||||
if aux_loss_factor >= 1.0:
|
if aux_loss_factor >= 1.0:
|
||||||
raise ValueError(f"'aux_loss_factor' should be less than 1.0, "
|
raise ValueError(f"'aux_loss_factor' should be less than 1.0, "
|
||||||
f"but got {aux_loss_factor}.")
|
f"but got {aux_loss_factor}.")
|
||||||
if num_experts_chosen > expert_num:
|
if num_experts_chosen != 1:
|
||||||
raise ValueError(f"'num_experts_chosen' should be less than or equal to 'expert_num', "
|
raise ValueError(f"'num_experts_chosen' should be 1. Since only 'Top1' routing policy supported currently, "
|
||||||
f"but got {num_experts_chosen} for 'num_experts_chosen', "
|
f"the value should be 1.")
|
||||||
f"and {expert_num} for 'expert_num'.")
|
|
||||||
self.expert_num = expert_num
|
self.expert_num = expert_num
|
||||||
self.capacity_factor = capacity_factor
|
self.capacity_factor = capacity_factor
|
||||||
self.aux_loss_factor = aux_loss_factor
|
self.aux_loss_factor = aux_loss_factor
|
||||||
|
@ -73,7 +72,15 @@ class MoEConfig:
|
||||||
|
|
||||||
default_moe_config = MoEConfig()
|
default_moe_config = MoEConfig()
|
||||||
|
|
||||||
|
def _check_moe_config(moe_config=None, parallel_config=None):
|
||||||
|
if not isinstance(moe_config, MoEConfig):
|
||||||
|
raise TypeError(f"'moe_config' should be an instance of MoEConfig, but got {type(moe_config).__name__}.")
|
||||||
|
use_moe = (moe_config.expert_num > 1)
|
||||||
|
if use_moe and moe_config.expert_num % parallel_config.data_parallel != 0:
|
||||||
|
raise ValueError(f"When using MoE, the 'expert_num' in {type(moe_config).__name__} must be a multiple "
|
||||||
|
f"of 'data_parallel' value in {type(parallel_config).__name__}, but got "
|
||||||
|
f"{moe_config.expert_num} for 'expert_num' and {parallel_config.data_parallel} for "
|
||||||
|
f"'data_parallel'.")
|
||||||
@constexpr
|
@constexpr
|
||||||
def calculate_expert_capacity(k, tokens_per_device, capacity_factor, expert_dim):
|
def calculate_expert_capacity(k, tokens_per_device, capacity_factor, expert_dim):
|
||||||
return math.ceil(k * tokens_per_device * capacity_factor / expert_dim)
|
return math.ceil(k * tokens_per_device * capacity_factor / expert_dim)
|
||||||
|
|
|
@ -35,7 +35,7 @@ from .layers import _LayerNorm, _Linear, _check_input_shape, \
|
||||||
_args_type_validator_check, _valid_type_checks, _valid_value_checks, \
|
_args_type_validator_check, _valid_type_checks, _valid_value_checks, \
|
||||||
_check_shape_equal, _check_past_none_input_none, _check_input_dtype, _check_input_shape_value
|
_check_shape_equal, _check_past_none_input_none, _check_input_dtype, _check_input_shape_value
|
||||||
from .op_parallel_config import default_dpmp_config, _PipeLineConfig, OpParallelConfig, _Config, _check_config
|
from .op_parallel_config import default_dpmp_config, _PipeLineConfig, OpParallelConfig, _Config, _check_config
|
||||||
from .moe import default_moe_config, MoE
|
from .moe import default_moe_config, MoE, _check_moe_config
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AttentionMask",
|
"AttentionMask",
|
||||||
|
@ -1304,6 +1304,7 @@ class TransformerEncoderLayer(Cell):
|
||||||
param_init_type=param_init_type,
|
param_init_type=param_init_type,
|
||||||
use_past=use_past,
|
use_past=use_past,
|
||||||
parallel_config=parallel_config)
|
parallel_config=parallel_config)
|
||||||
|
_check_moe_config(moe_config, parallel_config)
|
||||||
self.use_moe = (moe_config.expert_num > 1)
|
self.use_moe = (moe_config.expert_num > 1)
|
||||||
if self.use_moe is True:
|
if self.use_moe is True:
|
||||||
self.output = MoE(hidden_size=hidden_size,
|
self.output = MoE(hidden_size=hidden_size,
|
||||||
|
@ -1625,6 +1626,7 @@ class TransformerDecoderLayer(Cell):
|
||||||
self.cross_attention_layernorm = _LayerNorm((hidden_size,)).to_float(
|
self.cross_attention_layernorm = _LayerNorm((hidden_size,)).to_float(
|
||||||
layernorm_compute_type)
|
layernorm_compute_type)
|
||||||
self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),))
|
self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),))
|
||||||
|
_check_moe_config(moe_config, parallel_config)
|
||||||
self.use_moe = (moe_config.expert_num > 1)
|
self.use_moe = (moe_config.expert_num > 1)
|
||||||
if self.use_moe is True:
|
if self.use_moe is True:
|
||||||
self.output = MoE(hidden_size=hidden_size,
|
self.output = MoE(hidden_size=hidden_size,
|
||||||
|
@ -2004,7 +2006,7 @@ class TransformerEncoder(Cell):
|
||||||
parallel_config=default_transformer_config):
|
parallel_config=default_transformer_config):
|
||||||
super(TransformerEncoder, self).__init__()
|
super(TransformerEncoder, self).__init__()
|
||||||
_check_config(parallel_config)
|
_check_config(parallel_config)
|
||||||
|
_check_moe_config(moe_config, parallel_config)
|
||||||
self.use_moe = (moe_config.expert_num > 1)
|
self.use_moe = (moe_config.expert_num > 1)
|
||||||
self.add = P.Add().shard(((), ()))
|
self.add = P.Add().shard(((), ()))
|
||||||
self.aux_loss = Tensor(0.0, mstype.float32)
|
self.aux_loss = Tensor(0.0, mstype.float32)
|
||||||
|
@ -2205,6 +2207,7 @@ class TransformerDecoder(Cell):
|
||||||
raise RuntimeError(f"The {self.cls_name} does not support auto parallel mode now.")
|
raise RuntimeError(f"The {self.cls_name} does not support auto parallel mode now.")
|
||||||
self.num_layers = num_layers
|
self.num_layers = num_layers
|
||||||
self.blocks = nn.CellList()
|
self.blocks = nn.CellList()
|
||||||
|
_check_moe_config(moe_config, parallel_config)
|
||||||
self.use_moe = (moe_config.expert_num > 1)
|
self.use_moe = (moe_config.expert_num > 1)
|
||||||
for i in range(num_layers):
|
for i in range(num_layers):
|
||||||
block = TransformerDecoderLayer(hidden_size=hidden_size,
|
block = TransformerDecoderLayer(hidden_size=hidden_size,
|
||||||
|
@ -2433,7 +2436,7 @@ class Transformer(Cell):
|
||||||
# The shard setting of Transformer is set within the TransformerEncoderLayer
|
# The shard setting of Transformer is set within the TransformerEncoderLayer
|
||||||
if not lambda_func:
|
if not lambda_func:
|
||||||
lambda_func = _get_lambda_func(total_layer=encoder_layers + decoder_layers)
|
lambda_func = _get_lambda_func(total_layer=encoder_layers + decoder_layers)
|
||||||
|
_check_moe_config(moe_config, parallel_config)
|
||||||
self.use_moe = (moe_config.expert_num > 1)
|
self.use_moe = (moe_config.expert_num > 1)
|
||||||
self.add = P.Add().shard(((), ()))
|
self.add = P.Add().shard(((), ()))
|
||||||
self.aux_loss = Tensor(0.0, mstype.float32)
|
self.aux_loss = Tensor(0.0, mstype.float32)
|
||||||
|
|
Loading…
Reference in New Issue