forked from mindspore-Ecosystem/mindspore
clean codecheck for thor
This commit is contained in:
parent
f5fb195f04
commit
a140e9ee36
|
@ -32,7 +32,7 @@ from .quant import *
|
|||
from .math import *
|
||||
from .combined import *
|
||||
from .timedistributed import *
|
||||
from .thor_layer import *
|
||||
from .thor_layer import DenseThor, Conv2dThor, EmbeddingThor
|
||||
|
||||
__all__ = []
|
||||
__all__.extend(activation.__all__)
|
||||
|
|
|
@ -26,10 +26,10 @@ from mindspore.nn.cell import Cell
|
|||
from mindspore.nn.layer.activation import get_activation
|
||||
|
||||
|
||||
__all__ = ['Dense_Thor', 'Conv2d_Thor', 'Embedding_Thor']
|
||||
__all__ = ['DenseThor', 'Conv2dThor', 'EmbeddingThor']
|
||||
|
||||
|
||||
class Dense_Thor(Cell):
|
||||
class DenseThor(Cell):
|
||||
r"""
|
||||
The dense connected layer.
|
||||
|
||||
|
@ -77,7 +77,7 @@ class Dense_Thor(Cell):
|
|||
bias_init='zeros',
|
||||
has_bias=True,
|
||||
activation=None):
|
||||
super(Dense_Thor, self).__init__()
|
||||
super(DenseThor, self).__init__()
|
||||
self.thor = True
|
||||
self.in_channels = Validator.check_positive_int(in_channels)
|
||||
self.out_channels = Validator.check_positive_int(out_channels)
|
||||
|
@ -100,40 +100,45 @@ class Dense_Thor(Cell):
|
|||
self.activation = get_activation(activation)
|
||||
self.activation_flag = self.activation is not None
|
||||
|
||||
self.matrix_A = Parameter(Tensor(np.zeros([in_channels, in_channels]).astype(np.float32)),
|
||||
name='matrix_A', requires_grad=False)
|
||||
self.matrix_a = Parameter(Tensor(np.zeros([in_channels, in_channels]).astype(np.float32)),
|
||||
name='matrix_a', requires_grad=False)
|
||||
self.shape = P.Shape()
|
||||
self.reshape = P.Reshape()
|
||||
self.transpose = P.Transpose()
|
||||
self.mul = P.Mul()
|
||||
self.is_Ascend = True
|
||||
if context.get_context("device_target") == "Ascend":
|
||||
if out_channels == 1001:
|
||||
self.matrix_G = Parameter(Tensor(np.zeros([1024, 1024]).astype(np.float32)),
|
||||
name='matrix_G', requires_grad=False)
|
||||
self.pad = P.Pad(((0, 23), (0, 23)))
|
||||
self.pad1 = P.Pad(((0, 7), (0, 7)))
|
||||
self.slice = P.Slice()
|
||||
self.add = P.TensorAdd()
|
||||
else:
|
||||
self.matrix_G = Parameter(Tensor(np.eye(out_channels).astype(np.float32)),
|
||||
name="matrix_G", requires_grad=False)
|
||||
self.abs = P.Abs()
|
||||
self.reduce_max = P.ReduceMax(keep_dims=False)
|
||||
self.neg = P.Neg()
|
||||
self.reduce_sum = P.ReduceSum()
|
||||
self.matmul = P.MatMul(transpose_b=True)
|
||||
self.cube_matmul = P.CusMatMulCube(transpose_a=True)
|
||||
self.cast = P.Cast()
|
||||
self.is_nsp_layer = (out_channels == 2)
|
||||
self._process_ascend_dense_thor(out_channels)
|
||||
else:
|
||||
self.is_Ascend = False
|
||||
self.matrix_G = Parameter(Tensor(np.eye(out_channels).astype(np.float32)),
|
||||
name="matrix_G", requires_grad=False)
|
||||
self.matrix_g = Parameter(Tensor(np.eye(out_channels).astype(np.float32)),
|
||||
name="matrix_g", requires_grad=False)
|
||||
self.cube_matmul = P.MatMul(transpose_a=True)
|
||||
self.getG = P.InsertGradientOf(self.save_gradient)
|
||||
|
||||
|
||||
def _process_ascend_dense_thor(self, out_channels):
|
||||
"""process ascend dense thor"""
|
||||
if out_channels == 1001:
|
||||
self.matrix_g = Parameter(Tensor(np.zeros([1024, 1024]).astype(np.float32)),
|
||||
name='matrix_g', requires_grad=False)
|
||||
self.pad = P.Pad(((0, 23), (0, 23)))
|
||||
self.pad1 = P.Pad(((0, 7), (0, 7)))
|
||||
self.slice = P.Slice()
|
||||
self.add = P.TensorAdd()
|
||||
else:
|
||||
self.matrix_g = Parameter(Tensor(np.eye(out_channels).astype(np.float32)),
|
||||
name="matrix_g", requires_grad=False)
|
||||
self.abs = P.Abs()
|
||||
self.reduce_max = P.ReduceMax(keep_dims=False)
|
||||
self.neg = P.Neg()
|
||||
self.reduce_sum = P.ReduceSum()
|
||||
self.matmul = P.MatMul(transpose_b=True)
|
||||
self.cube_matmul = P.CusMatMulCube(transpose_a=True)
|
||||
self.cast = P.Cast()
|
||||
self.is_nsp_layer = (out_channels == 2)
|
||||
|
||||
|
||||
def save_gradient(self, dout):
|
||||
"""
|
||||
this function only for thor optimizer
|
||||
|
@ -144,17 +149,17 @@ class Dense_Thor(Cell):
|
|||
if not self.is_nsp_layer:
|
||||
shape = self.shape(dout)
|
||||
normalizer = self.cast(shape[0], mstype.float32)
|
||||
matrix_G = self.cube_matmul(dout, dout)
|
||||
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
|
||||
matrix_g = self.cube_matmul(dout, dout)
|
||||
matrix_g = self.mul(matrix_g, 1.0 / normalizer)
|
||||
if self.out_channels == 1001:
|
||||
matrix_G = P.Pad(((0, 23), (0, 23)))(matrix_G)
|
||||
self.matrix_G = matrix_G
|
||||
matrix_g = P.Pad(((0, 23), (0, 23)))(matrix_g)
|
||||
self.matrix_g = matrix_g
|
||||
else:
|
||||
dout_shape = self.shape(dout)
|
||||
normalizer = dout_shape[0]
|
||||
matrix_G = self.cube_matmul(dout, dout)
|
||||
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
|
||||
self.matrix_G = matrix_G
|
||||
matrix_g = self.cube_matmul(dout, dout)
|
||||
matrix_g = self.mul(matrix_g, 1.0 / normalizer)
|
||||
self.matrix_g = matrix_g
|
||||
return out
|
||||
|
||||
def construct(self, x):
|
||||
|
@ -163,14 +168,14 @@ class Dense_Thor(Cell):
|
|||
inputs = self.cube_matmul(x, x)
|
||||
shape = self.shape(x)
|
||||
normalizer = self.cast(shape[0], mstype.float32)
|
||||
matrix_A = self.mul(inputs, 1.0 / normalizer)
|
||||
self.matrix_A = matrix_A
|
||||
matrix_a = self.mul(inputs, 1.0 / normalizer)
|
||||
self.matrix_a = matrix_a
|
||||
else:
|
||||
inputs = self.cube_matmul(x, x)
|
||||
inputs_shape = self.shape(inputs)
|
||||
normalizer = inputs_shape[0]
|
||||
matrix_A = self.mul(inputs, 1.0 / normalizer)
|
||||
self.matrix_A = matrix_A
|
||||
matrix_a = self.mul(inputs, 1.0 / normalizer)
|
||||
self.matrix_a = matrix_a
|
||||
x = self.matmul(x, self.weight)
|
||||
x = self.getG(x)
|
||||
else:
|
||||
|
@ -226,19 +231,9 @@ class _Conv(Cell):
|
|||
self.dilation = dilation
|
||||
self.group = Validator.check_positive_int(group)
|
||||
self.has_bias = has_bias
|
||||
if (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
|
||||
isinstance(kernel_size[0], bool) or isinstance(kernel_size[1], bool) or \
|
||||
kernel_size[0] < 1 or kernel_size[1] < 1:
|
||||
raise ValueError("Attr 'kernel_size' of 'Conv2D' Op passed "
|
||||
+ str(self.kernel_size) + ", should be a int or tuple and equal to or greater than 1.")
|
||||
if (not isinstance(stride[0], int)) or (not isinstance(stride[1], int)) or \
|
||||
isinstance(stride[0], bool) or isinstance(stride[1], bool) or stride[0] < 1 or stride[1] < 1:
|
||||
raise ValueError("Attr 'stride' of 'Conv2D' Op passed "
|
||||
+ str(self.stride) + ", should be a int or tuple and equal to or greater than 1.")
|
||||
if (not isinstance(dilation[0], int)) or (not isinstance(dilation[1], int)) or \
|
||||
isinstance(dilation[0], bool) or isinstance(dilation[1], bool) or dilation[0] < 1 or dilation[1] < 1:
|
||||
raise ValueError("Attr 'dilation' of 'Conv2D' Op passed "
|
||||
+ str(self.dilation) + ", should be a int or tuple and equal to or greater than 1.")
|
||||
self._validate_kernel_size(kernel_size)
|
||||
self._validate_stride(stride)
|
||||
self._validate_dilation(dilation)
|
||||
if in_channels % group != 0:
|
||||
raise ValueError("Attr 'in_channels' of 'Conv2D' Op must be divisible by "
|
||||
"attr 'group' of 'Conv2D' Op.")
|
||||
|
@ -258,12 +253,34 @@ class _Conv(Cell):
|
|||
logger.warning("Value of 'has_bias' is False, value of 'bias_init' will be ignored.")
|
||||
self.bias = None
|
||||
|
||||
def _validate_kernel_size(self, kernel_size):
|
||||
"""validate kernel size."""
|
||||
if (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
|
||||
isinstance(kernel_size[0], bool) or isinstance(kernel_size[1], bool) or \
|
||||
kernel_size[0] < 1 or kernel_size[1] < 1:
|
||||
raise ValueError("Attr 'kernel_size' of 'Conv2D' Op passed "
|
||||
+ str(self.kernel_size) + ", should be a int or tuple and equal to or greater than 1.")
|
||||
|
||||
def _validate_stride(self, stride):
|
||||
"""validate stride."""
|
||||
if (not isinstance(stride[0], int)) or (not isinstance(stride[1], int)) or \
|
||||
isinstance(stride[0], bool) or isinstance(stride[1], bool) or stride[0] < 1 or stride[1] < 1:
|
||||
raise ValueError("Attr 'stride' of 'Conv2D' Op passed "
|
||||
+ str(self.stride) + ", should be a int or tuple and equal to or greater than 1.")
|
||||
|
||||
def _validate_dilation(self, dilation):
|
||||
"""validate dilation."""
|
||||
if (not isinstance(dilation[0], int)) or (not isinstance(dilation[1], int)) or \
|
||||
isinstance(dilation[0], bool) or isinstance(dilation[1], bool) or dilation[0] < 1 or dilation[1] < 1:
|
||||
raise ValueError("Attr 'dilation' of 'Conv2D' Op passed "
|
||||
+ str(self.dilation) + ", should be a int or tuple and equal to or greater than 1.")
|
||||
|
||||
def construct(self, *inputs):
|
||||
"""Must be overridden by all subclasses."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Conv2d_Thor(_Conv):
|
||||
class Conv2dThor(_Conv):
|
||||
r"""
|
||||
2D convolution layer.
|
||||
|
||||
|
@ -370,7 +387,7 @@ class Conv2d_Thor(_Conv):
|
|||
stride = twice(stride)
|
||||
self._dilation = dilation
|
||||
dilation = twice(dilation)
|
||||
super(Conv2d_Thor, self).__init__(
|
||||
super(Conv2dThor, self).__init__(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
|
@ -395,55 +412,58 @@ class Conv2d_Thor(_Conv):
|
|||
|
||||
self.thor = True
|
||||
self.hw = kernel_size[0] * kernel_size[1]
|
||||
self.matrix_A_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1]
|
||||
self.matrix_G_dim = self.out_channels
|
||||
self.matrix_a_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1]
|
||||
self.matrix_g_dim = self.out_channels
|
||||
self.shape = P.Shape()
|
||||
self.reshape = P.Reshape()
|
||||
self.mul = P.Mul()
|
||||
self.cast = P.Cast()
|
||||
self.A_normalizer = Parameter(initializer(0, [1], mstype.float32), name="A_normalizer", requires_grad=False)
|
||||
self.G_normalizer = Parameter(initializer(0, [1], mstype.float32), name="G_normalizer", requires_grad=False)
|
||||
self.a_normalizer = Parameter(initializer(0, [1], mstype.float32), name="a_normalizer", requires_grad=False)
|
||||
self.g_normalizer = Parameter(initializer(0, [1], mstype.float32), name="g_normalizer", requires_grad=False)
|
||||
self.is_Ascend = True
|
||||
if context.get_context("device_target") == "Ascend":
|
||||
ksizes = (1, kernel_size[0], kernel_size[1], 1)
|
||||
strides = (1, stride[0], stride[1], 1)
|
||||
self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
|
||||
self.cube_matmul = P.CusMatMulCube(transpose_a=True)
|
||||
self.transpose02314 = P.CusTranspose02314()
|
||||
dampingA_dim = self.matrix_A_dim
|
||||
self.diag_block_dim = 128
|
||||
if (self.matrix_A_dim % self.diag_block_dim) != 0 and self.matrix_A_dim > self.diag_block_dim:
|
||||
dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim
|
||||
dampingG_dim = self.matrix_G_dim
|
||||
if (self.matrix_G_dim % self.diag_block_dim) != 0 and self.matrix_G_dim > self.diag_block_dim:
|
||||
dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim
|
||||
self.matrix_A_cov = Parameter(Tensor(np.zeros([dampingA_dim, dampingA_dim]).astype(np.float32)),
|
||||
name='matrix_A', requires_grad=False)
|
||||
self.matrix_G_cov = Parameter(Tensor(np.zeros([dampingG_dim, dampingG_dim]).astype(np.float32)),
|
||||
name='matrix_G', requires_grad=False)
|
||||
|
||||
self.channels_slice_flag = False
|
||||
self.C0 = 16
|
||||
if self.in_channels % self.C0 != 0:
|
||||
self.channels_slice_flag = True
|
||||
self.padA_flag = False
|
||||
if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \
|
||||
and self.matrix_A_dim > self.diag_block_dim:
|
||||
self.padA_flag = True
|
||||
pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim
|
||||
self.padA = P.Pad(((0, pad_dim), (0, pad_dim)))
|
||||
self.slice = P.Slice()
|
||||
self._process_ascend_conv2d_thor(kernel_size, stride)
|
||||
else:
|
||||
self.is_Ascend = False
|
||||
self.img2col = P.Im2Col(kernel_size=kernel_size, stride=stride, pad_mode="same")
|
||||
self.matmul = P.MatMul(transpose_b=True)
|
||||
self.reduce_mean = P.ReduceMean(keep_dims=False)
|
||||
self.matrix_A_cov = Parameter(Tensor(np.zeros([self.matrix_A_dim, self.matrix_A_dim]).astype(np.float32)),
|
||||
name='matrix_A', requires_grad=False)
|
||||
self.matrix_G_cov = Parameter(Tensor(np.zeros([self.matrix_G_dim, self.matrix_G_dim]).astype(np.float32)),
|
||||
name='matrix_G', requires_grad=False)
|
||||
self.matrix_a_cov = Parameter(Tensor(np.zeros([self.matrix_a_dim, self.matrix_a_dim]).astype(np.float32)),
|
||||
name='matrix_a', requires_grad=False)
|
||||
self.matrix_g_cov = Parameter(Tensor(np.zeros([self.matrix_g_dim, self.matrix_g_dim]).astype(np.float32)),
|
||||
name='matrix_g', requires_grad=False)
|
||||
self.getG = P.InsertGradientOf(self.save_gradient)
|
||||
|
||||
def _process_ascend_conv2d_thor(self, kernel_size, stride):
|
||||
"""process ascend conv2d thor"""
|
||||
ksizes = (1, kernel_size[0], kernel_size[1], 1)
|
||||
strides = (1, stride[0], stride[1], 1)
|
||||
self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
|
||||
self.cube_matmul = P.CusMatMulCube(transpose_a=True)
|
||||
self.transpose02314 = P.CusTranspose02314()
|
||||
dampinga_dim = self.matrix_a_dim
|
||||
self.diag_block_dim = 128
|
||||
if (self.matrix_a_dim % self.diag_block_dim) != 0 and self.matrix_a_dim > self.diag_block_dim:
|
||||
dampinga_dim = (self.matrix_a_dim // self.diag_block_dim + 1) * self.diag_block_dim
|
||||
dampingg_dim = self.matrix_g_dim
|
||||
if (self.matrix_g_dim % self.diag_block_dim) != 0 and self.matrix_g_dim > self.diag_block_dim:
|
||||
dampingg_dim = (self.matrix_g_dim // self.diag_block_dim + 1) * self.diag_block_dim
|
||||
self.matrix_a_cov = Parameter(Tensor(np.zeros([dampinga_dim, dampinga_dim]).astype(np.float32)),
|
||||
name='matrix_a', requires_grad=False)
|
||||
self.matrix_g_cov = Parameter(Tensor(np.zeros([dampingg_dim, dampingg_dim]).astype(np.float32)),
|
||||
name='matrix_g', requires_grad=False)
|
||||
|
||||
self.channels_slice_flag = False
|
||||
self.C0 = 16
|
||||
if self.in_channels % self.C0 != 0:
|
||||
self.channels_slice_flag = True
|
||||
self.pada_flag = False
|
||||
if (self.matrix_a_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_a_dim \
|
||||
and self.matrix_a_dim > self.diag_block_dim:
|
||||
self.pada_flag = True
|
||||
pad_dim = self.diag_block_dim - self.matrix_a_dim % self.diag_block_dim
|
||||
self.pada = P.Pad(((0, pad_dim), (0, pad_dim)))
|
||||
self.slice = P.Slice()
|
||||
|
||||
def _init_depthwise_conv2d(self, weight_init):
|
||||
"""Initialize depthwise conv2d op"""
|
||||
|
@ -473,11 +493,11 @@ class Conv2d_Thor(_Conv):
|
|||
dout = self.transpose02314(dout)
|
||||
dout_shape = self.shape(dout)
|
||||
normalizer = dout_shape[0]
|
||||
matrix_G = self.cube_matmul(dout, dout)
|
||||
matrix_g = self.cube_matmul(dout, dout)
|
||||
normalizer = self.cast(normalizer, mstype.float32)
|
||||
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
|
||||
self.G_normalizer = normalizer
|
||||
self.matrix_G_cov = matrix_G
|
||||
matrix_g = self.mul(matrix_g, 1.0 / normalizer)
|
||||
self.g_normalizer = normalizer
|
||||
self.matrix_g_cov = matrix_g
|
||||
else:
|
||||
dout = self.reduce_mean(dout, 0)
|
||||
dout_shape = self.shape(dout)
|
||||
|
@ -485,43 +505,42 @@ class Conv2d_Thor(_Conv):
|
|||
dout_shape = self.shape(dout)
|
||||
normalizer = dout_shape[1]
|
||||
dout = self.cast(dout, mstype.float32)
|
||||
matrix_G = self.matmul(dout, dout)
|
||||
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
|
||||
self.G_normalizer = normalizer
|
||||
self.matrix_G_cov = matrix_G
|
||||
matrix_g = self.matmul(dout, dout)
|
||||
matrix_g = self.mul(matrix_g, 1.0 / normalizer)
|
||||
self.g_normalizer = normalizer
|
||||
self.matrix_g_cov = matrix_g
|
||||
return out
|
||||
|
||||
|
||||
|
||||
def construct(self, x):
|
||||
if self.thor:
|
||||
matrix_A = self.img2col(x)
|
||||
matrix_A_shape = self.shape(matrix_A)
|
||||
matrix_a = self.img2col(x)
|
||||
matrix_a_shape = self.shape(matrix_a)
|
||||
if self.is_Ascend:
|
||||
normalizer = matrix_A_shape[0]
|
||||
matrix_A = self.cube_matmul(matrix_A, matrix_A)
|
||||
normalizer = matrix_a_shape[0]
|
||||
matrix_a = self.cube_matmul(matrix_a, matrix_a)
|
||||
if self.channels_slice_flag:
|
||||
matrix_A = self.reshape(matrix_A, (self.hw, self.C0, self.hw, self.C0))
|
||||
matrix_A = self.slice(matrix_A, (0, 0, 0, 0),
|
||||
matrix_a = self.reshape(matrix_a, (self.hw, self.C0, self.hw, self.C0))
|
||||
matrix_a = self.slice(matrix_a, (0, 0, 0, 0),
|
||||
(self.hw, self.in_channels, self.hw, self.in_channels))
|
||||
matrix_A = self.reshape(matrix_A, (self.matrix_A_dim, self.matrix_A_dim))
|
||||
matrix_a = self.reshape(matrix_a, (self.matrix_a_dim, self.matrix_a_dim))
|
||||
normalizer = self.cast(normalizer, mstype.float32)
|
||||
matrix_A = self.mul(matrix_A, 1.0 / normalizer)
|
||||
if self.padA_flag:
|
||||
matrix_A = self.padA(matrix_A)
|
||||
self.A_normalizer = normalizer
|
||||
self.matrix_A_cov = matrix_A
|
||||
matrix_a = self.mul(matrix_a, 1.0 / normalizer)
|
||||
if self.pada_flag:
|
||||
matrix_a = self.pada(matrix_a)
|
||||
self.a_normalizer = normalizer
|
||||
self.matrix_a_cov = matrix_a
|
||||
else:
|
||||
matrix_A = self.reshape(matrix_A, (matrix_A_shape[0] * matrix_A_shape[1] * matrix_A_shape[2],
|
||||
matrix_A_shape[3], -1))
|
||||
matrix_A = self.reduce_mean(matrix_A, 1)
|
||||
matrix_A_shape = self.shape(matrix_A)
|
||||
normalizer = matrix_A_shape[1]
|
||||
matrix_A = self.cast(matrix_A, mstype.float32)
|
||||
matrix_A = self.matmul(matrix_A, matrix_A)
|
||||
matrix_A = self.mul(matrix_A, 1.0 / normalizer)
|
||||
self.A_normalizer = normalizer
|
||||
self.matrix_A_cov = matrix_A
|
||||
matrix_a = self.reshape(matrix_a, (matrix_a_shape[0] * matrix_a_shape[1] * matrix_a_shape[2],
|
||||
matrix_a_shape[3], -1))
|
||||
matrix_a = self.reduce_mean(matrix_a, 1)
|
||||
matrix_a_shape = self.shape(matrix_a)
|
||||
normalizer = matrix_a_shape[1]
|
||||
matrix_a = self.cast(matrix_a, mstype.float32)
|
||||
matrix_a = self.matmul(matrix_a, matrix_a)
|
||||
matrix_a = self.mul(matrix_a, 1.0 / normalizer)
|
||||
self.a_normalizer = normalizer
|
||||
self.matrix_a_cov = matrix_a
|
||||
output = self.conv2d(x, self.weight)
|
||||
output = self.getG(output)
|
||||
else:
|
||||
|
@ -549,7 +568,7 @@ class Conv2d_Thor(_Conv):
|
|||
return s
|
||||
|
||||
|
||||
class Embedding_Thor(Cell):
|
||||
class EmbeddingThor(Cell):
|
||||
r"""
|
||||
A simple lookup table that stores embeddings of a fixed dictionary and size.
|
||||
|
||||
|
@ -590,7 +609,7 @@ class Embedding_Thor(Cell):
|
|||
|
||||
def __init__(self, vocab_size, embedding_size, use_one_hot=False, embedding_table='normal',
|
||||
dtype=mstype.float32, padding_idx=None):
|
||||
super(Embedding_Thor, self).__init__()
|
||||
super(EmbeddingThor, self).__init__()
|
||||
self.vocab_size = Validator.check_value_type('vocab_size', vocab_size, [int], self.cls_name)
|
||||
self.embedding_size = Validator.check_value_type('embedding_size', embedding_size, [int], self.cls_name)
|
||||
Validator.check_value_type('use_one_hot', use_one_hot, [bool], self.cls_name)
|
||||
|
@ -616,10 +635,10 @@ class Embedding_Thor(Cell):
|
|||
self.reshape = P.Reshape()
|
||||
self.get_shp = P.Shape()
|
||||
self.thor = True
|
||||
self.matrix_A = Parameter(Tensor(np.zeros([vocab_size]).astype(np.float32)),
|
||||
name='matrix_A', requires_grad=False)
|
||||
self.matrix_G = Parameter(Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float32)),
|
||||
name="matrix_G", requires_grad=False)
|
||||
self.matrix_a = Parameter(Tensor(np.zeros([vocab_size]).astype(np.float32)),
|
||||
name='matrix_a', requires_grad=False)
|
||||
self.matrix_g = Parameter(Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float32)),
|
||||
name="matrix_g", requires_grad=False)
|
||||
self.reduce_sum = P.ReduceSum(keep_dims=False)
|
||||
self.getG = P.InsertGradientOf(self.save_gradient)
|
||||
self.cast = P.Cast()
|
||||
|
@ -638,9 +657,9 @@ class Embedding_Thor(Cell):
|
|||
out = dout
|
||||
shape = self.get_shp(dout)
|
||||
normalizer = self.cast(shape[0], mstype.float32)
|
||||
matrix_G = self.cube_matmul(dout, dout)
|
||||
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
|
||||
self.matrix_G = matrix_G
|
||||
matrix_g = self.cube_matmul(dout, dout)
|
||||
matrix_g = self.mul(matrix_g, 1.0 / normalizer)
|
||||
self.matrix_g = matrix_g
|
||||
return out
|
||||
|
||||
def construct(self, ids):
|
||||
|
@ -654,8 +673,8 @@ class Embedding_Thor(Cell):
|
|||
else:
|
||||
if self.thor:
|
||||
one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value)
|
||||
matrix_A = self.reduce_sum(one_hot_ids, 0)
|
||||
self.matrix_A = matrix_A
|
||||
matrix_a = self.reduce_sum(one_hot_ids, 0)
|
||||
self.matrix_a = matrix_a
|
||||
output_for_reshape = self.gather(self.embedding_table, flat_ids, 0)
|
||||
output_for_reshape = self.getG(output_for_reshape)
|
||||
else:
|
||||
|
|
|
@ -29,7 +29,7 @@ from .rmsprop import RMSProp
|
|||
from .proximal_ada_grad import ProximalAdagrad
|
||||
from .lazyadam import LazyAdam
|
||||
from .ada_grad import Adagrad
|
||||
from .thor import THOR
|
||||
from .thor import thor
|
||||
|
||||
__all__ = ['Optimizer', 'Momentum', 'LARS', 'Adam', 'AdamWeightDecay', 'LazyAdam', 'AdamOffload',
|
||||
'Lamb', 'SGD', 'FTRL', 'RMSProp', 'ProximalAdagrad', 'Adagrad', 'THOR']
|
||||
'Lamb', 'SGD', 'FTRL', 'RMSProp', 'ProximalAdagrad', 'Adagrad', 'thor']
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -43,22 +43,22 @@ class ConvertNetUntils():
|
|||
if act_name == "fastgelu":
|
||||
act_name = "fast_gelu"
|
||||
if subcell.out_channels == 1001:
|
||||
new_subcell = nn.Dense_Thor(in_channels=subcell.in_channels,
|
||||
out_channels=subcell.out_channels,
|
||||
weight_init=weight,
|
||||
has_bias=subcell.has_bias,
|
||||
bias_init='zeros',
|
||||
activation=act_name)
|
||||
new_subcell = nn.DenseThor(in_channels=subcell.in_channels,
|
||||
out_channels=subcell.out_channels,
|
||||
weight_init=weight,
|
||||
has_bias=subcell.has_bias,
|
||||
bias_init='zeros',
|
||||
activation=act_name)
|
||||
else:
|
||||
compute_type = mstype.float16
|
||||
if context.get_context("device_target") == "GPU":
|
||||
compute_type = mstype.float32
|
||||
new_subcell = nn.Dense_Thor(in_channels=subcell.in_channels,
|
||||
out_channels=subcell.out_channels,
|
||||
weight_init=weight,
|
||||
has_bias=subcell.has_bias,
|
||||
bias_init='zeros',
|
||||
activation=act_name).to_float(compute_type)
|
||||
new_subcell = nn.DenseThor(in_channels=subcell.in_channels,
|
||||
out_channels=subcell.out_channels,
|
||||
weight_init=weight,
|
||||
has_bias=subcell.has_bias,
|
||||
bias_init='zeros',
|
||||
activation=act_name).to_float(compute_type)
|
||||
|
||||
if subcell.has_bias:
|
||||
new_subcell.bias = subcell.bias
|
||||
|
@ -69,9 +69,9 @@ class ConvertNetUntils():
|
|||
"""
|
||||
convert embedding cell to second_order cell
|
||||
"""
|
||||
new_subcell = nn.Embedding_Thor(vocab_size=subcell.vocab_size,
|
||||
embedding_size=subcell.embedding_size,
|
||||
use_one_hot=False)
|
||||
new_subcell = nn.EmbeddingThor(vocab_size=subcell.vocab_size,
|
||||
embedding_size=subcell.embedding_size,
|
||||
use_one_hot=False)
|
||||
new_subcell.embedding_table = subcell.embedding_table
|
||||
return new_subcell
|
||||
|
||||
|
@ -88,9 +88,9 @@ class ConvertNetUntils():
|
|||
pad_mode = subcell.pad_mode
|
||||
has_bias = subcell.has_bias
|
||||
weight = subcell.weight
|
||||
new_subcell = nn.Conv2d_Thor(in_channel, out_channel,
|
||||
kernel_size=kernel_size, stride=stride, padding=padding, pad_mode=pad_mode,
|
||||
has_bias=has_bias, weight_init=weight)
|
||||
new_subcell = nn.Conv2dThor(in_channel, out_channel,
|
||||
kernel_size=kernel_size, stride=stride, padding=padding, pad_mode=pad_mode,
|
||||
has_bias=has_bias, weight_init=weight)
|
||||
return new_subcell
|
||||
|
||||
|
||||
|
@ -104,7 +104,7 @@ class ConvertNetUntils():
|
|||
subcell = cells[name]
|
||||
if subcell == net:
|
||||
continue
|
||||
elif isinstance(subcell, (nn.Dense_Thor, nn.Conv2d_Thor, nn.Embedding_Thor)):
|
||||
elif isinstance(subcell, (nn.DenseThor, nn.Conv2dThor, nn.EmbeddingThor)):
|
||||
continue
|
||||
elif isinstance(subcell, (nn.Conv2dTranspose, nn.Conv1d, nn.Conv1dTranspose, nn.BatchNorm1d, nn.GroupNorm,
|
||||
nn.GlobalBatchNorm, nn.LayerNorm, nn.BatchNorm2d, nn.MaxPool2d)):
|
||||
|
@ -113,7 +113,7 @@ class ConvertNetUntils():
|
|||
prefix = subcell.param_prefix
|
||||
new_subcell = self._convert_method_map[type(subcell)](subcell)
|
||||
print("subcell name: ", name, "prefix is", prefix, flush=True)
|
||||
if isinstance(new_subcell, (nn.Dense_Thor, nn.Embedding_Thor, nn.Conv2d_Thor)):
|
||||
if isinstance(new_subcell, (nn.DenseThor, nn.EmbeddingThor, nn.Conv2dThor)):
|
||||
print("convert to thor layer success.", flush=True)
|
||||
new_subcell.update_parameters_name(prefix + '.')
|
||||
net.insert_child_to_cell(name, new_subcell)
|
||||
|
@ -141,19 +141,19 @@ class ConvertModelUtils():
|
|||
"""
|
||||
|
||||
def convert_to_thor_model(self, model, network, loss_fn=None, optimizer=None, metrics=None, amp_level="O0",
|
||||
loss_scale_manager=None, keep_batchnorm_fp32=False, frequency=834):
|
||||
loss_scale_manager=None, keep_batchnorm_fp32=False):
|
||||
|
||||
"""
|
||||
api for convert model to thor model
|
||||
"""
|
||||
optim_name = type(optimizer).__name__
|
||||
if optim_name in ("THOR_Ascend", "THOR_GPU"):
|
||||
from .model_thor import Model_Thor
|
||||
if optim_name in ("ThorAscend", "ThorGpu"):
|
||||
from .model_thor import ModelThor
|
||||
if isinstance(network, nn.TrainOneStepCell):
|
||||
model = Model_Thor(network=network, frequency=frequency)
|
||||
model = ModelThor(network=network)
|
||||
else:
|
||||
model = Model_Thor(network=network, loss_fn=loss_fn, optimizer=optimizer, amp_level=amp_level,
|
||||
loss_scale_manager=loss_scale_manager,
|
||||
keep_batchnorm_fp32=keep_batchnorm_fp32, metrics=metrics, frequency=frequency)
|
||||
model = ModelThor(network=network, loss_fn=loss_fn, optimizer=optimizer, amp_level=amp_level,
|
||||
loss_scale_manager=loss_scale_manager,
|
||||
keep_batchnorm_fp32=keep_batchnorm_fp32, metrics=metrics)
|
||||
|
||||
return model
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
import math
|
||||
from mindspore.train.callback import RunContext
|
||||
from mindspore import context
|
||||
from mindspore import nn
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.dataset_helper import connect_network_with_dataset
|
||||
|
@ -25,6 +26,7 @@ from mindspore.common.dtype import pytype_to_dtype
|
|||
from mindspore._c_expression import init_exec_dataset
|
||||
from .dataset_helper import DatasetHelper
|
||||
|
||||
|
||||
def _convert_type(types):
|
||||
"""
|
||||
Convert from numpy type to tensor type.
|
||||
|
@ -66,7 +68,7 @@ def _exec_datagraph(exec_dataset, dataset_size, phase='dataset'):
|
|||
need_run=False)
|
||||
|
||||
|
||||
class Model_Thor(Model):
|
||||
class ModelThor(Model):
|
||||
"""
|
||||
High-Level API for Training or Testing.
|
||||
|
||||
|
@ -104,10 +106,19 @@ class Model_Thor(Model):
|
|||
"""
|
||||
|
||||
def __init__(self, network, loss_fn=None, optimizer=None, metrics=None, eval_network=None,
|
||||
eval_indexes=None, amp_level="O0", frequency=834, **kwargs):
|
||||
super(Model_Thor, self).__init__(network, loss_fn, optimizer, metrics, eval_network,
|
||||
eval_indexes, amp_level, **kwargs)
|
||||
self._frequency = frequency
|
||||
eval_indexes=None, amp_level="O0", **kwargs):
|
||||
super(ModelThor, self).__init__(network, loss_fn, optimizer, metrics, eval_network,
|
||||
eval_indexes, amp_level, **kwargs)
|
||||
if isinstance(network, nn.TrainOneStepCell):
|
||||
self._frequency = network.optimizer.get_frequency()
|
||||
else:
|
||||
self._frequency = optimizer.get_frequency()
|
||||
# used to stop training for early stop, such as stopAtTIme or stopATStep
|
||||
self.should_stop = False
|
||||
self.switch_branch_one = True
|
||||
self.index_first_order = 0
|
||||
self.train_network_init_flag = True
|
||||
self.has_do_dataset_init = False
|
||||
self._train_network = self._build_train_network()
|
||||
|
||||
def _exec_preprocess(self, network, is_train, phase, dataset, dataset_sink_mode, sink_size=-1,
|
||||
|
@ -127,6 +138,52 @@ class Model_Thor(Model):
|
|||
|
||||
return dataset_helper, network
|
||||
|
||||
def _train_gpu_sink_step(self, cb_params, inputs, list_callback, iter_first_order, run_context):
|
||||
"""train gpu sink step"""
|
||||
if self.switch_branch_one:
|
||||
cb_params.cur_step_num += 1
|
||||
if self.train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=True)
|
||||
self._train_network.phase = 'train0'
|
||||
self.switch_branch_one = not self.switch_branch_one
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
list_callback.step_end(run_context)
|
||||
else:
|
||||
cb_params.cur_step_num += 1
|
||||
if self.train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=False)
|
||||
self.train_network_init_flag = False
|
||||
self._train_network.phase = 'train1'
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
self.index_first_order += 1
|
||||
if self.index_first_order == iter_first_order:
|
||||
self.index_first_order = 0
|
||||
self.switch_branch_one = not self.switch_branch_one
|
||||
list_callback.step_end(run_context)
|
||||
|
||||
def _train_ascend_sink_step(self, cb_params, train_dataset, iter_first_order, inputs, list_callback, run_context):
|
||||
"""train ascend sink step"""
|
||||
if self.switch_branch_one:
|
||||
cb_params.cur_step_num += 1
|
||||
if self.train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=True)
|
||||
self._train_network.phase = 'train0'
|
||||
else:
|
||||
cb_params.cur_step_num += iter_first_order
|
||||
if self.train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=False)
|
||||
self.train_network_init_flag = False
|
||||
self._train_network.phase = 'train1'
|
||||
if not self.has_do_dataset_init:
|
||||
_exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset')
|
||||
self.has_do_dataset_init = True
|
||||
self.switch_branch_one = not self.switch_branch_one
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
list_callback.step_end(run_context)
|
||||
|
||||
def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None, sink_size=-1):
|
||||
"""
|
||||
Training process. The data would be passed to network through dataset channel.
|
||||
|
@ -166,13 +223,6 @@ class Model_Thor(Model):
|
|||
run_context = RunContext(cb_params)
|
||||
list_callback.begin(run_context)
|
||||
|
||||
# used to stop training for early stop, such as stopAtTIme or stopATStep
|
||||
should_stop = False
|
||||
switch_branch_one = True
|
||||
index_first_order = 0
|
||||
train_network_init_flag = True
|
||||
has_do_dataset_init = False
|
||||
|
||||
for i in range(epoch):
|
||||
cb_params.cur_epoch_num = i + 1
|
||||
list_callback.epoch_begin(run_context)
|
||||
|
@ -182,55 +232,17 @@ class Model_Thor(Model):
|
|||
inputs = _to_full_tensor(inputs, self._device_number, self._global_rank)
|
||||
list_callback.step_begin(run_context)
|
||||
if context.get_context("device_target") == "GPU":
|
||||
if switch_branch_one:
|
||||
cb_params.cur_step_num += 1
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=True)
|
||||
self._train_network.phase = 'train0'
|
||||
switch_branch_one = not switch_branch_one
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
list_callback.step_end(run_context)
|
||||
else:
|
||||
cb_params.cur_step_num += 1
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=False)
|
||||
train_network_init_flag = False
|
||||
self._train_network.phase = 'train1'
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
index_first_order += 1
|
||||
if index_first_order == iter_first_order:
|
||||
index_first_order = 0
|
||||
switch_branch_one = not switch_branch_one
|
||||
list_callback.step_end(run_context)
|
||||
self._train_gpu_sink_step(cb_params, inputs, list_callback, iter_first_order, run_context)
|
||||
else:
|
||||
if switch_branch_one:
|
||||
cb_params.cur_step_num += 1
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=True)
|
||||
self._train_network.phase = 'train0'
|
||||
else:
|
||||
cb_params.cur_step_num += iter_first_order
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=False)
|
||||
train_network_init_flag = False
|
||||
self._train_network.phase = 'train1'
|
||||
if not has_do_dataset_init:
|
||||
_exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset')
|
||||
has_do_dataset_init = True
|
||||
switch_branch_one = not switch_branch_one
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
list_callback.step_end(run_context)
|
||||
|
||||
self._train_ascend_sink_step(cb_params, train_dataset, iter_first_order, inputs, list_callback,
|
||||
run_context)
|
||||
list_callback.epoch_end(run_context)
|
||||
should_stop = should_stop or run_context.get_stop_requested()
|
||||
if should_stop:
|
||||
self.should_stop = self.should_stop or run_context.get_stop_requested()
|
||||
if self.should_stop:
|
||||
break
|
||||
dataset_helper.stop_send()
|
||||
|
||||
list_callback.end(run_context)
|
||||
|
||||
|
||||
__all__ = ["Model_Thor"]
|
||||
__all__ = ["ModelThor"]
|
||||
|
|
|
@ -18,7 +18,7 @@ import argparse
|
|||
import ast
|
||||
from mindspore import context
|
||||
from mindspore import Tensor
|
||||
from mindspore.nn.optim import Momentum, THOR
|
||||
from mindspore.nn.optim import Momentum, thor
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.train.train_thor import ConvertModelUtils
|
||||
|
@ -235,12 +235,11 @@ if __name__ == '__main__':
|
|||
from src.lr_generator import get_thor_damping
|
||||
damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size)
|
||||
split_indices = [26, 53]
|
||||
opt = THOR(net, lr, Tensor(damping), config.momentum, config.weight_decay, config.loss_scale,
|
||||
config.batch_size, split_indices=split_indices)
|
||||
opt = thor(net, lr, Tensor(damping), config.momentum, config.weight_decay, config.loss_scale,
|
||||
config.batch_size, split_indices=split_indices, frequency=config.frequency)
|
||||
model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
|
||||
loss_scale_manager=loss_scale, metrics={'acc'},
|
||||
amp_level="O2", keep_batchnorm_fp32=False,
|
||||
frequency=config.frequency)
|
||||
amp_level="O2", keep_batchnorm_fp32=False)
|
||||
args_opt.run_eval = False
|
||||
logger.warning("Thor optimizer not support evaluation while training.")
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
|
|||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
from mindspore.train.train_thor import ConvertModelUtils
|
||||
from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay, THOR
|
||||
from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay, thor
|
||||
from mindspore import log as logger
|
||||
from mindspore.common import set_seed
|
||||
from src import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell, \
|
||||
|
@ -106,20 +106,14 @@ def _get_optimizer(args_opt, network):
|
|||
damping = get_bert_thor_damping(cfg.Thor.damping_max, cfg.Thor.damping_min, cfg.Thor.damping_power,
|
||||
cfg.Thor.damping_total_steps)
|
||||
split_indices = None
|
||||
if bert_net_cfg.num_hidden_layers == 12:
|
||||
if bert_net_cfg.use_relative_positions:
|
||||
split_indices = [29, 58, 87, 116, 145, 174, 203, 217]
|
||||
else:
|
||||
split_indices = [28, 55, 82, 109, 136, 163, 190, 205]
|
||||
elif bert_net_cfg.num_hidden_layers == 24:
|
||||
if bert_net_cfg.use_relative_positions:
|
||||
split_indices = [30, 90, 150, 210, 270, 330, 390, 421]
|
||||
else:
|
||||
split_indices = [38, 93, 148, 203, 258, 313, 368, 397]
|
||||
optimizer = THOR(network, lr, damping, cfg.Thor.momentum,
|
||||
if bert_net_cfg.num_hidden_layers == 12 and not bert_net_cfg.use_relative_positions:
|
||||
split_indices = [28, 55, 77]
|
||||
elif bert_net_cfg.num_hidden_layers == 24 and not bert_net_cfg.use_relative_positions:
|
||||
split_indices = [38, 93, 149]
|
||||
optimizer = thor(network, lr, damping, cfg.Thor.momentum,
|
||||
cfg.Thor.weight_decay, cfg.Thor.loss_scale, cfg.batch_size,
|
||||
decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(),
|
||||
split_indices=split_indices)
|
||||
split_indices=split_indices, enable_clip_grad=True, frequency=cfg.Thor.frequency)
|
||||
else:
|
||||
raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay, Thor]".
|
||||
format(cfg.optimizer))
|
||||
|
@ -278,11 +272,10 @@ def run_pretrain():
|
|||
accumulation_steps=accumulation_steps,
|
||||
enable_global_norm=enable_global_norm)
|
||||
else:
|
||||
net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer)
|
||||
net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, enable_clip_grad=False)
|
||||
|
||||
model = Model(net_with_grads)
|
||||
model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer,
|
||||
frequency=cfg.Thor.frequency)
|
||||
model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer)
|
||||
model.train(new_repeat_count, ds, callbacks=callback,
|
||||
dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
|
||||
|
||||
|
|
|
@ -269,12 +269,14 @@ class BertTrainOneStepCell(nn.TrainOneStepCell):
|
|||
network (Cell): The training network. Note that loss function should have been added.
|
||||
optimizer (Optimizer): Optimizer for updating the weights.
|
||||
sens (Number): The adjust parameter. Default: 1.0.
|
||||
enable_clip_grad (boolean): If True, clip gradients in BertTrainOneStepCell. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self, network, optimizer, sens=1.0):
|
||||
def __init__(self, network, optimizer, sens=1.0, enable_clip_grad=False):
|
||||
super(BertTrainOneStepCell, self).__init__(network, optimizer, sens)
|
||||
self.cast = P.Cast()
|
||||
self.hyper_map = C.HyperMap()
|
||||
self.enable_clip_grad = enable_clip_grad
|
||||
|
||||
def set_sens(self, value):
|
||||
self.sens = value
|
||||
|
@ -306,7 +308,8 @@ class BertTrainOneStepCell(nn.TrainOneStepCell):
|
|||
masked_lm_weights,
|
||||
self.cast(F.tuple_to_array((self.sens,)),
|
||||
mstype.float32))
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
if self.enable_clip_grad:
|
||||
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
|
||||
grads = self.grad_reducer(grads)
|
||||
succ = self.optimizer(grads)
|
||||
return F.depend(loss, succ)
|
||||
|
|
|
@ -28,7 +28,7 @@ from mindspore import log as logger
|
|||
from mindspore.train.callback import Callback
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
from mindspore.nn.optim import THOR
|
||||
from mindspore.nn.optim import thor
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.train_thor import ConvertModelUtils
|
||||
import mindspore.dataset.transforms.c_transforms as C
|
||||
|
@ -166,10 +166,10 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num):
|
|||
|
||||
lr = get_bert_thor_lr()
|
||||
damping = get_bert_thor_damping()
|
||||
split_indices = [38, 77]
|
||||
optimizer = THOR(net_with_loss, lr, damping, momentum, weight_decay, loss_scale, batch_size,
|
||||
split_indices = None
|
||||
optimizer = thor(net_with_loss, lr, damping, momentum, weight_decay, loss_scale, batch_size,
|
||||
decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(),
|
||||
split_indices=split_indices)
|
||||
split_indices=split_indices, enable_clip_grad=True, frequency=frequency)
|
||||
time_monitor_callback = TimeMonitor(data_sink_steps)
|
||||
loss_callback = LossCallback()
|
||||
callback = [time_monitor_callback, loss_callback]
|
||||
|
@ -178,10 +178,9 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num):
|
|||
param_dict = load_checkpoint(load_checkpoint_path)
|
||||
load_param_into_net(net_with_loss, param_dict)
|
||||
|
||||
net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer)
|
||||
net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, sens=loss_scale, enable_clip_grad=False)
|
||||
model = Model(net_with_grads)
|
||||
model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer,
|
||||
frequency=frequency)
|
||||
model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer)
|
||||
model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps)
|
||||
|
||||
loss_list = loss_callback.loss_list
|
||||
|
|
|
@ -24,7 +24,7 @@ from mindspore.nn.optim.optimizer import Optimizer
|
|||
from mindspore.parallel._utils import _get_device_num, _get_gradients_mean
|
||||
from mindspore import context
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.nn.layer import Dense_Thor, Conv2d_Thor, Embedding_Thor
|
||||
from mindspore.nn.layer import DenseThor, Conv2dThor, EmbeddingThor
|
||||
from mindspore.nn.wrap import DistributedGradReducer
|
||||
from mindspore.train.train_thor.convert_utils import ConvertNetUntils
|
||||
from mindspore.parallel._auto_parallel_context import auto_parallel_context
|
||||
|
@ -106,11 +106,11 @@ def find_net_layertype_recur(net, layertype_map):
|
|||
subcell = cells[name]
|
||||
if subcell == net:
|
||||
continue
|
||||
elif isinstance(subcell, Conv2d_Thor):
|
||||
elif isinstance(subcell, Conv2dThor):
|
||||
layertype_map.append(Conv)
|
||||
elif isinstance(subcell, Dense_Thor):
|
||||
elif isinstance(subcell, DenseThor):
|
||||
layertype_map.append(FC)
|
||||
elif isinstance(subcell, Embedding_Thor):
|
||||
elif isinstance(subcell, EmbeddingThor):
|
||||
layertype_map.append(Embedding)
|
||||
elif isinstance(subcell, nn.LayerNorm):
|
||||
layertype_map.append(LayerNorm)
|
||||
|
@ -168,10 +168,10 @@ class THOR_Ascend(Optimizer):
|
|||
self.hyper_map = C.HyperMap()
|
||||
self.opt = P.ApplyMomentum()
|
||||
self.net = net
|
||||
self.matrix_A_cov = ParameterTuple(filter(lambda x: 'matrix_A' in x.name, net.get_parameters()))
|
||||
self.matrix_G_cov = ParameterTuple(filter(lambda x: 'matrix_G' in x.name, net.get_parameters()))
|
||||
self.A_normalizer = ParameterTuple(filter(lambda x: 'A_normalizer' in x.name, net.get_parameters()))
|
||||
self.G_normalizer = ParameterTuple(filter(lambda x: 'G_normalizer' in x.name, net.get_parameters()))
|
||||
self.matrix_A_cov = ParameterTuple(filter(lambda x: 'matrix_a' in x.name, net.get_parameters()))
|
||||
self.matrix_G_cov = ParameterTuple(filter(lambda x: 'matrix_g' in x.name, net.get_parameters()))
|
||||
self.A_normalizer = ParameterTuple(filter(lambda x: 'a_normalizer' in x.name, net.get_parameters()))
|
||||
self.G_normalizer = ParameterTuple(filter(lambda x: 'g_normalizer' in x.name, net.get_parameters()))
|
||||
self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
|
||||
self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
|
||||
self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
|
||||
|
@ -188,7 +188,7 @@ class THOR_Ascend(Optimizer):
|
|||
self.diag_block_dim = 128
|
||||
self.matrix_A = ()
|
||||
self.matrix_G = ()
|
||||
print("matrix_A_cov len is", len(self.matrix_A_cov))
|
||||
print("matrix_a_cov len is", len(self.matrix_A_cov))
|
||||
self.thor_layer_count = 0
|
||||
self.conv_layer_count = 0
|
||||
self.weight_fim_idx_map = ()
|
||||
|
|
Loading…
Reference in New Issue