clean codecheck for thor

This commit is contained in:
sl_wang 2021-05-28 15:36:55 +08:00
parent f5fb195f04
commit a140e9ee36
11 changed files with 920 additions and 759 deletions

View File

@ -32,7 +32,7 @@ from .quant import *
from .math import *
from .combined import *
from .timedistributed import *
from .thor_layer import *
from .thor_layer import DenseThor, Conv2dThor, EmbeddingThor
__all__ = []
__all__.extend(activation.__all__)

View File

@ -26,10 +26,10 @@ from mindspore.nn.cell import Cell
from mindspore.nn.layer.activation import get_activation
__all__ = ['Dense_Thor', 'Conv2d_Thor', 'Embedding_Thor']
__all__ = ['DenseThor', 'Conv2dThor', 'EmbeddingThor']
class Dense_Thor(Cell):
class DenseThor(Cell):
r"""
The dense connected layer.
@ -77,7 +77,7 @@ class Dense_Thor(Cell):
bias_init='zeros',
has_bias=True,
activation=None):
super(Dense_Thor, self).__init__()
super(DenseThor, self).__init__()
self.thor = True
self.in_channels = Validator.check_positive_int(in_channels)
self.out_channels = Validator.check_positive_int(out_channels)
@ -100,40 +100,45 @@ class Dense_Thor(Cell):
self.activation = get_activation(activation)
self.activation_flag = self.activation is not None
self.matrix_A = Parameter(Tensor(np.zeros([in_channels, in_channels]).astype(np.float32)),
name='matrix_A', requires_grad=False)
self.matrix_a = Parameter(Tensor(np.zeros([in_channels, in_channels]).astype(np.float32)),
name='matrix_a', requires_grad=False)
self.shape = P.Shape()
self.reshape = P.Reshape()
self.transpose = P.Transpose()
self.mul = P.Mul()
self.is_Ascend = True
if context.get_context("device_target") == "Ascend":
if out_channels == 1001:
self.matrix_G = Parameter(Tensor(np.zeros([1024, 1024]).astype(np.float32)),
name='matrix_G', requires_grad=False)
self.pad = P.Pad(((0, 23), (0, 23)))
self.pad1 = P.Pad(((0, 7), (0, 7)))
self.slice = P.Slice()
self.add = P.TensorAdd()
else:
self.matrix_G = Parameter(Tensor(np.eye(out_channels).astype(np.float32)),
name="matrix_G", requires_grad=False)
self.abs = P.Abs()
self.reduce_max = P.ReduceMax(keep_dims=False)
self.neg = P.Neg()
self.reduce_sum = P.ReduceSum()
self.matmul = P.MatMul(transpose_b=True)
self.cube_matmul = P.CusMatMulCube(transpose_a=True)
self.cast = P.Cast()
self.is_nsp_layer = (out_channels == 2)
self._process_ascend_dense_thor(out_channels)
else:
self.is_Ascend = False
self.matrix_G = Parameter(Tensor(np.eye(out_channels).astype(np.float32)),
name="matrix_G", requires_grad=False)
self.matrix_g = Parameter(Tensor(np.eye(out_channels).astype(np.float32)),
name="matrix_g", requires_grad=False)
self.cube_matmul = P.MatMul(transpose_a=True)
self.getG = P.InsertGradientOf(self.save_gradient)
def _process_ascend_dense_thor(self, out_channels):
"""process ascend dense thor"""
if out_channels == 1001:
self.matrix_g = Parameter(Tensor(np.zeros([1024, 1024]).astype(np.float32)),
name='matrix_g', requires_grad=False)
self.pad = P.Pad(((0, 23), (0, 23)))
self.pad1 = P.Pad(((0, 7), (0, 7)))
self.slice = P.Slice()
self.add = P.TensorAdd()
else:
self.matrix_g = Parameter(Tensor(np.eye(out_channels).astype(np.float32)),
name="matrix_g", requires_grad=False)
self.abs = P.Abs()
self.reduce_max = P.ReduceMax(keep_dims=False)
self.neg = P.Neg()
self.reduce_sum = P.ReduceSum()
self.matmul = P.MatMul(transpose_b=True)
self.cube_matmul = P.CusMatMulCube(transpose_a=True)
self.cast = P.Cast()
self.is_nsp_layer = (out_channels == 2)
def save_gradient(self, dout):
"""
this function only for thor optimizer
@ -144,17 +149,17 @@ class Dense_Thor(Cell):
if not self.is_nsp_layer:
shape = self.shape(dout)
normalizer = self.cast(shape[0], mstype.float32)
matrix_G = self.cube_matmul(dout, dout)
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
matrix_g = self.cube_matmul(dout, dout)
matrix_g = self.mul(matrix_g, 1.0 / normalizer)
if self.out_channels == 1001:
matrix_G = P.Pad(((0, 23), (0, 23)))(matrix_G)
self.matrix_G = matrix_G
matrix_g = P.Pad(((0, 23), (0, 23)))(matrix_g)
self.matrix_g = matrix_g
else:
dout_shape = self.shape(dout)
normalizer = dout_shape[0]
matrix_G = self.cube_matmul(dout, dout)
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
self.matrix_G = matrix_G
matrix_g = self.cube_matmul(dout, dout)
matrix_g = self.mul(matrix_g, 1.0 / normalizer)
self.matrix_g = matrix_g
return out
def construct(self, x):
@ -163,14 +168,14 @@ class Dense_Thor(Cell):
inputs = self.cube_matmul(x, x)
shape = self.shape(x)
normalizer = self.cast(shape[0], mstype.float32)
matrix_A = self.mul(inputs, 1.0 / normalizer)
self.matrix_A = matrix_A
matrix_a = self.mul(inputs, 1.0 / normalizer)
self.matrix_a = matrix_a
else:
inputs = self.cube_matmul(x, x)
inputs_shape = self.shape(inputs)
normalizer = inputs_shape[0]
matrix_A = self.mul(inputs, 1.0 / normalizer)
self.matrix_A = matrix_A
matrix_a = self.mul(inputs, 1.0 / normalizer)
self.matrix_a = matrix_a
x = self.matmul(x, self.weight)
x = self.getG(x)
else:
@ -226,19 +231,9 @@ class _Conv(Cell):
self.dilation = dilation
self.group = Validator.check_positive_int(group)
self.has_bias = has_bias
if (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
isinstance(kernel_size[0], bool) or isinstance(kernel_size[1], bool) or \
kernel_size[0] < 1 or kernel_size[1] < 1:
raise ValueError("Attr 'kernel_size' of 'Conv2D' Op passed "
+ str(self.kernel_size) + ", should be a int or tuple and equal to or greater than 1.")
if (not isinstance(stride[0], int)) or (not isinstance(stride[1], int)) or \
isinstance(stride[0], bool) or isinstance(stride[1], bool) or stride[0] < 1 or stride[1] < 1:
raise ValueError("Attr 'stride' of 'Conv2D' Op passed "
+ str(self.stride) + ", should be a int or tuple and equal to or greater than 1.")
if (not isinstance(dilation[0], int)) or (not isinstance(dilation[1], int)) or \
isinstance(dilation[0], bool) or isinstance(dilation[1], bool) or dilation[0] < 1 or dilation[1] < 1:
raise ValueError("Attr 'dilation' of 'Conv2D' Op passed "
+ str(self.dilation) + ", should be a int or tuple and equal to or greater than 1.")
self._validate_kernel_size(kernel_size)
self._validate_stride(stride)
self._validate_dilation(dilation)
if in_channels % group != 0:
raise ValueError("Attr 'in_channels' of 'Conv2D' Op must be divisible by "
"attr 'group' of 'Conv2D' Op.")
@ -258,12 +253,34 @@ class _Conv(Cell):
logger.warning("Value of 'has_bias' is False, value of 'bias_init' will be ignored.")
self.bias = None
def _validate_kernel_size(self, kernel_size):
"""validate kernel size."""
if (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
isinstance(kernel_size[0], bool) or isinstance(kernel_size[1], bool) or \
kernel_size[0] < 1 or kernel_size[1] < 1:
raise ValueError("Attr 'kernel_size' of 'Conv2D' Op passed "
+ str(self.kernel_size) + ", should be a int or tuple and equal to or greater than 1.")
def _validate_stride(self, stride):
"""validate stride."""
if (not isinstance(stride[0], int)) or (not isinstance(stride[1], int)) or \
isinstance(stride[0], bool) or isinstance(stride[1], bool) or stride[0] < 1 or stride[1] < 1:
raise ValueError("Attr 'stride' of 'Conv2D' Op passed "
+ str(self.stride) + ", should be a int or tuple and equal to or greater than 1.")
def _validate_dilation(self, dilation):
"""validate dilation."""
if (not isinstance(dilation[0], int)) or (not isinstance(dilation[1], int)) or \
isinstance(dilation[0], bool) or isinstance(dilation[1], bool) or dilation[0] < 1 or dilation[1] < 1:
raise ValueError("Attr 'dilation' of 'Conv2D' Op passed "
+ str(self.dilation) + ", should be a int or tuple and equal to or greater than 1.")
def construct(self, *inputs):
"""Must be overridden by all subclasses."""
raise NotImplementedError
class Conv2d_Thor(_Conv):
class Conv2dThor(_Conv):
r"""
2D convolution layer.
@ -370,7 +387,7 @@ class Conv2d_Thor(_Conv):
stride = twice(stride)
self._dilation = dilation
dilation = twice(dilation)
super(Conv2d_Thor, self).__init__(
super(Conv2dThor, self).__init__(
in_channels,
out_channels,
kernel_size,
@ -395,55 +412,58 @@ class Conv2d_Thor(_Conv):
self.thor = True
self.hw = kernel_size[0] * kernel_size[1]
self.matrix_A_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1]
self.matrix_G_dim = self.out_channels
self.matrix_a_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1]
self.matrix_g_dim = self.out_channels
self.shape = P.Shape()
self.reshape = P.Reshape()
self.mul = P.Mul()
self.cast = P.Cast()
self.A_normalizer = Parameter(initializer(0, [1], mstype.float32), name="A_normalizer", requires_grad=False)
self.G_normalizer = Parameter(initializer(0, [1], mstype.float32), name="G_normalizer", requires_grad=False)
self.a_normalizer = Parameter(initializer(0, [1], mstype.float32), name="a_normalizer", requires_grad=False)
self.g_normalizer = Parameter(initializer(0, [1], mstype.float32), name="g_normalizer", requires_grad=False)
self.is_Ascend = True
if context.get_context("device_target") == "Ascend":
ksizes = (1, kernel_size[0], kernel_size[1], 1)
strides = (1, stride[0], stride[1], 1)
self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
self.cube_matmul = P.CusMatMulCube(transpose_a=True)
self.transpose02314 = P.CusTranspose02314()
dampingA_dim = self.matrix_A_dim
self.diag_block_dim = 128
if (self.matrix_A_dim % self.diag_block_dim) != 0 and self.matrix_A_dim > self.diag_block_dim:
dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim
dampingG_dim = self.matrix_G_dim
if (self.matrix_G_dim % self.diag_block_dim) != 0 and self.matrix_G_dim > self.diag_block_dim:
dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim
self.matrix_A_cov = Parameter(Tensor(np.zeros([dampingA_dim, dampingA_dim]).astype(np.float32)),
name='matrix_A', requires_grad=False)
self.matrix_G_cov = Parameter(Tensor(np.zeros([dampingG_dim, dampingG_dim]).astype(np.float32)),
name='matrix_G', requires_grad=False)
self.channels_slice_flag = False
self.C0 = 16
if self.in_channels % self.C0 != 0:
self.channels_slice_flag = True
self.padA_flag = False
if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \
and self.matrix_A_dim > self.diag_block_dim:
self.padA_flag = True
pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim
self.padA = P.Pad(((0, pad_dim), (0, pad_dim)))
self.slice = P.Slice()
self._process_ascend_conv2d_thor(kernel_size, stride)
else:
self.is_Ascend = False
self.img2col = P.Im2Col(kernel_size=kernel_size, stride=stride, pad_mode="same")
self.matmul = P.MatMul(transpose_b=True)
self.reduce_mean = P.ReduceMean(keep_dims=False)
self.matrix_A_cov = Parameter(Tensor(np.zeros([self.matrix_A_dim, self.matrix_A_dim]).astype(np.float32)),
name='matrix_A', requires_grad=False)
self.matrix_G_cov = Parameter(Tensor(np.zeros([self.matrix_G_dim, self.matrix_G_dim]).astype(np.float32)),
name='matrix_G', requires_grad=False)
self.matrix_a_cov = Parameter(Tensor(np.zeros([self.matrix_a_dim, self.matrix_a_dim]).astype(np.float32)),
name='matrix_a', requires_grad=False)
self.matrix_g_cov = Parameter(Tensor(np.zeros([self.matrix_g_dim, self.matrix_g_dim]).astype(np.float32)),
name='matrix_g', requires_grad=False)
self.getG = P.InsertGradientOf(self.save_gradient)
def _process_ascend_conv2d_thor(self, kernel_size, stride):
"""process ascend conv2d thor"""
ksizes = (1, kernel_size[0], kernel_size[1], 1)
strides = (1, stride[0], stride[1], 1)
self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
self.cube_matmul = P.CusMatMulCube(transpose_a=True)
self.transpose02314 = P.CusTranspose02314()
dampinga_dim = self.matrix_a_dim
self.diag_block_dim = 128
if (self.matrix_a_dim % self.diag_block_dim) != 0 and self.matrix_a_dim > self.diag_block_dim:
dampinga_dim = (self.matrix_a_dim // self.diag_block_dim + 1) * self.diag_block_dim
dampingg_dim = self.matrix_g_dim
if (self.matrix_g_dim % self.diag_block_dim) != 0 and self.matrix_g_dim > self.diag_block_dim:
dampingg_dim = (self.matrix_g_dim // self.diag_block_dim + 1) * self.diag_block_dim
self.matrix_a_cov = Parameter(Tensor(np.zeros([dampinga_dim, dampinga_dim]).astype(np.float32)),
name='matrix_a', requires_grad=False)
self.matrix_g_cov = Parameter(Tensor(np.zeros([dampingg_dim, dampingg_dim]).astype(np.float32)),
name='matrix_g', requires_grad=False)
self.channels_slice_flag = False
self.C0 = 16
if self.in_channels % self.C0 != 0:
self.channels_slice_flag = True
self.pada_flag = False
if (self.matrix_a_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_a_dim \
and self.matrix_a_dim > self.diag_block_dim:
self.pada_flag = True
pad_dim = self.diag_block_dim - self.matrix_a_dim % self.diag_block_dim
self.pada = P.Pad(((0, pad_dim), (0, pad_dim)))
self.slice = P.Slice()
def _init_depthwise_conv2d(self, weight_init):
"""Initialize depthwise conv2d op"""
@ -473,11 +493,11 @@ class Conv2d_Thor(_Conv):
dout = self.transpose02314(dout)
dout_shape = self.shape(dout)
normalizer = dout_shape[0]
matrix_G = self.cube_matmul(dout, dout)
matrix_g = self.cube_matmul(dout, dout)
normalizer = self.cast(normalizer, mstype.float32)
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
self.G_normalizer = normalizer
self.matrix_G_cov = matrix_G
matrix_g = self.mul(matrix_g, 1.0 / normalizer)
self.g_normalizer = normalizer
self.matrix_g_cov = matrix_g
else:
dout = self.reduce_mean(dout, 0)
dout_shape = self.shape(dout)
@ -485,43 +505,42 @@ class Conv2d_Thor(_Conv):
dout_shape = self.shape(dout)
normalizer = dout_shape[1]
dout = self.cast(dout, mstype.float32)
matrix_G = self.matmul(dout, dout)
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
self.G_normalizer = normalizer
self.matrix_G_cov = matrix_G
matrix_g = self.matmul(dout, dout)
matrix_g = self.mul(matrix_g, 1.0 / normalizer)
self.g_normalizer = normalizer
self.matrix_g_cov = matrix_g
return out
def construct(self, x):
if self.thor:
matrix_A = self.img2col(x)
matrix_A_shape = self.shape(matrix_A)
matrix_a = self.img2col(x)
matrix_a_shape = self.shape(matrix_a)
if self.is_Ascend:
normalizer = matrix_A_shape[0]
matrix_A = self.cube_matmul(matrix_A, matrix_A)
normalizer = matrix_a_shape[0]
matrix_a = self.cube_matmul(matrix_a, matrix_a)
if self.channels_slice_flag:
matrix_A = self.reshape(matrix_A, (self.hw, self.C0, self.hw, self.C0))
matrix_A = self.slice(matrix_A, (0, 0, 0, 0),
matrix_a = self.reshape(matrix_a, (self.hw, self.C0, self.hw, self.C0))
matrix_a = self.slice(matrix_a, (0, 0, 0, 0),
(self.hw, self.in_channels, self.hw, self.in_channels))
matrix_A = self.reshape(matrix_A, (self.matrix_A_dim, self.matrix_A_dim))
matrix_a = self.reshape(matrix_a, (self.matrix_a_dim, self.matrix_a_dim))
normalizer = self.cast(normalizer, mstype.float32)
matrix_A = self.mul(matrix_A, 1.0 / normalizer)
if self.padA_flag:
matrix_A = self.padA(matrix_A)
self.A_normalizer = normalizer
self.matrix_A_cov = matrix_A
matrix_a = self.mul(matrix_a, 1.0 / normalizer)
if self.pada_flag:
matrix_a = self.pada(matrix_a)
self.a_normalizer = normalizer
self.matrix_a_cov = matrix_a
else:
matrix_A = self.reshape(matrix_A, (matrix_A_shape[0] * matrix_A_shape[1] * matrix_A_shape[2],
matrix_A_shape[3], -1))
matrix_A = self.reduce_mean(matrix_A, 1)
matrix_A_shape = self.shape(matrix_A)
normalizer = matrix_A_shape[1]
matrix_A = self.cast(matrix_A, mstype.float32)
matrix_A = self.matmul(matrix_A, matrix_A)
matrix_A = self.mul(matrix_A, 1.0 / normalizer)
self.A_normalizer = normalizer
self.matrix_A_cov = matrix_A
matrix_a = self.reshape(matrix_a, (matrix_a_shape[0] * matrix_a_shape[1] * matrix_a_shape[2],
matrix_a_shape[3], -1))
matrix_a = self.reduce_mean(matrix_a, 1)
matrix_a_shape = self.shape(matrix_a)
normalizer = matrix_a_shape[1]
matrix_a = self.cast(matrix_a, mstype.float32)
matrix_a = self.matmul(matrix_a, matrix_a)
matrix_a = self.mul(matrix_a, 1.0 / normalizer)
self.a_normalizer = normalizer
self.matrix_a_cov = matrix_a
output = self.conv2d(x, self.weight)
output = self.getG(output)
else:
@ -549,7 +568,7 @@ class Conv2d_Thor(_Conv):
return s
class Embedding_Thor(Cell):
class EmbeddingThor(Cell):
r"""
A simple lookup table that stores embeddings of a fixed dictionary and size.
@ -590,7 +609,7 @@ class Embedding_Thor(Cell):
def __init__(self, vocab_size, embedding_size, use_one_hot=False, embedding_table='normal',
dtype=mstype.float32, padding_idx=None):
super(Embedding_Thor, self).__init__()
super(EmbeddingThor, self).__init__()
self.vocab_size = Validator.check_value_type('vocab_size', vocab_size, [int], self.cls_name)
self.embedding_size = Validator.check_value_type('embedding_size', embedding_size, [int], self.cls_name)
Validator.check_value_type('use_one_hot', use_one_hot, [bool], self.cls_name)
@ -616,10 +635,10 @@ class Embedding_Thor(Cell):
self.reshape = P.Reshape()
self.get_shp = P.Shape()
self.thor = True
self.matrix_A = Parameter(Tensor(np.zeros([vocab_size]).astype(np.float32)),
name='matrix_A', requires_grad=False)
self.matrix_G = Parameter(Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float32)),
name="matrix_G", requires_grad=False)
self.matrix_a = Parameter(Tensor(np.zeros([vocab_size]).astype(np.float32)),
name='matrix_a', requires_grad=False)
self.matrix_g = Parameter(Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float32)),
name="matrix_g", requires_grad=False)
self.reduce_sum = P.ReduceSum(keep_dims=False)
self.getG = P.InsertGradientOf(self.save_gradient)
self.cast = P.Cast()
@ -638,9 +657,9 @@ class Embedding_Thor(Cell):
out = dout
shape = self.get_shp(dout)
normalizer = self.cast(shape[0], mstype.float32)
matrix_G = self.cube_matmul(dout, dout)
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
self.matrix_G = matrix_G
matrix_g = self.cube_matmul(dout, dout)
matrix_g = self.mul(matrix_g, 1.0 / normalizer)
self.matrix_g = matrix_g
return out
def construct(self, ids):
@ -654,8 +673,8 @@ class Embedding_Thor(Cell):
else:
if self.thor:
one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value)
matrix_A = self.reduce_sum(one_hot_ids, 0)
self.matrix_A = matrix_A
matrix_a = self.reduce_sum(one_hot_ids, 0)
self.matrix_a = matrix_a
output_for_reshape = self.gather(self.embedding_table, flat_ids, 0)
output_for_reshape = self.getG(output_for_reshape)
else:

View File

@ -29,7 +29,7 @@ from .rmsprop import RMSProp
from .proximal_ada_grad import ProximalAdagrad
from .lazyadam import LazyAdam
from .ada_grad import Adagrad
from .thor import THOR
from .thor import thor
__all__ = ['Optimizer', 'Momentum', 'LARS', 'Adam', 'AdamWeightDecay', 'LazyAdam', 'AdamOffload',
'Lamb', 'SGD', 'FTRL', 'RMSProp', 'ProximalAdagrad', 'Adagrad', 'THOR']
'Lamb', 'SGD', 'FTRL', 'RMSProp', 'ProximalAdagrad', 'Adagrad', 'thor']

File diff suppressed because it is too large Load Diff

View File

@ -43,22 +43,22 @@ class ConvertNetUntils():
if act_name == "fastgelu":
act_name = "fast_gelu"
if subcell.out_channels == 1001:
new_subcell = nn.Dense_Thor(in_channels=subcell.in_channels,
out_channels=subcell.out_channels,
weight_init=weight,
has_bias=subcell.has_bias,
bias_init='zeros',
activation=act_name)
new_subcell = nn.DenseThor(in_channels=subcell.in_channels,
out_channels=subcell.out_channels,
weight_init=weight,
has_bias=subcell.has_bias,
bias_init='zeros',
activation=act_name)
else:
compute_type = mstype.float16
if context.get_context("device_target") == "GPU":
compute_type = mstype.float32
new_subcell = nn.Dense_Thor(in_channels=subcell.in_channels,
out_channels=subcell.out_channels,
weight_init=weight,
has_bias=subcell.has_bias,
bias_init='zeros',
activation=act_name).to_float(compute_type)
new_subcell = nn.DenseThor(in_channels=subcell.in_channels,
out_channels=subcell.out_channels,
weight_init=weight,
has_bias=subcell.has_bias,
bias_init='zeros',
activation=act_name).to_float(compute_type)
if subcell.has_bias:
new_subcell.bias = subcell.bias
@ -69,9 +69,9 @@ class ConvertNetUntils():
"""
convert embedding cell to second_order cell
"""
new_subcell = nn.Embedding_Thor(vocab_size=subcell.vocab_size,
embedding_size=subcell.embedding_size,
use_one_hot=False)
new_subcell = nn.EmbeddingThor(vocab_size=subcell.vocab_size,
embedding_size=subcell.embedding_size,
use_one_hot=False)
new_subcell.embedding_table = subcell.embedding_table
return new_subcell
@ -88,9 +88,9 @@ class ConvertNetUntils():
pad_mode = subcell.pad_mode
has_bias = subcell.has_bias
weight = subcell.weight
new_subcell = nn.Conv2d_Thor(in_channel, out_channel,
kernel_size=kernel_size, stride=stride, padding=padding, pad_mode=pad_mode,
has_bias=has_bias, weight_init=weight)
new_subcell = nn.Conv2dThor(in_channel, out_channel,
kernel_size=kernel_size, stride=stride, padding=padding, pad_mode=pad_mode,
has_bias=has_bias, weight_init=weight)
return new_subcell
@ -104,7 +104,7 @@ class ConvertNetUntils():
subcell = cells[name]
if subcell == net:
continue
elif isinstance(subcell, (nn.Dense_Thor, nn.Conv2d_Thor, nn.Embedding_Thor)):
elif isinstance(subcell, (nn.DenseThor, nn.Conv2dThor, nn.EmbeddingThor)):
continue
elif isinstance(subcell, (nn.Conv2dTranspose, nn.Conv1d, nn.Conv1dTranspose, nn.BatchNorm1d, nn.GroupNorm,
nn.GlobalBatchNorm, nn.LayerNorm, nn.BatchNorm2d, nn.MaxPool2d)):
@ -113,7 +113,7 @@ class ConvertNetUntils():
prefix = subcell.param_prefix
new_subcell = self._convert_method_map[type(subcell)](subcell)
print("subcell name: ", name, "prefix is", prefix, flush=True)
if isinstance(new_subcell, (nn.Dense_Thor, nn.Embedding_Thor, nn.Conv2d_Thor)):
if isinstance(new_subcell, (nn.DenseThor, nn.EmbeddingThor, nn.Conv2dThor)):
print("convert to thor layer success.", flush=True)
new_subcell.update_parameters_name(prefix + '.')
net.insert_child_to_cell(name, new_subcell)
@ -141,19 +141,19 @@ class ConvertModelUtils():
"""
def convert_to_thor_model(self, model, network, loss_fn=None, optimizer=None, metrics=None, amp_level="O0",
loss_scale_manager=None, keep_batchnorm_fp32=False, frequency=834):
loss_scale_manager=None, keep_batchnorm_fp32=False):
"""
api for convert model to thor model
"""
optim_name = type(optimizer).__name__
if optim_name in ("THOR_Ascend", "THOR_GPU"):
from .model_thor import Model_Thor
if optim_name in ("ThorAscend", "ThorGpu"):
from .model_thor import ModelThor
if isinstance(network, nn.TrainOneStepCell):
model = Model_Thor(network=network, frequency=frequency)
model = ModelThor(network=network)
else:
model = Model_Thor(network=network, loss_fn=loss_fn, optimizer=optimizer, amp_level=amp_level,
loss_scale_manager=loss_scale_manager,
keep_batchnorm_fp32=keep_batchnorm_fp32, metrics=metrics, frequency=frequency)
model = ModelThor(network=network, loss_fn=loss_fn, optimizer=optimizer, amp_level=amp_level,
loss_scale_manager=loss_scale_manager,
keep_batchnorm_fp32=keep_batchnorm_fp32, metrics=metrics)
return model

View File

@ -17,6 +17,7 @@
import math
from mindspore.train.callback import RunContext
from mindspore import context
from mindspore import nn
from mindspore.context import ParallelMode
from mindspore.train.model import Model
from mindspore.train.dataset_helper import connect_network_with_dataset
@ -25,6 +26,7 @@ from mindspore.common.dtype import pytype_to_dtype
from mindspore._c_expression import init_exec_dataset
from .dataset_helper import DatasetHelper
def _convert_type(types):
"""
Convert from numpy type to tensor type.
@ -66,7 +68,7 @@ def _exec_datagraph(exec_dataset, dataset_size, phase='dataset'):
need_run=False)
class Model_Thor(Model):
class ModelThor(Model):
"""
High-Level API for Training or Testing.
@ -104,10 +106,19 @@ class Model_Thor(Model):
"""
def __init__(self, network, loss_fn=None, optimizer=None, metrics=None, eval_network=None,
eval_indexes=None, amp_level="O0", frequency=834, **kwargs):
super(Model_Thor, self).__init__(network, loss_fn, optimizer, metrics, eval_network,
eval_indexes, amp_level, **kwargs)
self._frequency = frequency
eval_indexes=None, amp_level="O0", **kwargs):
super(ModelThor, self).__init__(network, loss_fn, optimizer, metrics, eval_network,
eval_indexes, amp_level, **kwargs)
if isinstance(network, nn.TrainOneStepCell):
self._frequency = network.optimizer.get_frequency()
else:
self._frequency = optimizer.get_frequency()
# used to stop training for early stop, such as stopAtTIme or stopATStep
self.should_stop = False
self.switch_branch_one = True
self.index_first_order = 0
self.train_network_init_flag = True
self.has_do_dataset_init = False
self._train_network = self._build_train_network()
def _exec_preprocess(self, network, is_train, phase, dataset, dataset_sink_mode, sink_size=-1,
@ -127,6 +138,52 @@ class Model_Thor(Model):
return dataset_helper, network
def _train_gpu_sink_step(self, cb_params, inputs, list_callback, iter_first_order, run_context):
"""train gpu sink step"""
if self.switch_branch_one:
cb_params.cur_step_num += 1
if self.train_network_init_flag:
self._train_network.add_flags_recursive(thor=True)
self._train_network.phase = 'train0'
self.switch_branch_one = not self.switch_branch_one
outputs = self._train_network(*inputs)
cb_params.net_outputs = outputs
list_callback.step_end(run_context)
else:
cb_params.cur_step_num += 1
if self.train_network_init_flag:
self._train_network.add_flags_recursive(thor=False)
self.train_network_init_flag = False
self._train_network.phase = 'train1'
outputs = self._train_network(*inputs)
cb_params.net_outputs = outputs
self.index_first_order += 1
if self.index_first_order == iter_first_order:
self.index_first_order = 0
self.switch_branch_one = not self.switch_branch_one
list_callback.step_end(run_context)
def _train_ascend_sink_step(self, cb_params, train_dataset, iter_first_order, inputs, list_callback, run_context):
"""train ascend sink step"""
if self.switch_branch_one:
cb_params.cur_step_num += 1
if self.train_network_init_flag:
self._train_network.add_flags_recursive(thor=True)
self._train_network.phase = 'train0'
else:
cb_params.cur_step_num += iter_first_order
if self.train_network_init_flag:
self._train_network.add_flags_recursive(thor=False)
self.train_network_init_flag = False
self._train_network.phase = 'train1'
if not self.has_do_dataset_init:
_exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset')
self.has_do_dataset_init = True
self.switch_branch_one = not self.switch_branch_one
outputs = self._train_network(*inputs)
cb_params.net_outputs = outputs
list_callback.step_end(run_context)
def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None, sink_size=-1):
"""
Training process. The data would be passed to network through dataset channel.
@ -166,13 +223,6 @@ class Model_Thor(Model):
run_context = RunContext(cb_params)
list_callback.begin(run_context)
# used to stop training for early stop, such as stopAtTIme or stopATStep
should_stop = False
switch_branch_one = True
index_first_order = 0
train_network_init_flag = True
has_do_dataset_init = False
for i in range(epoch):
cb_params.cur_epoch_num = i + 1
list_callback.epoch_begin(run_context)
@ -182,55 +232,17 @@ class Model_Thor(Model):
inputs = _to_full_tensor(inputs, self._device_number, self._global_rank)
list_callback.step_begin(run_context)
if context.get_context("device_target") == "GPU":
if switch_branch_one:
cb_params.cur_step_num += 1
if train_network_init_flag:
self._train_network.add_flags_recursive(thor=True)
self._train_network.phase = 'train0'
switch_branch_one = not switch_branch_one
outputs = self._train_network(*inputs)
cb_params.net_outputs = outputs
list_callback.step_end(run_context)
else:
cb_params.cur_step_num += 1
if train_network_init_flag:
self._train_network.add_flags_recursive(thor=False)
train_network_init_flag = False
self._train_network.phase = 'train1'
outputs = self._train_network(*inputs)
cb_params.net_outputs = outputs
index_first_order += 1
if index_first_order == iter_first_order:
index_first_order = 0
switch_branch_one = not switch_branch_one
list_callback.step_end(run_context)
self._train_gpu_sink_step(cb_params, inputs, list_callback, iter_first_order, run_context)
else:
if switch_branch_one:
cb_params.cur_step_num += 1
if train_network_init_flag:
self._train_network.add_flags_recursive(thor=True)
self._train_network.phase = 'train0'
else:
cb_params.cur_step_num += iter_first_order
if train_network_init_flag:
self._train_network.add_flags_recursive(thor=False)
train_network_init_flag = False
self._train_network.phase = 'train1'
if not has_do_dataset_init:
_exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset')
has_do_dataset_init = True
switch_branch_one = not switch_branch_one
outputs = self._train_network(*inputs)
cb_params.net_outputs = outputs
list_callback.step_end(run_context)
self._train_ascend_sink_step(cb_params, train_dataset, iter_first_order, inputs, list_callback,
run_context)
list_callback.epoch_end(run_context)
should_stop = should_stop or run_context.get_stop_requested()
if should_stop:
self.should_stop = self.should_stop or run_context.get_stop_requested()
if self.should_stop:
break
dataset_helper.stop_send()
list_callback.end(run_context)
__all__ = ["Model_Thor"]
__all__ = ["ModelThor"]

View File

@ -18,7 +18,7 @@ import argparse
import ast
from mindspore import context
from mindspore import Tensor
from mindspore.nn.optim import Momentum, THOR
from mindspore.nn.optim import Momentum, thor
from mindspore.train.model import Model
from mindspore.context import ParallelMode
from mindspore.train.train_thor import ConvertModelUtils
@ -235,12 +235,11 @@ if __name__ == '__main__':
from src.lr_generator import get_thor_damping
damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size)
split_indices = [26, 53]
opt = THOR(net, lr, Tensor(damping), config.momentum, config.weight_decay, config.loss_scale,
config.batch_size, split_indices=split_indices)
opt = thor(net, lr, Tensor(damping), config.momentum, config.weight_decay, config.loss_scale,
config.batch_size, split_indices=split_indices, frequency=config.frequency)
model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
loss_scale_manager=loss_scale, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False,
frequency=config.frequency)
amp_level="O2", keep_batchnorm_fp32=False)
args_opt.run_eval = False
logger.warning("Thor optimizer not support evaluation while training.")

View File

@ -29,7 +29,7 @@ from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train.train_thor import ConvertModelUtils
from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay, THOR
from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay, thor
from mindspore import log as logger
from mindspore.common import set_seed
from src import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell, \
@ -106,20 +106,14 @@ def _get_optimizer(args_opt, network):
damping = get_bert_thor_damping(cfg.Thor.damping_max, cfg.Thor.damping_min, cfg.Thor.damping_power,
cfg.Thor.damping_total_steps)
split_indices = None
if bert_net_cfg.num_hidden_layers == 12:
if bert_net_cfg.use_relative_positions:
split_indices = [29, 58, 87, 116, 145, 174, 203, 217]
else:
split_indices = [28, 55, 82, 109, 136, 163, 190, 205]
elif bert_net_cfg.num_hidden_layers == 24:
if bert_net_cfg.use_relative_positions:
split_indices = [30, 90, 150, 210, 270, 330, 390, 421]
else:
split_indices = [38, 93, 148, 203, 258, 313, 368, 397]
optimizer = THOR(network, lr, damping, cfg.Thor.momentum,
if bert_net_cfg.num_hidden_layers == 12 and not bert_net_cfg.use_relative_positions:
split_indices = [28, 55, 77]
elif bert_net_cfg.num_hidden_layers == 24 and not bert_net_cfg.use_relative_positions:
split_indices = [38, 93, 149]
optimizer = thor(network, lr, damping, cfg.Thor.momentum,
cfg.Thor.weight_decay, cfg.Thor.loss_scale, cfg.batch_size,
decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(),
split_indices=split_indices)
split_indices=split_indices, enable_clip_grad=True, frequency=cfg.Thor.frequency)
else:
raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay, Thor]".
format(cfg.optimizer))
@ -278,11 +272,10 @@ def run_pretrain():
accumulation_steps=accumulation_steps,
enable_global_norm=enable_global_norm)
else:
net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer)
net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, enable_clip_grad=False)
model = Model(net_with_grads)
model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer,
frequency=cfg.Thor.frequency)
model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer)
model.train(new_repeat_count, ds, callbacks=callback,
dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)

View File

@ -269,12 +269,14 @@ class BertTrainOneStepCell(nn.TrainOneStepCell):
network (Cell): The training network. Note that loss function should have been added.
optimizer (Optimizer): Optimizer for updating the weights.
sens (Number): The adjust parameter. Default: 1.0.
enable_clip_grad (boolean): If True, clip gradients in BertTrainOneStepCell. Default: False.
"""
def __init__(self, network, optimizer, sens=1.0):
def __init__(self, network, optimizer, sens=1.0, enable_clip_grad=False):
super(BertTrainOneStepCell, self).__init__(network, optimizer, sens)
self.cast = P.Cast()
self.hyper_map = C.HyperMap()
self.enable_clip_grad = enable_clip_grad
def set_sens(self, value):
self.sens = value
@ -306,7 +308,8 @@ class BertTrainOneStepCell(nn.TrainOneStepCell):
masked_lm_weights,
self.cast(F.tuple_to_array((self.sens,)),
mstype.float32))
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
if self.enable_clip_grad:
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
grads = self.grad_reducer(grads)
succ = self.optimizer(grads)
return F.depend(loss, succ)

View File

@ -28,7 +28,7 @@ from mindspore import log as logger
from mindspore.train.callback import Callback
from mindspore.context import ParallelMode
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.nn.optim import THOR
from mindspore.nn.optim import thor
from mindspore.train.model import Model
from mindspore.train.train_thor import ConvertModelUtils
import mindspore.dataset.transforms.c_transforms as C
@ -166,10 +166,10 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num):
lr = get_bert_thor_lr()
damping = get_bert_thor_damping()
split_indices = [38, 77]
optimizer = THOR(net_with_loss, lr, damping, momentum, weight_decay, loss_scale, batch_size,
split_indices = None
optimizer = thor(net_with_loss, lr, damping, momentum, weight_decay, loss_scale, batch_size,
decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(),
split_indices=split_indices)
split_indices=split_indices, enable_clip_grad=True, frequency=frequency)
time_monitor_callback = TimeMonitor(data_sink_steps)
loss_callback = LossCallback()
callback = [time_monitor_callback, loss_callback]
@ -178,10 +178,9 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num):
param_dict = load_checkpoint(load_checkpoint_path)
load_param_into_net(net_with_loss, param_dict)
net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer)
net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer, sens=loss_scale, enable_clip_grad=False)
model = Model(net_with_grads)
model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer,
frequency=frequency)
model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer)
model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps)
loss_list = loss_callback.loss_list

View File

@ -24,7 +24,7 @@ from mindspore.nn.optim.optimizer import Optimizer
from mindspore.parallel._utils import _get_device_num, _get_gradients_mean
from mindspore import context
from mindspore.context import ParallelMode
from mindspore.nn.layer import Dense_Thor, Conv2d_Thor, Embedding_Thor
from mindspore.nn.layer import DenseThor, Conv2dThor, EmbeddingThor
from mindspore.nn.wrap import DistributedGradReducer
from mindspore.train.train_thor.convert_utils import ConvertNetUntils
from mindspore.parallel._auto_parallel_context import auto_parallel_context
@ -106,11 +106,11 @@ def find_net_layertype_recur(net, layertype_map):
subcell = cells[name]
if subcell == net:
continue
elif isinstance(subcell, Conv2d_Thor):
elif isinstance(subcell, Conv2dThor):
layertype_map.append(Conv)
elif isinstance(subcell, Dense_Thor):
elif isinstance(subcell, DenseThor):
layertype_map.append(FC)
elif isinstance(subcell, Embedding_Thor):
elif isinstance(subcell, EmbeddingThor):
layertype_map.append(Embedding)
elif isinstance(subcell, nn.LayerNorm):
layertype_map.append(LayerNorm)
@ -168,10 +168,10 @@ class THOR_Ascend(Optimizer):
self.hyper_map = C.HyperMap()
self.opt = P.ApplyMomentum()
self.net = net
self.matrix_A_cov = ParameterTuple(filter(lambda x: 'matrix_A' in x.name, net.get_parameters()))
self.matrix_G_cov = ParameterTuple(filter(lambda x: 'matrix_G' in x.name, net.get_parameters()))
self.A_normalizer = ParameterTuple(filter(lambda x: 'A_normalizer' in x.name, net.get_parameters()))
self.G_normalizer = ParameterTuple(filter(lambda x: 'G_normalizer' in x.name, net.get_parameters()))
self.matrix_A_cov = ParameterTuple(filter(lambda x: 'matrix_a' in x.name, net.get_parameters()))
self.matrix_G_cov = ParameterTuple(filter(lambda x: 'matrix_g' in x.name, net.get_parameters()))
self.A_normalizer = ParameterTuple(filter(lambda x: 'a_normalizer' in x.name, net.get_parameters()))
self.G_normalizer = ParameterTuple(filter(lambda x: 'g_normalizer' in x.name, net.get_parameters()))
self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
@ -188,7 +188,7 @@ class THOR_Ascend(Optimizer):
self.diag_block_dim = 128
self.matrix_A = ()
self.matrix_G = ()
print("matrix_A_cov len is", len(self.matrix_A_cov))
print("matrix_a_cov len is", len(self.matrix_A_cov))
self.thor_layer_count = 0
self.conv_layer_count = 0
self.weight_fim_idx_map = ()