forked from mindspore-Ecosystem/mindspore
fix resnet50 of thor
This commit is contained in:
parent
40ca285ab3
commit
fea69fe0e6
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -46,46 +46,12 @@ matmul_cube_dense_left_op_info = TBERegOp("CusMatMulCubeDenseLeft") \
|
|||
.dtype_format(DataType.F16_Default, DataType.F16_FracNZ, DataType.F16_Default, DataType.F16_FracNZ) \
|
||||
.get_op_info()
|
||||
|
||||
|
||||
# @util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
|
||||
@op_info_register(matmul_cube_dense_left_op_info)
|
||||
def cus_matmul_cube_dense_left(input_x1, input_x2, bias=None, output_y=None, trans_a=False, trans_b=False,
|
||||
kernel_name="cus_matmul_cube_dense_left"):
|
||||
"""
|
||||
calculating matrix multiplication with bias, C = A*B + bias, support input
|
||||
data with fractal format.
|
||||
|
||||
Parameters:
|
||||
shape_a: list or tuple
|
||||
Shape of the first tensor a with rank > 1
|
||||
shape_b: list or tuple
|
||||
Shape of the second tensor b with the same type with a,
|
||||
and shape_a, shape_b must be 2 dims
|
||||
src_dtype: str
|
||||
The data type of input, support "float32", "float16"
|
||||
dst_dtype: str
|
||||
The data type of output, support "float32", "float16"
|
||||
trans_a: bool
|
||||
If True, shape_a == transposed before multiplication
|
||||
trans_b: bool
|
||||
If True, shape_b == transposed before multiplication
|
||||
is_fractal: bool
|
||||
If True, the input data format of a and b must be fractal format
|
||||
shape_bias: list or tuple
|
||||
Shape of bias, only support the input data format with ND
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
print("!!!!come into zzt~~~~~~~!!!!")
|
||||
def shape_gen1(input_x1, input_x2, output_y, kernel_name, trans_a, trans_b):
|
||||
"""shape gen1"""
|
||||
shape_a = input_x1.get("ori_shape")
|
||||
shape_b = input_x2.get("ori_shape")
|
||||
shape_output = output_y.get("ori_shape")
|
||||
print("============")
|
||||
print(input_x1.get("format"), input_x2.get("format"))
|
||||
print(shape_a, shape_b)
|
||||
print("============")
|
||||
|
||||
if input_x2.get("format") == "FRACTAL_Z":
|
||||
n, c, h, w = shape_b
|
||||
c0 = 16
|
||||
|
@ -115,7 +81,6 @@ def cus_matmul_cube_dense_left(input_x1, input_x2, bias=None, output_y=None, tra
|
|||
|
||||
shape_a = _get_input_shape(shape_a)
|
||||
shape_b = _get_input_shape(shape_b)
|
||||
|
||||
util.check_kernel_name(kernel_name)
|
||||
util.check_shape_rule(shape_a)
|
||||
util.check_shape_rule(shape_b)
|
||||
|
@ -127,7 +92,10 @@ def cus_matmul_cube_dense_left(input_x1, input_x2, bias=None, output_y=None, tra
|
|||
|
||||
shape_b = [shape_b[1], shape_b[0]]
|
||||
trans_b = bool(1 - trans_b)
|
||||
return shape_a, shape_b, trans_a, trans_b, shape_output
|
||||
|
||||
def shape_gen2(bias, input_x1, output_y, shape_a, shape_b, trans_a, trans_b):
|
||||
"""shape gen2"""
|
||||
shape_bias = ()
|
||||
if bias is not None and bool(bias):
|
||||
shape_bias = bias.get("shape")
|
||||
|
@ -174,11 +142,106 @@ def cus_matmul_cube_dense_left(input_x1, input_x2, bias=None, output_y=None, tra
|
|||
format_a = "FRACTAL_NZ"
|
||||
shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
|
||||
format_b = "FRACTAL_NZ"
|
||||
return shape_a_temp, format_a, shape_b_temp, format_b, shape_bias, src_dtype, dst_dtype
|
||||
|
||||
print("=======================================")
|
||||
print(shape_a_temp, shape_b_temp)
|
||||
print(format_a, format_b)
|
||||
print("=======================================")
|
||||
def core(shape_a_temp, shape_b_temp, shape_output, kernel_name):
|
||||
"""core func"""
|
||||
if util.get_product_version() == util.VERSION_MINI:
|
||||
tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
|
||||
else:
|
||||
tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
|
||||
|
||||
input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
|
||||
input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
|
||||
resmatmul = tik_instance.Tensor("float16", shape_output, name="output", scope=tik.scope_gm)
|
||||
with tik_instance.for_range(0, 32, block_num=32) as block_index:
|
||||
resmatmul_local_ub = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf,
|
||||
name="resmatmul_local_ub")
|
||||
resmatmul_local_ub_local_l0c = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc,
|
||||
name="resmatmul_local_ub")
|
||||
input_1_local_l1_local_l0a = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca,
|
||||
name="input_1_local_l1_local_l0a")
|
||||
input_2_local_l1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf,
|
||||
name="input_2_local_l1")
|
||||
input_1_local_l1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf,
|
||||
name="input_1_local_l1")
|
||||
input_2_local_l1_local_l0b = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb,
|
||||
name="input_2_local_l1_local_l0b")
|
||||
core_m_idx = block_index % 8
|
||||
core_n_idx = block_index // 8
|
||||
with tik_instance.if_scope(core_m_idx != 7):
|
||||
tik_instance.data_move(input_1_local_l1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128,
|
||||
55 * 16, 0)
|
||||
tik_instance.data_move(input_2_local_l1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
|
||||
32, 128, 55 * 16, 0)
|
||||
with tik_instance.for_range(0, 8) as cc12:
|
||||
tik_instance.load2dv1(input_1_local_l1_local_l0a[cc12 * 2048], input_1_local_l1[cc12 * 256], 0, 8,
|
||||
8, 0, False)
|
||||
with tik_instance.for_range(0, 2) as cc6:
|
||||
with tik_instance.for_range(0, 8) as cc121:
|
||||
tik_instance.load2dv1(input_2_local_l1_local_l0b[cc121 * 4096],
|
||||
input_2_local_l1[cc6 * 32768 + cc121 * 256], 0, 16, 8, 0, True)
|
||||
tik_instance.mmad(resmatmul_local_ub_local_l0c, input_1_local_l1_local_l0a,
|
||||
input_2_local_l1_local_l0b, 128, 128, 256, 0)
|
||||
tik_instance.data_move(resmatmul_local_ub, resmatmul_local_ub_local_l0c, 0, 1, 128, 0, 0, 1)
|
||||
tik_instance.data_move(resmatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008]
|
||||
, resmatmul_local_ub, 0, 16, 256 // 2, 0, 55 * 16 * 2 // 2)
|
||||
with tik_instance.else_scope():
|
||||
tik_instance.data_move(input_1_local_l1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112,
|
||||
56 * 16, 0)
|
||||
tik_instance.data_move(input_2_local_l1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
|
||||
32, 112, 56 * 16, 0)
|
||||
with tik_instance.for_range(0, 7) as cc10:
|
||||
tik_instance.load2dv1(input_1_local_l1_local_l0a[cc10 * 1792], input_1_local_l1[cc10 * 256], 0, 7,
|
||||
7, 0, False)
|
||||
with tik_instance.for_range(0, 2) as cc5:
|
||||
with tik_instance.for_range(0, 7) as cc101:
|
||||
tik_instance.load2dv1(input_2_local_l1_local_l0b[cc101 * 4096],
|
||||
input_2_local_l1[cc5 * 28672 + cc101 * 256], 0, 16, 7, 0, True)
|
||||
tik_instance.mmad(resmatmul_local_ub_local_l0c, input_1_local_l1_local_l0a,
|
||||
input_2_local_l1_local_l0b, 112, 112, 256, 0)
|
||||
tik_instance.data_move(resmatmul_local_ub, resmatmul_local_ub_local_l0c, 0, 1, 112, 0, 0, 1)
|
||||
tik_instance.data_move(resmatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008]
|
||||
, resmatmul_local_ub, 0, 16, 224 // 2, 0, 56 * 16 * 2 // 2)
|
||||
tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[resmatmul])
|
||||
return tik_instance
|
||||
|
||||
@op_info_register(matmul_cube_dense_left_op_info)
|
||||
def cus_matmul_cube_dense_left(input_x1, input_x2, bias=None, output_y=None, trans_a=False, trans_b=False,
|
||||
kernel_name="cus_matmul_cube_dense_left"):
|
||||
"""
|
||||
calculating matrix multiplication with bias, C = A*B + bias, support input
|
||||
data with fractal format.
|
||||
|
||||
Parameters:
|
||||
shape_a: list or tuple
|
||||
Shape of the first tensor a with rank > 1
|
||||
shape_b: list or tuple
|
||||
Shape of the second tensor b with the same type with a,
|
||||
and shape_a, shape_b must be 2 dims
|
||||
src_dtype: str
|
||||
The data type of input, support "float32", "float16"
|
||||
dst_dtype: str
|
||||
The data type of output, support "float32", "float16"
|
||||
trans_a: bool
|
||||
If True, shape_a == transposed before multiplication
|
||||
trans_b: bool
|
||||
If True, shape_b == transposed before multiplication
|
||||
is_fractal: bool
|
||||
If True, the input data format of a and b must be fractal format
|
||||
shape_bias: list or tuple
|
||||
Shape of bias, only support the input data format with ND
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
shape_a, shape_b, trans_a, trans_b, shape_output = shape_gen1(input_x1, input_x2, output_y, kernel_name,
|
||||
trans_a, trans_b)
|
||||
shape_a_temp, format_a, shape_b_temp, format_b, shape_bias, src_dtype, dst_dtype = shape_gen2(bias, input_x1,
|
||||
output_y, shape_a,
|
||||
shape_b, trans_a,
|
||||
trans_b)
|
||||
tensor_bias = None
|
||||
tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a',
|
||||
dtype=src_dtype)
|
||||
|
@ -190,67 +253,9 @@ def cus_matmul_cube_dense_left(input_x1, input_x2, bias=None, output_y=None, tra
|
|||
dtype=dst_dtype)
|
||||
|
||||
if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 63:
|
||||
if util.get_product_version() == util.VERSION_MINI:
|
||||
tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
|
||||
else:
|
||||
tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
|
||||
|
||||
input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
|
||||
input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
|
||||
resmatmul = tik_instance.Tensor("float16", shape_output, name="output", scope=tik.scope_gm)
|
||||
with tik_instance.for_range(0, 32, block_num=32) as block_index:
|
||||
resmatmul_local_ub = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf,
|
||||
name="resmatmul_local_ub")
|
||||
resmatmul_local_ub_local_l0c = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc,
|
||||
name="resmatmul_local_ub")
|
||||
input_1_local_l1_local_l0a = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca,
|
||||
name="input_1_local_l1_local_l0a")
|
||||
input_2_local_l1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf,
|
||||
name="input_2_local_l1")
|
||||
input_1_local_l1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf,
|
||||
name="input_1_local_l1")
|
||||
input_2_local_l1_local_l0b = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb,
|
||||
name="input_2_local_l1_local_l0b")
|
||||
core_m_idx = block_index % 8
|
||||
core_n_idx = block_index // 8
|
||||
with tik_instance.if_scope(core_m_idx != 7):
|
||||
tik_instance.data_move(input_1_local_l1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128,
|
||||
55 * 16, 0)
|
||||
tik_instance.data_move(input_2_local_l1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
|
||||
32, 128, 55 * 16, 0)
|
||||
with tik_instance.for_range(0, 8) as cc12:
|
||||
tik_instance.load2dv1(input_1_local_l1_local_l0a[cc12 * 2048], input_1_local_l1[cc12 * 256], 0, 8,
|
||||
8, 0, False)
|
||||
with tik_instance.for_range(0, 2) as cc6:
|
||||
with tik_instance.for_range(0, 8) as cc121:
|
||||
tik_instance.load2dv1(input_2_local_l1_local_l0b[cc121 * 4096],
|
||||
input_2_local_l1[cc6 * 32768 + cc121 * 256], 0, 16, 8, 0, True)
|
||||
tik_instance.mmad(resmatmul_local_ub_local_l0c, input_1_local_l1_local_l0a,
|
||||
input_2_local_l1_local_l0b, 128, 128, 256, 0)
|
||||
tik_instance.data_move(resmatmul_local_ub, resmatmul_local_ub_local_l0c, 0, 1, 128, 0, 0, 1)
|
||||
tik_instance.data_move(resmatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008]
|
||||
, resmatmul_local_ub, 0, 16, 256 // 2, 0, 55 * 16 * 2 // 2)
|
||||
with tik_instance.else_scope():
|
||||
tik_instance.data_move(input_1_local_l1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112,
|
||||
56 * 16, 0)
|
||||
tik_instance.data_move(input_2_local_l1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
|
||||
32, 112, 56 * 16, 0)
|
||||
with tik_instance.for_range(0, 7) as cc10:
|
||||
tik_instance.load2dv1(input_1_local_l1_local_l0a[cc10 * 1792], input_1_local_l1[cc10 * 256], 0, 7,
|
||||
7, 0, False)
|
||||
with tik_instance.for_range(0, 2) as cc5:
|
||||
with tik_instance.for_range(0, 7) as cc101:
|
||||
tik_instance.load2dv1(input_2_local_l1_local_l0b[cc101 * 4096],
|
||||
input_2_local_l1[cc5 * 28672 + cc101 * 256], 0, 16, 7, 0, True)
|
||||
tik_instance.mmad(resmatmul_local_ub_local_l0c, input_1_local_l1_local_l0a,
|
||||
input_2_local_l1_local_l0b, 112, 112, 256, 0)
|
||||
tik_instance.data_move(resmatmul_local_ub, resmatmul_local_ub_local_l0c, 0, 1, 112, 0, 0, 1)
|
||||
tik_instance.data_move(resmatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008]
|
||||
, resmatmul_local_ub, 0, 16, 224 // 2, 0, 56 * 16 * 2 // 2)
|
||||
tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[resmatmul])
|
||||
tik_instance = core(shape_a_temp, shape_b_temp, shape_output, kernel_name)
|
||||
return tik_instance
|
||||
|
||||
print("come into tbe, shape is error!")
|
||||
result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
|
||||
format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
|
||||
|
||||
|
|
|
@ -130,7 +130,7 @@ class CusFusedAbsMax1(PrimitiveWithInfer):
|
|||
"""Initialize CusFusedAbsMax1"""
|
||||
self.init_prim_io_names(inputs=['x1'], outputs=['y'])
|
||||
self.origin_shape = origin_shape
|
||||
from mindspore.ops._op_impl._custom_op.fused_abs_max1_impl import CusFusedAbsMax1
|
||||
from mindspore.ops._op_impl._custom_op.fused_abs_max1_impl import cus_fused_abs_max1
|
||||
|
||||
def infer_shape(self, data1_shape):
|
||||
ll = []
|
||||
|
@ -169,7 +169,7 @@ class CusImg2Col(PrimitiveWithInfer):
|
|||
self.strides = strides
|
||||
self.dilates = dilates
|
||||
self.mode = mode
|
||||
from mindspore.ops._op_impl._custom_op.img2col_impl import CusImg2Col
|
||||
from mindspore.ops._op_impl._custom_op.img2col_impl import cus_img2col
|
||||
|
||||
def infer_shape(self, data1_shape):
|
||||
bs, c, h, w = data1_shape
|
||||
|
|
|
@ -22,7 +22,7 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
|||
from src.crossentropy import CrossEntropy
|
||||
from src.config import config
|
||||
from src.dataset import create_dataset
|
||||
from src.resnet_thor import resnet50 as resnet
|
||||
from src.resnet import resnet50 as resnet
|
||||
|
||||
parser = argparse.ArgumentParser(description='Image classification')
|
||||
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
|
||||
|
|
|
@ -25,17 +25,18 @@ config = ed({
|
|||
"momentum": 0.9,
|
||||
"weight_decay": 5e-4,
|
||||
"epoch_size": 45,
|
||||
"pretrain_epoch_size": 0,
|
||||
"save_checkpoint": True,
|
||||
"save_checkpoint_epochs": 1,
|
||||
"save_checkpoint_epochs": 2,
|
||||
"keep_checkpoint_max": 15,
|
||||
"save_checkpoint_path": "./",
|
||||
"use_label_smooth": True,
|
||||
"label_smooth_factor": 0.1,
|
||||
"lr_init": 0.045,
|
||||
"lr_decay": 6,
|
||||
"lr_end_epoch": 70,
|
||||
"damping_init": 0.03,
|
||||
"damping_decay": 0.87,
|
||||
"lr_init": 0.05803,
|
||||
"lr_decay": 4.04839,
|
||||
"lr_end_epoch": 53,
|
||||
"damping_init": 0.02714,
|
||||
"damping_decay": 0.50036,
|
||||
"frequency": 834,
|
||||
"use_dynamic_frequency": False,
|
||||
"first_stage_steps": 835,
|
||||
|
|
|
@ -16,12 +16,63 @@
|
|||
import mindspore.nn as nn
|
||||
from mindspore import Tensor
|
||||
from mindspore.common import dtype as mstype
|
||||
from mindspore.nn.loss.loss import _Loss
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.ops import operations as P
|
||||
|
||||
|
||||
class CrossEntropy(_Loss):
|
||||
class Loss(nn.Cell):
|
||||
"""
|
||||
Base class for other losses.
|
||||
"""
|
||||
def __init__(self, reduction='mean'):
|
||||
super(Loss, self).__init__()
|
||||
if reduction is None:
|
||||
reduction = 'none'
|
||||
|
||||
if reduction not in ('mean', 'sum', 'none'):
|
||||
raise ValueError(f"reduction method for {reduction.lower()} is not supported")
|
||||
|
||||
self.average = True
|
||||
self.reduce = True
|
||||
if reduction == 'sum':
|
||||
self.average = False
|
||||
if reduction == 'none':
|
||||
self.reduce = False
|
||||
|
||||
self.reduce_mean = P.ReduceMean()
|
||||
self.reduce_sum = P.ReduceSum()
|
||||
self.mul = P.Mul()
|
||||
self.cast = P.Cast()
|
||||
|
||||
def get_axis(self, x):
|
||||
shape = F.shape(x)
|
||||
length = F.tuple_len(shape)
|
||||
perm = F.make_range(0, length)
|
||||
return perm
|
||||
|
||||
def get_loss(self, x, weights=1.0):
|
||||
"""
|
||||
Computes the weighted loss
|
||||
Args:
|
||||
weights: Optional `Tensor` whose rank is either 0, or the same rank as inputs, and must be broadcastable to
|
||||
inputs (i.e., all dimensions must be either `1`, or the same as the corresponding inputs dimension).
|
||||
"""
|
||||
input_dtype = x.dtype
|
||||
x = self.cast(x, mstype.float32)
|
||||
weights = self.cast(weights, mstype.float32)
|
||||
x = self.mul(weights, x)
|
||||
if self.reduce and self.average:
|
||||
x = self.reduce_mean(x, self.get_axis(x))
|
||||
if self.reduce and not self.average:
|
||||
x = self.reduce_sum(x, self.get_axis(x))
|
||||
x = self.cast(x, input_dtype)
|
||||
return x
|
||||
|
||||
def construct(self, base, target):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class CrossEntropy(Loss):
|
||||
"""CrossEntropy"""
|
||||
def __init__(self, smooth_factor=0., num_classes=1000):
|
||||
super(CrossEntropy, self).__init__()
|
||||
|
|
|
@ -1,191 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Dataset help for minddata dataset"""
|
||||
import math
|
||||
import os
|
||||
from mindspore._checkparam import Validator
|
||||
from mindspore import context
|
||||
from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes
|
||||
from mindspore.nn.wrap import GetNextSingleOp
|
||||
from mindspore.parallel._utils import _get_device_num, _need_to_full, _to_full_shapes
|
||||
|
||||
|
||||
def _send_data(dataset, epoch_num):
|
||||
"""Engine dataset to write data to tdt queue."""
|
||||
if not hasattr(dataset, '__has_sent__'):
|
||||
exec_dataset = dataset.__transfer_dataset__
|
||||
exec_dataset.send(epoch_num)
|
||||
dataset.__has_sent__ = True
|
||||
|
||||
|
||||
def _send_data_no_flag(dataset, epoch_num):
|
||||
"""Engine dataset to write data to tdt queue directly."""
|
||||
exec_dataset = dataset.__transfer_dataset__
|
||||
exec_dataset.send(epoch_num)
|
||||
|
||||
|
||||
class DatasetHelper:
|
||||
"""
|
||||
Help function to use the MindData dataset.
|
||||
|
||||
According to different contexts, change the iterations of dataset and use the same iteration for loop in different
|
||||
contexts.
|
||||
|
||||
Note:
|
||||
The iteration of DatasetHelper will provide one epoch data.
|
||||
|
||||
Args:
|
||||
dataset (DataSet): The training dataset iterator.
|
||||
dataset_sink_mode (bool): If true use GetNext to fetch the data, or else feed the data from host. Default: True.
|
||||
sink_size (int): Control the amount of data in each sink.
|
||||
If sink_size=-1, sink the complete dataset for each epoch.
|
||||
If sink_size>0, sink sink_size data for each epoch. Default: -1.
|
||||
epoch_num (int): Control the number of epoch data to send. Default: 1.
|
||||
|
||||
Examples:
|
||||
>>> dataset_helper = DatasetHelper(dataset)
|
||||
>>> for inputs in dataset_helper:
|
||||
>>> outputs = network(*inputs)
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, dataset_sink_mode=True, sink_size=-1, epoch_num=1, iter_first_order=1):
|
||||
dataset_sink_mode = Validator.check_bool(dataset_sink_mode)
|
||||
Validator.check_is_int(sink_size)
|
||||
if sink_size < -1 or sink_size == 0:
|
||||
raise ValueError("The sink_size must be -1 or positive, but got sink_size {}.".format(sink_size))
|
||||
|
||||
if dataset_sink_mode:
|
||||
if context.get_context("device_target") == "Ascend":
|
||||
iterclass = _DatasetIterMSLoopSink
|
||||
self.iter = iterclass(dataset, sink_size, epoch_num, iter_first_order)
|
||||
elif context.get_context("device_target") == "GPU":
|
||||
iterclass = _DatasetIterMS
|
||||
self.iter = iterclass(dataset, sink_size, epoch_num)
|
||||
elif context.get_context("device_target") == "CPU":
|
||||
raise RuntimeError("Currently dataset sink mode is not supported when the device target is CPU.")
|
||||
|
||||
def __iter__(self):
|
||||
return self.iter.__iter__()
|
||||
|
||||
# A temp solution for loop sink. Delete later
|
||||
def types_shapes(self):
|
||||
"""Get the types and shapes from dataset on the current configuration."""
|
||||
return self.iter.types_shapes()
|
||||
|
||||
def sink_size(self):
|
||||
"""Get sink_size for each iteration."""
|
||||
return self.iter.get_sink_size()
|
||||
|
||||
def stop_send(self):
|
||||
"""Free up resources about data sink."""
|
||||
self.iter.stop_send()
|
||||
|
||||
|
||||
class _DatasetIter:
|
||||
"""Base iter for dataset helper"""
|
||||
def __init__(self, dataset, sink_size, epoch_num):
|
||||
self.dataset = dataset
|
||||
self.sink_size = sink_size
|
||||
self.sink_count = 1
|
||||
|
||||
if not hasattr(dataset, '__transfer_dataset__'):
|
||||
if hasattr(dataset, '__loop_size__'):
|
||||
self.sink_size = dataset.__loop_size__
|
||||
dataset.__transfer_dataset__ = _exec_datagraph(dataset, self.sink_size)
|
||||
|
||||
if not hasattr(dataset, '__no_send__'):
|
||||
_send_data(dataset, epoch_num)
|
||||
else:
|
||||
_send_data_no_flag(dataset, epoch_num)
|
||||
|
||||
self.stop_send = dataset.__transfer_dataset__.stop_send
|
||||
self.dataset_types, self.dataset_shapes = _get_types_and_shapes(dataset)
|
||||
|
||||
def __iter__(self):
|
||||
self.index = 0
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.index >= self.sink_count:
|
||||
raise StopIteration()
|
||||
self.index += 1
|
||||
return self.op()
|
||||
|
||||
def types_shapes(self):
|
||||
return self.dataset_types, self.dataset_shapes
|
||||
|
||||
def get_sink_count(self, dataset):
|
||||
sink_count = 1
|
||||
if hasattr(dataset, '__loop_size__'):
|
||||
loop_size = dataset.__loop_size__
|
||||
if loop_size <= dataset.get_dataset_size() and dataset.get_dataset_size() % loop_size != 0:
|
||||
raise ValueError(f'Dataset size {dataset.get_dataset_size()} and '
|
||||
f'sink_size {loop_size} are not matched.')
|
||||
sink_count = math.ceil(dataset.get_dataset_size() / loop_size)
|
||||
return sink_count
|
||||
|
||||
def get_sink_size(self):
|
||||
"""get sink_size to device"""
|
||||
sink_size = 1
|
||||
if hasattr(self.dataset, '__loop_size__'):
|
||||
sink_size = self.dataset.__loop_size__
|
||||
else:
|
||||
if context.get_context("enable_ge") or context.get_context("device_target") == "Ascend":
|
||||
if self.sink_size > 0:
|
||||
sink_size = self.sink_size
|
||||
else:
|
||||
sink_size = self.dataset.get_dataset_size()
|
||||
return sink_size
|
||||
|
||||
|
||||
class _DatasetIterMSLoopSink(_DatasetIter):
|
||||
"""Iter for context when device_target is Ascend"""
|
||||
def __init__(self, dataset, sink_size, epoch_num, iter_first_order):
|
||||
super().__init__(dataset, sink_size, epoch_num)
|
||||
sink_count = 1
|
||||
if hasattr(dataset, '__loop_size__'):
|
||||
loop_size = dataset.__loop_size__ + iter_first_order
|
||||
if loop_size <= dataset.get_dataset_size() and dataset.get_dataset_size() % loop_size != 0:
|
||||
raise ValueError(f'Dataset size {dataset.get_dataset_size()} and '
|
||||
f'sink_size {loop_size} are not matched.')
|
||||
sink_count = math.ceil(dataset.get_dataset_size() / loop_size) * 2
|
||||
self.sink_count = sink_count
|
||||
ms_role = os.getenv("MS_ROLE")
|
||||
if ms_role in ("MS_PSERVER", "MS_SCHED"):
|
||||
self.sink_count = 1
|
||||
# for self._parallel_mode equal to semi_auto_parallel or auto_parallel, and not using full_batch,
|
||||
# use a complete tensor to compile, and slice tensor to run. The batch dimension of tensors for
|
||||
# compile is device_number times the batch dimension of tensors for run. Now only support LoopSink.
|
||||
if _need_to_full():
|
||||
device_num = _get_device_num()
|
||||
self.dataset_shapes = _to_full_shapes(self.dataset_shapes, device_num)
|
||||
|
||||
def op():
|
||||
return tuple()
|
||||
|
||||
self.op = op
|
||||
|
||||
|
||||
class _DatasetIterMS(_DatasetIter):
|
||||
"""Iter for MS when enable_loop_sink is False."""
|
||||
def __init__(self, dataset, sink_size, epoch_num):
|
||||
super().__init__(dataset, sink_size, epoch_num)
|
||||
if sink_size > 0:
|
||||
self.sink_count = sink_size
|
||||
else:
|
||||
self.sink_count = dataset.get_dataset_size()
|
||||
|
||||
queue_name = dataset.__transfer_dataset__.queue_name
|
||||
self.op = GetNextSingleOp(self.dataset_types, self.dataset_shapes, queue_name)
|
|
@ -1,135 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""grad reducer cell for distributed training"""
|
||||
from mindspore.nn.cell import Cell
|
||||
from mindspore.communication.management import GlobalComm, get_group_size
|
||||
from mindspore.ops import functional as F, composite as C, operations as P
|
||||
from mindspore.ops.operations.comm_ops import AllReduce
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
reduce_opt = C.MultitypeFuncGraph("reduce_opt")
|
||||
|
||||
|
||||
def _init_allreduce_operators(length, split_indices):
|
||||
""" initialize allreduce communication operators"""
|
||||
indices = split_indices[0]
|
||||
fusion = split_indices[1]
|
||||
op_list = ()
|
||||
j = 0
|
||||
for i in range(length):
|
||||
if j <= len(indices)-1:
|
||||
temp = indices[j]
|
||||
else:
|
||||
temp = length
|
||||
if i >= temp:
|
||||
j = j + 1
|
||||
fusion = fusion + 1
|
||||
op = AllReduce('sum', GlobalComm.WORLD_COMM_GROUP)
|
||||
op.add_prim_attr('fusion', fusion)
|
||||
op_list = op_list + (op,)
|
||||
return op_list
|
||||
|
||||
|
||||
@reduce_opt.register("Function", "Number", "Function", "Tensor")
|
||||
def _tensors_allreduce_mean(mul, degree, allreduce, parameters):
|
||||
"""
|
||||
Apply allreduce on parameters.
|
||||
|
||||
Args:
|
||||
mul(Primitive): The mul operator for parameters.
|
||||
degree (int): The mean coefficient.
|
||||
allreduce (Primitive): The communication operator for parameters.
|
||||
parameters (Tensor): The parameters before operation.
|
||||
|
||||
Returns:
|
||||
Tensor, the parameters after operation.
|
||||
"""
|
||||
degree = F.scalar_cast(degree, F.dtype(parameters))
|
||||
parameters = allreduce(parameters)
|
||||
cast_op = P.Cast()
|
||||
return mul(parameters, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(parameters)))
|
||||
|
||||
|
||||
_get_datatype = C.MultitypeFuncGraph("_get_datatype")
|
||||
|
||||
|
||||
@_get_datatype.register("Tensor")
|
||||
def _tensors_get_datatype(parameters):
|
||||
"""
|
||||
Acquire parameters datatype.
|
||||
|
||||
Args:
|
||||
parameters (Tensor): The parameters before operation.
|
||||
|
||||
Returns:
|
||||
mstype, the datatype of parameters.
|
||||
"""
|
||||
return F.dtype(parameters)
|
||||
|
||||
|
||||
_cast_datatype = C.MultitypeFuncGraph("_cast_datatype")
|
||||
|
||||
|
||||
@_cast_datatype.register("TypeType", "Tensor")
|
||||
def _tensors_cast_datatype(datatype, parameters):
|
||||
"""
|
||||
Cast parameters to datatype.
|
||||
|
||||
Args:
|
||||
datatype (mstype): the destination datatype of parameters.
|
||||
parameters (Tensor): The parameters before operation.
|
||||
|
||||
Returns:
|
||||
Tensor, the parameters after operation.
|
||||
"""
|
||||
return F.cast(parameters, datatype)
|
||||
|
||||
|
||||
class DistributedGradReducerThor(Cell):
|
||||
"""
|
||||
A distributed optimizer.
|
||||
|
||||
Constructs a parameters reducer Cell, which applies communication and average operations on
|
||||
single-process parameters values.
|
||||
|
||||
Args:
|
||||
parameter_length (int): length of the parameters to be updated.
|
||||
split_indices(tuple): parameter split indices.
|
||||
mean (bool): When mean is true, the mean coefficient (degree) would apply on parameters. Default: False.
|
||||
degree (int): The mean coefficient. Usually it equals to device number. Default: None.
|
||||
|
||||
Raises:
|
||||
ValueError: If degree is not a int or less than 0.
|
||||
"""
|
||||
|
||||
def __init__(self, parameter_length, split_indices, mean=True, degree=None):
|
||||
super(DistributedGradReducerThor, self).__init__(auto_prefix=False)
|
||||
self.hyper_map = C.HyperMap()
|
||||
self.mul = P.Mul()
|
||||
if degree is None:
|
||||
self.degree = get_group_size()
|
||||
else:
|
||||
if not isinstance(degree, int) or degree <= 0:
|
||||
raise ValueError("Parameter 'degree' in DistributedGradReducer should large than 0 and be int")
|
||||
self.degree = degree
|
||||
self.mean = mean
|
||||
self.op_list = _init_allreduce_operators(parameter_length, split_indices)
|
||||
|
||||
def construct(self, parameters):
|
||||
datatypes = self.hyper_map(F.partial(_get_datatype), parameters)
|
||||
parameters = self.hyper_map(F.partial(_cast_datatype, mstype.float32), parameters)
|
||||
new_parameters = self.hyper_map(F.partial(reduce_opt, self.mul, self.degree), self.op_list, parameters)
|
||||
new_parameters = self.hyper_map(F.partial(_cast_datatype), datatypes, new_parameters)
|
||||
return new_parameters
|
|
@ -1,267 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Model."""
|
||||
|
||||
import math
|
||||
from mindspore.train.callback import RunContext
|
||||
from mindspore import context
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.dataset_helper import connect_network_with_dataset
|
||||
from mindspore.parallel._utils import _need_to_full, _to_full_tensor
|
||||
from mindspore.common.dtype import pytype_to_dtype
|
||||
from mindspore._c_expression import init_exec_dataset
|
||||
|
||||
from src.dataset_helper import DatasetHelper
|
||||
|
||||
def _convert_type(types):
|
||||
"""
|
||||
Convert from numpy type to tensor type.
|
||||
|
||||
Args:
|
||||
types (list): Numpy type list of element in dataset.
|
||||
|
||||
Returns:
|
||||
list, list of element in dataset.
|
||||
"""
|
||||
ms_types = []
|
||||
for np_type in types:
|
||||
ms_type = pytype_to_dtype(np_type)
|
||||
ms_types.append(ms_type)
|
||||
return ms_types
|
||||
|
||||
|
||||
def _get_types_and_shapes(dataset):
|
||||
"""Get dataset types and shapes."""
|
||||
dataset_types = _convert_type(dataset.output_types())
|
||||
dataset_shapes = dataset.output_shapes()
|
||||
return dataset_types, dataset_shapes
|
||||
|
||||
|
||||
def _exec_datagraph(exec_dataset, dataset_size, phase='dataset'):
|
||||
"""Initialize and execute the dataset graph."""
|
||||
batch_size = exec_dataset.get_batch_size()
|
||||
input_indexs = exec_dataset.input_indexs
|
||||
|
||||
# transform data format
|
||||
dataset_types, dataset_shapes = _get_types_and_shapes(exec_dataset)
|
||||
init_exec_dataset(exec_dataset.__transfer_dataset__.queue_name,
|
||||
dataset_size,
|
||||
batch_size,
|
||||
dataset_types,
|
||||
dataset_shapes,
|
||||
input_indexs,
|
||||
phase=phase,
|
||||
need_run=False)
|
||||
|
||||
|
||||
class Model_Thor(Model):
|
||||
"""
|
||||
High-Level API for Training or Testing.
|
||||
|
||||
`Model` groups layers into an object with training and inference features.
|
||||
|
||||
Args:
|
||||
network (Cell): A training or testing network.
|
||||
loss_fn (Cell): Objective function, if loss_fn is None, the
|
||||
network should contain the logic of loss and grads calculation, and the logic
|
||||
of parallel if needed. Default: None.
|
||||
optimizer (Cell): Optimizer for updating the weights. Default: None.
|
||||
metrics (Union[dict, set]): A Dictionary or a set of metrics to be evaluated by the model during
|
||||
training and testing. eg: {'accuracy', 'recall'}. Default: None.
|
||||
eval_network (Cell): Network for evaluation. If not defined, `network` and `loss_fn` would be wrapped as
|
||||
`eval_network`. Default: None.
|
||||
eval_indexes (list): When defining the `eval_network`, if `eval_indexes` is None, all outputs of the
|
||||
`eval_network` would be passed to metrics, otherwise `eval_indexes` must contain three
|
||||
elements, including the positions of loss value, predicted value and label. The loss
|
||||
value would be passed to the `Loss` metric, the predicted value and label would be passed
|
||||
to other metric. Default: None.
|
||||
amp_level (str): Option for argument `level` in `mindspore.amp.build_train_network`, level for mixed
|
||||
precision training. Supports [O0, O2, O3]. Default: "O0".
|
||||
|
||||
- O0: Do not change.
|
||||
- O2: Cast network to float16, keep batchnorm run in float32, using dynamic loss scale.
|
||||
- O3: Cast network to float16, with additional property 'keep_batchnorm_fp32=False'.
|
||||
|
||||
O2 is recommended on GPU, O3 is recommended on Ascend.
|
||||
|
||||
loss_scale_manager (Union[None, LossScaleManager]): If it is None, the loss would not be scaled. Otherwise,
|
||||
scale the loss by LossScaleManager. It is a key argument.
|
||||
e.g. Use `loss_scale_manager=None` to set the value.
|
||||
keep_batchnorm_fp32 (bool): Keep Batchnorm running in `float32`. If it is set to true, the level setting before
|
||||
will be overwritten. Default: True.
|
||||
"""
|
||||
|
||||
def __init__(self, network, loss_fn=None, optimizer=None, metrics=None, eval_network=None,
|
||||
eval_indexes=None, amp_level="O0", frequency=834, use_dynamic_frequency=False,
|
||||
first_stage_steps=5, **kwargs):
|
||||
super(Model_Thor, self).__init__(network, loss_fn, optimizer, metrics, eval_network,
|
||||
eval_indexes, amp_level, **kwargs)
|
||||
self._frequency = frequency
|
||||
self._use_dynamic_frequency = use_dynamic_frequency
|
||||
self._first_stage_steps = first_stage_steps
|
||||
self._train_network = self._build_train_network()
|
||||
|
||||
def _exec_preprocess(self, network, is_train, phase, dataset, dataset_sink_mode, sink_size=-1,
|
||||
epoch_num=1, iter_first_order=1):
|
||||
"""Initializes dataset."""
|
||||
if dataset_sink_mode and not is_train:
|
||||
dataset.__loop_size__ = 1
|
||||
dataset_helper = DatasetHelper(dataset, dataset_sink_mode, sink_size, epoch_num, iter_first_order)
|
||||
|
||||
if dataset_sink_mode and context.get_context("device_target") != "GPU":
|
||||
network = connect_network_with_dataset(network, dataset_helper)
|
||||
network.set_train(is_train)
|
||||
network.phase = phase
|
||||
|
||||
if self._parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
|
||||
network.set_auto_parallel()
|
||||
|
||||
return dataset_helper, network
|
||||
|
||||
def _get_iter_second_steps(self, cb_params, sink_size):
|
||||
"""get first stage steps for second order."""
|
||||
iter_second_steps = 1
|
||||
if self._use_dynamic_frequency:
|
||||
global_steps = (cb_params.cur_epoch_num - 1) * sink_size + cb_params.cur_step_num
|
||||
if global_steps <= self._first_stage_steps:
|
||||
iter_second_steps = self._first_stage_steps
|
||||
return iter_second_steps
|
||||
|
||||
def _get_ascend_sink_count(self, cb_params, dataset_helper, sink_size, iter_first_order, ori_sink_count):
|
||||
"""get ascend sink count for each epoch."""
|
||||
if context.get_context("device_target") == "Ascend":
|
||||
if self._use_dynamic_frequency and cb_params.cur_epoch_num == 1:
|
||||
fix_fre_sink_size = sink_size - self._first_stage_steps - iter_first_order
|
||||
first_epoch_sink_count = math.ceil(fix_fre_sink_size / self._frequency) * 2 + 2
|
||||
dataset_helper.iter.sink_count = first_epoch_sink_count
|
||||
else:
|
||||
dataset_helper.iter.sink_count = ori_sink_count
|
||||
|
||||
def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None, sink_size=-1):
|
||||
"""
|
||||
Training process. The data would be passed to network through dataset channel.
|
||||
|
||||
Args:
|
||||
epoch (int): Total number of iterations on the data.
|
||||
train_dataset (Dataset): A training dataset iterator. If there is no
|
||||
loss_fn, a tuple with multiple data (data1, data2, data3, ...) should be
|
||||
returned and passed to the network. Otherwise, a tuple (data, label) should
|
||||
be returned. The data and label would be passed to the network and loss
|
||||
function respectively.
|
||||
list_callback (Callback): Executor of callback list. Default: None.
|
||||
cb_params (_InternalCallbackParam): Callback parameters. Default: None.
|
||||
sink_size (int): Control the amount of data in each sink. Default: -1.
|
||||
"""
|
||||
if sink_size == -1:
|
||||
epoch_num = epoch
|
||||
else:
|
||||
epoch_num = math.ceil(epoch * sink_size / train_dataset.get_dataset_size())
|
||||
|
||||
iter_first_order = self._frequency - 1
|
||||
iter_second_order = 1
|
||||
train_dataset.__loop_size__ = iter_second_order
|
||||
dataset_helper, train_network = self._exec_preprocess(self._train_network,
|
||||
is_train=True,
|
||||
phase='train',
|
||||
dataset=train_dataset,
|
||||
dataset_sink_mode=True,
|
||||
sink_size=sink_size,
|
||||
epoch_num=epoch_num,
|
||||
iter_first_order=iter_first_order)
|
||||
|
||||
self._train_network = train_network
|
||||
cb_params.train_network = self._train_network
|
||||
cb_params.cur_step_num = 0
|
||||
|
||||
run_context = RunContext(cb_params)
|
||||
list_callback.begin(run_context)
|
||||
|
||||
# used to stop training for early stop, such as stopAtTIme or stopATStep
|
||||
should_stop = False
|
||||
switch_branch_one = True
|
||||
index_first_order = 0
|
||||
train_network_init_flag = True
|
||||
has_do_dataset_init = False
|
||||
|
||||
ori_sink_count = dataset_helper.iter.sink_count
|
||||
for i in range(epoch):
|
||||
cb_params.cur_epoch_num = i + 1
|
||||
list_callback.epoch_begin(run_context)
|
||||
self._get_ascend_sink_count(cb_params, dataset_helper, sink_size, iter_first_order, ori_sink_count)
|
||||
|
||||
# for data sink dataset_helper only iter once, other wise iter epoch_size times.
|
||||
for inputs in dataset_helper:
|
||||
if _need_to_full() and context.get_context("device_target") == "GPU":
|
||||
inputs = _to_full_tensor(inputs, self._device_number, self._global_rank)
|
||||
list_callback.step_begin(run_context)
|
||||
if context.get_context("device_target") == "GPU":
|
||||
if switch_branch_one:
|
||||
cb_params.cur_step_num += 1
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=True)
|
||||
self._train_network.phase = 'train0'
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
is_first_stage = self._use_dynamic_frequency and cb_params.cur_epoch_num == 1 \
|
||||
and cb_params.cur_step_num < self._first_stage_steps
|
||||
if is_first_stage:
|
||||
continue
|
||||
else:
|
||||
switch_branch_one = not switch_branch_one
|
||||
list_callback.step_end(run_context)
|
||||
else:
|
||||
cb_params.cur_step_num += 1
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=False)
|
||||
train_network_init_flag = False
|
||||
self._train_network.phase = 'train1'
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
index_first_order += 1
|
||||
if index_first_order == iter_first_order:
|
||||
index_first_order = 0
|
||||
switch_branch_one = not switch_branch_one
|
||||
list_callback.step_end(run_context)
|
||||
else:
|
||||
if switch_branch_one:
|
||||
cb_params.cur_step_num += self._get_iter_second_steps(cb_params, sink_size)
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=True)
|
||||
self._train_network.phase = 'train0'
|
||||
else:
|
||||
cb_params.cur_step_num += iter_first_order
|
||||
if train_network_init_flag:
|
||||
self._train_network.add_flags_recursive(thor=False)
|
||||
train_network_init_flag = False
|
||||
self._train_network.phase = 'train1'
|
||||
if not has_do_dataset_init:
|
||||
_exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset')
|
||||
has_do_dataset_init = True
|
||||
switch_branch_one = not switch_branch_one
|
||||
outputs = self._train_network(*inputs)
|
||||
cb_params.net_outputs = outputs
|
||||
list_callback.step_end(run_context)
|
||||
|
||||
list_callback.epoch_end(run_context)
|
||||
should_stop = should_stop or run_context.get_stop_requested()
|
||||
if should_stop:
|
||||
break
|
||||
dataset_helper.stop_send()
|
||||
|
||||
list_callback.end(run_context)
|
||||
|
||||
|
||||
__all__ = ["Model_Thor"]
|
|
@ -0,0 +1,573 @@
|
|||
# Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""ResNet."""
|
||||
import math
|
||||
import numpy as np
|
||||
from scipy.stats import truncnorm
|
||||
import mindspore.nn as nn
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.common.tensor import Tensor
|
||||
|
||||
|
||||
def _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size):
|
||||
fan_in = in_channel * kernel_size * kernel_size
|
||||
scale = 1.0
|
||||
scale /= max(1., fan_in)
|
||||
stddev = (scale ** 0.5) / .87962566103423978
|
||||
mu, sigma = 0, stddev
|
||||
weight = truncnorm(-2, 2, loc=mu, scale=sigma).rvs(out_channel * in_channel * kernel_size * kernel_size)
|
||||
weight = np.reshape(weight, (out_channel, in_channel, kernel_size, kernel_size))
|
||||
return Tensor(weight, dtype=mstype.float32)
|
||||
|
||||
|
||||
def _weight_variable(shape, factor=0.01):
|
||||
init_value = np.random.randn(*shape).astype(np.float32) * factor
|
||||
return Tensor(init_value)
|
||||
|
||||
|
||||
def calculate_gain(nonlinearity, param=None):
|
||||
"""calculate_gain"""
|
||||
linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d']
|
||||
res = 0
|
||||
if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
|
||||
res = 1
|
||||
elif nonlinearity == 'tanh':
|
||||
res = 5.0 / 3
|
||||
elif nonlinearity == 'relu':
|
||||
res = math.sqrt(2.0)
|
||||
elif nonlinearity == 'leaky_relu':
|
||||
if param is None:
|
||||
negative_slope = 0.01
|
||||
elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
|
||||
# True/False are instances of int, hence check above
|
||||
negative_slope = param
|
||||
else:
|
||||
raise ValueError("negative_slope {} not a valid number".format(param))
|
||||
res = math.sqrt(2.0 / (1 + negative_slope ** 2))
|
||||
else:
|
||||
raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
|
||||
return res
|
||||
|
||||
|
||||
def _calculate_fan_in_and_fan_out(tensor):
|
||||
"""_calculate_fan_in_and_fan_out"""
|
||||
dimensions = len(tensor)
|
||||
if dimensions < 2:
|
||||
raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
|
||||
if dimensions == 2: # Linear
|
||||
fan_in = tensor[1]
|
||||
fan_out = tensor[0]
|
||||
else:
|
||||
num_input_fmaps = tensor[1]
|
||||
num_output_fmaps = tensor[0]
|
||||
receptive_field_size = 1
|
||||
if dimensions > 2:
|
||||
receptive_field_size = tensor[2] * tensor[3]
|
||||
fan_in = num_input_fmaps * receptive_field_size
|
||||
fan_out = num_output_fmaps * receptive_field_size
|
||||
return fan_in, fan_out
|
||||
|
||||
|
||||
def _calculate_correct_fan(tensor, mode):
|
||||
mode = mode.lower()
|
||||
valid_modes = ['fan_in', 'fan_out']
|
||||
if mode not in valid_modes:
|
||||
raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
|
||||
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
|
||||
return fan_in if mode == 'fan_in' else fan_out
|
||||
|
||||
|
||||
def kaiming_normal(inputs_shape, a=0, mode='fan_in', nonlinearity='leaky_relu'):
|
||||
fan = _calculate_correct_fan(inputs_shape, mode)
|
||||
gain = calculate_gain(nonlinearity, a)
|
||||
std = gain / math.sqrt(fan)
|
||||
return np.random.normal(0, std, size=inputs_shape).astype(np.float32)
|
||||
|
||||
|
||||
def kaiming_uniform(inputs_shape, a=0., mode='fan_in', nonlinearity='leaky_relu'):
|
||||
fan = _calculate_correct_fan(inputs_shape, mode)
|
||||
gain = calculate_gain(nonlinearity, a)
|
||||
std = gain / math.sqrt(fan)
|
||||
bound = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation
|
||||
return np.random.uniform(-bound, bound, size=inputs_shape).astype(np.float32)
|
||||
|
||||
|
||||
def _conv3x3(in_channel, out_channel, stride=1, use_se=False, res_base=False):
|
||||
if use_se:
|
||||
weight = _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=3)
|
||||
else:
|
||||
weight_shape = (out_channel, in_channel, 3, 3)
|
||||
weight = Tensor(kaiming_normal(weight_shape, mode="fan_out", nonlinearity='relu'))
|
||||
if res_base:
|
||||
return nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=stride,
|
||||
padding=1, pad_mode='pad', weight_init=weight)
|
||||
return nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=stride,
|
||||
padding=0, pad_mode='same', weight_init=weight)
|
||||
|
||||
|
||||
def _conv1x1(in_channel, out_channel, stride=1, use_se=False, res_base=False):
|
||||
if use_se:
|
||||
weight = _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=1)
|
||||
else:
|
||||
weight_shape = (out_channel, in_channel, 1, 1)
|
||||
weight = Tensor(kaiming_normal(weight_shape, mode="fan_out", nonlinearity='relu'))
|
||||
if res_base:
|
||||
return nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride,
|
||||
padding=0, pad_mode='pad', weight_init=weight)
|
||||
return nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride,
|
||||
padding=0, pad_mode='same', weight_init=weight)
|
||||
|
||||
|
||||
def _conv7x7(in_channel, out_channel, stride=1, use_se=False, res_base=False):
|
||||
if use_se:
|
||||
weight = _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size=7)
|
||||
else:
|
||||
weight_shape = (out_channel, in_channel, 7, 7)
|
||||
weight = Tensor(kaiming_normal(weight_shape, mode="fan_out", nonlinearity='relu'))
|
||||
if res_base:
|
||||
return nn.Conv2d(in_channel, out_channel,
|
||||
kernel_size=7, stride=stride, padding=3, pad_mode='pad', weight_init=weight)
|
||||
return nn.Conv2d(in_channel, out_channel,
|
||||
kernel_size=7, stride=stride, padding=0, pad_mode='same', weight_init=weight)
|
||||
|
||||
|
||||
def _bn(channel, res_base=False):
|
||||
if res_base:
|
||||
return nn.BatchNorm2d(channel, eps=1e-5, momentum=0.1,
|
||||
gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
|
||||
return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
|
||||
gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
|
||||
|
||||
|
||||
def _bn_last(channel):
|
||||
return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
|
||||
gamma_init=0, beta_init=0, moving_mean_init=0, moving_var_init=1)
|
||||
|
||||
|
||||
def _fc(in_channel, out_channel, use_se=False):
|
||||
if use_se:
|
||||
weight = np.random.normal(loc=0, scale=0.01, size=out_channel * in_channel)
|
||||
weight = Tensor(np.reshape(weight, (out_channel, in_channel)), dtype=mstype.float32)
|
||||
else:
|
||||
weight_shape = (out_channel, in_channel)
|
||||
weight = Tensor(kaiming_uniform(weight_shape, a=math.sqrt(5)))
|
||||
return nn.Dense(in_channel, out_channel, has_bias=True, weight_init=weight, bias_init=0)
|
||||
|
||||
|
||||
class ResidualBlock(nn.Cell):
|
||||
"""
|
||||
ResNet V1 residual block definition.
|
||||
|
||||
Args:
|
||||
in_channel (int): Input channel.
|
||||
out_channel (int): Output channel.
|
||||
stride (int): Stride size for the first convolutional layer. Default: 1.
|
||||
use_se (bool): Enable SE-ResNet50 net. Default: False.
|
||||
se_block(bool): Use se block in SE-ResNet50 net. Default: False.
|
||||
|
||||
Returns:
|
||||
Tensor, output tensor.
|
||||
|
||||
Examples:
|
||||
>>> ResidualBlock(3, 256, stride=2)
|
||||
"""
|
||||
expansion = 4
|
||||
|
||||
def __init__(self,
|
||||
in_channel,
|
||||
out_channel,
|
||||
stride=1,
|
||||
use_se=False, se_block=False):
|
||||
super(ResidualBlock, self).__init__()
|
||||
self.stride = stride
|
||||
self.use_se = use_se
|
||||
self.se_block = se_block
|
||||
channel = out_channel // self.expansion
|
||||
self.conv1 = _conv1x1(in_channel, channel, stride=1, use_se=self.use_se)
|
||||
self.bn1 = _bn(channel)
|
||||
if self.use_se and self.stride != 1:
|
||||
self.e2 = nn.SequentialCell([_conv3x3(channel, channel, stride=1, use_se=True), _bn(channel),
|
||||
nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='same')])
|
||||
else:
|
||||
self.conv2 = _conv3x3(channel, channel, stride=stride, use_se=self.use_se)
|
||||
self.bn2 = _bn(channel)
|
||||
|
||||
self.conv3 = _conv1x1(channel, out_channel, stride=1, use_se=self.use_se)
|
||||
self.bn3 = _bn_last(out_channel)
|
||||
if self.se_block:
|
||||
self.se_global_pool = P.ReduceMean(keep_dims=False)
|
||||
self.se_dense_0 = _fc(out_channel, int(out_channel / 4), use_se=self.use_se)
|
||||
self.se_dense_1 = _fc(int(out_channel / 4), out_channel, use_se=self.use_se)
|
||||
self.se_sigmoid = nn.Sigmoid()
|
||||
self.se_mul = P.Mul()
|
||||
self.relu = nn.ReLU()
|
||||
|
||||
self.down_sample = False
|
||||
|
||||
if stride != 1 or in_channel != out_channel:
|
||||
self.down_sample = True
|
||||
self.down_sample_layer = None
|
||||
|
||||
if self.down_sample:
|
||||
if self.use_se:
|
||||
if stride == 1:
|
||||
self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel,
|
||||
stride, use_se=self.use_se), _bn(out_channel)])
|
||||
else:
|
||||
self.down_sample_layer = nn.SequentialCell([nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='same'),
|
||||
_conv1x1(in_channel, out_channel, 1,
|
||||
use_se=self.use_se), _bn(out_channel)])
|
||||
else:
|
||||
self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride,
|
||||
use_se=self.use_se), _bn(out_channel)])
|
||||
|
||||
def construct(self, x):
|
||||
identity = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
if self.use_se and self.stride != 1:
|
||||
out = self.e2(out)
|
||||
else:
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
out = self.relu(out)
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
if self.se_block:
|
||||
out_se = out
|
||||
out = self.se_global_pool(out, (2, 3))
|
||||
out = self.se_dense_0(out)
|
||||
out = self.relu(out)
|
||||
out = self.se_dense_1(out)
|
||||
out = self.se_sigmoid(out)
|
||||
out = F.reshape(out, F.shape(out) + (1, 1))
|
||||
out = self.se_mul(out, out_se)
|
||||
|
||||
if self.down_sample:
|
||||
identity = self.down_sample_layer(identity)
|
||||
|
||||
out = out + identity
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ResidualBlockBase(nn.Cell):
|
||||
"""
|
||||
ResNet V1 residual block definition.
|
||||
|
||||
Args:
|
||||
in_channel (int): Input channel.
|
||||
out_channel (int): Output channel.
|
||||
stride (int): Stride size for the first convolutional layer. Default: 1.
|
||||
use_se (bool): Enable SE-ResNet50 net. Default: False.
|
||||
se_block(bool): Use se block in SE-ResNet50 net. Default: False.
|
||||
res_base (bool): Enable parameter setting of resnet18. Default: True.
|
||||
|
||||
Returns:
|
||||
Tensor, output tensor.
|
||||
|
||||
Examples:
|
||||
>>> ResidualBlockBase(3, 256, stride=2)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channel,
|
||||
out_channel,
|
||||
stride=1,
|
||||
use_se=False,
|
||||
se_block=False,
|
||||
res_base=True):
|
||||
super(ResidualBlockBase, self).__init__()
|
||||
self.res_base = res_base
|
||||
self.conv1 = _conv3x3(in_channel, out_channel, stride=stride, res_base=self.res_base)
|
||||
self.bn1d = _bn(out_channel)
|
||||
self.conv2 = _conv3x3(out_channel, out_channel, stride=1, res_base=self.res_base)
|
||||
self.bn2d = _bn(out_channel)
|
||||
self.relu = nn.ReLU()
|
||||
|
||||
self.down_sample = False
|
||||
if stride != 1 or in_channel != out_channel:
|
||||
self.down_sample = True
|
||||
|
||||
self.down_sample_layer = None
|
||||
if self.down_sample:
|
||||
self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride,
|
||||
use_se=use_se, res_base=self.res_base),
|
||||
_bn(out_channel, res_base)])
|
||||
|
||||
def construct(self, x):
|
||||
identity = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1d(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2d(out)
|
||||
|
||||
if self.down_sample:
|
||||
identity = self.down_sample_layer(identity)
|
||||
|
||||
out = out + identity
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ResNet(nn.Cell):
|
||||
"""
|
||||
ResNet architecture.
|
||||
|
||||
Args:
|
||||
block (Cell): Block for network.
|
||||
layer_nums (list): Numbers of block in different layers.
|
||||
in_channels (list): Input channel in each layer.
|
||||
out_channels (list): Output channel in each layer.
|
||||
strides (list): Stride size in each layer.
|
||||
num_classes (int): The number of classes that the training images are belonging to.
|
||||
use_se (bool): Enable SE-ResNet50 net. Default: False.
|
||||
se_block(bool): Use se block in SE-ResNet50 net in layer 3 and layer 4. Default: False.
|
||||
res_base (bool): Enable parameter setting of resnet18. Default: False.
|
||||
|
||||
Returns:
|
||||
Tensor, output tensor.
|
||||
|
||||
Examples:
|
||||
>>> ResNet(ResidualBlock,
|
||||
>>> [3, 4, 6, 3],
|
||||
>>> [64, 256, 512, 1024],
|
||||
>>> [256, 512, 1024, 2048],
|
||||
>>> [1, 2, 2, 2],
|
||||
>>> 10)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
block,
|
||||
layer_nums,
|
||||
in_channels,
|
||||
out_channels,
|
||||
strides,
|
||||
num_classes,
|
||||
use_se=False,
|
||||
res_base=False):
|
||||
super(ResNet, self).__init__()
|
||||
|
||||
if not len(layer_nums) == len(in_channels) == len(out_channels) == 4:
|
||||
raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!")
|
||||
self.use_se = use_se
|
||||
self.res_base = res_base
|
||||
self.se_block = False
|
||||
if self.use_se:
|
||||
self.se_block = True
|
||||
|
||||
if self.use_se:
|
||||
self.conv1_0 = _conv3x3(3, 32, stride=2, use_se=self.use_se)
|
||||
self.bn1_0 = _bn(32)
|
||||
self.conv1_1 = _conv3x3(32, 32, stride=1, use_se=self.use_se)
|
||||
self.bn1_1 = _bn(32)
|
||||
self.conv1_2 = _conv3x3(32, 64, stride=1, use_se=self.use_se)
|
||||
else:
|
||||
self.conv1 = _conv7x7(3, 64, stride=2, res_base=self.res_base)
|
||||
self.bn1 = _bn(64, self.res_base)
|
||||
self.relu = P.ReLU()
|
||||
|
||||
if self.res_base:
|
||||
self.pad = nn.Pad(paddings=((0, 0), (0, 0), (1, 1), (1, 1)))
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="valid")
|
||||
else:
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
|
||||
|
||||
self.layer1 = self._make_layer(block,
|
||||
layer_nums[0],
|
||||
in_channel=in_channels[0],
|
||||
out_channel=out_channels[0],
|
||||
stride=strides[0],
|
||||
use_se=self.use_se)
|
||||
self.layer2 = self._make_layer(block,
|
||||
layer_nums[1],
|
||||
in_channel=in_channels[1],
|
||||
out_channel=out_channels[1],
|
||||
stride=strides[1],
|
||||
use_se=self.use_se)
|
||||
self.layer3 = self._make_layer(block,
|
||||
layer_nums[2],
|
||||
in_channel=in_channels[2],
|
||||
out_channel=out_channels[2],
|
||||
stride=strides[2],
|
||||
use_se=self.use_se,
|
||||
se_block=self.se_block)
|
||||
self.layer4 = self._make_layer(block,
|
||||
layer_nums[3],
|
||||
in_channel=in_channels[3],
|
||||
out_channel=out_channels[3],
|
||||
stride=strides[3],
|
||||
use_se=self.use_se,
|
||||
se_block=self.se_block)
|
||||
|
||||
self.mean = P.ReduceMean(keep_dims=True)
|
||||
self.flatten = nn.Flatten()
|
||||
self.end_point = _fc(out_channels[3], num_classes, use_se=self.use_se)
|
||||
|
||||
def _make_layer(self, block, layer_num, in_channel, out_channel, stride, use_se=False, se_block=False):
|
||||
"""
|
||||
Make stage network of ResNet.
|
||||
|
||||
Args:
|
||||
block (Cell): Resnet block.
|
||||
layer_num (int): Layer number.
|
||||
in_channel (int): Input channel.
|
||||
out_channel (int): Output channel.
|
||||
stride (int): Stride size for the first convolutional layer.
|
||||
se_block(bool): Use se block in SE-ResNet50 net. Default: False.
|
||||
Returns:
|
||||
SequentialCell, the output layer.
|
||||
|
||||
Examples:
|
||||
>>> _make_layer(ResidualBlock, 3, 128, 256, 2)
|
||||
"""
|
||||
layers = []
|
||||
|
||||
resnet_block = block(in_channel, out_channel, stride=stride, use_se=use_se)
|
||||
layers.append(resnet_block)
|
||||
if se_block:
|
||||
for _ in range(1, layer_num - 1):
|
||||
resnet_block = block(out_channel, out_channel, stride=1, use_se=use_se)
|
||||
layers.append(resnet_block)
|
||||
resnet_block = block(out_channel, out_channel, stride=1, use_se=use_se, se_block=se_block)
|
||||
layers.append(resnet_block)
|
||||
else:
|
||||
for _ in range(1, layer_num):
|
||||
resnet_block = block(out_channel, out_channel, stride=1, use_se=use_se)
|
||||
layers.append(resnet_block)
|
||||
return nn.SequentialCell(layers)
|
||||
|
||||
def construct(self, x):
|
||||
if self.use_se:
|
||||
x = self.conv1_0(x)
|
||||
x = self.bn1_0(x)
|
||||
x = self.relu(x)
|
||||
x = self.conv1_1(x)
|
||||
x = self.bn1_1(x)
|
||||
x = self.relu(x)
|
||||
x = self.conv1_2(x)
|
||||
else:
|
||||
x = self.conv1(x)
|
||||
x = self.bn1(x)
|
||||
x = self.relu(x)
|
||||
if self.res_base:
|
||||
x = self.pad(x)
|
||||
c1 = self.maxpool(x)
|
||||
|
||||
c2 = self.layer1(c1)
|
||||
c3 = self.layer2(c2)
|
||||
c4 = self.layer3(c3)
|
||||
c5 = self.layer4(c4)
|
||||
|
||||
out = self.mean(c5, (2, 3))
|
||||
out = self.flatten(out)
|
||||
out = self.end_point(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def resnet18(class_num=10):
|
||||
"""
|
||||
Get ResNet18 neural network.
|
||||
|
||||
Args:
|
||||
class_num (int): Class number.
|
||||
|
||||
Returns:
|
||||
Cell, cell instance of ResNet18 neural network.
|
||||
|
||||
Examples:
|
||||
>>> net = resnet18(10)
|
||||
"""
|
||||
return ResNet(ResidualBlockBase,
|
||||
[2, 2, 2, 2],
|
||||
[64, 64, 128, 256],
|
||||
[64, 128, 256, 512],
|
||||
[1, 2, 2, 2],
|
||||
class_num,
|
||||
res_base=True)
|
||||
|
||||
|
||||
def resnet50(class_num=10):
|
||||
"""
|
||||
Get ResNet50 neural network.
|
||||
|
||||
Args:
|
||||
class_num (int): Class number.
|
||||
|
||||
Returns:
|
||||
Cell, cell instance of ResNet50 neural network.
|
||||
|
||||
Examples:
|
||||
>>> net = resnet50(10)
|
||||
"""
|
||||
return ResNet(ResidualBlock,
|
||||
[3, 4, 6, 3],
|
||||
[64, 256, 512, 1024],
|
||||
[256, 512, 1024, 2048],
|
||||
[1, 2, 2, 2],
|
||||
class_num)
|
||||
|
||||
|
||||
def se_resnet50(class_num=1001):
|
||||
"""
|
||||
Get SE-ResNet50 neural network.
|
||||
|
||||
Args:
|
||||
class_num (int): Class number.
|
||||
|
||||
Returns:
|
||||
Cell, cell instance of SE-ResNet50 neural network.
|
||||
|
||||
Examples:
|
||||
>>> net = se-resnet50(1001)
|
||||
"""
|
||||
return ResNet(ResidualBlock,
|
||||
[3, 4, 6, 3],
|
||||
[64, 256, 512, 1024],
|
||||
[256, 512, 1024, 2048],
|
||||
[1, 2, 2, 2],
|
||||
class_num,
|
||||
use_se=True)
|
||||
|
||||
|
||||
def resnet101(class_num=1001):
|
||||
"""
|
||||
Get ResNet101 neural network.
|
||||
|
||||
Args:
|
||||
class_num (int): Class number.
|
||||
|
||||
Returns:
|
||||
Cell, cell instance of ResNet101 neural network.
|
||||
|
||||
Examples:
|
||||
>>> net = resnet101(1001)
|
||||
"""
|
||||
return ResNet(ResidualBlock,
|
||||
[3, 4, 23, 3],
|
||||
[64, 256, 512, 1024],
|
||||
[256, 512, 1024, 2048],
|
||||
[1, 2, 2, 2],
|
||||
class_num)
|
|
@ -1,409 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""ResNet."""
|
||||
import math
|
||||
import numpy as np
|
||||
import mindspore.nn as nn
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore import context
|
||||
|
||||
from src.thor_layer import Conv2d_Thor, Dense_Thor, Conv2d_Thor_GPU, Dense_Thor_GPU
|
||||
|
||||
|
||||
def calculate_gain(nonlinearity, param=None):
|
||||
"""calculate_gain"""
|
||||
linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d']
|
||||
res = 0
|
||||
if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
|
||||
res = 1
|
||||
elif nonlinearity == 'tanh':
|
||||
res = 5.0 / 3
|
||||
elif nonlinearity == 'relu':
|
||||
res = math.sqrt(2.0)
|
||||
elif nonlinearity == 'leaky_relu':
|
||||
if param is None:
|
||||
negative_slope = 0.01
|
||||
elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
|
||||
# True/False are instances of int, hence check above
|
||||
negative_slope = param
|
||||
else:
|
||||
raise ValueError("negative_slope {} not a valid number".format(param))
|
||||
res = math.sqrt(2.0 / (1 + negative_slope ** 2))
|
||||
else:
|
||||
raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
|
||||
return res
|
||||
|
||||
|
||||
def _calculate_fan_in_and_fan_out(tensor):
|
||||
"""_calculate_fan_in_and_fan_out"""
|
||||
dimensions = len(tensor)
|
||||
if dimensions < 2:
|
||||
raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
|
||||
if dimensions == 2: # Linear
|
||||
fan_in = tensor[1]
|
||||
fan_out = tensor[0]
|
||||
else:
|
||||
num_input_fmaps = tensor[1]
|
||||
num_output_fmaps = tensor[0]
|
||||
receptive_field_size = 1
|
||||
if dimensions > 2:
|
||||
receptive_field_size = tensor[2] * tensor[3]
|
||||
fan_in = num_input_fmaps * receptive_field_size
|
||||
fan_out = num_output_fmaps * receptive_field_size
|
||||
return fan_in, fan_out
|
||||
|
||||
|
||||
def _calculate_correct_fan(tensor, mode):
|
||||
mode = mode.lower()
|
||||
valid_modes = ['fan_in', 'fan_out']
|
||||
if mode not in valid_modes:
|
||||
raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
|
||||
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
|
||||
return fan_in if mode == 'fan_in' else fan_out
|
||||
|
||||
|
||||
def kaiming_normal(inputs_shape, a=0, mode='fan_in', nonlinearity='leaky_relu'):
|
||||
fan = _calculate_correct_fan(inputs_shape, mode)
|
||||
gain = calculate_gain(nonlinearity, a)
|
||||
std = gain / math.sqrt(fan)
|
||||
return np.random.normal(0, std, size=inputs_shape).astype(np.float32)
|
||||
|
||||
|
||||
def kaiming_uniform(inputs_shape, a=0., mode='fan_in', nonlinearity='leaky_relu'):
|
||||
fan = _calculate_correct_fan(inputs_shape, mode)
|
||||
gain = calculate_gain(nonlinearity, a)
|
||||
std = gain / math.sqrt(fan)
|
||||
bound = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation
|
||||
return np.random.uniform(-bound, bound, size=inputs_shape).astype(np.float32)
|
||||
|
||||
|
||||
def _weight_variable(shape, factor=0.01):
|
||||
init_value = np.random.randn(*shape).astype(np.float32) * factor
|
||||
return Tensor(init_value)
|
||||
|
||||
|
||||
def _conv3x3(in_channel, out_channel, stride=1, damping=0.03, loss_scale=1, frequency=278, batch_size=32):
|
||||
weight_shape = (out_channel, in_channel, 3, 3)
|
||||
weight = Tensor(kaiming_normal(weight_shape, mode="fan_out", nonlinearity='relu'))
|
||||
if context.get_context('device_target') == "Ascend":
|
||||
layer = Conv2d_Thor(in_channel, out_channel,
|
||||
kernel_size=3, stride=stride, padding=0, pad_mode='same', weight_init=weight,
|
||||
damping=damping, loss_scale=loss_scale, frequency=frequency, batch_size=batch_size)
|
||||
else:
|
||||
layer = Conv2d_Thor_GPU(in_channel, out_channel,
|
||||
kernel_size=3, stride=stride, padding=0, pad_mode='same', weight_init=weight,
|
||||
damping=damping, loss_scale=loss_scale, frequency=frequency, batch_size=batch_size)
|
||||
return layer
|
||||
|
||||
|
||||
def _conv1x1(in_channel, out_channel, stride=1, damping=0.03, loss_scale=1, frequency=278, batch_size=32):
|
||||
weight_shape = (out_channel, in_channel, 1, 1)
|
||||
weight = Tensor(kaiming_normal(weight_shape, mode="fan_out", nonlinearity='relu'))
|
||||
if context.get_context('device_target') == "Ascend":
|
||||
layer = Conv2d_Thor(in_channel, out_channel,
|
||||
kernel_size=1, stride=stride, padding=0, pad_mode='same', weight_init=weight,
|
||||
damping=damping, loss_scale=loss_scale, frequency=frequency, batch_size=batch_size)
|
||||
else:
|
||||
layer = Conv2d_Thor_GPU(in_channel, out_channel,
|
||||
kernel_size=1, stride=stride, padding=0, pad_mode='same', weight_init=weight,
|
||||
damping=damping, loss_scale=loss_scale, frequency=frequency, batch_size=batch_size)
|
||||
return layer
|
||||
|
||||
|
||||
def _conv7x7(in_channel, out_channel, stride=1, damping=0.03, loss_scale=1, frequency=278, batch_size=32):
|
||||
weight_shape = (out_channel, in_channel, 7, 7)
|
||||
weight = Tensor(kaiming_normal(weight_shape, mode="fan_out", nonlinearity='relu'))
|
||||
if context.get_context('device_target') == "Ascend":
|
||||
layer = Conv2d_Thor(in_channel, out_channel,
|
||||
kernel_size=7, stride=stride, padding=0, pad_mode='same', weight_init=weight,
|
||||
damping=damping, loss_scale=loss_scale, frequency=frequency, batch_size=batch_size)
|
||||
else:
|
||||
layer = Conv2d_Thor_GPU(in_channel, out_channel,
|
||||
kernel_size=7, stride=stride, padding=0, pad_mode='same', weight_init=weight,
|
||||
damping=damping, loss_scale=loss_scale, frequency=frequency, batch_size=batch_size)
|
||||
return layer
|
||||
|
||||
|
||||
def _bn(channel):
|
||||
return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
|
||||
gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
|
||||
|
||||
|
||||
def _bn_last(channel):
|
||||
return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
|
||||
gamma_init=0, beta_init=0, moving_mean_init=0, moving_var_init=1)
|
||||
|
||||
|
||||
def _fc(in_channel, out_channel, damping, loss_scale, frequency, batch_size=32):
|
||||
weight_shape = (out_channel, in_channel)
|
||||
weight = Tensor(kaiming_uniform(weight_shape, a=math.sqrt(5)))
|
||||
if context.get_context('device_target') == "Ascend":
|
||||
layer = Dense_Thor(in_channel, out_channel, has_bias=False, weight_init=weight,
|
||||
bias_init=0, damping=damping, loss_scale=loss_scale, frequency=frequency,
|
||||
batch_size=batch_size)
|
||||
else:
|
||||
layer = Dense_Thor_GPU(in_channel, out_channel, has_bias=False, weight_init=weight,
|
||||
bias_init=0, damping=damping, loss_scale=loss_scale, frequency=frequency,
|
||||
batch_size=batch_size)
|
||||
return layer
|
||||
|
||||
|
||||
class ResidualBlock(nn.Cell):
|
||||
"""
|
||||
ResNet V1 residual block definition.
|
||||
|
||||
Args:
|
||||
in_channel (int): Input channel.
|
||||
out_channel (int): Output channel.
|
||||
stride (int): Stride size for the first convolutional layer. Default: 1.
|
||||
|
||||
Returns:
|
||||
Tensor, output tensor.
|
||||
|
||||
Examples:
|
||||
>>> ResidualBlock(3, 256, stride=2)
|
||||
"""
|
||||
expansion = 4
|
||||
|
||||
def __init__(self,
|
||||
in_channel,
|
||||
out_channel,
|
||||
stride=1,
|
||||
damping=0.03,
|
||||
loss_scale=1,
|
||||
frequency=278,
|
||||
batch_size=32):
|
||||
super(ResidualBlock, self).__init__()
|
||||
|
||||
channel = out_channel // self.expansion
|
||||
self.conv1 = _conv1x1(in_channel, channel, stride=1, damping=damping, loss_scale=loss_scale,
|
||||
frequency=frequency, batch_size=batch_size)
|
||||
self.bn1 = _bn(channel)
|
||||
|
||||
self.conv2 = _conv3x3(channel, channel, stride=stride, damping=damping, loss_scale=loss_scale,
|
||||
frequency=frequency, batch_size=batch_size)
|
||||
self.bn2 = _bn(channel)
|
||||
|
||||
self.conv3 = _conv1x1(channel, out_channel, stride=1, damping=damping, loss_scale=loss_scale,
|
||||
frequency=frequency, batch_size=batch_size)
|
||||
self.bn3 = _bn_last(out_channel)
|
||||
|
||||
self.relu = nn.ReLU()
|
||||
|
||||
self.down_sample = False
|
||||
|
||||
if stride != 1 or in_channel != out_channel:
|
||||
self.down_sample = True
|
||||
self.down_sample_layer = None
|
||||
|
||||
if self.down_sample:
|
||||
self.down_sample_layer = nn.SequentialCell([_conv1x1(in_channel, out_channel, stride,
|
||||
damping=damping, loss_scale=loss_scale,
|
||||
frequency=frequency,
|
||||
batch_size=batch_size),
|
||||
_bn(out_channel)])
|
||||
self.add = P.Add()
|
||||
|
||||
def construct(self, x):
|
||||
identity = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
if self.down_sample:
|
||||
identity = self.down_sample_layer(identity)
|
||||
|
||||
out = self.add(out, identity)
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ResNet(nn.Cell):
|
||||
"""
|
||||
ResNet architecture.
|
||||
|
||||
Args:
|
||||
block (Cell): Block for network.
|
||||
layer_nums (list): Numbers of block in different layers.
|
||||
in_channels (list): Input channel in each layer.
|
||||
out_channels (list): Output channel in each layer.
|
||||
strides (list): Stride size in each layer.
|
||||
num_classes (int): The number of classes that the training images are belonging to.
|
||||
Returns:
|
||||
Tensor, output tensor.
|
||||
|
||||
Examples:
|
||||
>>> ResNet(ResidualBlock,
|
||||
>>> [3, 4, 6, 3],
|
||||
>>> [64, 256, 512, 1024],
|
||||
>>> [256, 512, 1024, 2048],
|
||||
>>> [1, 2, 2, 2],
|
||||
>>> 10)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
block,
|
||||
layer_nums,
|
||||
in_channels,
|
||||
out_channels,
|
||||
strides,
|
||||
num_classes,
|
||||
damping,
|
||||
loss_scale,
|
||||
frequency,
|
||||
batch_size,
|
||||
include_top=True):
|
||||
super(ResNet, self).__init__()
|
||||
|
||||
if not len(layer_nums) == len(in_channels) == len(out_channels) == 4:
|
||||
raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!")
|
||||
|
||||
self.conv1 = _conv7x7(3, 64, stride=2, damping=damping, loss_scale=loss_scale,
|
||||
frequency=frequency, batch_size=batch_size)
|
||||
self.bn1 = _bn(64)
|
||||
self.relu = P.ReLU()
|
||||
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
|
||||
|
||||
self.layer1 = self._make_layer(block,
|
||||
layer_nums[0],
|
||||
in_channel=in_channels[0],
|
||||
out_channel=out_channels[0],
|
||||
stride=strides[0],
|
||||
damping=damping,
|
||||
loss_scale=loss_scale,
|
||||
frequency=frequency,
|
||||
batch_size=batch_size)
|
||||
self.layer2 = self._make_layer(block,
|
||||
layer_nums[1],
|
||||
in_channel=in_channels[1],
|
||||
out_channel=out_channels[1],
|
||||
stride=strides[1],
|
||||
damping=damping,
|
||||
loss_scale=loss_scale,
|
||||
frequency=frequency,
|
||||
batch_size=batch_size)
|
||||
self.layer3 = self._make_layer(block,
|
||||
layer_nums[2],
|
||||
in_channel=in_channels[2],
|
||||
out_channel=out_channels[2],
|
||||
stride=strides[2], damping=damping,
|
||||
loss_scale=loss_scale,
|
||||
frequency=frequency,
|
||||
batch_size=batch_size)
|
||||
self.layer4 = self._make_layer(block,
|
||||
layer_nums[3],
|
||||
in_channel=in_channels[3],
|
||||
out_channel=out_channels[3],
|
||||
stride=strides[3],
|
||||
damping=damping,
|
||||
loss_scale=loss_scale,
|
||||
frequency=frequency,
|
||||
batch_size=batch_size)
|
||||
self.include_top = include_top
|
||||
if self.include_top:
|
||||
self.mean = P.ReduceMean(keep_dims=True)
|
||||
self.flatten = nn.Flatten()
|
||||
self.end_point = _fc(out_channels[3], num_classes, damping=damping, loss_scale=loss_scale,
|
||||
frequency=frequency, batch_size=batch_size)
|
||||
|
||||
def _make_layer(self, block, layer_num, in_channel, out_channel, stride,
|
||||
damping, loss_scale, frequency, batch_size):
|
||||
"""
|
||||
Make stage network of ResNet.
|
||||
|
||||
Args:
|
||||
block (Cell): Resnet block.
|
||||
layer_num (int): Layer number.
|
||||
in_channel (int): Input channel.
|
||||
out_channel (int): Output channel.
|
||||
stride (int): Stride size for the first convolutional layer.
|
||||
|
||||
Returns:
|
||||
SequentialCell, the output layer.
|
||||
|
||||
Examples:
|
||||
>>> _make_layer(ResidualBlock, 3, 128, 256, 2)
|
||||
"""
|
||||
layers = []
|
||||
|
||||
resnet_block = block(in_channel, out_channel, stride=stride,
|
||||
damping=damping, loss_scale=loss_scale, frequency=frequency,
|
||||
batch_size=batch_size)
|
||||
layers.append(resnet_block)
|
||||
|
||||
for _ in range(1, layer_num):
|
||||
resnet_block = block(out_channel, out_channel, stride=1,
|
||||
damping=damping, loss_scale=loss_scale, frequency=frequency,
|
||||
batch_size=batch_size)
|
||||
layers.append(resnet_block)
|
||||
|
||||
return nn.SequentialCell(layers)
|
||||
|
||||
def construct(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.bn1(x)
|
||||
x = self.relu(x)
|
||||
c1 = self.maxpool(x)
|
||||
|
||||
c2 = self.layer1(c1)
|
||||
c3 = self.layer2(c2)
|
||||
c4 = self.layer3(c3)
|
||||
c5 = self.layer4(c4)
|
||||
|
||||
if not self.include_top:
|
||||
return x
|
||||
|
||||
out = self.mean(c5, (2, 3))
|
||||
out = self.flatten(out)
|
||||
out = self.end_point(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def resnet50(class_num=10, damping=0.03, loss_scale=1, frequency=278, batch_size=32, include_top=True):
|
||||
"""
|
||||
Get ResNet50 neural network.
|
||||
|
||||
Args:
|
||||
class_num (int): Class number.
|
||||
|
||||
Returns:
|
||||
Cell, cell instance of ResNet50 neural network.
|
||||
|
||||
Examples:
|
||||
>>> net = resnet50(10)
|
||||
"""
|
||||
return ResNet(ResidualBlock,
|
||||
[3, 4, 6, 3],
|
||||
[64, 256, 512, 1024],
|
||||
[256, 512, 1024, 2048],
|
||||
[1, 2, 2, 2],
|
||||
class_num,
|
||||
damping,
|
||||
loss_scale,
|
||||
frequency,
|
||||
batch_size,
|
||||
include_top)
|
|
@ -1,301 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""THOR"""
|
||||
from mindspore.ops import functional as F, composite as C, operations as P
|
||||
from mindspore.common.initializer import initializer
|
||||
from mindspore.common.parameter import Parameter, ParameterTuple
|
||||
from mindspore.common.tensor import Tensor
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore._checkparam import Validator
|
||||
from mindspore.nn.optim.optimizer import Optimizer
|
||||
from mindspore.parallel._utils import _get_device_num, _get_gradients_mean
|
||||
from src.grad_reducer_thor import DistributedGradReducerThor
|
||||
|
||||
_momentum_opt = C.MultitypeFuncGraph("momentum_opt")
|
||||
|
||||
op_add = P.AddN()
|
||||
apply_decay = C.MultitypeFuncGraph("apply_decay")
|
||||
|
||||
|
||||
@apply_decay.register("Number", "Bool", "Tensor", "Tensor")
|
||||
def _tensor_apply_decay(weight_decay, if_apply, weight, gradient):
|
||||
"""Get grad with weight_decay."""
|
||||
if if_apply:
|
||||
return op_add((weight * weight_decay, gradient))
|
||||
return gradient
|
||||
|
||||
|
||||
@_momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
|
||||
def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, moment):
|
||||
"""Apply momentum optimizer to the weight parameter using Tensor."""
|
||||
success = True
|
||||
success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum))
|
||||
return success
|
||||
|
||||
|
||||
class THOR_GPU(Optimizer):
|
||||
"""
|
||||
THOR
|
||||
"""
|
||||
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max,
|
||||
weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []):
|
||||
super(THOR_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale)
|
||||
Validator.check_value_type("momentum", momentum, [float], self.cls_name)
|
||||
if isinstance(momentum, float) and momentum < 0.0:
|
||||
raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
|
||||
self.momentum = Parameter(Tensor(momentum, mstype.float32))
|
||||
self.params = self.parameters
|
||||
self.use_nesterov = Validator.check_bool(use_nesterov)
|
||||
self.moments = self.params.clone(prefix="moments", init='zeros')
|
||||
self.hyper_map = C.HyperMap()
|
||||
self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)
|
||||
|
||||
self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
|
||||
1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
|
||||
1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
|
||||
1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
|
||||
1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||
1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||
1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||
1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
|
||||
1.0]
|
||||
self.feature_map_new = [x ** 0.5 for x in self.feature_map]
|
||||
self.transpose = P.Transpose()
|
||||
self.shape = P.Shape()
|
||||
self.reshape = P.Reshape()
|
||||
self.matmul = P.MatMul()
|
||||
self.matrix_A = ParameterTuple(matrix_A)
|
||||
self.matrix_G = ParameterTuple(matrix_G)
|
||||
self.A_inv_max = ParameterTuple(A_inv_max)
|
||||
self.G_inv_max = ParameterTuple(G_inv_max)
|
||||
self.assign = P.Assign()
|
||||
self.mul = P.Mul()
|
||||
|
||||
mean = _get_gradients_mean()
|
||||
degree = _get_device_num()
|
||||
|
||||
parameter_length = len(self.feature_map)
|
||||
self.grad_reducer_thorA = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree)
|
||||
self.grad_reducer_thorG = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree)
|
||||
self.weight_decay = weight_decay
|
||||
self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
|
||||
self.update_gradient = P.UpdateThorGradient(split_dim=128)
|
||||
|
||||
def construct(self, gradients):
|
||||
params = self.params
|
||||
moments = self.moments
|
||||
gradients = self.scale_grad(gradients)
|
||||
new_grads = ()
|
||||
if self.thor:
|
||||
matrix_A_allreduce = ()
|
||||
matrix_G_allreduce = ()
|
||||
for i in range(54):
|
||||
g = gradients[i * 3]
|
||||
matrix_A = self.matrix_A[i]
|
||||
matrix_G = self.matrix_G[i]
|
||||
matrix_A = F.depend(matrix_A, g)
|
||||
matrix_G = F.depend(matrix_G, g)
|
||||
matrix_A = self.mul(matrix_A, self.feature_map_new[i])
|
||||
matrix_G = self.mul(matrix_G, self.feature_map_new[i])
|
||||
matrix_A_allreduce = matrix_A_allreduce + (matrix_A,)
|
||||
matrix_G_allreduce = matrix_G_allreduce + (matrix_G,)
|
||||
matrix_A_allreduce = self.grad_reducer_thorA(matrix_A_allreduce)
|
||||
matrix_G_allreduce = self.grad_reducer_thorG(matrix_G_allreduce)
|
||||
for i in range(54):
|
||||
g = gradients[i * 3]
|
||||
g_shape = self.shape(g)
|
||||
g = self.reshape(g, (g_shape[0], -1))
|
||||
matrix_A = matrix_A_allreduce[i]
|
||||
matrix_G = matrix_G_allreduce[i]
|
||||
g = self.update_gradient(matrix_G, g, matrix_A)
|
||||
fake_A = self.assign(self.matrix_A[i], matrix_A)
|
||||
fake_G = self.assign(self.matrix_G[i], matrix_G)
|
||||
g = F.depend(g, fake_A)
|
||||
g = F.depend(g, fake_G)
|
||||
if i == 53:
|
||||
new_grads = new_grads + (g,)
|
||||
else:
|
||||
g = self.reshape(g, g_shape)
|
||||
new_grads = new_grads + (g, gradients[i * 3 + 1], gradients[i * 3 + 2])
|
||||
else:
|
||||
for i in range(54):
|
||||
g = gradients[i * 3]
|
||||
g_shape = self.shape(g)
|
||||
g = self.reshape(g, (g_shape[0], -1))
|
||||
matrix_A = self.matrix_A[i]
|
||||
matrix_G = self.matrix_G[i]
|
||||
g = self.update_gradient(matrix_G, g, matrix_A)
|
||||
if i == 53:
|
||||
new_grads = new_grads + (g,)
|
||||
else:
|
||||
g = self.reshape(g, g_shape)
|
||||
new_grads = new_grads + (g, gradients[i * 3 + 1], gradients[i * 3 + 2])
|
||||
gradients = new_grads
|
||||
if self.weight_decay > 0:
|
||||
gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags,
|
||||
params, gradients)
|
||||
lr = self.get_lr()
|
||||
success = self.hyper_map(F.partial(_momentum_opt, self.opt, self.momentum, lr), gradients, params, moments)
|
||||
return success
|
||||
|
||||
class THOR(Optimizer):
|
||||
"""THOR"""
|
||||
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0,
|
||||
loss_scale=1.0,
|
||||
decay_filter=lambda x: x.name not in []):
|
||||
super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale)
|
||||
if isinstance(momentum, float) and momentum < 0.0:
|
||||
raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
|
||||
self.momentum = Parameter(Tensor(momentum, mstype.float32))
|
||||
self.params = self.parameters
|
||||
self.moments = self.params.clone(prefix="moments", init='zeros')
|
||||
self.hyper_map = C.HyperMap()
|
||||
self.opt = P.ApplyMomentum()
|
||||
self.matrix_A = ParameterTuple(matrix_A)
|
||||
self.matrix_G = ParameterTuple(matrix_G)
|
||||
self.A_inv_max = ParameterTuple(A_inv_max)
|
||||
self.G_inv_max = ParameterTuple(G_inv_max)
|
||||
self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
|
||||
self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
|
||||
self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
|
||||
self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
|
||||
self.transpose = P.Transpose()
|
||||
self.shape = P.Shape()
|
||||
self.reshape = P.Reshape()
|
||||
self.mul = P.Mul()
|
||||
self.weight_idx = []
|
||||
for i in range(len(self.params)):
|
||||
if "conv" in self.params[i].name or "end_point" in self.params[i].name:
|
||||
self.weight_idx.append(i)
|
||||
self.weight_idx.append(len(self.params))
|
||||
self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
|
||||
1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
|
||||
1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
|
||||
1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
|
||||
1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||
1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||
1.0 / 196, 1.0 / 196, 1.0 / 196,
|
||||
1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
|
||||
1.0]
|
||||
mean = _get_gradients_mean()
|
||||
degree = _get_device_num()
|
||||
parameter_length = len(self.feature_map)
|
||||
self.grad_reducer_Amax = DistributedGradReducerThor(parameter_length, ((27,), 2), mean, degree)
|
||||
self.grad_reducer_Gmax = DistributedGradReducerThor(parameter_length, ((27,), 4), mean, degree)
|
||||
self.grad_reducer_A = DistributedGradReducerThor(parameter_length, ((27,), 6), mean, degree)
|
||||
self.grad_reducer_G = DistributedGradReducerThor(parameter_length, ((27,), 8), mean, degree)
|
||||
self.matrix_A_inv = ()
|
||||
self.matrix_G_inv = ()
|
||||
self.matrix_max_inv = ()
|
||||
|
||||
for i in range(54):
|
||||
self.matrix_max_inv = self.matrix_max_inv + (
|
||||
Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),)
|
||||
self.log = P.Log()
|
||||
self.exp = P.Exp()
|
||||
self.sqrt = P.Sqrt()
|
||||
self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
|
||||
self.assign = P.Assign()
|
||||
self.cast = P.Cast()
|
||||
self.thor = True
|
||||
self.weight_decay = weight_decay * loss_scale
|
||||
self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
|
||||
|
||||
def construct(self, gradients):
|
||||
params = self.params
|
||||
moments = self.moments
|
||||
if self.thor:
|
||||
matrix_A_allreduce = ()
|
||||
matrix_G_allreduce = ()
|
||||
matrix_A_max_allreduce = ()
|
||||
matrix_G_max_allreduce = ()
|
||||
for i in range(54):
|
||||
g = gradients[i * 3]
|
||||
matrix_A = self.matrix_A[i]
|
||||
matrix_G = self.matrix_G[i]
|
||||
A_max = self.A_inv_max[i]
|
||||
G_max = self.G_inv_max[i]
|
||||
matrix_A = F.depend(matrix_A, g)
|
||||
matrix_G = F.depend(matrix_G, g)
|
||||
A_max = F.depend(A_max, g)
|
||||
G_max = F.depend(G_max, g)
|
||||
matrix_A_allreduce = matrix_A_allreduce + (matrix_A,)
|
||||
matrix_G_allreduce = matrix_G_allreduce + (matrix_G,)
|
||||
matrix_A_max_allreduce = matrix_A_max_allreduce + (A_max,)
|
||||
matrix_G_max_allreduce = matrix_G_max_allreduce + (G_max,)
|
||||
matrix_A_allreduce = self.grad_reducer_A(matrix_A_allreduce)
|
||||
matrix_G_allreduce = self.grad_reducer_G(matrix_G_allreduce)
|
||||
matrix_A_max_allreduce = self.grad_reducer_Amax(matrix_A_max_allreduce)
|
||||
matrix_G_max_allreduce = self.grad_reducer_Gmax(matrix_G_max_allreduce)
|
||||
new_grads = ()
|
||||
for i in range(54):
|
||||
g = gradients[i * 3]
|
||||
temp_a = matrix_A_allreduce[i]
|
||||
temp_g = matrix_G_allreduce[i]
|
||||
temp_a = self.cast(temp_a, mstype.float32)
|
||||
temp_g = self.cast(temp_g, mstype.float32)
|
||||
matrix_A_inv_max = self.log(matrix_A_max_allreduce[i])
|
||||
matrix_A_inv_max = self.mul(matrix_A_inv_max, -1)
|
||||
matrix_A_inv_max = self.exp(matrix_A_inv_max)
|
||||
temp_a = self.mul(temp_a, matrix_A_inv_max)
|
||||
matrix_G_inv_max = self.log(matrix_G_max_allreduce[i])
|
||||
matrix_G_inv_max = self.mul(matrix_G_inv_max, -1)
|
||||
matrix_G_inv_max = self.exp(matrix_G_inv_max)
|
||||
temp_g = self.mul(temp_g, matrix_G_inv_max)
|
||||
temp_max = self.mul(matrix_A_max_allreduce[i], matrix_G_max_allreduce[i])
|
||||
temp_max = self.mul(temp_max, self.feature_map[i])
|
||||
temp_a = self.cast(temp_a, mstype.float16)
|
||||
temp_g = self.cast(temp_g, mstype.float16)
|
||||
if i == 53:
|
||||
g = self.cube_matmul_left_fc(temp_g, g)
|
||||
g = self.cube_matmul_right_fc(g, temp_a, temp_max)
|
||||
else:
|
||||
g = self.cube_matmul_left(temp_g, g)
|
||||
g = self.cube_matmul_right_mul(g, temp_a, temp_max)
|
||||
fake_A = self.assign(self.matrix_A[i], temp_a)
|
||||
fake_G = self.assign(self.matrix_G[i], temp_g)
|
||||
fake_max = self.assign(self.matrix_max_inv[i], temp_max)
|
||||
g = F.depend(g, fake_A)
|
||||
g = F.depend(g, fake_G)
|
||||
g = F.depend(g, fake_max)
|
||||
if i == 53:
|
||||
new_grads = new_grads + (g,)
|
||||
else:
|
||||
new_grads = new_grads + (g, gradients[i * 3 + 1], gradients[i * 3 + 2])
|
||||
gradients = new_grads
|
||||
else:
|
||||
new_grads = ()
|
||||
for i in range(54):
|
||||
g = gradients[i * 3]
|
||||
matrix_A = self.matrix_A[i]
|
||||
matrix_G = self.matrix_G[i]
|
||||
matrix_max = self.matrix_max_inv[i]
|
||||
if i == 53:
|
||||
g = self.cube_matmul_left_fc(matrix_G, g)
|
||||
g = self.cube_matmul_right_fc(g, matrix_A, matrix_max)
|
||||
new_grads = new_grads + (g,)
|
||||
else:
|
||||
g = self.cube_matmul_left(matrix_G, g)
|
||||
g = self.cube_matmul_right_mul(g, matrix_A, matrix_max)
|
||||
new_grads = new_grads + (g, gradients[i * 3 + 1], gradients[i * 3 + 2])
|
||||
gradients = new_grads
|
||||
|
||||
if self.weight_decay > 0:
|
||||
gradients = self.hyper_map(F.partial(apply_decay, self.weight_decay), self.decay_flags,
|
||||
params, gradients)
|
||||
gradients = self.scale_grad(gradients)
|
||||
lr = self.get_lr()
|
||||
success = self.hyper_map(F.partial(_momentum_opt, self.opt, self.momentum, lr), gradients, params, moments)
|
||||
return success
|
|
@ -1,771 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""thor_layer"""
|
||||
import numpy as np
|
||||
|
||||
import mindspore.common.dtype as mstype
|
||||
from mindspore._checkparam import Validator, twice
|
||||
from mindspore._extends import cell_attr_register
|
||||
from mindspore.common.initializer import initializer
|
||||
from mindspore.common.parameter import Parameter
|
||||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.nn.cell import Cell
|
||||
from mindspore.nn.layer.activation import get_activation
|
||||
from mindspore.ops import operations as P
|
||||
|
||||
C0 = 16
|
||||
|
||||
|
||||
def caculate_device_shape(matrix_dim, channel, is_A):
|
||||
ll = (0)
|
||||
if is_A:
|
||||
if channel // C0 == 0:
|
||||
matrix_dim = (matrix_dim / channel) * C0
|
||||
ll = (int(matrix_dim // C0), int(matrix_dim // C0), C0, C0), int(matrix_dim)
|
||||
else:
|
||||
ll = (int(matrix_dim // C0), int(matrix_dim // C0), C0, C0), int(matrix_dim)
|
||||
return ll
|
||||
|
||||
|
||||
def caculate_matmul_shape(matrix_A_dim, matrix_G_dim, split_dim):
|
||||
split_dimA = split_dim
|
||||
split_dimG = split_dim
|
||||
if matrix_A_dim % split_dim == 0:
|
||||
batch_w = matrix_A_dim // split_dim
|
||||
else:
|
||||
if matrix_A_dim < split_dim:
|
||||
batch_w = 1
|
||||
split_dimA = matrix_A_dim
|
||||
else:
|
||||
batch_w = matrix_A_dim // split_dim + 1
|
||||
|
||||
if matrix_G_dim % split_dim == 0:
|
||||
batch_h = matrix_G_dim // split_dim
|
||||
else:
|
||||
if matrix_G_dim < split_dim:
|
||||
batch_h = 1
|
||||
split_dimG = matrix_G_dim
|
||||
else:
|
||||
batch_h = matrix_G_dim // split_dim + 1
|
||||
matrix_A_shape = (batch_h, batch_w, split_dimA, split_dimA)
|
||||
matrix_G_shape = (batch_h, split_dimG, split_dimG)
|
||||
return matrix_A_shape, matrix_G_shape
|
||||
|
||||
class _Conv(Cell):
|
||||
r"""Applies a N-D convolution over an input signal composed of several input
|
||||
planes.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
pad_mode,
|
||||
padding,
|
||||
dilation,
|
||||
group,
|
||||
data_format,
|
||||
has_bias,
|
||||
weight_init,
|
||||
bias_init,
|
||||
):
|
||||
super(_Conv, self).__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.stride = stride
|
||||
self.pad_mode = pad_mode
|
||||
self.padding = padding
|
||||
self.dilation = dilation
|
||||
self.group = group
|
||||
self.data_format = data_format
|
||||
self.has_bias = has_bias
|
||||
if not (isinstance(in_channels, int) and in_channels > 0):
|
||||
raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op passed '
|
||||
+ str(in_channels) + ', should be a int and greater than 0.')
|
||||
if (not isinstance(kernel_size, tuple)) or len(kernel_size) != 2 or \
|
||||
(not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
|
||||
kernel_size[0] < 1 or kernel_size[1] < 1:
|
||||
raise ValueError('Attr \'kernel_size\' of \'Conv2D\' Op passed '
|
||||
+ str(self.kernel_size) + ', should be a int or tuple and equal to or greater than 1.')
|
||||
if in_channels % group != 0:
|
||||
raise ValueError('Attr \'in_channels\' of \'Conv2D\' Op must be divisible by '
|
||||
'attr \'group\' of \'Conv2D\' Op.')
|
||||
if out_channels % group != 0:
|
||||
raise ValueError('Attr \'out_channels\' of \'Conv2D\' Op must be divisible by '
|
||||
'attr \'group\' of \'Conv2D\' Op.')
|
||||
|
||||
self.weight = Parameter(initializer(
|
||||
weight_init, [out_channels, in_channels // group, *kernel_size]))
|
||||
|
||||
if Validator.check_bool(has_bias):
|
||||
self.bias = Parameter(initializer(bias_init, [out_channels]))
|
||||
else:
|
||||
if bias_init != 'zeros':
|
||||
logger.warning("Value of 'has_bias' is False, value of 'bias_init' will be ignored.")
|
||||
self.bias = None
|
||||
|
||||
def construct(self, *inputs):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Conv2d_Thor_GPU(_Conv):
|
||||
"""Conv2d_Thor"""
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
pad_mode='same',
|
||||
padding=0,
|
||||
dilation=1,
|
||||
group=1,
|
||||
data_format='NCHW',
|
||||
has_bias=False,
|
||||
weight_init='normal',
|
||||
damping=0.03,
|
||||
loss_scale=1,
|
||||
frequency=278,
|
||||
batch_size=32,
|
||||
bias_init='zeros'):
|
||||
self.thor = True
|
||||
self.hw = kernel_size * kernel_size
|
||||
kernel_size = twice(kernel_size)
|
||||
super(Conv2d_Thor_GPU, self).__init__(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
pad_mode,
|
||||
padding,
|
||||
dilation,
|
||||
group,
|
||||
data_format,
|
||||
has_bias,
|
||||
weight_init,
|
||||
bias_init,
|
||||
)
|
||||
self.conv2d = P.Conv2D(out_channel=self.out_channels,
|
||||
kernel_size=self.kernel_size,
|
||||
mode=1,
|
||||
pad_mode=self.pad_mode,
|
||||
pad=self.padding,
|
||||
stride=self.stride,
|
||||
dilation=self.dilation,
|
||||
group=self.group
|
||||
)
|
||||
|
||||
self.matrix_A_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1]
|
||||
self.matrix_G_dim = self.out_channels
|
||||
|
||||
split_dim = 128
|
||||
matrix_A_shape, matrix_G_shape = caculate_matmul_shape(self.matrix_A_dim, self.matrix_G_dim, split_dim)
|
||||
self.matrix_A_inv = Parameter(np.zeros(matrix_A_shape).astype(np.float32), requires_grad=False)
|
||||
self.matrix_G_inv = Parameter(np.zeros(matrix_G_shape).astype(np.float32), requires_grad=False)
|
||||
self.broadcast_to = P.BroadcastTo(matrix_A_shape)
|
||||
self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False)
|
||||
self.img2col = P.Im2Col(kernel_size=kernel_size, stride=stride, pad_mode="same")
|
||||
self.matmul = P.MatMul(transpose_b=True)
|
||||
self.shape = P.Shape()
|
||||
self.reshape = P.Reshape()
|
||||
self.mul = P.Mul()
|
||||
self.getG = P.InsertGradientOf(self.save_gradient)
|
||||
self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
|
||||
self.batch_size = Tensor(batch_size, mstype.float16)
|
||||
self.transpose = P.Transpose()
|
||||
self.cast = P.Cast()
|
||||
self.gather = P.Gather()
|
||||
self.freq = Tensor(frequency, mstype.int32)
|
||||
self.axis = 0
|
||||
self.sqrt = P.Sqrt()
|
||||
self.reduce_mean = P.ReduceMean(keep_dims=False)
|
||||
self.damping = Parameter(Tensor(damping), requires_grad=False)
|
||||
self.dampingA = Tensor(np.identity(self.matrix_A_dim), mstype.float32)
|
||||
self.dampingG = Tensor(np.identity(self.matrix_G_dim), mstype.float32)
|
||||
self.cholesky = P.CholeskyTrsm(split_dim=split_dim)
|
||||
self.vector_matmul = P.BatchMatMul(transpose_a=True)
|
||||
|
||||
def save_gradient(self, dout):
|
||||
"""save_gradient"""
|
||||
out = dout
|
||||
dout = self.mul(dout, self.loss_scale)
|
||||
dout = self.mul(dout, self.batch_size)
|
||||
dout = self.reduce_mean(dout, 0)
|
||||
dout_shape = self.shape(dout)
|
||||
dout = self.reshape(dout, (dout_shape[0], -1))
|
||||
dout_shape = self.shape(dout)
|
||||
normalizer = dout_shape[1]
|
||||
dout = self.cast(dout, mstype.float32)
|
||||
matrix_G = self.matmul(dout, dout)
|
||||
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
|
||||
damping_step = self.gather(self.damping, self.cov_step, 0)
|
||||
damping_step = self.cast(damping_step, mstype.float32)
|
||||
self.cov_step = self.cov_step + self.freq
|
||||
damping = self.mul(damping_step, 1.0 / normalizer)
|
||||
damping = self.sqrt(damping)
|
||||
matrix_G = matrix_G + damping * self.dampingG
|
||||
matrix_G = self.cholesky(matrix_G)
|
||||
matrix_G = self.vector_matmul(matrix_G, matrix_G)
|
||||
self.matrix_G_inv = matrix_G
|
||||
return out
|
||||
|
||||
def construct(self, x):
|
||||
if self.thor:
|
||||
matrix_A = self.img2col(x)
|
||||
matrix_A_shape = self.shape(matrix_A)
|
||||
matrix_A = self.reshape(matrix_A, (matrix_A_shape[0]*matrix_A_shape[1]*matrix_A_shape[2],
|
||||
matrix_A_shape[3], -1))
|
||||
matrix_A = self.reduce_mean(matrix_A, 1)
|
||||
matrix_A_shape = self.shape(matrix_A)
|
||||
normalizer = matrix_A_shape[1]
|
||||
matrix_A = self.cast(matrix_A, mstype.float32)
|
||||
matrix_A = self.matmul(matrix_A, matrix_A)
|
||||
matrix_A = self.mul(matrix_A, 1.0 / normalizer)
|
||||
damping_step = self.gather(self.damping, self.cov_step, self.axis)
|
||||
damping_step = self.cast(damping_step, mstype.float32)
|
||||
damping = self.mul(damping_step, 1.0 / normalizer)
|
||||
damping = self.sqrt(damping)
|
||||
matrix_A = matrix_A + damping * self.dampingA
|
||||
matrix_A = self.cholesky(matrix_A)
|
||||
matrix_A = self.vector_matmul(matrix_A, matrix_A)
|
||||
matrix_A = self.broadcast_to(matrix_A)
|
||||
self.matrix_A_inv = matrix_A
|
||||
out = self.conv2d(x, self.weight)
|
||||
out = self.getG(out)
|
||||
else:
|
||||
out = self.conv2d(x, self.weight)
|
||||
|
||||
return out
|
||||
|
||||
def extra_repr(self):
|
||||
"""extra_repr"""
|
||||
s = 'input_channels={}, output_channels={}, kernel_size={},' \
|
||||
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
||||
'group={}, data_format={}, has_bias={},' \
|
||||
'weight_init={}, bias_init={}'.format(
|
||||
self.in_channels,
|
||||
self.out_channels,
|
||||
self.kernel_size,
|
||||
self.stride,
|
||||
self.pad_mode,
|
||||
self.padding,
|
||||
self.dilation,
|
||||
self.group,
|
||||
self.data_format,
|
||||
self.has_bias,
|
||||
self.weight,
|
||||
self.bias)
|
||||
|
||||
if self.has_bias:
|
||||
s += ', bias={}'.format(self.bias)
|
||||
return s
|
||||
|
||||
|
||||
class Dense_Thor_GPU(Cell):
|
||||
"""Dense_Thor"""
|
||||
@cell_attr_register(attrs=['has_bias', 'activation'])
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
weight_init='normal',
|
||||
bias_init='zeros',
|
||||
damping=0.03,
|
||||
loss_scale=1,
|
||||
frequency=278,
|
||||
batch_size=32,
|
||||
has_bias=True,
|
||||
activation=None):
|
||||
super(Dense_Thor_GPU, self).__init__()
|
||||
self.in_channels = Validator.check_positive_int(in_channels)
|
||||
self.out_channels = Validator.check_positive_int(out_channels)
|
||||
self.has_bias = Validator.check_bool(has_bias)
|
||||
self.thor = True
|
||||
if isinstance(weight_init, Tensor):
|
||||
if weight_init.ndim != 2 or weight_init.shape[0] != out_channels or \
|
||||
weight_init.shape[1] != in_channels:
|
||||
raise ValueError("weight_init shape error")
|
||||
|
||||
self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]))
|
||||
|
||||
if self.has_bias:
|
||||
if isinstance(bias_init, Tensor):
|
||||
if bias_init.ndim != 1 or bias_init.shape[0] != out_channels:
|
||||
raise ValueError("bias_init shape error")
|
||||
|
||||
self.bias = Parameter(initializer(bias_init, [out_channels]))
|
||||
|
||||
self.matmul = P.MatMul(transpose_b=True)
|
||||
self.bias_add = P.BiasAdd()
|
||||
|
||||
self.activation = get_activation(activation)
|
||||
self.activation_flag = self.activation is not None
|
||||
split_dim = 128
|
||||
matrix_A_shape, matrix_G_shape = caculate_matmul_shape(self.in_channels, self.out_channels, split_dim)
|
||||
self.matrix_A_inv = Parameter(Tensor(np.zeros(matrix_A_shape).astype(np.float32)), requires_grad=False)
|
||||
self.matrix_G_inv = Parameter(Tensor(np.zeros(matrix_G_shape).astype(np.float32)), requires_grad=False)
|
||||
self.broadcast_to = P.BroadcastTo(matrix_A_shape)
|
||||
self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False)
|
||||
self.shape = P.Shape()
|
||||
self.reshape = P.Reshape()
|
||||
self.transpose = P.Transpose()
|
||||
self.mul = P.Mul()
|
||||
self.cube_matmul = P.MatMul(transpose_a=True)
|
||||
self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
|
||||
self.batch_size = Tensor(batch_size, mstype.float16)
|
||||
self.getG = P.InsertGradientOf(self.save_gradient)
|
||||
self.damping = Parameter(Tensor(damping), requires_grad=False)
|
||||
self.dampingA = Tensor(np.identity(in_channels), mstype.float32)
|
||||
self.dampingG = Tensor(np.identity(out_channels), mstype.float32)
|
||||
self.cast = P.Cast()
|
||||
self.gather = P.Gather()
|
||||
self.freq = Tensor(frequency, mstype.int32)
|
||||
self.axis = 0
|
||||
self.add = P.Add()
|
||||
self.sqrt = P.Sqrt()
|
||||
self.cholesky = P.CholeskyTrsm(split_dim=split_dim)
|
||||
self.vector_matmul = P.BatchMatMul(transpose_a=True)
|
||||
|
||||
def save_gradient(self, dout):
|
||||
"""save_gradient"""
|
||||
out = dout
|
||||
dout = self.mul(dout, self.loss_scale)
|
||||
dout = self.mul(dout, self.batch_size)
|
||||
dout_shape = self.shape(dout)
|
||||
normalizer = dout_shape[0]
|
||||
dout = self.cast(dout, mstype.float32)
|
||||
matrix_G = self.cube_matmul(dout, dout)
|
||||
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
|
||||
damping_step = self.gather(self.damping, self.cov_step, 0)
|
||||
damping_step = self.cast(damping_step, mstype.float32)
|
||||
self.cov_step = self.cov_step + self.freq
|
||||
damping = self.sqrt(damping_step)
|
||||
matrix_G = matrix_G + damping * self.dampingG
|
||||
matrix_G = self.cholesky(matrix_G)
|
||||
matrix_G = self.vector_matmul(matrix_G, matrix_G)
|
||||
self.matrix_G_inv = matrix_G
|
||||
return out
|
||||
|
||||
def construct(self, x):
|
||||
"""construct"""
|
||||
if self.thor:
|
||||
inputs = self.cast(x, mstype.float32)
|
||||
inputs = self.cube_matmul(inputs, inputs)
|
||||
inputs_shape = self.shape(inputs)
|
||||
normalizer = inputs_shape[0]
|
||||
matrix_A = self.mul(inputs, 1.0 / normalizer)
|
||||
damping_step = self.gather(self.damping, self.cov_step, self.axis)
|
||||
damping_step = self.cast(damping_step, mstype.float32)
|
||||
damping = self.sqrt(damping_step)
|
||||
matrix_A = matrix_A + damping * self.dampingA
|
||||
matrix_A = self.cholesky(matrix_A)
|
||||
matrix_A = self.vector_matmul(matrix_A, matrix_A)
|
||||
matrix_A = self.broadcast_to(matrix_A)
|
||||
self.matrix_A_inv = matrix_A
|
||||
output = self.matmul(x, self.weight)
|
||||
output = self.getG(output)
|
||||
else:
|
||||
output = self.matmul(x, self.weight)
|
||||
|
||||
if self.has_bias:
|
||||
output = self.bias_add(output, self.bias)
|
||||
if self.activation_flag:
|
||||
return self.activation(output)
|
||||
return output
|
||||
|
||||
def extend_repr(self):
|
||||
"""extend_repr"""
|
||||
s = 'in_channels={}, out_channels={}'.format(self.in_channels, self.out_channels)
|
||||
if self.has_bias:
|
||||
s += ', has_bias={}'.format(self.has_bias)
|
||||
if self.activation_flag:
|
||||
s += ', activation={}'.format(self.activation)
|
||||
return s
|
||||
|
||||
|
||||
class Conv2d_Thor(_Conv):
|
||||
"""Conv2d_Thor"""
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
pad_mode='same',
|
||||
padding=0,
|
||||
dilation=1,
|
||||
group=1,
|
||||
data_format='NCHW',
|
||||
has_bias=False,
|
||||
weight_init='normal',
|
||||
damping=0.03,
|
||||
loss_scale=1,
|
||||
frequency=278,
|
||||
batch_size=32,
|
||||
bias_init='zeros'):
|
||||
self.thor = True
|
||||
ksizes = (1, kernel_size, kernel_size, 1)
|
||||
self.hw = kernel_size * kernel_size
|
||||
strides = (1, stride, stride, 1)
|
||||
kernel_size = twice(kernel_size)
|
||||
super(Conv2d_Thor, self).__init__(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
pad_mode,
|
||||
padding,
|
||||
dilation,
|
||||
group,
|
||||
data_format,
|
||||
has_bias,
|
||||
weight_init,
|
||||
bias_init,
|
||||
)
|
||||
self.conv2d = P.Conv2D(out_channel=self.out_channels,
|
||||
kernel_size=self.kernel_size,
|
||||
mode=1,
|
||||
pad_mode=self.pad_mode,
|
||||
pad=self.padding,
|
||||
stride=self.stride,
|
||||
dilation=self.dilation,
|
||||
group=self.group
|
||||
)
|
||||
self.batch_size = batch_size
|
||||
self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
|
||||
self.cube_matmul = P.CusMatMulCube(transpose_a=True)
|
||||
self.matrix_combine = P.CusMatrixCombine()
|
||||
self.cholesky = P.CusCholeskyTrsm()
|
||||
self.transpose02314 = P.CusTranspose02314()
|
||||
self.matrix_A_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1]
|
||||
self.matrix_G_dim = self.out_channels
|
||||
self.matrix_A_device_shape, self.matrix_A_device_dim = caculate_device_shape(self.matrix_A_dim,
|
||||
self.in_channels, True)
|
||||
self.matrix_G_device_shape, self.matrix_G_device_dim = caculate_device_shape(self.matrix_G_dim,
|
||||
self.in_channels, False)
|
||||
self.matrix_A_device_temp_shape = (
|
||||
self.matrix_A_device_shape[0], self.matrix_A_device_shape[2], self.matrix_A_device_shape[1],
|
||||
self.matrix_A_device_shape[3])
|
||||
self.matrix_G_device_temp_shape = (
|
||||
self.matrix_G_device_shape[0], self.matrix_G_device_shape[2], self.matrix_G_device_shape[1],
|
||||
self.matrix_G_device_shape[3])
|
||||
self.matrix_A_inv = Parameter(
|
||||
Tensor(np.reshape(np.identity(self.matrix_A_device_dim).astype(np.float16), self.matrix_A_device_shape)),
|
||||
requires_grad=False)
|
||||
self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False)
|
||||
self.matrix_G_inv = Parameter(
|
||||
Tensor(np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)),
|
||||
requires_grad=False)
|
||||
|
||||
self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False)
|
||||
self.fake_G = Tensor(
|
||||
np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape))
|
||||
|
||||
self.shape = P.Shape()
|
||||
self.reshape = P.Reshape()
|
||||
self.transpose = P.Transpose()
|
||||
self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False)
|
||||
self.mul = P.Mul()
|
||||
self.cast = P.Cast()
|
||||
self.damping = Tensor(damping)
|
||||
self.vector_matmul = P.CusBatchMatMul()
|
||||
self.diag_block_dim = 128
|
||||
self.channels_slice_flag = False
|
||||
if self.in_channels % C0 != 0:
|
||||
self.channels_slice_flag = True
|
||||
|
||||
self.padA_flag = False
|
||||
if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \
|
||||
and self.matrix_A_dim > self.diag_block_dim:
|
||||
self.padA_flag = True
|
||||
pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim
|
||||
self.padA = P.Pad(((0, pad_dim), (0, pad_dim)))
|
||||
self.device_shape_pad_flag = False
|
||||
if self.matrix_A_dim != self.matrix_A_device_dim:
|
||||
self.device_shape_pad_flag = True
|
||||
self.device_shape_pad = P.Pad(((0, 0), (0, C0 - self.in_channels), (0, 0), (0, C0 - self.in_channels)))
|
||||
self.slice = P.Slice()
|
||||
self.gather = P.Gather()
|
||||
self.freq = Tensor(frequency, mstype.int32)
|
||||
self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
|
||||
self.axis = 0
|
||||
|
||||
dampingA_dim = self.matrix_A_dim
|
||||
if (self.matrix_A_dim % self.diag_block_dim) != 0 and self.matrix_A_dim > self.diag_block_dim:
|
||||
dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim
|
||||
dampingG_dim = self.matrix_G_dim
|
||||
if (self.matrix_G_dim % self.diag_block_dim) != 0 and self.matrix_G_dim > self.diag_block_dim:
|
||||
dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim
|
||||
|
||||
self.dampingA = Tensor(np.identity(dampingA_dim), mstype.float32)
|
||||
self.dampingG = Tensor(np.identity(dampingG_dim), mstype.float32)
|
||||
self.fused_abs_max1 = P.CusFusedAbsMax1([self.matrix_A_dim, self.matrix_A_dim])
|
||||
self.fused_abs_max2 = P.CusFusedAbsMax1()
|
||||
self.log = P.Log()
|
||||
self.exp = P.Exp()
|
||||
self.sqrt = P.Sqrt()
|
||||
self.getG = P.InsertGradientOf(self.save_gradient)
|
||||
|
||||
def save_gradient(self, dout):
|
||||
"""save_gradient"""
|
||||
out = dout
|
||||
dout = self.mul(dout, self.loss_scale)
|
||||
dout = self.mul(dout, 32.0)
|
||||
dout = self.transpose02314(dout)
|
||||
dout_shape = self.shape(dout)
|
||||
normalizer = dout_shape[0]
|
||||
|
||||
matrix_G = self.cube_matmul(dout, dout)
|
||||
normalizer = self.cast(normalizer, mstype.float32)
|
||||
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
|
||||
damping_step = self.gather(self.damping, self.cov_step, 0)
|
||||
self.cov_step = self.cov_step + self.freq
|
||||
damping_step = self.cast(damping_step, mstype.float32)
|
||||
damping = self.mul(damping_step, 32.0 / normalizer)
|
||||
damping = self.sqrt(damping)
|
||||
dampingG = self.cast(self.dampingG, mstype.float32)
|
||||
matrix_G = matrix_G + damping * dampingG
|
||||
|
||||
matrix_G_inv = self.cholesky(matrix_G)
|
||||
matrix_G_inv = self.vector_matmul(matrix_G_inv, matrix_G_inv)
|
||||
matrix_G_inv_max = self.fused_abs_max2(matrix_G_inv)
|
||||
matrix_G_inv_max = self.fused_abs_max2(matrix_G_inv_max)
|
||||
self.G_inv_max = matrix_G_inv_max
|
||||
matrix_G_inv = self.matrix_combine(matrix_G_inv)
|
||||
matrix_G_inv = self.reshape(matrix_G_inv, self.matrix_G_device_temp_shape)
|
||||
matrix_G_inv = self.transpose(matrix_G_inv, (2, 0, 1, 3))
|
||||
matrix_G = self.cast(matrix_G_inv, mstype.float16)
|
||||
self.matrix_G_inv = matrix_G
|
||||
return out
|
||||
|
||||
def construct(self, x):
|
||||
if self.thor:
|
||||
matrix_A = self.img2col(x)
|
||||
matrix_A_shape = self.shape(matrix_A)
|
||||
normalizer = matrix_A_shape[0]
|
||||
matrix_A = self.cube_matmul(matrix_A, matrix_A)
|
||||
|
||||
if self.channels_slice_flag:
|
||||
matrix_A = self.reshape(matrix_A, (self.hw, C0, self.hw, C0))
|
||||
matrix_A = self.slice(matrix_A, (0, 0, 0, 0), (self.hw, self.in_channels, self.hw, self.in_channels))
|
||||
matrix_A = self.reshape(matrix_A, (self.matrix_A_dim, self.matrix_A_dim))
|
||||
normalizer = self.cast(normalizer, mstype.float32)
|
||||
matrix_A = self.mul(matrix_A, 1.0 / normalizer)
|
||||
if self.padA_flag:
|
||||
matrix_A = self.padA(matrix_A)
|
||||
damping_step = self.gather(self.damping, self.cov_step, self.axis)
|
||||
damping_step = self.cast(damping_step, mstype.float32)
|
||||
damping = self.mul(damping_step, 32.0 / normalizer)
|
||||
damping = self.sqrt(damping)
|
||||
damping_A = self.cast(self.dampingA, mstype.float32)
|
||||
matrix_A = matrix_A + damping * damping_A
|
||||
matrix_A_inv = self.cholesky(matrix_A)
|
||||
matrix_A_inv = self.vector_matmul(matrix_A_inv, matrix_A_inv)
|
||||
matrix_A_inv_max = self.fused_abs_max1(matrix_A_inv)
|
||||
matrix_A_inv_max = self.fused_abs_max2(matrix_A_inv_max)
|
||||
self.A_inv_max = matrix_A_inv_max
|
||||
matrix_A_inv = self.matrix_combine(matrix_A_inv)
|
||||
matrix_A_inv = self.cast(matrix_A_inv, mstype.float16)
|
||||
if self.padA_flag:
|
||||
matrix_A_inv = self.slice(matrix_A_inv, (0, 0), (self.matrix_A_dim, self.matrix_A_dim))
|
||||
|
||||
if self.device_shape_pad_flag:
|
||||
matrix_A_inv = self.reshape(matrix_A_inv, (self.hw, self.in_channels, self.hw, self.in_channels))
|
||||
matrix_A_inv = self.device_shape_pad(matrix_A_inv)
|
||||
matrix_A_inv = self.reshape(matrix_A_inv, self.matrix_A_device_temp_shape)
|
||||
matrix_A_inv = self.transpose(matrix_A_inv, (2, 0, 1, 3))
|
||||
self.matrix_A_inv = matrix_A_inv
|
||||
out = self.conv2d(x, self.weight)
|
||||
out = self.getG(out)
|
||||
else:
|
||||
out = self.conv2d(x, self.weight)
|
||||
|
||||
return out
|
||||
|
||||
def extra_repr(self):
|
||||
"""extra_repr"""
|
||||
s = 'input_channels={}, output_channels={}, kernel_size={},' \
|
||||
'stride={}, pad_mode={}, padding={}, dilation={}, ' \
|
||||
'group={}, data_format={}, has_bias={},' \
|
||||
'weight_init={}, bias_init={}'.format(
|
||||
self.in_channels,
|
||||
self.out_channels,
|
||||
self.kernel_size,
|
||||
self.stride,
|
||||
self.pad_mode,
|
||||
self.padding,
|
||||
self.dilation,
|
||||
self.group,
|
||||
self.data_format,
|
||||
self.has_bias,
|
||||
self.weight,
|
||||
self.bias)
|
||||
|
||||
if self.has_bias:
|
||||
s += ', bias={}'.format(self.bias)
|
||||
return s
|
||||
|
||||
|
||||
class Dense_Thor(Cell):
|
||||
"""Dense_Thor"""
|
||||
@cell_attr_register(attrs=['has_bias', 'activation'])
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
weight_init='normal',
|
||||
bias_init='zeros',
|
||||
damping=0.03,
|
||||
loss_scale=1,
|
||||
frequency=278,
|
||||
batch_size=32,
|
||||
has_bias=True,
|
||||
activation=None):
|
||||
super(Dense_Thor, self).__init__()
|
||||
self.in_channels = Validator.check_positive_int(in_channels)
|
||||
self.out_channels = Validator.check_positive_int(out_channels)
|
||||
self.has_bias = Validator.check_bool(has_bias)
|
||||
self.thor = True
|
||||
self.batch_size = batch_size
|
||||
if isinstance(weight_init, Tensor):
|
||||
if weight_init.ndim != 2 or weight_init.shape[0] != out_channels or \
|
||||
weight_init.shape[1] != in_channels:
|
||||
raise ValueError("weight_init shape error")
|
||||
|
||||
self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]))
|
||||
|
||||
if self.has_bias:
|
||||
if isinstance(bias_init, Tensor):
|
||||
if bias_init.ndim != 1 or bias_init.shape[0] != out_channels:
|
||||
raise ValueError("bias_init shape error")
|
||||
|
||||
self.bias = Parameter(initializer(bias_init, [out_channels]))
|
||||
|
||||
self.matmul = P.MatMul(transpose_b=True)
|
||||
self.bias_add = P.BiasAdd()
|
||||
|
||||
self.activation = get_activation(activation)
|
||||
self.activation_flag = self.activation is not None
|
||||
|
||||
self.matrix_A_inv = Parameter(Tensor(np.zeros([128, 128, 16, 16]).astype(np.float16)), requires_grad=False)
|
||||
self.matrix_G_inv = Parameter(Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)), requires_grad=False)
|
||||
self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16))
|
||||
|
||||
self.matmul = P.MatMul(transpose_b=True)
|
||||
self.cube_matmul = P.CusMatMulCube(transpose_a=True)
|
||||
self.matrix_combine = P.CusMatrixCombine()
|
||||
self.cholesky = P.CusCholeskyTrsm()
|
||||
self.shape = P.Shape()
|
||||
self.reshape = P.Reshape()
|
||||
self.transpose = P.Transpose()
|
||||
self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False)
|
||||
self.mul = P.Mul()
|
||||
self.cast = P.Cast()
|
||||
self.damping = Tensor(damping)
|
||||
self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
|
||||
self.vector_matmul = P.CusBatchMatMul()
|
||||
self.pad = P.Pad(((0, 23), (0, 23)))
|
||||
self.pad1 = P.Pad(((0, 7), (0, 7)))
|
||||
self.slice = P.Slice()
|
||||
self.gather = P.Gather()
|
||||
self.assignadd = P.AssignAdd()
|
||||
self.freq = Tensor(frequency, mstype.int32)
|
||||
self.axis = 0
|
||||
self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False)
|
||||
self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False)
|
||||
self.fused_abs_max1 = P.CusFusedAbsMax1([1001, 1001])
|
||||
self.fused_abs_max2 = P.CusFusedAbsMax1()
|
||||
self.log = P.Log()
|
||||
self.exp = P.Exp()
|
||||
self.dampingA = Tensor(np.identity(2048), mstype.float32)
|
||||
self.dampingG = Tensor(np.identity(1024), mstype.float32)
|
||||
self.add = P.Add()
|
||||
self.sqrt = P.Sqrt()
|
||||
self.getG = P.InsertGradientOf(self.save_gradient)
|
||||
|
||||
def save_gradient(self, dout):
|
||||
"""save_gradient"""
|
||||
out = dout
|
||||
dout = self.mul(dout, self.loss_scale)
|
||||
dout = self.mul(dout, 32.0)
|
||||
normalizer = 32
|
||||
matrix_G = self.cube_matmul(dout, dout)
|
||||
normalizer = self.cast(normalizer, mstype.float32)
|
||||
matrix_G = self.mul(matrix_G, 1.0 / normalizer)
|
||||
matrix_G = self.pad(matrix_G)
|
||||
damping_step = self.gather(self.damping, self.cov_step, 0)
|
||||
damping_step = self.cast(damping_step, mstype.float32)
|
||||
self.cov_step = self.cov_step + self.freq
|
||||
damping = self.sqrt(damping_step)
|
||||
dampingG = self.cast(self.dampingG, mstype.float32)
|
||||
matrix_G = matrix_G + damping * dampingG
|
||||
matrix_G_inv = self.cholesky(matrix_G)
|
||||
matrix_G_inv = self.vector_matmul(matrix_G_inv, matrix_G_inv)
|
||||
matrix_G_inv_max = self.fused_abs_max1(matrix_G_inv)
|
||||
matrix_G_inv_max = self.fused_abs_max2(matrix_G_inv_max)
|
||||
self.G_inv_max = matrix_G_inv_max
|
||||
matrix_G_inv = self.matrix_combine(matrix_G_inv)
|
||||
matrix_G_inv = self.slice(matrix_G_inv, (0, 0), (1001, 1001))
|
||||
matrix_G_inv = self.pad1(matrix_G_inv)
|
||||
matrix_G_inv_shape = self.shape(matrix_G_inv)
|
||||
matrix_G_inv = self.reshape(matrix_G_inv, (matrix_G_inv_shape[0] / 16, 16, matrix_G_inv_shape[0] / 16, 16))
|
||||
matrix_G_inv = self.transpose(matrix_G_inv, (2, 0, 1, 3))
|
||||
matrix_G_inv = self.cast(matrix_G_inv, mstype.float16)
|
||||
self.matrix_G_inv = matrix_G_inv
|
||||
return out
|
||||
|
||||
def construct(self, x):
|
||||
"""construct"""
|
||||
if self.thor:
|
||||
inputs = self.cube_matmul(x, x)
|
||||
normalizer = 32
|
||||
normalizer = self.cast(normalizer, mstype.float32)
|
||||
matrix_A = self.mul(inputs, 1.0 / normalizer)
|
||||
|
||||
damping_step = self.gather(self.damping, self.cov_step, self.axis)
|
||||
damping_step = self.cast(damping_step, mstype.float32)
|
||||
damping = self.sqrt(damping_step)
|
||||
dampingA = self.cast(self.dampingA, mstype.float32)
|
||||
matrix_A = matrix_A + damping * dampingA
|
||||
matrix_A_inv = self.cholesky(matrix_A)
|
||||
matrix_A_inv = self.vector_matmul(matrix_A_inv, matrix_A_inv)
|
||||
|
||||
matrix_A_inv_max = self.fused_abs_max2(matrix_A_inv)
|
||||
matrix_A_inv_max = self.fused_abs_max2(matrix_A_inv_max)
|
||||
self.A_inv_max = matrix_A_inv_max
|
||||
|
||||
matrix_A_inv = self.matrix_combine(matrix_A_inv)
|
||||
matrix_A_inv_shape = self.shape(matrix_A_inv)
|
||||
matrix_A_inv = self.reshape(matrix_A_inv, (matrix_A_inv_shape[0] / 16, 16, matrix_A_inv_shape[0] / 16, 16))
|
||||
matrix_A_inv = self.transpose(matrix_A_inv, (2, 0, 1, 3))
|
||||
matrix_A_inv = self.cast(matrix_A_inv, mstype.float16)
|
||||
self.matrix_A_inv = matrix_A_inv
|
||||
output = self.matmul(x, self.weight)
|
||||
output = self.getG(output)
|
||||
else:
|
||||
output = self.matmul(x, self.weight)
|
||||
|
||||
if self.has_bias:
|
||||
output = self.bias_add(output, self.bias)
|
||||
if self.activation_flag:
|
||||
return self.activation(output)
|
||||
return output
|
||||
|
||||
def extend_repr(self):
|
||||
"""extend_repr"""
|
||||
s = 'in_channels={}, out_channels={}'.format(self.in_channels, self.out_channels)
|
||||
if self.has_bias:
|
||||
s += ', has_bias={}'.format(self.has_bias)
|
||||
if self.activation_flag:
|
||||
s += ', activation={}'.format(self.activation)
|
||||
return s
|
|
@ -24,11 +24,14 @@ from mindspore.context import ParallelMode
|
|||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor
|
||||
from mindspore.train.loss_scale_manager import FixedLossScaleManager
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
from mindspore.parallel import set_algo_parameters
|
||||
from mindspore.train.train_thor import ConvertModelUtils
|
||||
from mindspore.nn.optim import thor
|
||||
from mindspore.train.model import Model
|
||||
|
||||
from src.model_thor import Model_Thor as Model
|
||||
from src.resnet_thor import resnet50
|
||||
from src.resnet import resnet50 as resnet
|
||||
from src.dataset import create_dataset
|
||||
from src.crossentropy import CrossEntropy
|
||||
from src.crossentropy import CrossEntropy as CrossEntropySmooth
|
||||
|
||||
parser = argparse.ArgumentParser(description='Image classification')
|
||||
parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
|
||||
|
@ -38,16 +41,32 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num')
|
|||
args_opt = parser.parse_args()
|
||||
|
||||
if args_opt.device_target == "Ascend":
|
||||
from src.thor import THOR
|
||||
from src.config import config
|
||||
else:
|
||||
from src.thor import THOR_GPU as THOR
|
||||
from src.config import config_gpu as config
|
||||
|
||||
set_seed(1)
|
||||
|
||||
|
||||
def get_model_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100):
|
||||
def filter_checkpoint_parameter_by_list(origin_dict, param_filter):
|
||||
"""remove useless parameters according to filter_list"""
|
||||
for key in list(origin_dict.keys()):
|
||||
for name in param_filter:
|
||||
if name in key:
|
||||
print("Delete parameter from checkpoint: ", key)
|
||||
del origin_dict[key]
|
||||
break
|
||||
|
||||
|
||||
def apply_eval(eval_param):
|
||||
eval_model = eval_param["model"]
|
||||
eval_ds = eval_param["dataset"]
|
||||
metrics_name = eval_param["metrics_name"]
|
||||
res = eval_model.eval(eval_ds)
|
||||
return res[metrics_name]
|
||||
|
||||
|
||||
def get_thor_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, decay_epochs=100):
|
||||
"""get_model_lr"""
|
||||
lr_each_step = []
|
||||
total_steps = steps_per_epoch * total_epochs
|
||||
|
@ -66,7 +85,7 @@ def get_model_lr(global_step, lr_init, decay, total_epochs, steps_per_epoch, dec
|
|||
return learning_rate
|
||||
|
||||
|
||||
def get_model_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch):
|
||||
def get_thor_damping(global_step, damping_init, decay_rate, total_epochs, steps_per_epoch):
|
||||
"""get_model_damping"""
|
||||
damping_each_step = []
|
||||
total_steps = steps_per_epoch * total_epochs
|
||||
|
@ -88,46 +107,50 @@ if __name__ == '__main__':
|
|||
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
|
||||
|
||||
if args_opt.run_distribute:
|
||||
# Ascend target
|
||||
if target == "Ascend":
|
||||
device_id = int(os.getenv('DEVICE_ID'))
|
||||
context.set_context(device_id=device_id, enable_auto_mixed_precision=True)
|
||||
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True, all_reduce_fusion_config=[107])
|
||||
gradients_mean=True)
|
||||
set_algo_parameters(elementwise_op_strategy_follow=True)
|
||||
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
|
||||
init()
|
||||
# GPU target
|
||||
else:
|
||||
init()
|
||||
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True, all_reduce_fusion_config=[107])
|
||||
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/"
|
||||
gradients_mean=True)
|
||||
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
|
||||
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
|
||||
|
||||
# create dataset
|
||||
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1,
|
||||
batch_size=config.batch_size, target=target)
|
||||
step_size = dataset.get_dataset_size()
|
||||
|
||||
# define net
|
||||
step_size = dataset.get_dataset_size()
|
||||
damping = get_model_damping(0, config.damping_init, config.damping_decay, 70, step_size)
|
||||
lr = get_model_lr(0, config.lr_init, config.lr_decay, config.lr_end_epoch, step_size, decay_epochs=39)
|
||||
net = resnet50(class_num=config.class_num, damping=damping, loss_scale=config.loss_scale,
|
||||
frequency=config.frequency, batch_size=config.batch_size)
|
||||
net = resnet(class_num=config.class_num)
|
||||
|
||||
# define loss, model
|
||||
# init lr
|
||||
lr = get_thor_lr(0, config.lr_init, config.lr_decay, config.lr_end_epoch, step_size, decay_epochs=39)
|
||||
lr = Tensor(lr)
|
||||
|
||||
# define loss
|
||||
if not config.use_label_smooth:
|
||||
config.label_smooth_factor = 0.0
|
||||
loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
|
||||
opt = THOR(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr), config.momentum,
|
||||
filter(lambda x: 'matrix_A' in x.name, net.get_parameters()),
|
||||
filter(lambda x: 'matrix_G' in x.name, net.get_parameters()),
|
||||
filter(lambda x: 'A_inv_max' in x.name, net.get_parameters()),
|
||||
filter(lambda x: 'G_inv_max' in x.name, net.get_parameters()),
|
||||
config.weight_decay, config.loss_scale)
|
||||
loss = CrossEntropySmooth(smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
|
||||
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', loss_scale_manager=loss_scale,
|
||||
keep_batchnorm_fp32=False, metrics={'acc'}, frequency=config.frequency,
|
||||
use_dynamic_frequency=config.use_dynamic_frequency,
|
||||
first_stage_steps=config.first_stage_steps)
|
||||
metrics = {"acc"}
|
||||
damping = get_thor_damping(0, config.damping_init, config.damping_decay, 70, step_size)
|
||||
split_indices = [26, 53]
|
||||
opt = thor(net, lr, Tensor(damping), config.momentum, config.weight_decay, config.loss_scale,
|
||||
config.batch_size, split_indices=split_indices, frequency=config.frequency)
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics=metrics,
|
||||
amp_level="O2", keep_batchnorm_fp32=False)
|
||||
|
||||
model = ConvertModelUtils().convert_to_thor_model(model=model, network=net, loss_fn=loss, optimizer=opt,
|
||||
loss_scale_manager=loss_scale, metrics={'acc'},
|
||||
amp_level="O2", keep_batchnorm_fp32=False)
|
||||
|
||||
# define callbacks
|
||||
time_cb = TimeMonitor(data_size=step_size)
|
||||
|
@ -140,4 +163,6 @@ if __name__ == '__main__':
|
|||
cb += [ckpt_cb]
|
||||
|
||||
# train model
|
||||
model.train(config.epoch_size, dataset, callbacks=cb)
|
||||
dataset_sink_mode = True
|
||||
model.train(config.epoch_size, dataset, callbacks=cb,
|
||||
sink_size=dataset.get_dataset_size(), dataset_sink_mode=dataset_sink_mode)
|
||||
|
|
Loading…
Reference in New Issue