fix aicpu migration issues

This commit is contained in:
lilinjie 2023-02-15 14:08:55 +08:00
parent 45b230d727
commit 4aac1ea35f
12 changed files with 1000 additions and 3 deletions

View File

@ -366,3 +366,5 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d_grad.cc:aicpu::MaxUnpool2DGradCpuKernel::MaxUnpool2DGradCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_pool_3d_with_argmax.cc:aicpu::MaxPool3DWithArgmaxCpuKernel::MaxPool3DWithArgmaxCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/layer_norm_grad_grad.cc:aicpu::LayerNormGradGradCpuKernel::LayerNormGradGradCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/batchmatmul.cc:aicpu::BatchMatMulCpuKernel::DoCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_to_dense.cc:aicpu::SparseToDenseCpuKernel::ValidParam

View File

@ -79,7 +79,6 @@ MindSpore中 `mindspore.ops.primitive` 接口与上一版本相比,新增、
mindspore.ops.MaxUnpool2D
mindspore.ops.MirrorPad
mindspore.ops.NthElement
mindspore.ops.NuclearNorm
mindspore.ops.Pad
mindspore.ops.Padding
mindspore.ops.PadV3

View File

@ -78,7 +78,6 @@ Neural Network
mindspore.ops.MaxUnpool2D
mindspore.ops.MirrorPad
mindspore.ops.NthElement
mindspore.ops.NuclearNorm
mindspore.ops.Pad
mindspore.ops.EmbeddingLookup
mindspore.ops.Padding

View File

@ -756,6 +756,7 @@ constexpr auto kSparseSparseMaximumOpName = "SparseSparseMaximum";
constexpr auto kSparseTensorDenseMatMulOpName = "SparseTensorDenseMatMul";
constexpr auto kSparseTensorDenseAddOpName = "SparseTensorDenseAdd";
constexpr auto kSparseTensorToCSRSparseMatrixOpName = "SparseTensorToCSRSparseMatrix";
constexpr auto kSparseToDenseOpName = "SparseToDense";
constexpr auto kSplitOpName = "Split";
constexpr auto kSplitDOpName = "SplitD";
constexpr auto kSplitVOpName = "SplitV";

View File

@ -0,0 +1,251 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "batchmatmul.h"
#include <complex>
#include "unsupported/Eigen/CXX11/Tensor"
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
#include "kernel_log.h"
#include "status.h"
#include <iostream>
using namespace std;
namespace {
const char *kBatchMatmul = "BatchMatMul";
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
const int64_t kParallelDataNum = 1024;
} // namespace
namespace aicpu {
template <typename T>
uint32_t BatchMatMulCpuKernel::DoCompute(CpuKernelContext &ctx) {
auto input0_tensor = ctx.Input(0);
auto input0_tensor_shape = input0_tensor->GetTensorShape();
int32_t input0_tensor_dims = input0_tensor_shape->GetDims();
KERNEL_CHECK_FALSE((input0_tensor_dims > 1), KERNEL_STATUS_PARAM_INVALID, "Input[x1] must be a matrix or higher.")
auto input1_tensor = ctx.Input(1);
auto input1_tensor_shape = input1_tensor->GetTensorShape();
int32_t input1_tensor_dims = input1_tensor_shape->GetDims();
KERNEL_CHECK_FALSE((input1_tensor_dims > 1), KERNEL_STATUS_PARAM_INVALID, "Input[x2] must be a matrix or higher.")
auto output_tensor = ctx.Output(0);
DataType input0_data_type = input0_tensor->GetDataType();
DataType input1_data_type = input1_tensor->GetDataType();
DataType output_data_type = output_tensor->GetDataType();
KERNEL_CHECK_FALSE((input0_data_type == input1_data_type), KERNEL_STATUS_PARAM_INVALID,
"Input[x1] data type[%s] and input[x2] data type[%s] must be same.",
DTypeStr(input0_data_type).c_str(), DTypeStr(input1_data_type).c_str())
KERNEL_CHECK_FALSE((input0_data_type == output_data_type), KERNEL_STATUS_PARAM_INVALID,
"Input data type[%s] and output data type[%s] must be same.", DTypeStr(input0_data_type).c_str(),
DTypeStr(output_data_type).c_str())
bool adj_x = false;
bool adj_y = false;
auto adj_x1 = ctx.GetAttr("adj_x1");
auto adj_x2 = ctx.GetAttr("adj_x2");
if (adj_x1 != nullptr) {
adj_x = adj_x1->GetBool();
}
if (adj_x2 != nullptr) {
adj_y = adj_x2->GetBool();
}
KERNEL_LOG_DEBUG(
"%s Attr[adj_x1] value[%d], "
"Attr[adj_x2] value[%d].",
kBatchMatmul, adj_x, adj_y);
int32_t x1_dim = adj_x ? input0_tensor_dims - 2 : input0_tensor_dims - 1;
int32_t x2_dim = adj_y ? input1_tensor_dims - 1 : input1_tensor_dims - 2;
KERNEL_CHECK_FALSE((input0_tensor_shape->GetDimSize(x1_dim) == input1_tensor_shape->GetDimSize(x2_dim)),
KERNEL_STATUS_PARAM_INVALID,
"Matrix size incompatible, input[x1] dim[%d] value[%lld], "
"input[x2] dim[%d] value[%lld]",
x1_dim, input0_tensor_shape->GetDimSize(x1_dim), x2_dim, input1_tensor_shape->GetDimSize(x2_dim))
KERNEL_CHECK_FALSE((input0_tensor_dims == input1_tensor_dims), KERNEL_STATUS_PARAM_INVALID,
"input0_tensor_dims value[%d] is not equal to "
"input1_tensor_dims value[%d]",
input0_tensor_dims, input1_tensor_dims)
auto input0_shape = input0_tensor_shape->GetDimSizes();
auto input1_shape = input1_tensor_shape->GetDimSizes();
auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
for (int32_t i = 0; i < input0_tensor_dims - 2; i++) {
KERNEL_CHECK_FALSE((input0_shape[i] == input1_shape[i]), KERNEL_STATUS_PARAM_INVALID,
"input0_shape dim[%d] value[%lld] is not equal to "
"input1_shape dim[%d] value[%lld]",
i, input0_shape[i], i, input1_shape[i])
KERNEL_CHECK_FALSE((input0_shape[i] == output_shape[i]), KERNEL_STATUS_PARAM_INVALID,
"input0_shape dim[%d] value[%lld] is not equal to "
"output_shape dim[%d] value[%lld]",
i, input0_shape[i], i, output_shape[i])
}
int32_t map1_l = input0_shape[input0_tensor_dims - 2];
int32_t map1_r = input0_shape[input0_tensor_dims - 1];
int32_t map2_l = input1_shape[input1_tensor_dims - 2];
int32_t map2_r = input1_shape[input1_tensor_dims - 1];
int32_t rows_dim = adj_x ? input0_tensor_dims - 1 : input0_tensor_dims - 2;
int32_t cols_dim = adj_y ? input1_tensor_dims - 2 : input1_tensor_dims - 1;
int32_t num_rows = input0_tensor_shape->GetDimSize(rows_dim);
int32_t num_cols = input1_tensor_shape->GetDimSize(cols_dim);
uint64_t num_element = output_tensor->NumElements();
uint64_t num_batches = num_element / (num_rows * num_cols);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> map1(map1_l, map1_r);
auto *input0_data = reinterpret_cast<T *>(input0_tensor->GetData());
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> map2(map2_l, map2_r);
auto *input1_data = reinterpret_cast<T *>(input1_tensor->GetData());
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> map_output(num_rows, num_cols);
auto *output_data = reinterpret_cast<T *>(output_tensor->GetData());
uint64_t input0_size = input0_tensor->GetDataSize();
if (input0_size >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > num_batches) {
max_core_num = num_batches;
}
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> map1[num_batches];
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> map2[num_batches];
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> map_output[num_batches];
auto shared_batchmatmul = [&](int64_t start, int64_t end) {
for (int64_t batch = start; batch < end; ++batch) {
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> matrix1(map1_l, map1_r);
map1[batch].resize(map1_l, map1_r);
for (int64_t i = 0; i < map1_l; i++) {
for (int64_t j = 0; j < map1_r; j++) {
map1[batch](i, j) = input0_data[batch * map1_l * map1_r + i * map1_r + j];
}
}
map2[batch].resize(map2_l, map2_r);
for (int64_t i = 0; i < map2_l; i++) {
for (int64_t j = 0; j < map2_r; j++) {
map2[batch](i, j) = input1_data[batch * map2_l * map2_r + i * map2_r + j];
}
}
if (adj_x) {
if (adj_y) {
map_output[batch] = map1[batch].adjoint() * map2[batch].adjoint();
} else {
map_output[batch] = map1[batch].adjoint() * map2[batch];
}
} else {
if (adj_y) {
map_output[batch] = map1[batch] * map2[batch].adjoint();
} else {
map_output[batch] = map1[batch] * map2[batch];
}
}
map_output[batch].resize(num_rows, num_cols);
for (int64_t i = 0; i < num_rows; ++i) {
for (int64_t j = 0; j < num_cols; ++j) {
output_data[batch * num_rows * num_cols + i * num_cols + j] = map_output[batch](i, j);
}
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, num_batches, num_batches / max_core_num, shared_batchmatmul),
"BatchMatMul Compute failed.");
} else {
for (uint64_t batch = 0; batch < num_batches; ++batch) {
for (int64_t i = 0; i < map1_l; i++) {
for (int64_t j = 0; j < map1_r; j++) {
map1(i, j) = input0_data[batch * map1_l * map1_r + i * map1_r + j];
}
}
for (int64_t i = 0; i < map2_l; i++) {
for (int64_t j = 0; j < map2_r; j++) {
map2(i, j) = input1_data[batch * map2_l * map2_r + i * map2_r + j];
}
}
if (adj_x) {
if (adj_y) {
map_output = map1.adjoint() * map2.adjoint();
} else {
map_output = map1.adjoint() * map2;
}
} else {
if (adj_y) {
map_output = map1 * map2.adjoint();
} else {
map_output = map1 * map2;
}
}
for (int64_t i = 0; i < num_rows; ++i) {
for (int64_t j = 0; j < num_cols; ++j) {
output_data[batch * num_rows * num_cols + i * num_cols + j] = map_output(i, j);
}
}
}
}
return KERNEL_STATUS_OK;
}
uint32_t BatchMatMulCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check BatchMatMul params failed.");
DataType input0_data_type = ctx.Input(0)->GetDataType();
KERNEL_LOG_DEBUG("%s op input[x1] data type is [%s].", kBatchMatmul, DTypeStr(input0_data_type).c_str());
uint32_t ret = KERNEL_STATUS_OK;
switch (input0_data_type) {
case DT_FLOAT:
ret = DoCompute<float>(ctx);
break;
case DT_DOUBLE:
ret = DoCompute<double>(ctx);
break;
case DT_FLOAT16:
ret = DoCompute<Eigen::half>(ctx);
break;
case DT_INT16:
ret = DoCompute<int16_t>(ctx);
break;
case DT_INT32:
ret = DoCompute<int32_t>(ctx);
break;
case DT_INT64:
ret = DoCompute<int64_t>(ctx);
break;
case DT_UINT16:
ret = DoCompute<uint16_t>(ctx);
break;
case DT_UINT32:
ret = DoCompute<uint32_t>(ctx);
break;
case DT_UINT64:
ret = DoCompute<uint64_t>(ctx);
break;
case DT_COMPLEX64:
ret = DoCompute<std::complex<std::float_t>>(ctx);
break;
case DT_COMPLEX128:
ret = DoCompute<std::complex<std::double_t>>(ctx);
break;
default:
KERNEL_LOG_ERROR("Unsupported input[x1] data type[%s]", DTypeStr(input0_data_type).c_str());
ret = KERNEL_STATUS_PARAM_INVALID;
}
return ret;
}
REGISTER_CPU_KERNEL(kBatchMatmul, BatchMatMulCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,34 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_HOST_BATCHMATMUL_H_
#define AICPU_KERNELS_HOST_BATCHMATMUL_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class BatchMatMulCpuKernel : public CpuKernel {
public:
BatchMatMulCpuKernel() = default;
~BatchMatMulCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,266 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "instance_norm_v2_grad.h"
#include "Eigen/Dense"
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "utils/kernel_util.h"
#include "kernel_log.h"
#include "securec.h"
#include "status.h"
#include "utils/eigen_tensor.h"
#include <vector>
#include <map>
#include <iostream>
namespace {
const char *const kInstanceNormV2Grad = "InstanceNormV2Grad";
constexpr uint32_t kOutputNum = 3;
constexpr uint32_t kInputNum = 7;
constexpr int64_t int64_init_one = 1;
constexpr int64_t kGrainSize = 4 * 1024;
constexpr float float_init_zero = 0.0;
constexpr float float_init_one = 1.0;
constexpr double double_init_zero = 0.0;
constexpr auto kDim3 = 3;
constexpr auto kDim4 = 4;
constexpr auto kDim5 = 5;
constexpr auto InstanceNormV2GradInDyIndex = 0;
constexpr auto InstanceNormV2GradInXIndex = 1;
constexpr auto InstanceNormV2GradInGammaIndex = 2;
constexpr auto InstanceNormV2GradInMeanIndex = 3;
constexpr auto InstanceNormV2GradInVarianceIndex = 4;
constexpr auto InstanceNormV2GradInSaveMeanIndex = 5;
constexpr auto InstanceNormV2GradInSaveVarianceIndex = 6;
constexpr auto InstanceNormV2GradOutDxIndex = 0;
constexpr auto InstanceNormV2GradOutPdGammaIndex = 1;
constexpr auto InstanceNormV2GradOutPdBetaIndex = 2;
inline double FloatToDouble(float v) { return static_cast<double>(v); }
inline double LongToDouble(int64_t v) { return static_cast<double>(v); }
} // namespace
namespace aicpu {
uint32_t InstanceNormV2GradCpuKernel::InstanceNormV2GradTypeCheck(const CpuKernelContext &ctx) {
auto dy_type = ctx.Input(InstanceNormV2GradInDyIndex)->GetDataType();
auto x_type = ctx.Input(InstanceNormV2GradInXIndex)->GetDataType();
auto gamma_type = ctx.Input(InstanceNormV2GradInGammaIndex)->GetDataType();
auto mean_type = ctx.Input(InstanceNormV2GradInMeanIndex)->GetDataType();
auto variance_type = ctx.Input(InstanceNormV2GradInVarianceIndex)->GetDataType();
auto save_mean_type = ctx.Input(InstanceNormV2GradInSaveMeanIndex)->GetDataType();
auto save_variance_type = ctx.Input(InstanceNormV2GradInSaveVarianceIndex)->GetDataType();
if (dy_type != x_type) {
KERNEL_LOG_ERROR(
"For primitive[%s]'s input arguments dy and x should have the same "
"data type, but dy type is [%s], x type is [%s].",
kInstanceNormV2Grad, DTypeStr(dy_type).c_str(), DTypeStr(x_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
const std::map<std::string, DataType> types = {{"gamma", gamma_type},
{"mean", mean_type},
{"variance", variance_type},
{"save_mean", save_mean_type},
{"save_variance", save_variance_type}};
return CheckTensorTypeSame(types, DT_FLOAT, kInstanceNormV2Grad);
}
uint32_t InstanceNormV2GradCpuKernel::InstanceNormV2GradShapeCheck(const CpuKernelContext &ctx) {
auto dy_shape_ptr = ctx.Input(InstanceNormV2GradInDyIndex)->GetTensorShape();
auto x_shape_ptr = ctx.Input(InstanceNormV2GradInXIndex)->GetTensorShape();
auto gamma_shape_ptr = ctx.Input(InstanceNormV2GradInGammaIndex)->GetTensorShape();
auto mean_shape_ptr = ctx.Input(InstanceNormV2GradInMeanIndex)->GetTensorShape();
auto variance_shape_ptr = ctx.Input(InstanceNormV2GradInVarianceIndex)->GetTensorShape();
auto save_mean_shape_ptr = ctx.Input(InstanceNormV2GradInSaveMeanIndex)->GetTensorShape();
auto save_variance_shape_ptr = ctx.Input(InstanceNormV2GradInSaveVarianceIndex)->GetTensorShape();
auto pd_gamma_shape_ptr = ctx.Output(InstanceNormV2GradOutPdGammaIndex)->GetTensorShape();
auto pd_beta_shape_ptr = ctx.Output(InstanceNormV2GradOutPdBetaIndex)->GetTensorShape();
auto dy_shape = dy_shape_ptr->GetDimSizes();
auto res = CheckTensorShapeSame({{"input x", x_shape_ptr}}, dy_shape, kInstanceNormV2Grad);
if (res != KERNEL_STATUS_OK) {
return res;
};
auto x_format = x_shape_ptr->GetFormat();
std::vector<int64_t> check_shape;
check_shape = dy_shape;
int64_t image_size = 0;
if (dy_shape.size() == kDim4) {
if (x_format == FORMAT_NCHW || x_format == FORMAT_ND) {
// consider (N, C, H, W) as (N*C, H, W, 1), similar to (N, H, W, C)
check_shape[kFormatNCHWIndexH] = int64_init_one;
check_shape[kFormatNCHWIndexW] = int64_init_one;
image_size = dy_shape[kFormatNCHWIndexH] * dy_shape[kFormatNCHWIndexW];
instance_num = dy_shape[kFormatNCHWIndexN] * dy_shape[kFormatNCHWIndexC];
constexpr int64_t kNumberOne = 1;
dy_shape_4d_ = {dy_shape[kFormatNCHWIndexN] * dy_shape[kFormatNCHWIndexC], dy_shape[kFormatNCHWIndexH],
dy_shape[kFormatNCHWIndexW], kNumberOne};
batch_channels_2d_ = {dy_shape[kFormatNCHWIndexN] * dy_shape[kFormatNCHWIndexC], kNumberOne};
} else if (x_format == FORMAT_NHWC) {
check_shape[kFormatNHWCIndexH] = int64_init_one;
check_shape[kFormatNHWCIndexW] = int64_init_one;
image_size = dy_shape[kFormatNHWCIndexH] * dy_shape[kFormatNHWCIndexW];
instance_num = dy_shape[kFormatNHWCIndexN] * dy_shape[kFormatNHWCIndexC];
dy_shape_4d_ = dy_shape;
batch_channels_2d_ = {dy_shape[kFormatNHWCIndexN], dy_shape[kFormatNHWCIndexC]};
}
} else if (x_format == FORMAT_NC1HWC0 || dy_shape.size() == kDim5) {
// consider (N, C1, H, W, C0) as (N*C1, H, W, C0), similar to (N, H, W, C)
check_shape[kFormatNC1HWC0IndexH] = int64_init_one;
check_shape[kFormatNC1HWC0IndexW] = int64_init_one;
image_size = dy_shape[kFormatNC1HWC0IndexH] * dy_shape[kFormatNC1HWC0IndexW];
instance_num = dy_shape[kFormatNC1HWC0IndexN] * dy_shape[kFormatNC1HWC0IndexC1] * dy_shape[kFormatNC1HWC0IndexC0];
dy_shape_4d_ = {dy_shape[kFormatNC1HWC0IndexN] * dy_shape[kFormatNC1HWC0IndexC1], dy_shape[kFormatNC1HWC0IndexH],
dy_shape[kFormatNC1HWC0IndexW], dy_shape[kFormatNC1HWC0IndexC0]};
batch_channels_2d_ = {dy_shape[kFormatNC1HWC0IndexN] * dy_shape[kFormatNC1HWC0IndexC1],
dy_shape[kFormatNC1HWC0IndexC0]};
} else {
KERNEL_LOG_ERROR(
"For primitive[%s]'s input arguments dy and x only "
"support NHWC, NCHW and NC1HWC0, but get data format [%s]",
kInstanceNormV2Grad, FormatToSerialString(x_format).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
constexpr int64_t image_min = 1;
if (image_size <= image_min) {
KERNEL_LOG_ERROR(
"For primitive[%s], expected more than 1 value per instance, but get "
"[%ld] value per instance.",
kInstanceNormV2Grad, image_size);
return KERNEL_STATUS_PARAM_INVALID;
}
const std::map<std::string, TensorShapePtr> shapes = {{"gamma", gamma_shape_ptr},
{"mean", mean_shape_ptr},
{"variance", variance_shape_ptr},
{"save_mean", save_mean_shape_ptr},
{"save_variance", save_variance_shape_ptr},
{"pd_gamma", pd_gamma_shape_ptr},
{"pd_beta", pd_beta_shape_ptr}};
return CheckTensorShapeSame(shapes, check_shape, kInstanceNormV2Grad);
}
uint32_t InstanceNormV2GradCpuKernel::InstanceNormV2GradAttrCheck(const CpuKernelContext &ctx) {
constexpr float epsilon_min = 0.0;
constexpr float epsilon_max = 1.0;
auto epsilon_ptr = ctx.GetAttr("epsilon");
if (epsilon_ptr) {
epsilon_ = epsilon_ptr->GetFloat();
}
if (epsilon_ < epsilon_min || epsilon_ >= epsilon_max) {
KERNEL_LOG_ERROR(
"For primitive[%s], attr epsilon value should be in [0, 1), but get "
"[%f].",
kInstanceNormV2Grad, epsilon_);
return KERNEL_STATUS_PARAM_INVALID;
}
auto is_training_ptr = ctx.GetAttr("is_training");
if (is_training_ptr) {
is_training_ = is_training_ptr->GetBool();
}
return KERNEL_STATUS_OK;
}
uint32_t InstanceNormV2GradCpuKernel::InstanceNormV2GradParamCheck(const CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(InstanceNormV2GradTypeCheck(ctx), "InstanceNormV2Grad check type failed.");
KERNEL_HANDLE_ERROR(InstanceNormV2GradShapeCheck(ctx), "InstanceNormV2Grad check shape failed.");
KERNEL_HANDLE_ERROR(InstanceNormV2GradAttrCheck(ctx), "InstanceNormV2Grad check attr failed.");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t InstanceNormV2GradCpuKernel::DoCompute(CpuKernelContext &ctx) {
const int64_t batch = dy_shape_4d_[kFormatNHWCIndexN];
const int64_t channel = dy_shape_4d_[kFormatNHWCIndexC];
const int64_t image_size = dy_shape_4d_[kFormatNHWCIndexH] * dy_shape_4d_[kFormatNHWCIndexW];
std::vector<int64_t> dy_shape_3d_ = {batch, image_size, channel};
auto dy_3d = EigenTensor(dy_shape_3d_, ctx.Input(InstanceNormV2GradInDyIndex)->GetData()).tensor<T, kDim3>();
auto in_x_3d = EigenTensor(dy_shape_3d_, ctx.Input(InstanceNormV2GradInXIndex)->GetData()).tensor<T, kDim3>();
auto weight_matrix =
EigenTensor(batch_channels_2d_, ctx.Input(InstanceNormV2GradInGammaIndex)->GetData()).matrix<float>();
auto running_mean_matrix =
EigenTensor(batch_channels_2d_, ctx.Input(InstanceNormV2GradInMeanIndex)->GetData()).matrix<float>();
auto running_var_matrix =
EigenTensor(batch_channels_2d_, ctx.Input(InstanceNormV2GradInVarianceIndex)->GetData()).matrix<float>();
auto save_mean_matrix =
EigenTensor(batch_channels_2d_, ctx.Input(InstanceNormV2GradInSaveMeanIndex)->GetData()).matrix<float>();
auto save_invstd_matrix =
EigenTensor(batch_channels_2d_, ctx.Input(InstanceNormV2GradInSaveVarianceIndex)->GetData()).matrix<float>();
auto dx_3d = EigenTensor(dy_shape_3d_, ctx.Output(InstanceNormV2GradOutDxIndex)->GetData()).tensor<T, kDim3>();
auto grad_weight_matrix =
EigenTensor(batch_channels_2d_, ctx.Output(InstanceNormV2GradOutPdGammaIndex)->GetData()).matrix<float>();
auto grad_bias_matrix =
EigenTensor(batch_channels_2d_, ctx.Output(InstanceNormV2GradOutPdBetaIndex)->GetData()).matrix<float>();
auto loop_batch = [&](int64_t begin, int64_t end) {
for (int64_t idx = begin; idx < end; ++idx) {
for (int64_t c_idx = 0; c_idx < channel; ++c_idx) {
float w = weight_matrix(idx, c_idx);
float mean = float_init_zero, invstd = float_init_zero;
mean = is_training_ ? save_mean_matrix(idx, c_idx) : running_mean_matrix(idx, c_idx);
float _invstd_ = std::sqrt(running_var_matrix(idx, c_idx) + epsilon_);
invstd = is_training_ ? save_invstd_matrix(idx, c_idx) : float_init_one / _invstd_;
double sum = double_init_zero, dotp = double_init_zero;
for (int64_t img_idx = 0; img_idx < image_size; ++img_idx) {
sum += static_cast<double>(dy_3d(idx, img_idx, c_idx));
dotp += (static_cast<double>(in_x_3d(idx, img_idx, c_idx)) - FloatToDouble(mean)) *
static_cast<double>(dy_3d(idx, img_idx, c_idx));
}
float k = static_cast<float>(dotp * FloatToDouble(invstd) * FloatToDouble(invstd) / LongToDouble(image_size));
float grad_mean = static_cast<float>(sum / LongToDouble(image_size));
for (int64_t img_idx = 0; img_idx < image_size; ++img_idx) {
float _dx_ = (static_cast<float>(in_x_3d(idx, img_idx, c_idx)) - mean) * k;
dx_3d(idx, img_idx, c_idx) =
is_training_
? static_cast<T>((static_cast<float>(dy_3d(idx, img_idx, c_idx)) - grad_mean - _dx_) * invstd * w)
: static_cast<T>(static_cast<float>(dy_3d(idx, img_idx, c_idx)) * invstd * w);
}
grad_weight_matrix(idx, c_idx) = static_cast<float>(dotp * FloatToDouble(invstd));
grad_bias_matrix(idx, c_idx) = static_cast<float>(sum);
}
}
};
int64_t block_size = std::max(int64_init_one, (kGrainSize / (channel * image_size)));
return CpuKernelUtils::ParallelFor(ctx, batch, block_size, loop_batch);
}
uint32_t InstanceNormV2GradCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"InstanceNormV2Grad check input and output number failed.");
KERNEL_HANDLE_ERROR(InstanceNormV2GradParamCheck(ctx), "InstanceNormV2Grad check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
uint32_t result;
switch (data_type) {
case (DT_FLOAT16):
result = DoCompute<Eigen::half>(ctx);
break;
case (DT_FLOAT):
result = DoCompute<float>(ctx);
break;
default:
KERNEL_LOG_ERROR("InstanceNormV2Grad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (result != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("InstanceNormV2Grad kernel compute failed.");
}
return result;
}
REGISTER_CPU_KERNEL(kInstanceNormV2Grad, InstanceNormV2GradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,46 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_INSTANCE_NORM_V2_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_INSTANCE_NORM_V2_GRAD_H_
#include "cpu_ops_kernel.h"
#include <vector>
namespace aicpu {
class InstanceNormV2GradCpuKernel : public CpuKernel {
public:
InstanceNormV2GradCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t InstanceNormV2GradParamCheck(const CpuKernelContext &ctx);
uint32_t InstanceNormV2GradShapeCheck(const CpuKernelContext &ctx);
uint32_t InstanceNormV2GradTypeCheck(const CpuKernelContext &ctx);
uint32_t InstanceNormV2GradAttrCheck(const CpuKernelContext &ctx);
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
bool is_training_ = true;
float epsilon_ = 0.00001;
std::vector<int64_t> dy_shape_4d_;
std::vector<int64_t> batch_channels_2d_;
int64_t instance_num = 0;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,299 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_to_dense.h"
#include <securec.h>
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace aicpu {
const char *const SPARSETODENSE = "SparseToDense";
constexpr int64_t kParallelDateSize = 16 * 1024;
constexpr int64_t kCopyDataSize = 1024;
constexpr uint32_t kInput0 = 0;
constexpr uint32_t kInput1 = 1;
constexpr uint32_t kInput2 = 2;
constexpr uint32_t kInput3 = 3;
constexpr uint32_t kOutput0 = 0;
constexpr int32_t kRank = 2;
} // namespace aicpu
namespace aicpu {
uint32_t SparseToDenseCpuKernel::SparseToDense(const CpuKernelContext &ctx, SparseTensor &st, const Tensor *indices,
Tensor *output) {
KERNEL_LOG_INFO("Start to execute SparseToDense");
if (indices == nullptr || output == nullptr) {
KERNEL_LOG_ERROR("Indices or output tensor is nullptr.");
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
DataType dt = static_cast<DataType>(output->GetDataType());
switch (dt) {
case DT_INT8:
return EigenSparseToDense<int8_t>(ctx, st, indices, output);
case DT_UINT8:
return EigenSparseToDense<uint8_t>(ctx, st, indices, output);
case DT_INT16:
return EigenSparseToDense<int16_t>(ctx, st, indices, output);
case DT_UINT16:
return EigenSparseToDense<uint16_t>(ctx, st, indices, output);
case DT_INT32:
return EigenSparseToDense<int32_t>(ctx, st, indices, output);
case DT_INT64:
return EigenSparseToDense<int64_t>(ctx, st, indices, output);
case DT_FLOAT16:
return EigenSparseToDense<Eigen::half>(ctx, st, indices, output);
case DT_FLOAT:
return EigenSparseToDense<float>(ctx, st, indices, output);
case DT_BOOL:
return EigenSparseToDense<bool>(ctx, st, indices, output);
case DT_DOUBLE:
return EigenSparseToDense<double>(ctx, st, indices, output);
default:
KERNEL_LOG_ERROR("Sparse to dense can't support this data type [%d].", static_cast<int32_t>(dt));
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
}
KernelStatus SparseToDenseCpuKernel::ValidParam(const CpuKernelContext &ctx) {
KERNEL_LOG_INFO("Start to execute ValidParam");
// valid input and output nullptr
Tensor *indices_tensor = ctx.Input(0);
Tensor *shape_tensor = ctx.Input(1);
Tensor *sparse_values = ctx.Input(2);
Tensor *default_value_tensor = ctx.Input(3);
Tensor *output_tensor = ctx.Output(0);
bool validNull = ((output_tensor == nullptr) || default_value_tensor == nullptr || (sparse_values == nullptr) ||
(indices_tensor == nullptr) || (shape_tensor == nullptr));
if (validNull) {
KERNEL_LOG_ERROR("Got input or output param is nullptr.");
return KERNEL_STATUS_PARAM_INVALID;
}
// valid shape nullptr
auto output_shape = shape_tensor->GetTensorShape();
auto values_shape = sparse_values->GetTensorShape();
auto default_value_shape = default_value_tensor->GetTensorShape();
auto indices_shape = indices_tensor->GetTensorShape();
bool validShapeNull = ((default_value_shape == nullptr) || values_shape == nullptr || (output_shape == nullptr) ||
(indices_shape == nullptr));
if (validShapeNull) {
KERNEL_LOG_ERROR("Got input shape is nullptr.");
return KERNEL_STATUS_PARAM_INVALID;
}
// sparse_indices
if (indices_shape->GetDims() > kRank) {
KERNEL_LOG_ERROR(
"Sparse_indices should be a scalar, vector, or matrix, got dim "
"size [%d].",
indices_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
const int64_t elems_num = indices_shape->GetDims() > 0 ? indices_shape->GetDimSize(0) : 1;
const int64_t dims_num = indices_shape->GetDims() > 1 ? indices_shape->GetDimSize(1) : 1;
// output_shape
if (output_shape->GetDims() != 1) {
KERNEL_LOG_ERROR("Output_shape should be a vector, and got dim size [%d].", output_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
if (shape_tensor->NumElements() != dims_num) {
KERNEL_LOG_ERROR("Output_shape has incorrect number of elements [%lld], should be [%lld]",
shape_tensor->NumElements(), dims_num);
return KERNEL_STATUS_PARAM_INVALID;
}
// valid data type
DataType IndiceType = indices_tensor->GetDataType();
DataType outShapeType = shape_tensor->GetDataType();
bool validIndiceType = ((IndiceType != DT_INT32) && (IndiceType != DT_INT64));
bool validShapeType = ((outShapeType != DT_INT32) && (outShapeType != DT_INT64));
if (validShapeType || validIndiceType) {
KERNEL_LOG_ERROR(
"Valid indice or output shape data type failed, indiceType [%d], "
"shapeType [%d].",
static_cast<int>(IndiceType), static_cast<int>(outShapeType));
return KERNEL_STATUS_PARAM_INVALID;
}
// sparse_values
int32_t values_dims_size = values_shape->GetDims();
if ((values_dims_size != 0) && (values_dims_size != 1)) {
KERNEL_LOG_ERROR("Values_shape should be a scalar or a vector, got dim size [%d].", values_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
if ((values_dims_size == 1) && (sparse_values->NumElements() != elems_num)) {
KERNEL_LOG_ERROR("Values_shape has incorrect number of elements [%lld], should be [%lld]",
sparse_values->NumElements(), elems_num);
return KERNEL_STATUS_PARAM_INVALID;
}
// default_value
if (default_value_shape->GetDims() != 0) {
KERNEL_LOG_ERROR("Default_value should be a scalar, and got dim size [%d].", default_value_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_LOG_INFO("Execute ValidParam end.");
return KERNEL_STATUS_OK;
}
uint32_t SparseToDenseCpuKernel::ParallelSetDefaultValue(const CpuKernelContext &ctx,
const Tensor *default_value_tensor,
const Tensor *output_tensor, int64_t output_size) {
auto type_size = GetSizeByDataType(static_cast<DataType>(output_tensor->GetDataType()));
char *default_value_addr = reinterpret_cast<char *>(default_value_tensor->GetData());
char *output_addr = reinterpret_cast<char *>(output_tensor->GetData());
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
auto default_value = [&](std::int64_t begin, std::int64_t end) {
int64_t total = end - begin;
int64_t remainder = total % kCopyDataSize;
int64_t piece = total / kCopyDataSize;
if (piece == 0) {
for (int64_t index = begin; index < end; index++) {
(void)memcpy_s(output_addr + (index * type_size), static_cast<size_t>(type_size), default_value_addr,
static_cast<size_t>(type_size));
}
} else {
for (int64_t index = begin; index < begin + kCopyDataSize; index++) {
(void)memcpy_s(output_addr + (index * type_size), static_cast<size_t>(type_size), default_value_addr,
static_cast<size_t>(type_size));
}
char *temp_addr = output_addr + (begin * type_size);
size_t data_size = static_cast<size_t>(type_size * kCopyDataSize);
for (int64_t loop = 1; loop < piece; loop++) {
(void)memcpy_s(temp_addr + (loop * type_size * kCopyDataSize), data_size, temp_addr, data_size);
}
char *temp_addr1 = output_addr + (begin * type_size) + (piece * type_size * kCopyDataSize);
for (int64_t loop1 = 0; loop1 < remainder; loop1++) {
(void)memcpy_s(temp_addr1 + (loop1 * type_size), static_cast<size_t>(type_size), default_value_addr,
static_cast<size_t>(type_size));
}
}
};
return CpuKernelUtils::ParallelFor(ctx, output_size, output_size / max_core_num, default_value);
}
uint32_t SparseToDenseCpuKernel::SetDefaultValue(const CpuKernelContext &ctx, const Tensor *default_value_tensor,
const Tensor *output_tensor, int64_t output_size) {
auto type_size = GetSizeByDataType(static_cast<DataType>(output_tensor->GetDataType()));
if (type_size < 1) {
KERNEL_LOG_ERROR("Don't support output tensor types");
return KERNEL_STATUS_PARAM_INVALID;
}
char *default_value_addr = reinterpret_cast<char *>(default_value_tensor->GetData());
char *output_addr = reinterpret_cast<char *>(output_tensor->GetData());
if (output_size < kParallelDateSize) {
int64_t remainder = output_size % kCopyDataSize;
int64_t piece = output_size / kCopyDataSize;
if (piece == 0) {
for (int index = 0; index < output_size; index++) {
(void)memcpy_s(output_addr + (index * type_size), static_cast<size_t>(type_size), default_value_addr,
static_cast<size_t>(type_size));
}
} else {
for (int index = 0; index < kCopyDataSize; index++) {
(void)memcpy_s(output_addr + (index * type_size), static_cast<size_t>(type_size), default_value_addr,
static_cast<size_t>(type_size));
}
size_t data_size = static_cast<size_t>(type_size * kCopyDataSize);
for (int loop = 1; loop < piece; loop++) {
(void)memcpy_s(output_addr + (loop * type_size * kCopyDataSize), data_size, output_addr, data_size);
}
char *temp_addr = output_addr + (piece * type_size * kCopyDataSize);
for (int loop1 = 0; loop1 < remainder; loop1++) {
(void)memcpy_s(temp_addr + (loop1 * type_size), static_cast<size_t>(type_size), default_value_addr,
static_cast<size_t>(type_size));
}
}
return KERNEL_STATUS_OK;
} else {
return ParallelSetDefaultValue(ctx, default_value_tensor, output_tensor, output_size);
}
}
uint32_t SparseToDenseCpuKernel::Compute(CpuKernelContext &ctx) {
if (ValidParam(ctx) != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("Valid sparse to dense param error.");
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
Tensor *indices_tensor = ctx.Input(kInput0);
KERNEL_CHECK_NULLPTR(indices_tensor, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "Indices_tensor is null")
Tensor *shape_tensor = ctx.Input(kInput1);
KERNEL_CHECK_NULLPTR(shape_tensor, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "Shape_tensor is null")
Tensor *sparse_values = ctx.Input(kInput2);
KERNEL_CHECK_NULLPTR(sparse_values, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "Sparse_values is null")
Tensor *default_value_tensor = ctx.Input(kInput3);
KERNEL_CHECK_NULLPTR(default_value_tensor, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID),
"Default_value_tensor is null")
Tensor *output_tensor = ctx.Output(kOutput0);
KERNEL_CHECK_NULLPTR(output_tensor, static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID), "Output_tensor is null")
auto output_shape = shape_tensor->GetTensorShape();
std::vector<int64_t> dense_shape;
std::vector<int64_t> order;
int64_t output_size = 1;
size_t output_zero_dim_size = static_cast<size_t>(output_shape->GetDimSize(0));
for (size_t index = 0; index < output_zero_dim_size; ++index) {
if (shape_tensor->GetDataType() == DT_INT32) {
int32_t *temp_dim = reinterpret_cast<int32_t *>(shape_tensor->GetData());
dense_shape.emplace_back(static_cast<int64_t>(temp_dim[index]));
} else {
int64_t *temp_dim = reinterpret_cast<int64_t *>(shape_tensor->GetData());
dense_shape.emplace_back(temp_dim[index]);
}
order.push_back(dense_shape[index]);
output_size *= dense_shape[index];
}
std::iota(order.begin(), order.end(), 0);
SparseTensor st;
if (st.CreateSparseTensor(indices_tensor, sparse_values, dense_shape, order) !=
static_cast<uint32_t>(KERNEL_STATUS_OK)) {
KERNEL_LOG_ERROR("Create sparse tensor failed.");
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
AttrValue *validate_indices = ctx.GetAttr("validate_indices");
if (validate_indices == nullptr) {
KERNEL_LOG_ERROR("Get attr:validate_indices failed.");
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
if (validate_indices->GetBool()) {
if (st.IndicesValid(ctx) != static_cast<uint32_t>(KERNEL_STATUS_OK)) {
KERNEL_LOG_ERROR("Indices is valid.");
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
}
if (SetDefaultValue(ctx, default_value_tensor, output_tensor, output_size) !=
static_cast<uint32_t>(KERNEL_STATUS_OK)) {
KERNEL_LOG_ERROR("Sparse_to_dense set default value failed.");
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
if (SparseToDense(ctx, st, indices_tensor, output_tensor) != static_cast<uint32_t>(KERNEL_STATUS_OK)) {
KERNEL_LOG_ERROR("Sparse_to_dense execute failed.");
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
REGISTER_CPU_KERNEL(SPARSETODENSE, SparseToDenseCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,92 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSETODENSE_H_
#define AICPU_KERNELS_NORMALIZED_SPARSETODENSE_H_
#include "cpu_ops_kernel.h"
#include "utils/sparse_tensor.h"
namespace aicpu {
class SparseToDenseCpuKernel : public CpuKernel {
public:
~SparseToDenseCpuKernel() = default;
/*
* compute sparse to dense
* @param ctx: cpu kernel context
* @return uint32_t: 0->success other->failed
*/
uint32_t Compute(CpuKernelContext &ctx) override;
protected:
/*
* valid sparse to dense param
* @param st: sparse tensor
* @param indices: indices tensor
* @param output: output tensor
* @return uint32_t: 0->success other->failed
*/
template <typename ValueT>
uint32_t EigenSparseToDense(const CpuKernelContext &ctx, SparseTensor &st, const Tensor *indices, Tensor *output) {
if (indices->GetDataType() == DT_INT32) {
return st.ToDense<int32_t, ValueT>(ctx, output);
} else {
return st.ToDense<int64_t, ValueT>(ctx, output);
}
}
/*
* valid sparse to dense param
* @param st: sparse tensor
* @param indices: indices tensor
* @param output: output tensor
* @return uint32_t: 0->success other->failed
*/
uint32_t SparseToDense(const CpuKernelContext &ctx, SparseTensor &st, const Tensor *indices, Tensor *output);
/*
* valid sparse to dense param
* @param ctx: cpu kernel context
* @return uint32_t: 0->success other->failed
*/
KernelStatus ValidParam(const CpuKernelContext &ctx);
/*
* parallel set default value to dense
* @param ctx: cpu kernel context
* @param default_value_tensor: default value of dense tensor
* @param output_tensor: output tensor
* @param output_size: output tensor size
* @return uint32_t: 0->success other->failed
*/
uint32_t ParallelSetDefaultValue(const CpuKernelContext &ctx, const Tensor *default_value_tensor,
const Tensor *output_tensor, int64_t output_size);
/*
* set default value to dense
* @param ctx: cpu kernel context
* @param default_value_tensor: default value of dense tensor
* @param output_tensor: output tensor
* @param output_size: output tensor size
* @return uint32_t: 0->success other->failed
*/
uint32_t SetDefaultValue(const CpuKernelContext &ctx, const Tensor *default_value_tensor, const Tensor *output_tensor,
int64_t output_size);
};
} // namespace aicpu
#endif

View File

@ -317,7 +317,11 @@ bool AICpuLibSelectPass::Process(const AnfNodePtr &node) const {
mindspore::kMaskedSelectOpName,
mindspore::kMaskedSelectGradOpName,
mindspore::kMultiMarginLossOpName,
mindspore::kMatrixInverseOpName};
mindspore::kMatrixInverseOpName,
mindspore::kMultiMarginLossGradOpName,
mindspore::kSspaddmmOpName,
mindspore::kBatchMatMulOpName,
mindspore::kSparseToDenseOpName};
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";

View File

@ -410,3 +410,7 @@ from .trace import _trace_aicpu
from .tracegrad import _tracegrad_aicpu
from .tridiagonal_solve import _tridiagonal_solve_aicpu
from .truncated_normal import _truncated_normal_aicpu
from .glu import _glu_aicpu
from .multi_margin_loss import _multi_margin_loss_aicpu
from .multi_margin_loss_grad import _multi_margin_loss_grad_aicpu
from .sparse_to_dense_v2 import _sparse_to_dense_v2_aicpu