!47329 migrates aicpu ops from CANN pr

Merge pull request !47329 from 李林杰/1229_migrates_aicpu_from_CAAN_pr
This commit is contained in:
i-robot 2023-01-03 08:31:37 +00:00 committed by Gitee
commit 76f46b52c4
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
22 changed files with 2615 additions and 10 deletions

View File

@ -83,18 +83,10 @@
"mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/matmul_tik.cc" "syntaxError"
# AICPU migration
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "useStlAlgorithm"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "variableScope"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constParameter"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constVariable"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "unreadVariable"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "redundantAssignment"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constArgument"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "useStlAlgorithm"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "variableScope"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constParameter"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constVariable"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "unreadVariable"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "nullPointerRedundantCheck"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "variableScope"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unreadVariable"
@ -104,3 +96,4 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "knownConditionTrueFalse"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "passedByValue"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"

View File

@ -280,3 +280,5 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multinomial.cc:aicpu::Generate
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_set_diag_v3.cc:aicpu::MatrixSetDiagV3CpuKernel::DoCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc:aicpu::Col2imCpuKernel::Col2imParamCheck

View File

@ -163,6 +163,7 @@ constexpr auto kClipBoxesDOpName = "kClipBoxesD";
constexpr auto kClipByNormNoDivSumOpName = "ClipByNormNoDivSum";
constexpr auto kClipByValueOpName = "ClipByValue";
constexpr auto kCoalesceOpName = "Coalesce";
constexpr auto kCol2imOpName = "Col2im";
constexpr auto kCombineMomentumOpName = "CombineMomentum";
constexpr auto kCombineMomentumWeightOpName = "CombineMomentumWeight";
constexpr auto kComputeAccidentalHitsOpName = "ComputeAccidentalHits";
@ -209,6 +210,7 @@ constexpr auto kCumSumOpName = "CumSum";
constexpr auto kDataFormatDimMapOpName = "DataFormatDimMap";
constexpr auto kCumulativeLogsumexpOpName = "CumulativeLogsumexp";
constexpr auto kCumulativeLogsumexpDOpName = "CumulativeLogsumexpD";
constexpr auto kDataFormatVecPermuteOpName = "DataFormatVecPermute";
constexpr auto kDeadNodeName = "DeadNode";
constexpr auto kDenseToDenseSetOperation = "DenseToDenseSetOperation";
constexpr auto kDenseToSparseSetOperation = "DenseToSparseSetOperation";
@ -346,6 +348,8 @@ constexpr auto kInplaceUpdateOpName = "InplaceUpdate";
constexpr auto kInplaceUpdateDOpName = "InplaceUpdateD";
constexpr auto kInstanceNorm = "InstanceNorm";
constexpr auto kInstanceNormGradOpName = "InstanceNormGrad";
constexpr auto kInstanceNormV2OpName = "InstanceNormV2";
constexpr auto kInstanceNormV2GradOpName = "InstanceNormV2Grad";
constexpr auto kInTopKOpName = "InTopK";
constexpr auto kInTopKDOpName = "InTopKD";
constexpr auto kIsInfOpName = "IsInf";
@ -403,6 +407,7 @@ constexpr auto kMatrixLogarithmOpName = "MatrixLogarithm";
constexpr auto kMatrixSetDiagOpName = "MatrixSetDiag";
constexpr auto kMatrixSetDiagDOpName = "MatrixSetDiagD";
constexpr auto kMatrixSetDiagV3OpName = "MatrixSetDiagV3";
constexpr auto kMatrixSolveLsOpName = "MatrixSolveLs";
constexpr auto kMaximumGradOpName = "MaximumGrad";
constexpr auto kMaximumOpName = "Maximum";
constexpr auto kMaxPool3DGradGradOpName = "MaxPool3DGradGrad";
@ -422,6 +427,7 @@ constexpr auto kMedianGradOpName = "MedianGrad";
constexpr auto kMemCpyAsyncOpName = "memcpy_async";
constexpr auto kMinimumGradOpName = "MinimumGrad";
constexpr auto kMinimumOpName = "Minimum";
constexpr auto kMirrorPadOpName = "MirrorPad";
constexpr auto kMomentumOpName = "Momentum";
constexpr auto kMulOpName = "Mul";
constexpr auto kMultinomialOpName = "Multinomial";
@ -439,6 +445,7 @@ constexpr auto kNonZeroOpName = "NonZero";
constexpr auto kNPUAllocFloatStatusOpName = "NPUAllocFloatStatus";
constexpr auto kNPUClearFloatStatusOpName = "NPUClearFloatStatus";
constexpr auto kNPUGetFloatStatusOpName = "NPUGetFloatStatus";
constexpr auto kNuclearNormOpName = "NuclearNorm";
constexpr auto kOneHotOpName = "OneHot";
constexpr auto kOneHotDOpName = "OneHotD";
constexpr auto kPadAndShiftOpName = "PadAndShift";
@ -472,6 +479,7 @@ constexpr auto kPullWeightOpName = "PullWeight";
constexpr auto kPushOpName = "Push";
constexpr auto kQrOpName = "Qr";
constexpr auto kPushWeightOpName = "PushWeight";
constexpr auto kQuantileOpName = "Quantile";
constexpr auto kRandomChoiceWithMaskOpName = "RandomChoiceWithMask";
constexpr auto kRandomShuffleOpName = "RandomShuffle";
constexpr auto kRangeOpName = "Range";
@ -591,6 +599,7 @@ constexpr auto kSparseSliceOpName = "SparseSlice";
constexpr auto kSparseSoftmaxCrossEntropyWithLogitsOpName = "SparseSoftmaxCrossEntropyWithLogits";
constexpr auto kSparseSparseMinimumOpName = "SparseSparseMinimum";
constexpr auto kSparseSparseMaximumOpName = "SparseSparseMaximum";
constexpr auto kSparseTensorDenseMatMulOpName = "SparseTensorDenseMatMul";
constexpr auto kSplitOpName = "Split";
constexpr auto kSplitDOpName = "SplitD";
constexpr auto kSplitVOpName = "SplitV";
@ -604,6 +613,7 @@ constexpr auto kStackDestroyOpName = "StackDestroy";
constexpr auto kStackInitOpName = "StackInit";
constexpr auto kStackOpName = "Stack";
constexpr auto kPackOpName = "Pack";
constexpr auto kSparseSegmentSqrtNOpName = "SparseSegmentSqrtN";
constexpr auto kStackPopOpName = "StackPop";
constexpr auto kStackPushOpName = "StackPush";
constexpr auto kStandardLaplaceOpName = "StandardLaplace";

View File

@ -0,0 +1,236 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "col2im.h"
#include <vector>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "status.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kCol2imInputNum = 2;
const uint32_t kCol2imOutputNum = 1;
constexpr uint32_t kValue0 = 0;
constexpr uint32_t kValue1 = 1;
constexpr uint32_t kValue2 = 2;
constexpr uint32_t kValue4 = 4;
constexpr uint32_t kIndex0 = 0;
constexpr uint32_t kIndex1 = 1;
constexpr uint32_t kIndex2 = 2;
constexpr uint32_t kIndex3 = 3;
const char *kCol2im = "Col2im";
} // namespace
namespace aicpu {
uint32_t Col2imCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(Col2imParamCheck(ctx), "[%s] check params failed.", kCol2im);
auto data_type = ctx.Input(0)->GetDataType();
uint32_t ret = KERNEL_STATUS_OK;
switch (data_type) {
case DT_FLOAT:
ret = Col2imCompute<float>(ctx);
break;
case DT_FLOAT16:
ret = Col2imCompute<Eigen::half>(ctx);
break;
default:
KERNEL_LOG_ERROR("Range kernel data type [%s] not support.", DTypeStr(data_type).c_str());
ret = KERNEL_STATUS_PARAM_INVALID;
break;
}
return ret;
}
template <typename T>
static inline T div_rtn(T x, T y) {
int q = x / y;
int r = x % y;
if ((r != 0) && ((r < 0) != (y < 0))) {
--q;
}
return q;
}
uint32_t Col2imCpuKernel::Col2imParamCheck(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kCol2imInputNum, kCol2imOutputNum), "[%s] check params failed.", kCol2im);
Tensor *input_ = ctx.Input(0);
Tensor *output_size_ = ctx.Input(1);
KERNEL_CHECK_NULLPTR(ctx.GetAttr("kernel_size"), KERNEL_STATUS_PARAM_INVALID,
"Get ctx.GetAttr(\"kernel_size\") failed.");
KERNEL_CHECK_NULLPTR(ctx.GetAttr("dilation"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"dilation\") failed.");
KERNEL_CHECK_NULLPTR(ctx.GetAttr("padding"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"padding\") failed.");
KERNEL_CHECK_NULLPTR(ctx.GetAttr("stride"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"stride\") failed.");
std::vector<int64_t> kernel_size = ctx.GetAttr("kernel_size")->GetListInt();
std::vector<int64_t> dilation = ctx.GetAttr("dilation")->GetListInt();
std::vector<int64_t> padding = ctx.GetAttr("padding")->GetListInt();
std::vector<int64_t> stride = ctx.GetAttr("stride")->GetListInt();
auto output_size_shape = output_size_->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE((output_size_shape.size() == kValue1 && output_size_->NumElements() == kValue2),
KERNEL_STATUS_PARAM_INVALID,
"Expected 1D tensor for output_size with non-zero dimensions for and "
"output_size's size equals to 2, but "
"got %dD tensor for output_size and output_size's size equals to %d.",
output_size_shape.size(), output_size_->NumElements());
KERNEL_CHECK_FALSE(kernel_size.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
"It is expected kernel_size's size equals to 2, but got size %d.", kernel_size.size());
KERNEL_CHECK_FALSE(dilation.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
"It is expected dilation_size equals to 2, but got size %d.", dilation.size());
KERNEL_CHECK_FALSE(padding.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
"It is expected padding_size equals to 2, but got size %d.", padding.size());
KERNEL_CHECK_FALSE(stride.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
"It is expected stride_size equals to 2, but got size %d.", stride.size());
int32_t *output_size_data = reinterpret_cast<int32_t *>(output_size_->GetData());
std::vector<int64_t> output_size(kValue2, kValue0);
output_size[kIndex0] = output_size_data[kIndex0];
output_size[kIndex1] = output_size_data[kIndex1];
const int64_t output_height = output_size.front();
const int64_t output_width = output_size.back();
const int64_t kernel_height = kernel_size.front();
const int64_t kernel_width = kernel_size.back();
const int64_t dilation_height = dilation.front();
const int64_t dilation_width = dilation.back();
const int64_t pad_height = padding.front();
const int64_t pad_width = padding.back();
const int64_t stride_height = stride.front();
const int64_t stride_width = stride.back();
KERNEL_CHECK_FALSE(output_width > kValue0 && output_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
"output should be greater than zero, but got "
"output_height: %d output_width: %d.",
output_height, output_width);
KERNEL_CHECK_FALSE(kernel_width > kValue0 && kernel_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
"kernel should be greater than zero, but got "
"kernel_height: %d kernel_width: %d.",
kernel_height, kernel_width);
KERNEL_CHECK_FALSE(dilation_width > kValue0 && dilation_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
"dilation should be greater than zero, but got "
"dilation_height: %d dilation_width: %d.",
dilation_height, dilation_width);
KERNEL_CHECK_FALSE(pad_width >= kValue0 && pad_height >= kValue0, KERNEL_STATUS_PARAM_INVALID,
"padding should be greater than zero, but got pad_height: "
"%d pad_width: %d.",
pad_height, pad_width);
KERNEL_CHECK_FALSE(stride_width > kValue0 && stride_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
"stride should be greater than zero, but got "
"stride_height: %d stride_width: %d.",
stride_height, stride_width);
auto input_shape = input_->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE(
(input_shape.size() == kValue4 && input_shape[kIndex0] != kValue0 && input_shape[kIndex1] != kValue0 &&
input_shape[kIndex2] != kValue0 && input_shape[kIndex3] != kValue0),
KERNEL_STATUS_PARAM_INVALID,
"Expected 4D (batch mode) tensor for input with non-zero "
"batch size and non-zero dimensions for input, but got %dD input: (%d %d "
"%d %d).",
input_shape.size(), input_shape[kIndex0], input_shape[kIndex1], input_shape[kIndex2], input_shape[kIndex3]);
KERNEL_CHECK_FALSE(input_shape[kIndex2] == (kernel_width * kernel_height), KERNEL_STATUS_PARAM_INVALID,
"Expected size of input's dimension 2 to match the calculated "
"number of kernel_size, but got input_shape[2]=%d and kernel_size=(%d, "
"%d).",
input_shape[kIndex2], kernel_height, kernel_width);
auto input_length = input_shape[kIndex3];
int64_t n_blocks_height =
div_rtn<int64_t>(output_height + 2 * pad_height - dilation_height * (kernel_height - 1) - 1, stride_height) + 1;
int64_t n_blocks_width =
div_rtn<int64_t>(output_width + 2 * pad_width - dilation_width * (kernel_width - 1) - 1, stride_width) + 1;
KERNEL_CHECK_FALSE(input_length == (n_blocks_height * n_blocks_width), KERNEL_STATUS_PARAM_INVALID,
"Given output_size=(%d, %d), kernel_size=(%d, %d), dilation=(%d, %d",
"), padding=(%d, %d), stride=(%d, %d), expected size of input's "
"dimension 2 to match the calculated "
"number of sliding blocks %d * %d = %d, but got input.size(2)=%d.",
output_height, output_width, kernel_height, kernel_width, dilation_height, dilation_width,
pad_height, pad_width, stride_height, stride_width, n_blocks_height, n_blocks_width,
(n_blocks_height * n_blocks_width), input_length);
return KERNEL_STATUS_OK;
}
template <typename T>
void Col2imCpuKernel::InnerCompute(int64_t c_col, int64_t input_offset, int64_t output_offset, T *input_data,
T *output_data) {
int64_t w_offset = c_col % kernel_width;
int64_t h_offset = (c_col / kernel_width) % kernel_height;
int64_t c_im = c_col / kernel_height / kernel_width;
for (int64_t h_col = 0; h_col < height_col; ++h_col) {
int64_t h_im = h_col * stride_height - pad_height + h_offset * dilation_height;
for (int64_t w_col = 0; w_col < width_col; ++w_col) {
int64_t w_im = w_col * stride_width - pad_width + w_offset * dilation_width;
if (h_im >= 0 && h_im < output_height && w_im >= 0 && w_im < output_width) {
output_data[output_offset + (c_im * output_height + h_im) * output_width + w_im] +=
input_data[input_offset + (c_col * height_col + h_col) * width_col + w_col];
}
}
}
}
template <typename T>
uint32_t Col2imCpuKernel::Col2imCompute(CpuKernelContext &ctx) {
Tensor *input_ = ctx.Input(0);
Tensor *output_size_ = ctx.Input(1);
Tensor *output_ = ctx.Output(0);
int32_t *output_size_data = reinterpret_cast<int32_t *>(output_size_->GetData());
std::vector<int64_t> output_size(kValue2, kValue0);
output_size[kIndex0] = output_size_data[kIndex0];
output_size[kIndex1] = output_size_data[kIndex1];
std::vector<int64_t> kernel_size = ctx.GetAttr("kernel_size")->GetListInt();
std::vector<int64_t> dilation = ctx.GetAttr("dilation")->GetListInt();
std::vector<int64_t> padding = ctx.GetAttr("padding")->GetListInt();
std::vector<int64_t> stride = ctx.GetAttr("stride")->GetListInt();
output_height = output_size.front();
output_width = output_size.back();
kernel_height = kernel_size.front();
kernel_width = kernel_size.back();
dilation_height = dilation.front();
dilation_width = dilation.back();
pad_height = padding.front();
pad_width = padding.back();
stride_height = stride.front();
stride_width = stride.back();
auto input_shape = input_->GetTensorShape()->GetDimSizes();
const int64_t batch_size = input_shape[kIndex0];
const int64_t n_input_plane = input_shape[kIndex1];
height_col =
(output_height + kValue2 * pad_height - (dilation_height * (kernel_height - kValue1) + kValue1)) / stride_height +
1;
width_col =
(output_width + kValue2 * pad_width - (dilation_width * (kernel_width - kValue1) + kValue1)) / stride_width + 1;
T *input_data = reinterpret_cast<T *>(input_->GetData());
T *output_data = reinterpret_cast<T *>(output_->GetData());
std::fill_n(output_data, output_->NumElements(), T(0));
channels_col = n_input_plane * kernel_height * kernel_width;
batch_input_size = n_input_plane * kernel_height * kernel_width * height_col * width_col;
batch_output_size = n_input_plane * output_height * output_width;
for (int64_t elt = 0; elt < batch_size; ++elt) {
int64_t input_offset = batch_input_size * elt;
int64_t output_offset = batch_output_size * elt;
for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
InnerCompute<T>(c_col, input_offset, output_offset, input_data, output_data);
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kCol2im, Col2imCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,50 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_COL2IM_H_
#define AICPU_KERNELS_NORMALIZED_COL2IM_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class Col2imCpuKernel : public CpuKernel {
public:
Col2imCpuKernel() = default;
~Col2imCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t Col2imParamCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t Col2imCompute(CpuKernelContext &ctx);
template <typename T>
void InnerCompute(int64_t c_col, int64_t input_offset, int64_t output_offset, T *input_data, T *output_data);
int64_t output_height, output_width;
int64_t kernel_height, kernel_width;
int64_t dilation_height, dilation_width;
int64_t pad_height, pad_width;
int64_t stride_height, stride_width;
int64_t height_col, width_col;
int64_t channels_col, batch_input_size, batch_output_size;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,211 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
#include "cumulativelogsumexp.h"
#include "cmath"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t KCumulativeLogsumexpInputNum = 2;
const uint32_t KCumulativeLogsumexpOutputNum = 1;
const float float16_exclusive_data = -65504e+0;
const float float_exclusive_data = -3.4028235e+38;
const double double_exclusive_data = -1.7976931348623157e+308;
const int64_t ParallelFor_size_float16 = 16 * 1024;
const int64_t ParallelFor_size_float32 = 32 * 1024;
const int64_t ParallelFor_size_double = 64 * 1024;
const char *KCumulativeLogsumexp = "CumulativeLogsumexp";
#define CUMULATIVELOGSUMEXP_COMPUTE_CASE(DTYPE, IN_TYPE, CTX) \
case (DTYPE): { \
uint32_t result = CumulativeLogsumexpCompute<IN_TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("CumulativeLogsumexp kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t CumulativeLogsumexpCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, KCumulativeLogsumexpInputNum, KCumulativeLogsumexpOutputNum),
"[%s] check input and output failed,", KCumulativeLogsumexp);
KERNEL_HANDLE_ERROR(CumulativeLogsumexpCheck(ctx), "[%s] check params failed.", KCumulativeLogsumexp);
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_FLOAT, float, ctx)
CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("CumulativeLogsumexp kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t CumulativeLogsumexpCpuKernel::CumulativeLogsumexpCheck(CpuKernelContext &ctx) {
KERNEL_CHECK_FALSE((ctx.Input(1)->GetDataType() == DT_INT16 || ctx.Input(1)->GetDataType() == DT_INT32),
KERNEL_STATUS_PARAM_INVALID, "Data type of axis is not support, axis data type is [%u].",
ctx.Input(1)->GetDataType())
KERNEL_CHECK_FALSE(ctx.Input(1)->NumElements() == 1, KERNEL_STATUS_PARAM_INVALID, "axis is out of shape");
auto axis_data = static_cast<int32_t *>(ctx.Input(1)->GetData());
int64_t axis = *axis_data;
KERNEL_CHECK_FALSE((axis < ctx.Input(0)->GetTensorShape()->GetDims()), KERNEL_STATUS_PARAM_INVALID,
"axis is larger than input dims - 1")
KERNEL_CHECK_FALSE((axis >= -ctx.Input(0)->GetTensorShape()->GetDims()), KERNEL_STATUS_PARAM_INVALID,
"axis is lower than -input dims")
std::vector<int64_t> shape_input = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_output = ctx.Output(0)->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE((shape_input.size() != 0), KERNEL_STATUS_PARAM_INVALID,
"Input must be at least rank 1, got [%zu].", shape_input.size())
KERNEL_CHECK_FALSE((shape_input.size() == shape_output.size()), KERNEL_STATUS_PARAM_INVALID,
"The output shape size should be same as the output shape size")
DataType input0_type = ctx.Input(0)->GetDataType();
DataType output0_type = ctx.Output(0)->GetDataType();
KERNEL_CHECK_FALSE((input0_type == output0_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with output0 [%s] ", DTypeStr(input0_type).c_str(),
DTypeStr(output0_type).c_str())
return KERNEL_STATUS_OK;
}
template <typename t>
void CumulativeProcess(uint32_t outer, uint32_t inner, uint32_t depth, bool reverse, bool exclusive, t *input_data,
t *output_data, DataType data_type) {
for (size_t outer_index = 0; outer_index < outer; ++outer_index) {
size_t outer_index_adj;
if (reverse) {
outer_index_adj = (outer - 1) - outer_index;
} else {
outer_index_adj = outer_index;
}
for (size_t inner_index = 0; inner_index < inner; ++inner_index) {
double one = 1;
double temp = 0;
size_t inner_index_adj;
if (reverse) {
inner_index_adj = (inner - 1) - inner_index;
} else {
inner_index_adj = inner_index;
}
for (size_t depth_index = 0; depth_index < depth; ++depth_index) {
size_t depth_index_adj;
if (reverse) {
depth_index_adj = (depth - 1) - depth_index;
} else {
depth_index_adj = depth_index;
}
size_t index = outer_index_adj;
index += inner_index_adj * depth * outer;
index += depth_index_adj * outer;
if (exclusive) {
if (depth_index == 0) {
if (data_type == DT_FLOAT16) {
output_data[index] = static_cast<t>(float16_exclusive_data);
} else if (data_type == DT_FLOAT) {
output_data[index] = static_cast<t>(float_exclusive_data);
} else {
output_data[index] = static_cast<t>(double_exclusive_data);
}
temp = static_cast<double>(input_data[index]);
} else {
output_data[index] = static_cast<t>(temp);
double a = temp;
double b, min0, max0;
b = static_cast<double>(input_data[index]);
min0 = (a < b) ? a : b;
max0 = (a > b) ? a : b;
temp = log(one + exp(min0 - max0)) + max0;
}
} else {
if (depth_index == 0) {
output_data[index] = input_data[index];
temp = static_cast<double>(input_data[index]);
} else {
double a = temp;
double b, min0, max0;
b = static_cast<double>(input_data[index]);
min0 = (a < b) ? a : b;
max0 = (a > b) ? a : b;
output_data[index] = static_cast<t>(log(one + exp(min0 - max0)) + max0);
temp = log(one + exp(min0 - max0)) + max0;
}
}
}
}
}
}
template <typename T>
uint32_t CumulativeLogsumexpCpuKernel::CumulativeLogsumexpCompute(CpuKernelContext &ctx) {
auto input_data = static_cast<T *>(ctx.Input(0)->GetData());
auto axis_data = static_cast<int32_t *>(ctx.Input(1)->GetData());
bool exclusive = false;
bool reverse = false;
AttrValue *exclusive_attr = ctx.GetAttr("exclusive");
if (exclusive_attr != nullptr) {
exclusive = exclusive_attr->GetBool();
}
AttrValue *reverse_attr = ctx.GetAttr("reverse");
if (reverse_attr != nullptr) {
reverse = reverse_attr->GetBool();
}
int32_t axis = 0;
if (axis_data != nullptr) {
axis = *axis_data;
}
auto output_data = static_cast<T *>(ctx.Output(0)->GetData());
auto shape = ctx.Input(0)->GetTensorShape();
const int64_t rank = shape->GetDims();
if (axis < 0) {
axis += shape->GetDims();
}
uint32_t inner = 1;
uint32_t outer = 1;
uint32_t depth = 1;
for (int32_t i = 0; i < rank; ++i) {
if (i < axis) {
inner *= shape->GetDimSize(i);
} else if (i > axis) {
outer *= shape->GetDimSize(i);
} else {
depth = shape->GetDimSize(i);
}
} // end for
auto data_type = ctx.Input(0)->GetDataType();
int64_t data_num = ctx.Input(0)->NumElements();
int64_t data_size = data_num * sizeof(T);
if ((data_type == DT_FLOAT16 && data_size <= ParallelFor_size_float16) ||
(data_type == DT_FLOAT && data_size <= ParallelFor_size_float32) ||
(data_type == DT_DOUBLE && data_size <= ParallelFor_size_double)) {
CumulativeProcess<T>(outer, inner, depth, reverse, exclusive, input_data, output_data, data_type);
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > outer) {
max_core_num = outer;
}
auto shard_cumulativelogsumexp = [&](size_t start, size_t end) {
CumulativeProcess<T>(outer, inner, depth, reverse, exclusive, input_data, output_data, data_type);
};
if (max_core_num == 0) {
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, outer, outer / max_core_num, shard_cumulativelogsumexp),
"CumulativeLogsumexp Compute failed.");
} // end else
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(KCumulativeLogsumexp, CumulativeLogsumexpCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_CUMULATIVELOGSUMEXP_H_
#define AICPU_KERNELS_NORMALIZED_CUMULATIVELOGSUMEXP_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class CumulativeLogsumexpCpuKernel : public CpuKernel {
public:
CumulativeLogsumexpCpuKernel() = default;
~CumulativeLogsumexpCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t CumulativeLogsumexpCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t CumulativeLogsumexpCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,126 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "data_format_vec_permute.h"
#include <string>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/kernel_util.h"
using namespace std;
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *kDataFormatVecPermute = "DataFormatVecPermute";
#define DATAFORMATVECPERMUTE_COMPUTE_CASE(DTYPE, TYPE, DIM, SRC_FORMAT_STR, DST_FORMAT_STR, X, Y, CTX) \
case (DTYPE): { \
uint32_t result = DataFormatVecPermuteCompute<TYPE>(DIM, SRC_FORMAT_STR, DST_FORMAT_STR, X, Y, CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("DataFormatVecPermute kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t DataFormatVecPermute::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check DataFormatVecPermute params failed.");
AttrValue *src_format = ctx.GetAttr("src_format");
std::string src_format_str = src_format->GetString();
KERNEL_CHECK_FALSE((src_format_str.size() == 4), KERNEL_STATUS_PARAM_INVALID,
"src_format must be of length 4, but the length of src_format = [%d].", src_format_str.size());
AttrValue *dst_format = ctx.GetAttr("dst_format");
std::string dst_format_str = dst_format->GetString();
KERNEL_CHECK_FALSE((dst_format_str.size() == 4), KERNEL_STATUS_PARAM_INVALID,
"dst_format must be of length 4, but the length of dst_format = [%d].", dst_format_str.size());
Tensor *x = ctx.Input(0);
auto x_shape = x->GetTensorShape();
int32_t dim = x_shape->GetDims();
KERNEL_CHECK_FALSE((dim == 1 || dim == 2), KERNEL_STATUS_PARAM_INVALID,
"Input dimension must be 1 or 2, but got dimension = [%d].", dim);
Tensor *y = ctx.Output(0);
auto y_shape = y->GetTensorShape();
if (dim == 1) {
KERNEL_CHECK_FALSE((x_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
"1D Input must be of size 4, but got size %lld.", x_shape->GetDimSize(0));
KERNEL_CHECK_FALSE((y_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
"1D Output must be of size 4, but got size %lld.", y_shape->GetDimSize(0));
} else if (dim == 2) {
KERNEL_CHECK_FALSE((x_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
"First dimension of 2D Input must be of size 4, but got size %lld.", x_shape->GetDimSize(0));
KERNEL_CHECK_FALSE((x_shape->GetDimSize(1) == 2), KERNEL_STATUS_PARAM_INVALID,
"Second dimension of 2D Input must be of size 2, but got size %lld.", x_shape->GetDimSize(1));
KERNEL_CHECK_FALSE((y_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
"First dimension of 2D Output must be of size 4, but got size %lld.", y_shape->GetDimSize(0));
KERNEL_CHECK_FALSE((y_shape->GetDimSize(1) == 2), KERNEL_STATUS_PARAM_INVALID,
"Second dimension of 2D Output must be of size 2, but got size %lld.", y_shape->GetDimSize(1));
}
auto x_type = x->GetDataType();
auto y_type = y->GetDataType();
KERNEL_CHECK_FALSE((x_type == y_type), KERNEL_STATUS_PARAM_INVALID,
"Input[%s] and output[%s] must have the same DataType.", DTypeStr(x_type).c_str(),
DTypeStr(y_type).c_str());
switch (x_type) {
DATAFORMATVECPERMUTE_COMPUTE_CASE(DT_INT32, int32_t, dim, src_format_str, dst_format_str, x, y, ctx)
DATAFORMATVECPERMUTE_COMPUTE_CASE(DT_INT64, int64_t, dim, src_format_str, dst_format_str, x, y, ctx)
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(x_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t DataFormatVecPermute::DataFormatVecPermuteCompute(const int32_t dim, const string &src_format_str,
const string &dst_format_str, Tensor *x, Tensor *y,
CpuKernelContext &ctx) {
T *x_addrs = reinterpret_cast<T *>(x->GetData());
T *y_addrs = reinterpret_cast<T *>(y->GetData());
if (dim == 1) {
for (uint64_t i = 0; i < dst_format_str.size(); i++) {
for (uint64_t j = 0; j < src_format_str.size(); j++) {
if (dst_format_str[i] == src_format_str[j]) {
y_addrs[i] = x_addrs[j];
break;
}
}
}
} else if (dim == 2) {
for (uint64_t i = 0; i < dst_format_str.size(); i++) {
for (uint64_t j = 0; j < src_format_str.size(); j++) {
if (dst_format_str[i] == src_format_str[j]) {
y_addrs[i * 2] = x_addrs[j * 2];
y_addrs[i * 2 + 1] = x_addrs[j * 2 + 1];
break;
}
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kDataFormatVecPermute, DataFormatVecPermute);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_DATA_FORMAT_VEC_PERMUTE_H_
#define AICPU_KERNELS_NORMALIZED_DATA_FORMAT_VEC_PERMUTE_H_
#include <string>
#include "cpu_ops_kernel.h"
namespace aicpu {
class DataFormatVecPermute : public CpuKernel {
public:
DataFormatVecPermute() = default;
~DataFormatVecPermute() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DataFormatVecPermuteCompute(const int32_t dim, const std::string &src_format_str,
const std::string &dst_format_str, Tensor *x, Tensor *y, CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,455 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "matrix_solve_ls.h"
#include <Eigen/Cholesky>
#include <Eigen/Dense>
#include <algorithm>
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 3;
const uint32_t kOutputNum = 1;
const char *MatrixSolveLs = "MatrixSolveLs";
const int64_t kNum2 = 2;
} // namespace
namespace aicpu {
uint32_t MatrixSolveLsCpuKernel::Compute(CpuKernelContext &ctx) {
bool qr_chole = (ctx.GetAttr("fast") == nullptr) ? true : ctx.GetAttr("fast")->GetBool();
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixSolveLs check input and output number failed.");
Tensor *matrix = ctx.Input(kFirstInputIndex);
Tensor *b = ctx.Input(kSecondInputIndex);
Tensor *l2 = ctx.Input(2);
Tensor *x = ctx.Output(0);
if ((matrix->GetDataSize() == 0) || (b->GetDataSize() == 0)) {
KERNEL_LOG_ERROR("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto shapea = matrix->GetTensorShape();
auto shapeb = b->GetTensorShape();
auto shapel2 = l2->GetTensorShape();
auto shapex = x->GetTensorShape();
auto dims = shapea->GetDims();
if (ctx.Input(1)->GetTensorShape()->GetDims() == 1) {
if (shapea->GetDimSize(dims - kNum2) != shapeb->GetDimSize(0)) {
KERNEL_LOG_ERROR(
"[%s] #Rows mismatch between A and rhs."
"#Rows of A = [%llu], #Rows of rhs = [%llu]",
ctx.GetOpType().c_str(), shapea->GetDimSize(dims - kNum2), shapeb->GetDimSize(0));
return KERNEL_STATUS_PARAM_INVALID;
}
} else {
if (shapea->GetDimSize(dims - kNum2) != shapeb->GetDimSize(dims - kNum2)) {
KERNEL_LOG_ERROR(
"[%s] #Rows mismatch between A and rhs."
"#Rows of A = [%llu], #Rows of rhs = [%llu]",
ctx.GetOpType().c_str(), shapea->GetDimSize(dims - kNum2), shapeb->GetDimSize(dims - kNum2));
return KERNEL_STATUS_PARAM_INVALID;
}
}
if (shapel2->GetDims() != 0) {
KERNEL_LOG_ERROR("[%s] Tensor l2 should be a scalar.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(1)->GetTensorShape()->GetDims() == 1) {
if (shapex->GetDims() != shapeb->GetDims() || shapea->GetDimSize(dims - 1) != shapex->GetDimSize(0) ||
shapex->GetDimSize(shapex->GetDims() - 1) != shapeb->GetDimSize(0)) {
KERNEL_LOG_ERROR("[%s] Tensor y shape mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
} else {
if (shapex->GetDims() != shapeb->GetDims() ||
shapea->GetDimSize(dims - 1) != shapex->GetDimSize(shapex->GetDims() - kNum2) ||
shapex->GetDimSize(shapex->GetDims() - 1) != shapeb->GetDimSize(shapeb->GetDims() - 1)) {
KERNEL_LOG_ERROR("[%s] Tensor y shape mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
auto a_data_type = matrix->GetDataType();
auto b_data_type = b->GetDataType();
if (a_data_type != b_data_type) {
KERNEL_LOG_ERROR("[%s] Tensor data type mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (a_data_type != DT_FLOAT && a_data_type != DT_DOUBLE && a_data_type != DT_COMPLEX64 &&
a_data_type != DT_COMPLEX128) {
KERNEL_LOG_ERROR("MatrixSolveLs kernel data type [%s] not support.", DTypeStr(a_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (qr_chole) {
if (a_data_type == DT_COMPLEX64) {
return ComplexCholesky<float>(ctx);
}
if (a_data_type == DT_COMPLEX128) {
return ComplexCholesky<double>(ctx);
}
if (a_data_type == DT_DOUBLE) {
return RealCholesky<double>(ctx);
}
if (a_data_type == DT_FLOAT) {
return RealCholesky<float>(ctx);
}
} else {
if (a_data_type == DT_COMPLEX64) {
return ComplexQr<float>(ctx);
}
if (a_data_type == DT_COMPLEX128) {
return ComplexQr<double>(ctx);
}
if (a_data_type == DT_DOUBLE) {
return RealQr<double>(ctx);
}
if (a_data_type == DT_FLOAT) {
return RealQr<float>(ctx);
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(MatrixSolveLs, MatrixSolveLsCpuKernel);
template <typename T>
void MatrixSolveLsCpuKernel::RealCholeskySingleCompute(T *aptr, T *bptr, T *xptr, double *l2, int64_t m, int64_t k,
int64_t n) {
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a(m, k);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(k, n);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(m, n);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_copy;
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_b;
for (int i = 0; i < m * k; i++) {
*(a.data() + i) = *(aptr + i);
}
for (int i = 0; i < m * n; i++) {
*(b.data() + i) = *(bptr + i);
}
if (m >= k) {
a_copy =
a.transpose() * a + ((T)*l2) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(k, k);
a_b = a.transpose() * b;
} else {
a_copy = a * a.transpose();
a_b = b;
}
for (int64_t i = 0; i < n; i++) {
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = a_copy.ldlt().solve(a_b.col(i));
if (m < k) {
xi = a.transpose() * xi;
}
x.col(i) = xi;
}
for (int64_t i = 0; i < k * n; i++) {
*(xptr + i) = *(x.data() + i);
}
}
template <typename T>
uint32_t MatrixSolveLsCpuKernel::RealCholesky(CpuKernelContext &ctx) {
auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
auto aptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto bptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto xptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto l2 = reinterpret_cast<double *>(ctx.Input(2)->GetData());
int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
int64_t n = 1;
if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
}
int64_t data_num = ctx.Input(0)->NumElements();
const int64_t mat_size = m * k;
const int64_t rhs_size = m * n;
const int64_t res_size = n * k;
const int64_t batch = data_num / mat_size;
const int64_t kParallelDataNum = 16 * mat_size;
const int64_t kParallelDataNumMid = 72 * mat_size;
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
RealCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
"MatrixSolveLs Compute failed.");
} else {
for (int64_t i = 0; i < batch; i++) {
RealCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
void MatrixSolveLsCpuKernel::ComplexCholeskySingleCompute(std::complex<T> *aptr, std::complex<T> *bptr,
std::complex<T> *xptr, double *l2, int64_t m, int64_t k,
int64_t n) {
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(kNum2 * m, kNum2 * k);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(kNum2 * k, n);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(kNum2 * m, n);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_copy;
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_b;
auto l2value = abs(*l2);
for (int64_t i = 0; i < k; i++) {
for (int64_t j = 0; j < m; j++) {
*(A.data() + i + j * kNum2 * k) = std::real(*(aptr + i + j * k));
}
for (int64_t j = 0; j < m; j++) {
*(A.data() + (i + k) + (j + m) * kNum2 * k) = std::real(*(aptr + i + j * k));
}
for (int64_t j = 0; j < m; j++) {
*(A.data() + (i + k) + j * kNum2 * k) = -std::imag(*(aptr + i + j * k));
}
for (int64_t j = 0; j < m; j++) {
*(A.data() + i + (j + m) * kNum2 * k) = std::imag(*(aptr + i + j * k));
}
}
for (int64_t i = 0; i < n; i++) {
for (int64_t j = 0; j < m; j++) {
*(b.data() + i + j * n) = std::real(*(bptr + i + j * n));
*(b.data() + i + (j + m) * n) = std::imag(*(bptr + i + j * n));
}
}
if (m >= k) {
a_copy =
A.transpose() * A +
((T)l2value) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(kNum2 * k, kNum2 * k);
a_b = A.transpose() * b;
} else {
a_copy =
A * A.transpose() +
((T)l2value) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(kNum2 * m, kNum2 * m);
a_b = b;
}
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi;
for (int64_t i = 0; i < n; i++) {
xi = a_copy.ldlt().solve(a_b.col(i));
if (m < k) {
xi = A.transpose() * xi;
}
x.col(i) = xi;
for (int64_t j = 0; j < k; j++) {
(xptr + i + j * n)->real(*(x.data() + i + j * n));
(xptr + i + j * n)->imag(*(x.data() + i + (j + k) * n));
}
}
}
template <typename T>
uint32_t MatrixSolveLsCpuKernel::ComplexCholesky(CpuKernelContext &ctx) {
auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
auto l2 = reinterpret_cast<double *>(ctx.Input(2)->GetData());
auto aptr = reinterpret_cast<std::complex<T> *>(ctx.Input(0)->GetData());
auto bptr = reinterpret_cast<std::complex<T> *>(ctx.Input(1)->GetData());
auto xptr = reinterpret_cast<std::complex<T> *>(ctx.Output(0)->GetData());
int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
int64_t n = 1;
if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
}
int64_t data_num = ctx.Input(0)->NumElements();
const int64_t mat_size = m * k;
const int64_t rhs_size = m * n;
const int64_t res_size = n * k;
const int64_t batch = data_num / mat_size;
const int64_t kParallelDataNum = 16 * mat_size;
const int64_t kParallelDataNumMid = 72 * mat_size;
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
ComplexCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
"MatrixSolveLs Compute failed.");
} else {
for (int64_t i = 0; i < batch; i++) {
ComplexCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
void MatrixSolveLsCpuKernel::RealQrSingleCompute(T *aptr, T *bptr, T *xptr, int64_t m, int64_t k, int64_t n) {
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a(m, k);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(k, n);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(m, n);
for (int i = 0; i < m * k; i++) {
*(a.data() + i) = *(aptr + i);
}
for (int i = 0; i < m * n; i++) {
*(b.data() + i) = *(bptr + i);
}
Eigen::ColPivHouseholderQR<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> qr_solve(a);
for (int64_t i = 0; i < n; i++) {
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = qr_solve.solve(b.col(i));
x.col(i) = xi;
}
for (int64_t i = 0; i < k * n; i++) {
*(xptr + i) = *(x.data() + i);
}
}
template <typename T>
uint32_t MatrixSolveLsCpuKernel::RealQr(CpuKernelContext &ctx) {
auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
auto aptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto bptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto xptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
int64_t n = 1;
if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
}
int64_t data_num = ctx.Input(0)->NumElements();
const int64_t mat_size = m * k;
const int64_t rhs_size = m * n;
const int64_t res_size = n * k;
const int64_t batch = data_num / mat_size;
const int64_t kParallelDataNum = 16 * mat_size;
const int64_t kParallelDataNumMid = 72 * mat_size;
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
RealQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
"MatrixSolveLs Compute failed.");
} else {
for (int64_t i = 0; i < batch; i++) {
RealQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
void MatrixSolveLsCpuKernel::ComplexQrSingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr,
int64_t m, int64_t k, int64_t n) {
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(kNum2 * m, kNum2 * k);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(kNum2 * k, n);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(kNum2 * m, n);
for (int64_t i = 0; i < k; i++) {
for (int64_t j = 0; j < m; j++) {
*(A.data() + i + j * kNum2 * k) = std::real(*(aptr + i + j * k));
}
for (int64_t j = 0; j < m; j++) {
*(A.data() + (i + k) + (j + m) * kNum2 * k) = std::real(*(aptr + i + j * k));
}
for (int64_t j = 0; j < m; j++) {
*(A.data() + (i + k) + j * kNum2 * k) = -std::imag(*(aptr + i + j * k));
}
for (int64_t j = 0; j < m; j++) {
*(A.data() + i + (j + m) * kNum2 * k) = std::imag(*(aptr + i + j * k));
}
}
for (int64_t i = 0; i < n; i++) {
for (int64_t j = 0; j < m; j++) {
*(b.data() + i + j * n) = std::real(*(bptr + i + j * n));
*(b.data() + i + (j + m) * n) = std::imag(*(bptr + i + j * n));
}
}
Eigen::ColPivHouseholderQR<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> qr_solve(A);
for (int64_t i = 0; i < n; i++) {
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = qr_solve.solve(b.col(i));
x.col(i) = xi;
for (int64_t j = 0; j < k; j++) {
(xptr + i + j * n)->real(*(x.data() + i + j * n));
(xptr + i + j * n)->imag(*(x.data() + i + (j + k) * n));
}
}
}
template <typename T>
uint32_t MatrixSolveLsCpuKernel::ComplexQr(CpuKernelContext &ctx) {
auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
int64_t n = 1;
if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
}
int64_t data_num = ctx.Input(0)->NumElements();
const int64_t mat_size = m * k;
const int64_t rhs_size = m * n;
const int64_t res_size = n * k;
const int64_t batch = data_num / mat_size;
const int64_t kParallelDataNum = 16 * mat_size;
const int64_t kParallelDataNumMid = 72 * mat_size;
auto aptr = reinterpret_cast<std::complex<T> *>(ctx.Input(0)->GetData());
auto bptr = reinterpret_cast<std::complex<T> *>(ctx.Input(1)->GetData());
auto xptr = reinterpret_cast<std::complex<T> *>(ctx.Output(0)->GetData());
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
ComplexQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
"MatrixSolveLs Compute failed.");
} else {
for (int64_t i = 0; i < batch; i++) {
ComplexQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
}
}
return KERNEL_STATUS_OK;
}
} // namespace aicpu

View File

@ -0,0 +1,62 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_SOLVE_LS_H_
#define AICPU_KERNELS_NORMALIZED_MATRIX_SOLVE_LS_H_
#include <complex>
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
namespace aicpu {
class MatrixSolveLsCpuKernel : public CpuKernel {
public:
MatrixSolveLsCpuKernel() = default;
~MatrixSolveLsCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
void RealCholeskySingleCompute(T *aptr, T *bptr, T *xptr, double *l2, int64_t m, int64_t k, int64_t n);
template <typename T>
uint32_t RealCholesky(CpuKernelContext &ctx);
template <typename T>
void RealQrSingleCompute(T *aptr, T *bptr, T *xptr, int64_t m, int64_t k, int64_t n);
template <typename T>
uint32_t RealQr(CpuKernelContext &ctx);
template <typename T>
void ComplexCholeskySingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr, double *l2,
int64_t m, int64_t k, int64_t n);
template <typename T>
uint32_t ComplexCholesky(CpuKernelContext &ctx);
template <typename T>
void ComplexQrSingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr, int64_t m, int64_t k,
int64_t n);
template <typename T>
uint32_t ComplexQr(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,440 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nuclear_norm.h"
#include <string.h>
#include <Eigen/Dense>
#include <cmath>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <unsupported/Eigen/CXX11/Tensor>
#include "kernel_util.h"
#include "utils/kernel_util.h"
#define NoneN 1000
using namespace Eigen;
using namespace std;
namespace {
const char *kNuclearNorm = "NuclearNorm";
const size_t kNuclearNormInputNum = 1;
const size_t kNuclearNormOutputNum = 1;
constexpr int64_t kParallelDataNums = 1 * 1024;
const size_t DIM_SIZE1 = 1;
const size_t DIM_SIZE2 = 2;
const size_t DIM_SIZE3 = 3;
const size_t DIM_SIZE4 = 4;
const size_t DIM_SIZE5 = 5;
const size_t DIM_SIZE6 = 6;
const size_t DIM_SIZE7 = 7;
const size_t DIM_SIZE8 = 8;
const size_t DIM_INDEX0 = 0;
const size_t DIM_INDEX1 = 1;
const size_t DIM_INDEX2 = 2;
} // namespace
namespace aicpu {
uint32_t NuclearNormCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kNuclearNormInputNum, kNuclearNormOutputNum),
"NuclearNorm Check input and output number failed.");
KERNEL_HANDLE_ERROR(NuclearNormParamCheck(ctx), "NuclearNorm Check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
uint32_t res = KERNEL_STATUS_OK;
switch (data_type) {
case (DT_FLOAT): {
res = NuclearNormCompute<float>(ctx);
break;
}
case (DT_DOUBLE): {
res = NuclearNormCompute<double>(ctx);
break;
}
default:
KERNEL_LOG_ERROR("NuclearNorm kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (res != KERNEL_STATUS_OK) {
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
uint32_t NuclearNormCpuKernel::NuclearNormParamCheck(CpuKernelContext &ctx) {
Tensor *input = ctx.Input(0);
Tensor *output = ctx.Output(0);
KERNEL_CHECK_FALSE((input->GetDataType() == output->GetDataType()), KERNEL_STATUS_PARAM_INVALID,
"The data type of the input [%s] need be the same as the output [%s]",
DTypeStr(input->GetDataType()).c_str(), DTypeStr(output->GetDataType()).c_str());
const size_t input_dimnum = input->GetTensorShape()->GetDims();
KERNEL_CHECK_FALSE((input_dimnum >= DIM_SIZE2 && input_dimnum <= DIM_SIZE8), KERNEL_STATUS_PARAM_INVALID,
"The range of the dimension of the input tensor should be "
"[%lld,%lld], but got input's dimension=%lld",
DIM_SIZE2, DIM_SIZE8, input_dimnum);
AttrValue *dim_ptr = ctx.GetAttr("dim");
std::vector<int64_t> dim_temp = {0, 1};
std::vector<int64_t> dim = (dim_ptr == nullptr) ? dim_temp : dim_ptr->GetListInt();
if (dim_ptr == nullptr) {
KERNEL_CHECK_FALSE((input_dimnum == DIM_SIZE2), KERNEL_STATUS_PARAM_INVALID,
"When Attr dim is none, NuclearNorm expected a tensor with 2 "
"dimensions, but got a tensor with [%lld] dimensions instead.",
input_dimnum);
}
if (dim.size() == 1 && dim[0] == NoneN) {
dim.clear();
dim.push_back(0);
dim.push_back(1);
}
KERNEL_CHECK_FALSE((dim.size() == DIM_SIZE2), KERNEL_STATUS_PARAM_INVALID,
"Attr dim'size must equal to 2, but got dim's size : [%lld]", dim.size());
int64_t lower_bound = 0 - input_dimnum;
int64_t upper_bound = input_dimnum - 1;
KERNEL_CHECK_FALSE((dim[0] >= lower_bound && dim[0] <= upper_bound), KERNEL_STATUS_PARAM_INVALID,
"The range of dim[0] should be [%lld,%lld], but got input dim[0]=%lld", lower_bound, upper_bound,
dim[0]);
KERNEL_CHECK_FALSE((dim[1] >= lower_bound && dim[1] <= upper_bound), KERNEL_STATUS_PARAM_INVALID,
"The range of dim[1] should be [%lld,%lld], but got input dim[1]=%lld", lower_bound, upper_bound,
dim[1]);
dim[0] = (dim[0] < 0) ? dim[0] + input_dimnum : dim[0];
dim[1] = (dim[1] < 0) ? dim[1] + input_dimnum : dim[1];
KERNEL_CHECK_FALSE((dim[0] != dim[1]), KERNEL_STATUS_PARAM_INVALID,
"The values in attr dim point to the same dimension.");
KERNEL_LOG_DEBUG("NuclearNormCpuKernel[%s], input: size[%llu], output: size[%llu].", ctx.GetOpType().c_str(),
input->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t NuclearNormCpuKernel::NuclearNormCompute(CpuKernelContext &ctx) {
Tensor *input_ptr = ctx.Input(0);
auto input_shape = input_ptr->GetTensorShape();
std::vector<int64_t> input_dims = input_shape->GetDimSizes();
uint32_t res = KERNEL_STATUS_OK;
switch (input_dims.size()) {
case DIM_SIZE2:
res = ComputeTensorNuclearNorm<T, DIM_SIZE2>(ctx);
break;
case DIM_SIZE3:
res = ComputeTensorNuclearNorm<T, DIM_SIZE3>(ctx);
break;
case DIM_SIZE4:
res = ComputeTensorNuclearNorm<T, DIM_SIZE4>(ctx);
break;
case DIM_SIZE5:
res = ComputeTensorNuclearNorm<T, DIM_SIZE5>(ctx);
break;
case DIM_SIZE6:
res = ComputeTensorNuclearNorm<T, DIM_SIZE6>(ctx);
break;
case DIM_SIZE7:
res = ComputeTensorNuclearNorm<T, DIM_SIZE7>(ctx);
break;
case DIM_SIZE8:
res = ComputeTensorNuclearNorm<T, DIM_SIZE8>(ctx);
break;
default:
KERNEL_LOG_ERROR(
"Only tensors with ranks between 2 and 8 are currently supported."
"Tensor rank: [%d]",
input_dims.size());
return KERNEL_STATUS_PARAM_INVALID;
}
if (res != KERNEL_STATUS_OK) {
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
template <typename T, int32_t RANK>
uint32_t NuclearNormCpuKernel::ComputeTensorNuclearNorm(const CpuKernelContext &ctx) {
Tensor *input_ptr = ctx.Input(0);
auto input_shape = input_ptr->GetTensorShape();
void *data_ptr = input_ptr->GetData();
int64_t value_num_ = input_ptr->NumElements();
T *input_data_ptr = reinterpret_cast<T *>(data_ptr);
int64_t total_copy_size = value_num_ * static_cast<int64_t>(sizeof(T));
Eigen::Tensor<T, 1, Eigen::RowMajor> eigen_tensor(value_num_);
int memcpy_ret = memcpy_s(&eigen_tensor(0), total_copy_size, input_data_ptr, total_copy_size);
if (memcpy_ret != 0) {
KERNEL_LOG_ERROR("memcpy_s error!");
}
std::vector<int64_t> input_dims = input_shape->GetDimSizes();
std::array<Eigen::DenseIndex, RANK> dim_array;
const int64_t input_dimnum = static_cast<int64_t>(input_shape->GetDims());
for (int64_t i = 0; i < input_dimnum; i++) {
dim_array.at(i) = input_dims[i];
}
Eigen::Tensor<T, RANK, Eigen::RowMajor> reshaped_tensor = eigen_tensor.reshape(dim_array);
AttrValue *dim_ptr = ctx.GetAttr("dim");
std::vector<int64_t> dim_temp = {0, 1};
std::vector<int64_t> dim = (dim_ptr == nullptr) ? dim_temp : dim_ptr->GetListInt();
if (dim.size() == 1 && dim[0] == NoneN) {
dim.clear();
dim.push_back(0);
dim.push_back(1);
}
dim[0] = (dim[0] < 0) ? dim[0] + input_dimnum : dim[0];
dim[1] = (dim[1] < 0) ? dim[1] + input_dimnum : dim[1];
int64_t j = 0;
for (int64_t i = 0; i < input_dimnum; i++) {
if (i != dim[0] && i != dim[1]) {
dim_array.at(j) = i;
j++;
}
}
dim_array.at(j) = dim[0];
dim_array.at(j + 1) = dim[1];
Eigen::Tensor<T, RANK, Eigen::RowMajor> shuffled_tensor = reshaped_tensor.shuffle(dim_array);
int64_t dimsize0 = input_shape->GetDimSize(dim[0]);
int64_t dimsize1 = input_shape->GetDimSize(dim[1]);
int64_t iter_number = value_num_ / (dimsize0 * dimsize1);
std::array<Eigen::DenseIndex, DIM_SIZE3> dim_array_last;
dim_array_last.at(DIM_INDEX0) = iter_number;
dim_array_last.at(DIM_INDEX1) = dimsize0;
dim_array_last.at(DIM_INDEX2) = dimsize1;
Eigen::Tensor<T, DIM_SIZE3, Eigen::RowMajor> permuted_tensor = shuffled_tensor.reshape(dim_array_last);
auto output_data_ptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t copy_size = (dimsize0 * dimsize1) * static_cast<int64_t>(sizeof(T));
if (iter_number <= kParallelDataNums) {
for (int64_t i = 0; i < iter_number; i++) {
T *mat = new T[dimsize0 * dimsize1];
memcpy(mat, &permuted_tensor(i, 0, 0), copy_size);
T nuclear_norm = matrix_nuclear_norm<T>(mat, dimsize0, dimsize1);
*(output_data_ptr + i) = nuclear_norm;
}
} else {
uint32_t min_core_num = 1;
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > static_cast<uint64_t>(iter_number)) {
max_core_num = static_cast<uint64_t>(iter_number);
}
auto shared_nuclear_norm = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
T *mat = new T[dimsize0 * dimsize1];
memcpy(mat, &permuted_tensor(i, 0, 0), copy_size);
T nuclear_norm = matrix_nuclear_norm<T>(mat, dimsize0, dimsize1);
*(output_data_ptr + i) = nuclear_norm;
}
};
if (max_core_num != 0) {
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, static_cast<uint64_t>(iter_number),
static_cast<uint64_t>(iter_number) / max_core_num, shared_nuclear_norm),
"NuclearNorm Compute failed.");
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
std::vector<std::vector<T>> NuclearNormCpuKernel::matrix_multiply(std::vector<std::vector<T>> const arrL,
std::vector<std::vector<T>> const arrR) {
size_t rowL = arrL.size();
size_t colL = arrL[0].size();
size_t colR = arrR[0].size();
std::vector<std::vector<T>> res(rowL);
for (size_t i = 0; i < res.size(); i++) {
res[i].resize(colR);
}
for (size_t i = 0; i < rowL; i++) {
for (size_t j = 0; j < colR; j++) {
for (size_t k = 0; k < colL; k++) {
res[i][j] += arrL[i][k] * arrR[k][j];
}
}
}
return res;
}
template <typename T>
std::vector<std::vector<T>> NuclearNormCpuKernel::transpose(std::vector<std::vector<T>> const arr) {
size_t row = arr.size();
size_t col = arr[0].size();
std::vector<std::vector<T>> trans(col);
for (size_t i = 0; i < col; i++) {
trans[i].resize(row);
}
for (size_t i = 0; i < col; i++) {
for (size_t j = 0; j < row; j++) {
trans[i][j] = arr[j][i];
}
}
return trans;
}
template <typename T>
std::vector<size_t> NuclearNormCpuKernel::argsort(const std::vector<T> &array) {
const size_t array_len(array.size());
std::vector<size_t> array_index(array_len, 0);
for (size_t i = 0; i < array_len; ++i) array_index[i] = i;
sort(array_index.begin(), array_index.end(),
[&array](size_t pos1, size_t pos2) { return (array[pos1] > array[pos2]); });
return array_index;
}
template <typename T>
void NuclearNormCpuKernel::get_row_col(std::vector<std::vector<T>> arr, T *max, size_t *row, size_t *col) {
size_t n = arr.size();
for (size_t i = 0; i < n; i++) {
for (size_t j = 0; j < n; j++) {
if (i != j && fabs(arr[i][j]) > *max) {
*max = fabs(arr[i][j]);
*row = i;
*col = j;
}
}
}
}
template <typename T>
void NuclearNormCpuKernel::svd(std::vector<std::vector<T>> arr, std::vector<std::vector<T>> &E, std::vector<T> &e) {
size_t n = arr.size();
size_t row = 0;
size_t col = 0;
size_t iter_max_num = 10000;
size_t iter_num = 0;
T eps = 1e-40;
T max = eps;
T dot5 = 0.5;
E.resize(n);
e.resize(n);
for (size_t i = 0; i < n; i++) {
E[i].resize(n, 0);
E[i][i] = 1;
}
while (iter_num < iter_max_num && max >= eps) {
max = fabs(arr[0][1]);
row = 0;
col = 1;
get_row_col<T>(arr, &max, &row, &col);
T theta = dot5 * atan2(-2 * arr[row][col], -(arr[row][row] - arr[col][col]));
T aii = arr[row][row];
T ajj = arr[col][col];
T aij = arr[row][col];
T sin_theta = sin(theta);
T cos_theta = cos(theta);
T sin_2theta = sin(2 * theta);
T cos_2theta = cos(2 * theta);
arr[row][row] = aii * cos_theta * cos_theta + ajj * sin_theta * sin_theta + aij * sin_2theta;
arr[col][col] = aii * sin_theta * sin_theta + ajj * cos_theta * cos_theta - aij * sin_2theta;
arr[row][col] = dot5 * (ajj - aii) * sin_2theta + aij * cos_2theta;
arr[col][row] = arr[row][col];
for (size_t k = 0; k < n; k++) {
if (k != row && k != col) {
T arowk = arr[row][k];
T acolk = arr[col][k];
arr[row][k] = arowk * cos_theta + acolk * sin_theta;
arr[k][row] = arr[row][k];
arr[col][k] = acolk * cos_theta - arowk * sin_theta;
arr[k][col] = arr[col][k];
}
}
T Eki;
T Ekj;
for (size_t k = 0; k < n; k++) {
Eki = E[k][row];
Ekj = E[k][col];
E[k][row] = Eki * cos_theta + Ekj * sin_theta;
E[k][col] = Ekj * cos_theta - Eki * sin_theta;
}
iter_num++;
}
for (size_t i = 0; i < n; i++) {
e[i] = arr[i][i];
}
std::vector<size_t> sort_index;
sort_index = argsort<T>(e);
std::vector<std::vector<T>> E_sorted(n);
for (size_t i = 0; i < n; i++) {
E_sorted[i].resize(n);
}
std::vector<T> e_sorted(n);
for (size_t i = 0; i < n; i++) {
e_sorted[i] = e[sort_index[i]];
for (size_t j = 0; j < n; j++) {
E_sorted[i][j] = E[i][sort_index[j]];
}
}
E = E_sorted;
e = e_sorted;
}
template <typename T>
T NuclearNormCpuKernel::matrix_nuclear_norm(T *mat, size_t dim0, size_t dim1) {
if (dim1 == DIM_SIZE1) {
T nuclear_norm = 0.0;
T temp = 0.0;
for (size_t j = 0; j < dim0; j++) {
temp = mat[j];
temp = temp * temp;
nuclear_norm += temp;
}
nuclear_norm = sqrt(nuclear_norm);
return nuclear_norm;
}
std::vector<std::vector<double>> arr(dim0);
size_t S_dim_size = dim0 < dim1 ? dim0 : dim1;
for (size_t i = 0; i < arr.size(); i++) {
arr[i].resize(dim1);
}
for (size_t i = 0; i < dim0; i++) {
for (size_t j = 0; j < dim1; j++) {
arr[i][j] = mat[i * dim1 + j];
}
}
std::vector<std::vector<double>> ATA;
std::vector<std::vector<double>> E;
std::vector<double> e;
ATA = matrix_multiply<double>(transpose(arr), arr);
svd<double>(ATA, E, e);
double nuclear_norm = 0.0;
for (size_t i = DIM_INDEX0; i < S_dim_size; i++) {
if (e[i] > 0) {
nuclear_norm += sqrt(e[i]);
}
}
return nuclear_norm;
}
REGISTER_CPU_KERNEL(kNuclearNorm, NuclearNormCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,66 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_NUCLEARNORM_H_
#define AICPU_KERNELS_NORMALIZED_NUCLEARNORM_H_
#include <memory>
#include <vector>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "securec.h"
#include "status.h"
#include "utils/bcast.h"
namespace aicpu {
class NuclearNormCpuKernel : public CpuKernel {
public:
NuclearNormCpuKernel() = default;
~NuclearNormCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t NuclearNormParamCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t NuclearNormCompute(CpuKernelContext &ctx);
template <typename T, int32_t RANK>
uint32_t ComputeTensorNuclearNorm(const CpuKernelContext &ctx);
template <typename T>
std::vector<std::vector<T>> matrix_multiply(std::vector<std::vector<T>> const arrL,
std::vector<std::vector<T>> const arrR);
template <typename T>
std::vector<std::vector<T>> transpose(std::vector<std::vector<T>> const arr);
template <typename T>
std::vector<size_t> argsort(const std::vector<T> &array);
template <typename T>
void get_row_col(std::vector<std::vector<T>> arr, T *max, size_t *row, size_t *col);
template <typename T>
void svd(std::vector<std::vector<T>> arr, std::vector<std::vector<T>> &E, std::vector<T> &e);
template <typename T>
T matrix_nuclear_norm(T *mat, size_t dim0, size_t dim1);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,410 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "quantile.h"
#include <algorithm>
#include <cmath>
#include <iostream>
#include <vector>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
constexpr uint32_t kQuantileInputNum = 2;
constexpr uint32_t kQuantileOutputNum = 1;
const int64_t paralled_data_size = 64 * 1024;
const int64_t kQuantileAttrDefaultDim = 10000;
const char *kQuantile = "Quantile";
} // namespace
namespace aicpu {
template <typename T>
uint32_t QuantileCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
input_ = ctx.Input(0);
DataType input_type = input_->GetDataType();
int64_t input_dim = input_->GetTensorShape()->GetDims();
int64_t input_size = input_->GetTensorShape()->NumElements();
q_ = ctx.Input(1);
int64_t q_size = q_->GetTensorShape()->NumElements();
T *q_addrs = reinterpret_cast<T *>(q_->GetData());
DataType q_type = q_->GetDataType();
int64_t q_dim = q_->GetTensorShape()->GetDims();
int64_t min = -input_dim;
int64_t max = input_dim - 1;
auto dim_attr = ctx.GetAttr("dim");
dim_ = (dim_attr == nullptr) ? kQuantileAttrDefaultDim : dim_attr->GetInt();
auto keep_dims_attr = ctx.GetAttr("keep_dims");
keep_dims_ = (keep_dims_attr == nullptr) ? false : keep_dims_attr->GetBool();
auto ignore_attr = ctx.GetAttr("ignore_nan");
ignore_nan_ = (ignore_attr == nullptr) ? false : ignore_attr->GetBool();
KERNEL_CHECK_FALSE(input_size > 0, KERNEL_STATUS_PARAM_INVALID, "quantile() input tensor must be non-empty");
KERNEL_CHECK_FALSE(q_dim <= 1, KERNEL_STATUS_PARAM_INVALID,
"quantile() q must be a scalar or 1D tensor,but got dimension = [%d].", q_dim);
KERNEL_CHECK_FALSE(input_type == q_type, KERNEL_STATUS_PARAM_INVALID,
"quantile() q tensor must be same dtype as the input tensor");
for (int64_t j = 0; j < q_size; ++j) {
KERNEL_CHECK_FALSE(q_addrs[j] <= 1 && q_addrs[j] >= 0, KERNEL_STATUS_PARAM_INVALID,
"quantile() q values must be in the range [0, 1]");
}
DataType out_type = ctx.Output(0)->GetDataType();
output_ = ctx.Output(0);
KERNEL_CHECK_FALSE(out_type == input_type, KERNEL_STATUS_PARAM_INVALID,
"quantile() out tensor must be same dtype as the input tensor");
if (dim_ != kQuantileAttrDefaultDim) {
KERNEL_CHECK_FALSE(dim_ >= min && dim_ <= max, KERNEL_STATUS_PARAM_INVALID,
"Dimension out of range (expected to be in range of [%d] and [%d]).", min, max);
}
dim_ = MaybeWrapDim(dim_, input_dim);
return KERNEL_STATUS_OK;
}
uint32_t QuantileCpuKernel::MaybeWrapDim(int64_t dim, int64_t dim_post_expr) {
if (dim == kQuantileAttrDefaultDim) {
return dim;
}
if (dim_post_expr <= 0) {
dim_post_expr = 1;
}
int64_t min = -dim_post_expr;
int64_t max = dim_post_expr - 1;
KERNEL_CHECK_FALSE(dim >= min && dim <= max, KERNEL_STATUS_PARAM_INVALID,
"Dimension out of range (expected to be in range of [%d] and [%d]).", min, max)
if (dim < 0) {
dim += dim_post_expr;
}
return dim;
}
template <typename T>
std::vector<T> transpose(std::vector<T> &f, std::vector<int64_t> &shape, int index) {
int element_count = f.size();
int m = shape.size();
int i;
int *indexA = (int *)malloc(sizeof(int) * m);
if (indexA == nullptr) {
return {};
}
std::vector<int> pos(m);
for (int i = 0; i < m; i++) pos[i] = i;
if (m != 0) {
std::swap(pos[m - 1], pos[((index + m) % m)]);
}
int *indexB = (int *)malloc(sizeof(int) * m);
if (indexB == nullptr) {
free(indexA);
return {};
}
std::vector<T> b(element_count);
std::vector<int64_t> shapeb(shape);
for (int i = 0; i < m; i++) {
shapeb[i] = shape[pos[i]];
}
for (int src = 0; src < element_count; src++) {
int temp = src;
for (i = m - 1; i >= 0; i--) {
indexA[i] = temp % shape[i];
temp = temp / shape[i];
}
for (i = 0; i < m; i++) {
indexB[i] = indexA[pos[i]];
}
int dst = 0;
temp = 1;
for (i = m - 1; i >= 0; i--) {
dst = dst + indexB[i] * temp;
temp = temp * shapeb[i];
}
b[dst] = f[src];
}
free(indexA);
free(indexB);
return b;
}
template <typename T>
void QuantileCpuKernel::QuantileComputeParallelFunc(size_t start, size_t end, int64_t last_shape_size,
std::vector<T> &sorted) {
uint64_t q_size = q_->GetTensorShape()->NumElements();
T *output_addr = reinterpret_cast<T *>(output_->GetData());
T *q_addrs = reinterpret_cast<T *>(q_->GetData());
for (u_int64_t i = start; i < end; i++) {
std::vector<T> tmp;
std::sort(sorted.begin() + i * last_shape_size, sorted.begin() + (i + 1) * last_shape_size);
bool has_nan = false;
bool all_nan = true;
for (u_int64_t j = i * last_shape_size; j < (i + 1) * last_shape_size; j++) {
if (std::isnan(sorted[j])) {
has_nan = true;
} else {
all_nan = false;
}
}
if ((has_nan && !ignore_nan_) || all_nan) {
for (uint64_t j = 0; j < q_size; ++j) {
output_addr[i * q_size + j] = NAN;
}
continue;
}
for (auto k = i * last_shape_size; k < (i + 1) * last_shape_size; k++) {
auto x = sorted[k];
if (!isnan(x)) {
tmp.push_back(x);
}
}
std::sort(tmp.begin(), tmp.end());
for (uint64_t j = 0; j < q_size; ++j) {
T index = (tmp.size() - 1) * q_addrs[j];
int32_t idx = index;
if (idx == (int32_t)tmp.size() - 1) {
output_addr[i * q_size + j] = tmp[idx];
continue;
}
output_addr[i * q_size + j] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx);
}
}
}
template <typename T>
void QuantileCpuKernel::QuantileComputeSerialFunc(int64_t last_shape_size, std::vector<T> &sorted) {
uint64_t n = input_->GetTensorShape()->NumElements();
uint64_t q_size = q_->GetTensorShape()->NumElements();
T *output_addr = reinterpret_cast<T *>(output_->GetData());
T *q_addrs = reinterpret_cast<T *>(q_->GetData());
for (u_int64_t i = 0; i < n; i += last_shape_size) {
std::vector<T> tmp;
sort(sorted.begin() + i, sorted.begin() + i + last_shape_size);
bool has_nan = false;
bool all_nan = true;
for (auto j = i; j < i + last_shape_size; j++) {
if (!isnan(sorted[j])) {
tmp.push_back(sorted[j]);
all_nan = false;
} else {
has_nan = true;
}
}
sort(tmp.begin(), tmp.end());
for (uint64_t j = 0; j < q_size; ++j) {
if ((has_nan && !ignore_nan_) || all_nan) {
output_addr[i * q_size / last_shape_size + j] = NAN;
continue;
}
T index = (tmp.size() - 1) * q_addrs[j];
int32_t idx = index;
if (idx == (int32_t)tmp.size() - 1) {
output_addr[i * q_size / last_shape_size + j] = tmp[idx];
continue;
}
output_addr[i * q_size / last_shape_size + j] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx);
}
}
}
template <typename T>
void QuantileCpuKernel::QuantileComputeDefaultFunc(std::vector<T> &sorted) {
uint64_t q_size = q_->GetTensorShape()->NumElements();
T *output_addr = reinterpret_cast<T *>(output_->GetData());
T *q_addrs = reinterpret_cast<T *>(q_->GetData());
std::sort(sorted.begin(), sorted.end());
bool all_nan = true;
std::vector<T> tmp;
for (auto &x : sorted) {
if (!isnan(x)) {
tmp.push_back(x);
all_nan = false;
}
}
std::sort(tmp.begin(), tmp.end());
for (uint64_t i = 0; i < q_size; ++i) {
if ((has_nan_ && !ignore_nan_) || all_nan) {
output_addr[i] = NAN;
continue;
}
T index = (tmp.size() - 1) * q_addrs[i];
int32_t idx = index;
if (idx == (int32_t)tmp.size() - 1) {
output_addr[i] = tmp[idx];
continue;
}
output_addr[i] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx);
}
}
std::vector<int64_t> QuantileCpuKernel::SetQuantileOutputShape() {
std::vector<int64_t> out_shape;
int64_t q_dim = q_->GetTensorShape()->NumElements();
int64_t input_dim = input_->GetTensorShape()->GetDims();
uint64_t q_size = q_->GetTensorShape()->NumElements();
std::vector<int64_t> input_shapesize = input_->GetTensorShape()->GetDimSizes();
if (dim_ != kQuantileAttrDefaultDim && input_dim > 0) {
out_shape = input_shapesize;
if (keep_dims_) {
out_shape[dim_] = 1;
} else {
out_shape.erase(out_shape.begin() + dim_);
}
} else if (keep_dims_) {
out_shape = std::vector<int64_t>(input_dim, 1);
}
if (q_dim > 0) {
out_shape.insert(out_shape.begin(), q_size);
}
return out_shape;
}
template <typename T>
uint32_t QuantileCpuKernel::QuantileCompute(CpuKernelContext &ctx) {
T *input_addrs = reinterpret_cast<T *>(ctx.Input(0)->GetData());
size_t data_size = input_->GetTensorShape()->NumElements() * sizeof(T);
std::vector<int64_t> out_shape = SetQuantileOutputShape();
std::vector<int64_t> input_dims = input_->GetTensorShape()->GetDimSizes();
int64_t input_shape_size = input_->GetTensorShape()->GetDims();
std::vector<T> sorted;
int64_t n = input_->GetTensorShape()->NumElements();
for (int64_t i = 0; i < n; i++) {
sorted.push_back(input_addrs[i]);
if (isnan(input_addrs[i])) {
has_nan_ = true;
}
}
if (data_size <= paralled_data_size) {
if (dim_ == kQuantileAttrDefaultDim) {
QuantileComputeDefaultFunc<T>(sorted);
} else if (dim_ == input_shape_size - 1) {
QuantileComputeSerialFunc<T>(input_dims[input_dims.size() - 1], sorted);
} else {
input_dims.push_back(1);
sorted = transpose<T>(sorted, input_dims, dim_);
int32_t m = input_dims.size();
if (m != 0) {
std::swap(input_dims[m - 1], input_dims[((dim_ + m) % m)]);
}
QuantileComputeSerialFunc<T>(input_dims[input_dims.size() - 1], sorted);
}
} else {
DoParallelQuantile(ctx, sorted, input_dims);
}
SetOutput<T>(out_shape);
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t QuantileCpuKernel::DoParallelQuantile(CpuKernelContext &ctx, std::vector<T> sorted,
std::vector<int64_t> input_dims) {
int64_t input_shape_size = input_->GetTensorShape()->GetDims();
std::vector<int64_t> input_shape_dims = input_->GetTensorShape()->GetDimSizes();
int64_t n = input_->GetTensorShape()->NumElements();
if (dim_ == kQuantileAttrDefaultDim) {
QuantileComputeDefaultFunc<T>(sorted);
} else if (dim_ == input_shape_size - 1) {
int64_t last_shape_size = input_dims[input_dims.size() - 1];
auto shard_quantile = [&](size_t start, size_t end) {
QuantileComputeParallelFunc<T>(start, end, last_shape_size, sorted);
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, n / last_shape_size, last_shape_size, shard_quantile),
"Quantile Compute failed.");
} else {
input_shape_dims.push_back(1);
sorted = transpose<T>(sorted, input_shape_dims, dim_);
int32_t m = input_shape_dims.size();
if (m != 0) {
std::swap(input_shape_dims[m - 1], input_shape_dims[((dim_ + m) % m)]);
}
int64_t last_shape_size = input_shape_dims[input_shape_dims.size() - 1];
auto shard_quantile = [&](size_t start, size_t end) {
QuantileComputeParallelFunc<T>(start, end, last_shape_size, sorted);
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, n / last_shape_size, last_shape_size, shard_quantile),
"Quantile Compute failed.");
}
return 0;
}
template <typename T>
void QuantileCpuKernel::SetOutput(std::vector<int64_t> &out_shape) {
auto output_addr = reinterpret_cast<T *>(output_->GetData());
int64_t l = output_->GetTensorShape()->NumElements();
std::vector<T> out;
int64_t q_dim = q_->GetTensorShape()->GetDims();
std::vector<int64_t> tmp(out_shape);
if (q_dim > 0) {
for (int i = 0; i < l; i++) {
out.push_back(*(output_addr + i));
}
int64_t out_end_shape = out_shape[out_shape.size() - 1];
out_shape.push_back(out_end_shape);
std::swap(out_shape[0], out_shape[out_shape.size() - 1]);
out_shape.erase(out_shape.begin());
out_shape.insert(out_shape.begin(), 1);
out = transpose<T>(out, out_shape, 0);
for (int i = 0; i < l; i++) {
output_addr[i] = out[i];
}
}
output_->GetTensorShape()->SetDimSizes(tmp);
}
uint32_t QuantileCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kQuantileInputNum, kQuantileOutputNum), "[%s] check params failed.", kQuantile);
uint32_t res = KERNEL_STATUS_OK;
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
case DT_FLOAT:
res = GetInputAndCheck<float>(ctx);
break;
case DT_DOUBLE:
res = GetInputAndCheck<double>(ctx);
break;
default:
KERNEL_LOG_ERROR("Quantile invalid input type [%s]", DTypeStr(data_type).c_str());
break;
}
KERNEL_CHECK_FALSE((res == KERNEL_STATUS_OK), res, "GetInputAndCheck failed.");
switch (data_type) {
case DT_FLOAT:
res = QuantileCompute<float>(ctx);
break;
case DT_DOUBLE:
res = QuantileCompute<double>(ctx);
break;
default:
KERNEL_LOG_ERROR("Quantile invalid input type [%s]", DTypeStr(data_type).c_str());
break;
}
if (res != KERNEL_STATUS_OK) {
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kQuantile, QuantileCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,61 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_QUANTILE_H_
#define AICPU_KERNELS_NORMALIZED_QUANTILE_H_
#include <vector>
#include "cpu_ops_kernel.h"
namespace aicpu {
class QuantileCpuKernel : public CpuKernel {
public:
QuantileCpuKernel() = default;
~QuantileCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t GetInputAndCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t QuantileCompute(CpuKernelContext &ctx);
uint32_t MaybeWrapDim(int64_t dim, int64_t dim_post_expr);
template <typename T>
void QuantileComputeSerialFunc(int64_t last_shape_size, std::vector<T> &sorted);
template <typename T>
void QuantileComputeParallelFunc(size_t start, size_t end, int64_t last_shape_size, std::vector<T> &sorted);
template <typename T>
void QuantileComputeDefaultFunc(std::vector<T> &sorted);
std::vector<int64_t> SetQuantileOutputShape();
template <typename T>
void SetOutput(std::vector<int64_t> &out_shape);
template <typename T>
uint32_t DoParallelQuantile(CpuKernelContext &ctx, std::vector<T> sorted, std::vector<int64_t> input_dims);
int64_t last_shape_size_ = 0;
bool ignore_nan_ = false;
bool keep_dims_ = false;
int dim_ = 0;
int64_t input_dim_ = 0;
Tensor *input_ = nullptr;
Tensor *output_ = nullptr;
Tensor *q_ = nullptr;
bool has_nan_ = false;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,154 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_segment_sqrt_n.h"
#include <math.h>
#include "Eigen/Core"
#include "utils/kernel_util.h"
namespace aicpu {
const uint32_t kInputNum = 3;
const uint32_t kOutputNum = 1;
const char *SparseSegmentSqrtN = "SparseSegmentSqrtN";
#define COMPUTE_CASE(DTYPE, TYPE, DTYPE_1, DTYPE_2, CTX) \
case (DTYPE): \
if ((DTYPE_1) == DT_INT32) { \
if ((DTYPE_2) == DT_INT32) { \
return ComputeKernal<TYPE, int32_t, int32_t>(CTX); \
} else { \
return ComputeKernal<TYPE, int32_t, int64_t>(CTX); \
} \
} else { \
if ((DTYPE_2) == DT_INT32) { \
return ComputeKernal<TYPE, int64_t, int32_t>(CTX); \
} else { \
return ComputeKernal<TYPE, int64_t, int64_t>(CTX); \
} \
} \
break;
} // namespace aicpu
namespace aicpu {
uint32_t SparseSegmentSqrtNCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentSqrtN normalcheck failed.");
Tensor *x = ctx.Input(0);
Tensor *indices = ctx.Input(1);
Tensor *segment_ids = ctx.Input(2);
auto x_shape = x->GetTensorShape();
auto indices_shape = indices->GetTensorShape();
auto segment_ids_shape = segment_ids->GetTensorShape();
if (x_shape->GetDims() < 1) {
KERNEL_LOG_ERROR("[%s] Tensor input0's rank less than 1.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (indices_shape->NumElements() != segment_ids_shape->NumElements()) {
KERNEL_LOG_ERROR("[%s] Tensor input1&input2's ranks mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto x_data_type = x->GetDataType();
auto indices_data_type = indices->GetDataType();
auto segment_ids_data_type = segment_ids->GetDataType();
if (x_data_type != DT_FLOAT && x_data_type != DT_DOUBLE && x_data_type != DT_FLOAT16) {
KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(x_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if ((indices_data_type != DT_INT32 && indices_data_type != DT_INT64) ||
(segment_ids_data_type != DT_INT32 && segment_ids_data_type != DT_INT64)) {
KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(indices_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
switch (x_data_type) {
COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, segment_ids_data_type, ctx)
COMPUTE_CASE(DT_FLOAT, float, indices_data_type, segment_ids_data_type, ctx)
COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, segment_ids_data_type, ctx)
default:
KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(x_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2, typename T3>
uint32_t SparseSegmentSqrtNCpuKernel::ComputeKernal(CpuKernelContext &ctx) {
size_t n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
size_t m = ctx.Input(2)->GetTensorShape()->NumElements();
size_t k = ctx.Output(0)->GetTensorShape()->NumElements();
auto x_addr = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
auto indices_addr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
auto segment_ids_addr = reinterpret_cast<T3 *>(ctx.Input(2)->GetData());
auto y_addr = reinterpret_cast<T1 *>(ctx.Output(0)->GetData());
std::vector<int64_t> x_shape_list = ctx.Input(0)->GetTensorShape()->GetDimSizes();
x_shape_list[0] = segment_ids_addr[m - 1] + 1;
ctx.Output(0)->GetTensorShape()->SetDimSizes(x_shape_list);
for (size_t i = 0; i < k; i++) {
y_addr[i] = (T1)0;
}
if (segment_ids_addr[0] != 0) {
KERNEL_LOG_ERROR("segment_ids can't miss ids.");
return KERNEL_STATUS_PARAM_INVALID;
}
for (size_t i = 1; i < m; i++) {
if (segment_ids_addr[i] < segment_ids_addr[i - 1]) {
KERNEL_LOG_ERROR("segment_ids should be sorted.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_addr[i] - segment_ids_addr[i - 1] > 1) {
KERNEL_LOG_ERROR("segment_ids can't miss ids.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (size_t i = 0; i < m; i++) {
if (indices_addr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
KERNEL_LOG_ERROR("indices out of range.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
int oldindex = -1;
int countnum = 0;
for (size_t i = 0; i < m; i++) {
if (oldindex == segment_ids_addr[i]) {
countnum++;
} else if (countnum != 0) {
for (size_t j = 0; j < n; j++) {
y_addr[j + oldindex * n] /= (T1)(sqrt(countnum));
}
countnum = 1;
oldindex = segment_ids_addr[i];
} else {
countnum = 1;
oldindex = segment_ids_addr[i];
}
for (size_t j = 0; j < n; j++) {
y_addr[j + oldindex * n] += x_addr[j + indices_addr[i] * n];
}
}
if (countnum != 0) {
for (size_t j = 0; j < n; j++) {
y_addr[j + oldindex * n] /= (T1)(sqrt(countnum));
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(SparseSegmentSqrtN, SparseSegmentSqrtNCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_H_
#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
#include "utils/sparse_tensor.h"
namespace aicpu {
class SparseSegmentSqrtNCpuKernel : public CpuKernel {
public:
SparseSegmentSqrtNCpuKernel() = default;
~SparseSegmentSqrtNCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2, typename T3>
uint32_t ComputeKernal(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,171 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "unsorted_segment_prod.h"
#include <string>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kUnsortedSegmentProd = "UnsortedSegmentProd";
const uint32_t input_num = 3;
const uint32_t output_num = 1;
constexpr int64_t kParallelDataNums = 64 * 1024;
} // namespace
namespace aicpu {
template <typename input_t, typename segment_ids_t, typename num_segments_t>
uint32_t UnsortedSegmentProdCpuKernel::UnsortedSegmentProdComputeTemplate(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_num, output_num), " node input size should be [%llu], get [%llu]",
input_num, ctx.GetInputsSize(), " node output size should be [%llu], get [%llu]", output_num,
ctx.GetOutputsSize());
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s]",
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
KERNEL_LOG_ERROR(
"The data size of the input [%llu] need be the same as the output "
"[%llu]",
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t data_size = ctx.Input(0)->NumElements();
int64_t id_size = ctx.Input(1)->NumElements();
auto input_x = reinterpret_cast<input_t *>(ctx.Input(0)->GetData());
KERNEL_CHECK_NULLPTR(input_x, KERNEL_STATUS_PARAM_INVALID, "Get input data failed")
auto output_y = reinterpret_cast<input_t *>(ctx.Output(0)->GetData());
KERNEL_CHECK_NULLPTR(output_y, KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
auto segmentids = reinterpret_cast<segment_ids_t *>(ctx.Input(1)->GetData());
KERNEL_CHECK_NULLPTR(segmentids, KERNEL_STATUS_PARAM_INVALID, "Get segment_ids failed")
auto numsegments = reinterpret_cast<num_segments_t *>(ctx.Input(2)->GetData());
KERNEL_CHECK_NULLPTR(numsegments, KERNEL_STATUS_PARAM_INVALID, "Get num_segments failed")
if (id_size <= 0) {
KERNEL_LOG_ERROR("segment_ids num elements should great than 0");
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t reshapesize = data_size / id_size;
// Initialized to 1
for (int64_t k = 0; k < data_size; k++) {
*(output_y + k) = static_cast<input_t>(1);
}
if (data_size <= kParallelDataNums) {
// calculation process
for (int64_t i = 0; i < id_size; i++) {
if (*(segmentids + i) < *numsegments) {
for (int64_t j = 0; j < reshapesize; j++) {
*(output_y + *(segmentids + i) * reshapesize + j) *= *(input_x + i * reshapesize + j);
}
}
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > reshapesize) {
max_core_num = reshapesize;
}
// calculation process
auto shard_unsorted_segment_prod = [&](int64_t start, int64_t end) {
for (int64_t i = 0; i < id_size; i++) {
if (*(segmentids + i) < *numsegments) {
for (int64_t j = start; j < end; j++) {
*(output_y + *(segmentids + i) * reshapesize + j) *= *(input_x + i * reshapesize + j);
}
}
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, reshapesize, reshapesize / max_core_num, shard_unsorted_segment_prod),
"CpuKernelUtils::ParallelFor failed.");
}
return KERNEL_STATUS_OK;
}
template <typename input_t, typename segment_ids_t>
uint32_t UnsortedSegmentProdCpuKernel::DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type) {
switch (num_segments_type) {
case DT_INT32:
return UnsortedSegmentProdComputeTemplate<input_t, segment_ids_t, int32_t>(ctx);
case DT_INT64:
return UnsortedSegmentProdComputeTemplate<input_t, segment_ids_t, int64_t>(ctx);
default:
KERNEL_LOG_ERROR("UnsortedSegmentProd invalid num_segments_type type [%s]", DTypeStr(num_segments_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
template <typename input_t>
uint32_t UnsortedSegmentProdCpuKernel::DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type) {
auto num_segments_type = ctx.Input(2)->GetDataType();
switch (segment_ids_type) {
case DT_INT32:
return DoComputeWithNumSegmentsType<input_t, int32_t>(ctx, num_segments_type);
case DT_INT64:
return DoComputeWithNumSegmentsType<input_t, int64_t>(ctx, num_segments_type);
default:
KERNEL_LOG_ERROR("UnsortedSegmentProd invalid segment_ids_type type [%s]", DTypeStr(segment_ids_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
uint32_t UnsortedSegmentProdCpuKernel::Compute(CpuKernelContext &ctx) {
auto input_type = ctx.Input(0)->GetDataType();
auto segment_ids_type = ctx.Input(1)->GetDataType();
switch (input_type) {
case DT_INT32:
return DoComputeWithSegmentIdsType<int32_t>(ctx, segment_ids_type);
case DT_INT16:
return DoComputeWithSegmentIdsType<int16_t>(ctx, segment_ids_type);
case DT_FLOAT:
return DoComputeWithSegmentIdsType<float>(ctx, segment_ids_type);
case DT_DOUBLE:
return DoComputeWithSegmentIdsType<double>(ctx, segment_ids_type);
case DT_FLOAT16:
return DoComputeWithSegmentIdsType<Eigen::half>(ctx, segment_ids_type);
case DT_INT8:
return DoComputeWithSegmentIdsType<int8_t>(ctx, segment_ids_type);
case DT_INT64:
return DoComputeWithSegmentIdsType<int64_t>(ctx, segment_ids_type);
case DT_UINT8:
return DoComputeWithSegmentIdsType<uint8_t>(ctx, segment_ids_type);
case DT_UINT16:
return DoComputeWithSegmentIdsType<uint16_t>(ctx, segment_ids_type);
case DT_UINT32:
return DoComputeWithSegmentIdsType<uint32_t>(ctx, segment_ids_type);
case DT_UINT64:
return DoComputeWithSegmentIdsType<uint64_t>(ctx, segment_ids_type);
case DT_COMPLEX64:
return DoComputeWithSegmentIdsType<std::complex<float>>(ctx, segment_ids_type);
case DT_COMPLEX128:
return DoComputeWithSegmentIdsType<std::complex<double>>(ctx, segment_ids_type);
default:
KERNEL_LOG_ERROR("UnsortedSegmentProd invalid input type [%s]", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kUnsortedSegmentProd, UnsortedSegmentProdCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_PROD_H
#define AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_PROD_H
#include "cpu_ops_kernel.h"
namespace aicpu {
class UnsortedSegmentProdCpuKernel : public CpuKernel {
public:
~UnsortedSegmentProdCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename input_t, typename segment_ids_t, typename num_segments_t>
uint32_t UnsortedSegmentProdComputeTemplate(CpuKernelContext &ctx);
template <typename input_t, typename segment_ids_t>
uint32_t DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type);
template <typename input_t>
uint32_t DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type);
};
} // namespace aicpu
#endif

View File

@ -41,7 +41,7 @@ uint32_t EqualCalculate(const CpuKernelContext &ctx, BCalcInfo &calcInfo, bool f
output_y[i] = (flag == true) ? (*x_index == *y_index) : (*x_index != *y_index);
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, 1, shard_equal), "Equal calculate failed.")
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, 1, shard_equal), "Equal calculate failed.");
return KERNEL_STATUS_OK;
}
/**
@ -69,7 +69,7 @@ uint32_t EqualCompute(const CpuKernelContext &ctx, bool flag) {
calcInfo.input_1->GetDataSize(), calcInfo.output->GetData(), calcInfo.output->GetDataSize());
Bcast bcast;
KERNEL_HANDLE_ERROR(bcast.GenerateBcastInfo(calcInfo), "Generate broadcast info failed.")
KERNEL_HANDLE_ERROR(bcast.GenerateBcastInfo(calcInfo), "Generate broadcast info failed.");
bcast.BCastIndexes(calcInfo.x_indexes, calcInfo.y_indexes);
bcast.GetBcastVec(calcInfo);

View File

@ -51,8 +51,12 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
static const std::set<std::string> kMigrateAicpuKernelOps = {mindspore::kAdaptiveAvgPool2dOpName,
mindspore::kAdaptiveAvgPool2dGradOpName,
mindspore::kCacheSwapTableOpName,
mindspore::kCol2imOpName,
mindspore::kCumulativeLogsumexpOpName,
mindspore::kDataFormatVecPermuteOpName,
mindspore::kFillOpName,
mindspore::kLogMatrixDeterminantOpName,
mindspore::kMatrixSolveLsOpName,
mindspore::kMaskedSelectOpName,
mindspore::kMaskedSelectGradOpName,
mindspore::kMedianOpName,
@ -71,6 +75,10 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
mindspore::kNanToNumOpName,
mindspore::kQrOpName,
mindspore::kResizeBicubicOpName};
mindspore::kNuclearNormOpName,
mindspore::kQuantileOpName,
mindspore::kSparseSegmentSqrtNOpName,
mindspore::kUnsortedSegmentProdOpName};
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";

View File

@ -171,3 +171,5 @@ from .median_grad import _median_grad_aicpu
from .reduce_sum import _reduce_sum_aicpu
from .adaptive_avg_pool_2d_v1 import _adaptive_avg_pool_2d_v1_aicpu
from .fill_v2 import _fill_v2_aicpu
from .data_format_vec_permute import _data_format_vec_permute_aicpu
from .quantile import _quantile_aicpu