forked from mindspore-Ecosystem/mindspore
!47329 migrates aicpu ops from CANN pr
Merge pull request !47329 from 李林杰/1229_migrates_aicpu_from_CAAN_pr
This commit is contained in:
commit
76f46b52c4
|
@ -83,18 +83,10 @@
|
|||
"mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/matmul_tik.cc" "syntaxError"
|
||||
|
||||
# AICPU migration
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "useStlAlgorithm"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "variableScope"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constParameter"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constVariable"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "unreadVariable"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "redundantAssignment"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constArgument"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "useStlAlgorithm"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "variableScope"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constParameter"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constVariable"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "unreadVariable"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "nullPointerRedundantCheck"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "variableScope"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unreadVariable"
|
||||
|
@ -104,3 +96,4 @@
|
|||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "knownConditionTrueFalse"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "passedByValue"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
|
||||
|
|
|
@ -280,3 +280,5 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
|
|||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multinomial.cc:aicpu::Generate
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_set_diag_v3.cc:aicpu::MatrixSetDiagV3CpuKernel::DoCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc:aicpu::Col2imCpuKernel::Col2imParamCheck
|
||||
|
|
|
@ -163,6 +163,7 @@ constexpr auto kClipBoxesDOpName = "kClipBoxesD";
|
|||
constexpr auto kClipByNormNoDivSumOpName = "ClipByNormNoDivSum";
|
||||
constexpr auto kClipByValueOpName = "ClipByValue";
|
||||
constexpr auto kCoalesceOpName = "Coalesce";
|
||||
constexpr auto kCol2imOpName = "Col2im";
|
||||
constexpr auto kCombineMomentumOpName = "CombineMomentum";
|
||||
constexpr auto kCombineMomentumWeightOpName = "CombineMomentumWeight";
|
||||
constexpr auto kComputeAccidentalHitsOpName = "ComputeAccidentalHits";
|
||||
|
@ -209,6 +210,7 @@ constexpr auto kCumSumOpName = "CumSum";
|
|||
constexpr auto kDataFormatDimMapOpName = "DataFormatDimMap";
|
||||
constexpr auto kCumulativeLogsumexpOpName = "CumulativeLogsumexp";
|
||||
constexpr auto kCumulativeLogsumexpDOpName = "CumulativeLogsumexpD";
|
||||
constexpr auto kDataFormatVecPermuteOpName = "DataFormatVecPermute";
|
||||
constexpr auto kDeadNodeName = "DeadNode";
|
||||
constexpr auto kDenseToDenseSetOperation = "DenseToDenseSetOperation";
|
||||
constexpr auto kDenseToSparseSetOperation = "DenseToSparseSetOperation";
|
||||
|
@ -346,6 +348,8 @@ constexpr auto kInplaceUpdateOpName = "InplaceUpdate";
|
|||
constexpr auto kInplaceUpdateDOpName = "InplaceUpdateD";
|
||||
constexpr auto kInstanceNorm = "InstanceNorm";
|
||||
constexpr auto kInstanceNormGradOpName = "InstanceNormGrad";
|
||||
constexpr auto kInstanceNormV2OpName = "InstanceNormV2";
|
||||
constexpr auto kInstanceNormV2GradOpName = "InstanceNormV2Grad";
|
||||
constexpr auto kInTopKOpName = "InTopK";
|
||||
constexpr auto kInTopKDOpName = "InTopKD";
|
||||
constexpr auto kIsInfOpName = "IsInf";
|
||||
|
@ -403,6 +407,7 @@ constexpr auto kMatrixLogarithmOpName = "MatrixLogarithm";
|
|||
constexpr auto kMatrixSetDiagOpName = "MatrixSetDiag";
|
||||
constexpr auto kMatrixSetDiagDOpName = "MatrixSetDiagD";
|
||||
constexpr auto kMatrixSetDiagV3OpName = "MatrixSetDiagV3";
|
||||
constexpr auto kMatrixSolveLsOpName = "MatrixSolveLs";
|
||||
constexpr auto kMaximumGradOpName = "MaximumGrad";
|
||||
constexpr auto kMaximumOpName = "Maximum";
|
||||
constexpr auto kMaxPool3DGradGradOpName = "MaxPool3DGradGrad";
|
||||
|
@ -422,6 +427,7 @@ constexpr auto kMedianGradOpName = "MedianGrad";
|
|||
constexpr auto kMemCpyAsyncOpName = "memcpy_async";
|
||||
constexpr auto kMinimumGradOpName = "MinimumGrad";
|
||||
constexpr auto kMinimumOpName = "Minimum";
|
||||
constexpr auto kMirrorPadOpName = "MirrorPad";
|
||||
constexpr auto kMomentumOpName = "Momentum";
|
||||
constexpr auto kMulOpName = "Mul";
|
||||
constexpr auto kMultinomialOpName = "Multinomial";
|
||||
|
@ -439,6 +445,7 @@ constexpr auto kNonZeroOpName = "NonZero";
|
|||
constexpr auto kNPUAllocFloatStatusOpName = "NPUAllocFloatStatus";
|
||||
constexpr auto kNPUClearFloatStatusOpName = "NPUClearFloatStatus";
|
||||
constexpr auto kNPUGetFloatStatusOpName = "NPUGetFloatStatus";
|
||||
constexpr auto kNuclearNormOpName = "NuclearNorm";
|
||||
constexpr auto kOneHotOpName = "OneHot";
|
||||
constexpr auto kOneHotDOpName = "OneHotD";
|
||||
constexpr auto kPadAndShiftOpName = "PadAndShift";
|
||||
|
@ -472,6 +479,7 @@ constexpr auto kPullWeightOpName = "PullWeight";
|
|||
constexpr auto kPushOpName = "Push";
|
||||
constexpr auto kQrOpName = "Qr";
|
||||
constexpr auto kPushWeightOpName = "PushWeight";
|
||||
constexpr auto kQuantileOpName = "Quantile";
|
||||
constexpr auto kRandomChoiceWithMaskOpName = "RandomChoiceWithMask";
|
||||
constexpr auto kRandomShuffleOpName = "RandomShuffle";
|
||||
constexpr auto kRangeOpName = "Range";
|
||||
|
@ -591,6 +599,7 @@ constexpr auto kSparseSliceOpName = "SparseSlice";
|
|||
constexpr auto kSparseSoftmaxCrossEntropyWithLogitsOpName = "SparseSoftmaxCrossEntropyWithLogits";
|
||||
constexpr auto kSparseSparseMinimumOpName = "SparseSparseMinimum";
|
||||
constexpr auto kSparseSparseMaximumOpName = "SparseSparseMaximum";
|
||||
constexpr auto kSparseTensorDenseMatMulOpName = "SparseTensorDenseMatMul";
|
||||
constexpr auto kSplitOpName = "Split";
|
||||
constexpr auto kSplitDOpName = "SplitD";
|
||||
constexpr auto kSplitVOpName = "SplitV";
|
||||
|
@ -604,6 +613,7 @@ constexpr auto kStackDestroyOpName = "StackDestroy";
|
|||
constexpr auto kStackInitOpName = "StackInit";
|
||||
constexpr auto kStackOpName = "Stack";
|
||||
constexpr auto kPackOpName = "Pack";
|
||||
constexpr auto kSparseSegmentSqrtNOpName = "SparseSegmentSqrtN";
|
||||
constexpr auto kStackPopOpName = "StackPop";
|
||||
constexpr auto kStackPushOpName = "StackPush";
|
||||
constexpr auto kStandardLaplaceOpName = "StandardLaplace";
|
||||
|
|
|
@ -0,0 +1,236 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "col2im.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "status.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kCol2imInputNum = 2;
|
||||
const uint32_t kCol2imOutputNum = 1;
|
||||
constexpr uint32_t kValue0 = 0;
|
||||
constexpr uint32_t kValue1 = 1;
|
||||
constexpr uint32_t kValue2 = 2;
|
||||
constexpr uint32_t kValue4 = 4;
|
||||
constexpr uint32_t kIndex0 = 0;
|
||||
constexpr uint32_t kIndex1 = 1;
|
||||
constexpr uint32_t kIndex2 = 2;
|
||||
constexpr uint32_t kIndex3 = 3;
|
||||
const char *kCol2im = "Col2im";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t Col2imCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(Col2imParamCheck(ctx), "[%s] check params failed.", kCol2im);
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
uint32_t ret = KERNEL_STATUS_OK;
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
ret = Col2imCompute<float>(ctx);
|
||||
break;
|
||||
case DT_FLOAT16:
|
||||
ret = Col2imCompute<Eigen::half>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Range kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
ret = KERNEL_STATUS_PARAM_INVALID;
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline T div_rtn(T x, T y) {
|
||||
int q = x / y;
|
||||
int r = x % y;
|
||||
if ((r != 0) && ((r < 0) != (y < 0))) {
|
||||
--q;
|
||||
}
|
||||
return q;
|
||||
}
|
||||
|
||||
uint32_t Col2imCpuKernel::Col2imParamCheck(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kCol2imInputNum, kCol2imOutputNum), "[%s] check params failed.", kCol2im);
|
||||
Tensor *input_ = ctx.Input(0);
|
||||
Tensor *output_size_ = ctx.Input(1);
|
||||
KERNEL_CHECK_NULLPTR(ctx.GetAttr("kernel_size"), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Get ctx.GetAttr(\"kernel_size\") failed.");
|
||||
KERNEL_CHECK_NULLPTR(ctx.GetAttr("dilation"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"dilation\") failed.");
|
||||
KERNEL_CHECK_NULLPTR(ctx.GetAttr("padding"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"padding\") failed.");
|
||||
KERNEL_CHECK_NULLPTR(ctx.GetAttr("stride"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"stride\") failed.");
|
||||
std::vector<int64_t> kernel_size = ctx.GetAttr("kernel_size")->GetListInt();
|
||||
std::vector<int64_t> dilation = ctx.GetAttr("dilation")->GetListInt();
|
||||
std::vector<int64_t> padding = ctx.GetAttr("padding")->GetListInt();
|
||||
std::vector<int64_t> stride = ctx.GetAttr("stride")->GetListInt();
|
||||
auto output_size_shape = output_size_->GetTensorShape()->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE((output_size_shape.size() == kValue1 && output_size_->NumElements() == kValue2),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected 1D tensor for output_size with non-zero dimensions for and "
|
||||
"output_size's size equals to 2, but "
|
||||
"got %dD tensor for output_size and output_size's size equals to %d.",
|
||||
output_size_shape.size(), output_size_->NumElements());
|
||||
KERNEL_CHECK_FALSE(kernel_size.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
|
||||
"It is expected kernel_size's size equals to 2, but got size %d.", kernel_size.size());
|
||||
KERNEL_CHECK_FALSE(dilation.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
|
||||
"It is expected dilation_size equals to 2, but got size %d.", dilation.size());
|
||||
KERNEL_CHECK_FALSE(padding.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
|
||||
"It is expected padding_size equals to 2, but got size %d.", padding.size());
|
||||
KERNEL_CHECK_FALSE(stride.size() == kValue2, KERNEL_STATUS_PARAM_INVALID,
|
||||
"It is expected stride_size equals to 2, but got size %d.", stride.size());
|
||||
int32_t *output_size_data = reinterpret_cast<int32_t *>(output_size_->GetData());
|
||||
std::vector<int64_t> output_size(kValue2, kValue0);
|
||||
output_size[kIndex0] = output_size_data[kIndex0];
|
||||
output_size[kIndex1] = output_size_data[kIndex1];
|
||||
const int64_t output_height = output_size.front();
|
||||
const int64_t output_width = output_size.back();
|
||||
const int64_t kernel_height = kernel_size.front();
|
||||
const int64_t kernel_width = kernel_size.back();
|
||||
const int64_t dilation_height = dilation.front();
|
||||
const int64_t dilation_width = dilation.back();
|
||||
const int64_t pad_height = padding.front();
|
||||
const int64_t pad_width = padding.back();
|
||||
const int64_t stride_height = stride.front();
|
||||
const int64_t stride_width = stride.back();
|
||||
KERNEL_CHECK_FALSE(output_width > kValue0 && output_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"output should be greater than zero, but got "
|
||||
"output_height: %d output_width: %d.",
|
||||
output_height, output_width);
|
||||
KERNEL_CHECK_FALSE(kernel_width > kValue0 && kernel_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"kernel should be greater than zero, but got "
|
||||
"kernel_height: %d kernel_width: %d.",
|
||||
kernel_height, kernel_width);
|
||||
KERNEL_CHECK_FALSE(dilation_width > kValue0 && dilation_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"dilation should be greater than zero, but got "
|
||||
"dilation_height: %d dilation_width: %d.",
|
||||
dilation_height, dilation_width);
|
||||
KERNEL_CHECK_FALSE(pad_width >= kValue0 && pad_height >= kValue0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"padding should be greater than zero, but got pad_height: "
|
||||
"%d pad_width: %d.",
|
||||
pad_height, pad_width);
|
||||
KERNEL_CHECK_FALSE(stride_width > kValue0 && stride_height > kValue0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"stride should be greater than zero, but got "
|
||||
"stride_height: %d stride_width: %d.",
|
||||
stride_height, stride_width);
|
||||
auto input_shape = input_->GetTensorShape()->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE(
|
||||
(input_shape.size() == kValue4 && input_shape[kIndex0] != kValue0 && input_shape[kIndex1] != kValue0 &&
|
||||
input_shape[kIndex2] != kValue0 && input_shape[kIndex3] != kValue0),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected 4D (batch mode) tensor for input with non-zero "
|
||||
"batch size and non-zero dimensions for input, but got %dD input: (%d %d "
|
||||
"%d %d).",
|
||||
input_shape.size(), input_shape[kIndex0], input_shape[kIndex1], input_shape[kIndex2], input_shape[kIndex3]);
|
||||
KERNEL_CHECK_FALSE(input_shape[kIndex2] == (kernel_width * kernel_height), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected size of input's dimension 2 to match the calculated "
|
||||
"number of kernel_size, but got input_shape[2]=%d and kernel_size=(%d, "
|
||||
"%d).",
|
||||
input_shape[kIndex2], kernel_height, kernel_width);
|
||||
auto input_length = input_shape[kIndex3];
|
||||
int64_t n_blocks_height =
|
||||
div_rtn<int64_t>(output_height + 2 * pad_height - dilation_height * (kernel_height - 1) - 1, stride_height) + 1;
|
||||
int64_t n_blocks_width =
|
||||
div_rtn<int64_t>(output_width + 2 * pad_width - dilation_width * (kernel_width - 1) - 1, stride_width) + 1;
|
||||
KERNEL_CHECK_FALSE(input_length == (n_blocks_height * n_blocks_width), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Given output_size=(%d, %d), kernel_size=(%d, %d), dilation=(%d, %d",
|
||||
"), padding=(%d, %d), stride=(%d, %d), expected size of input's "
|
||||
"dimension 2 to match the calculated "
|
||||
"number of sliding blocks %d * %d = %d, but got input.size(2)=%d.",
|
||||
output_height, output_width, kernel_height, kernel_width, dilation_height, dilation_width,
|
||||
pad_height, pad_width, stride_height, stride_width, n_blocks_height, n_blocks_width,
|
||||
(n_blocks_height * n_blocks_width), input_length);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void Col2imCpuKernel::InnerCompute(int64_t c_col, int64_t input_offset, int64_t output_offset, T *input_data,
|
||||
T *output_data) {
|
||||
int64_t w_offset = c_col % kernel_width;
|
||||
int64_t h_offset = (c_col / kernel_width) % kernel_height;
|
||||
int64_t c_im = c_col / kernel_height / kernel_width;
|
||||
for (int64_t h_col = 0; h_col < height_col; ++h_col) {
|
||||
int64_t h_im = h_col * stride_height - pad_height + h_offset * dilation_height;
|
||||
for (int64_t w_col = 0; w_col < width_col; ++w_col) {
|
||||
int64_t w_im = w_col * stride_width - pad_width + w_offset * dilation_width;
|
||||
if (h_im >= 0 && h_im < output_height && w_im >= 0 && w_im < output_width) {
|
||||
output_data[output_offset + (c_im * output_height + h_im) * output_width + w_im] +=
|
||||
input_data[input_offset + (c_col * height_col + h_col) * width_col + w_col];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t Col2imCpuKernel::Col2imCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input_ = ctx.Input(0);
|
||||
Tensor *output_size_ = ctx.Input(1);
|
||||
Tensor *output_ = ctx.Output(0);
|
||||
int32_t *output_size_data = reinterpret_cast<int32_t *>(output_size_->GetData());
|
||||
std::vector<int64_t> output_size(kValue2, kValue0);
|
||||
output_size[kIndex0] = output_size_data[kIndex0];
|
||||
output_size[kIndex1] = output_size_data[kIndex1];
|
||||
|
||||
std::vector<int64_t> kernel_size = ctx.GetAttr("kernel_size")->GetListInt();
|
||||
std::vector<int64_t> dilation = ctx.GetAttr("dilation")->GetListInt();
|
||||
std::vector<int64_t> padding = ctx.GetAttr("padding")->GetListInt();
|
||||
std::vector<int64_t> stride = ctx.GetAttr("stride")->GetListInt();
|
||||
|
||||
output_height = output_size.front();
|
||||
output_width = output_size.back();
|
||||
kernel_height = kernel_size.front();
|
||||
kernel_width = kernel_size.back();
|
||||
dilation_height = dilation.front();
|
||||
dilation_width = dilation.back();
|
||||
pad_height = padding.front();
|
||||
pad_width = padding.back();
|
||||
stride_height = stride.front();
|
||||
stride_width = stride.back();
|
||||
|
||||
auto input_shape = input_->GetTensorShape()->GetDimSizes();
|
||||
const int64_t batch_size = input_shape[kIndex0];
|
||||
const int64_t n_input_plane = input_shape[kIndex1];
|
||||
|
||||
height_col =
|
||||
(output_height + kValue2 * pad_height - (dilation_height * (kernel_height - kValue1) + kValue1)) / stride_height +
|
||||
1;
|
||||
width_col =
|
||||
(output_width + kValue2 * pad_width - (dilation_width * (kernel_width - kValue1) + kValue1)) / stride_width + 1;
|
||||
|
||||
T *input_data = reinterpret_cast<T *>(input_->GetData());
|
||||
T *output_data = reinterpret_cast<T *>(output_->GetData());
|
||||
std::fill_n(output_data, output_->NumElements(), T(0));
|
||||
channels_col = n_input_plane * kernel_height * kernel_width;
|
||||
batch_input_size = n_input_plane * kernel_height * kernel_width * height_col * width_col;
|
||||
batch_output_size = n_input_plane * output_height * output_width;
|
||||
for (int64_t elt = 0; elt < batch_size; ++elt) {
|
||||
int64_t input_offset = batch_input_size * elt;
|
||||
int64_t output_offset = batch_output_size * elt;
|
||||
for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
|
||||
InnerCompute<T>(c_col, input_offset, output_offset, input_data, output_data);
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kCol2im, Col2imCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_COL2IM_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_COL2IM_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class Col2imCpuKernel : public CpuKernel {
|
||||
public:
|
||||
Col2imCpuKernel() = default;
|
||||
~Col2imCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t Col2imParamCheck(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t Col2imCompute(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
void InnerCompute(int64_t c_col, int64_t input_offset, int64_t output_offset, T *input_data, T *output_data);
|
||||
|
||||
int64_t output_height, output_width;
|
||||
int64_t kernel_height, kernel_width;
|
||||
int64_t dilation_height, dilation_width;
|
||||
int64_t pad_height, pad_width;
|
||||
int64_t stride_height, stride_width;
|
||||
|
||||
int64_t height_col, width_col;
|
||||
|
||||
int64_t channels_col, batch_input_size, batch_output_size;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,211 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
**/
|
||||
#include "cumulativelogsumexp.h"
|
||||
|
||||
#include "cmath"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t KCumulativeLogsumexpInputNum = 2;
|
||||
const uint32_t KCumulativeLogsumexpOutputNum = 1;
|
||||
const float float16_exclusive_data = -65504e+0;
|
||||
const float float_exclusive_data = -3.4028235e+38;
|
||||
const double double_exclusive_data = -1.7976931348623157e+308;
|
||||
const int64_t ParallelFor_size_float16 = 16 * 1024;
|
||||
const int64_t ParallelFor_size_float32 = 32 * 1024;
|
||||
const int64_t ParallelFor_size_double = 64 * 1024;
|
||||
const char *KCumulativeLogsumexp = "CumulativeLogsumexp";
|
||||
#define CUMULATIVELOGSUMEXP_COMPUTE_CASE(DTYPE, IN_TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = CumulativeLogsumexpCompute<IN_TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("CumulativeLogsumexp kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
namespace aicpu {
|
||||
uint32_t CumulativeLogsumexpCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, KCumulativeLogsumexpInputNum, KCumulativeLogsumexpOutputNum),
|
||||
"[%s] check input and output failed,", KCumulativeLogsumexp);
|
||||
KERNEL_HANDLE_ERROR(CumulativeLogsumexpCheck(ctx), "[%s] check params failed.", KCumulativeLogsumexp);
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("CumulativeLogsumexp kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
uint32_t CumulativeLogsumexpCpuKernel::CumulativeLogsumexpCheck(CpuKernelContext &ctx) {
|
||||
KERNEL_CHECK_FALSE((ctx.Input(1)->GetDataType() == DT_INT16 || ctx.Input(1)->GetDataType() == DT_INT32),
|
||||
KERNEL_STATUS_PARAM_INVALID, "Data type of axis is not support, axis data type is [%u].",
|
||||
ctx.Input(1)->GetDataType())
|
||||
KERNEL_CHECK_FALSE(ctx.Input(1)->NumElements() == 1, KERNEL_STATUS_PARAM_INVALID, "axis is out of shape");
|
||||
auto axis_data = static_cast<int32_t *>(ctx.Input(1)->GetData());
|
||||
int64_t axis = *axis_data;
|
||||
KERNEL_CHECK_FALSE((axis < ctx.Input(0)->GetTensorShape()->GetDims()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"axis is larger than input dims - 1")
|
||||
KERNEL_CHECK_FALSE((axis >= -ctx.Input(0)->GetTensorShape()->GetDims()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"axis is lower than -input dims")
|
||||
std::vector<int64_t> shape_input = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_output = ctx.Output(0)->GetTensorShape()->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE((shape_input.size() != 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input must be at least rank 1, got [%zu].", shape_input.size())
|
||||
KERNEL_CHECK_FALSE((shape_input.size() == shape_output.size()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The output shape size should be same as the output shape size")
|
||||
DataType input0_type = ctx.Input(0)->GetDataType();
|
||||
DataType output0_type = ctx.Output(0)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == output0_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with output0 [%s] ", DTypeStr(input0_type).c_str(),
|
||||
DTypeStr(output0_type).c_str())
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
template <typename t>
|
||||
void CumulativeProcess(uint32_t outer, uint32_t inner, uint32_t depth, bool reverse, bool exclusive, t *input_data,
|
||||
t *output_data, DataType data_type) {
|
||||
for (size_t outer_index = 0; outer_index < outer; ++outer_index) {
|
||||
size_t outer_index_adj;
|
||||
if (reverse) {
|
||||
outer_index_adj = (outer - 1) - outer_index;
|
||||
} else {
|
||||
outer_index_adj = outer_index;
|
||||
}
|
||||
for (size_t inner_index = 0; inner_index < inner; ++inner_index) {
|
||||
double one = 1;
|
||||
double temp = 0;
|
||||
size_t inner_index_adj;
|
||||
if (reverse) {
|
||||
inner_index_adj = (inner - 1) - inner_index;
|
||||
} else {
|
||||
inner_index_adj = inner_index;
|
||||
}
|
||||
for (size_t depth_index = 0; depth_index < depth; ++depth_index) {
|
||||
size_t depth_index_adj;
|
||||
if (reverse) {
|
||||
depth_index_adj = (depth - 1) - depth_index;
|
||||
} else {
|
||||
depth_index_adj = depth_index;
|
||||
}
|
||||
size_t index = outer_index_adj;
|
||||
index += inner_index_adj * depth * outer;
|
||||
index += depth_index_adj * outer;
|
||||
if (exclusive) {
|
||||
if (depth_index == 0) {
|
||||
if (data_type == DT_FLOAT16) {
|
||||
output_data[index] = static_cast<t>(float16_exclusive_data);
|
||||
} else if (data_type == DT_FLOAT) {
|
||||
output_data[index] = static_cast<t>(float_exclusive_data);
|
||||
} else {
|
||||
output_data[index] = static_cast<t>(double_exclusive_data);
|
||||
}
|
||||
temp = static_cast<double>(input_data[index]);
|
||||
} else {
|
||||
output_data[index] = static_cast<t>(temp);
|
||||
double a = temp;
|
||||
double b, min0, max0;
|
||||
b = static_cast<double>(input_data[index]);
|
||||
min0 = (a < b) ? a : b;
|
||||
max0 = (a > b) ? a : b;
|
||||
temp = log(one + exp(min0 - max0)) + max0;
|
||||
}
|
||||
} else {
|
||||
if (depth_index == 0) {
|
||||
output_data[index] = input_data[index];
|
||||
temp = static_cast<double>(input_data[index]);
|
||||
} else {
|
||||
double a = temp;
|
||||
double b, min0, max0;
|
||||
b = static_cast<double>(input_data[index]);
|
||||
min0 = (a < b) ? a : b;
|
||||
max0 = (a > b) ? a : b;
|
||||
output_data[index] = static_cast<t>(log(one + exp(min0 - max0)) + max0);
|
||||
temp = log(one + exp(min0 - max0)) + max0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
uint32_t CumulativeLogsumexpCpuKernel::CumulativeLogsumexpCompute(CpuKernelContext &ctx) {
|
||||
auto input_data = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto axis_data = static_cast<int32_t *>(ctx.Input(1)->GetData());
|
||||
bool exclusive = false;
|
||||
bool reverse = false;
|
||||
AttrValue *exclusive_attr = ctx.GetAttr("exclusive");
|
||||
if (exclusive_attr != nullptr) {
|
||||
exclusive = exclusive_attr->GetBool();
|
||||
}
|
||||
AttrValue *reverse_attr = ctx.GetAttr("reverse");
|
||||
if (reverse_attr != nullptr) {
|
||||
reverse = reverse_attr->GetBool();
|
||||
}
|
||||
int32_t axis = 0;
|
||||
if (axis_data != nullptr) {
|
||||
axis = *axis_data;
|
||||
}
|
||||
auto output_data = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto shape = ctx.Input(0)->GetTensorShape();
|
||||
const int64_t rank = shape->GetDims();
|
||||
if (axis < 0) {
|
||||
axis += shape->GetDims();
|
||||
}
|
||||
uint32_t inner = 1;
|
||||
uint32_t outer = 1;
|
||||
uint32_t depth = 1;
|
||||
for (int32_t i = 0; i < rank; ++i) {
|
||||
if (i < axis) {
|
||||
inner *= shape->GetDimSize(i);
|
||||
} else if (i > axis) {
|
||||
outer *= shape->GetDimSize(i);
|
||||
} else {
|
||||
depth = shape->GetDimSize(i);
|
||||
}
|
||||
} // end for
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
int64_t data_size = data_num * sizeof(T);
|
||||
if ((data_type == DT_FLOAT16 && data_size <= ParallelFor_size_float16) ||
|
||||
(data_type == DT_FLOAT && data_size <= ParallelFor_size_float32) ||
|
||||
(data_type == DT_DOUBLE && data_size <= ParallelFor_size_double)) {
|
||||
CumulativeProcess<T>(outer, inner, depth, reverse, exclusive, input_data, output_data, data_type);
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > outer) {
|
||||
max_core_num = outer;
|
||||
}
|
||||
auto shard_cumulativelogsumexp = [&](size_t start, size_t end) {
|
||||
CumulativeProcess<T>(outer, inner, depth, reverse, exclusive, input_data, output_data, data_type);
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, outer, outer / max_core_num, shard_cumulativelogsumexp),
|
||||
"CumulativeLogsumexp Compute failed.");
|
||||
} // end else
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(KCumulativeLogsumexp, CumulativeLogsumexpCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_CUMULATIVELOGSUMEXP_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_CUMULATIVELOGSUMEXP_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class CumulativeLogsumexpCpuKernel : public CpuKernel {
|
||||
public:
|
||||
CumulativeLogsumexpCpuKernel() = default;
|
||||
~CumulativeLogsumexpCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t CumulativeLogsumexpCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t CumulativeLogsumexpCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,126 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "data_format_vec_permute.h"
|
||||
|
||||
#include <string>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/kernel_util.h"
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *kDataFormatVecPermute = "DataFormatVecPermute";
|
||||
|
||||
#define DATAFORMATVECPERMUTE_COMPUTE_CASE(DTYPE, TYPE, DIM, SRC_FORMAT_STR, DST_FORMAT_STR, X, Y, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = DataFormatVecPermuteCompute<TYPE>(DIM, SRC_FORMAT_STR, DST_FORMAT_STR, X, Y, CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("DataFormatVecPermute kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t DataFormatVecPermute::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check DataFormatVecPermute params failed.");
|
||||
AttrValue *src_format = ctx.GetAttr("src_format");
|
||||
std::string src_format_str = src_format->GetString();
|
||||
KERNEL_CHECK_FALSE((src_format_str.size() == 4), KERNEL_STATUS_PARAM_INVALID,
|
||||
"src_format must be of length 4, but the length of src_format = [%d].", src_format_str.size());
|
||||
AttrValue *dst_format = ctx.GetAttr("dst_format");
|
||||
std::string dst_format_str = dst_format->GetString();
|
||||
KERNEL_CHECK_FALSE((dst_format_str.size() == 4), KERNEL_STATUS_PARAM_INVALID,
|
||||
"dst_format must be of length 4, but the length of dst_format = [%d].", dst_format_str.size());
|
||||
Tensor *x = ctx.Input(0);
|
||||
auto x_shape = x->GetTensorShape();
|
||||
int32_t dim = x_shape->GetDims();
|
||||
KERNEL_CHECK_FALSE((dim == 1 || dim == 2), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input dimension must be 1 or 2, but got dimension = [%d].", dim);
|
||||
Tensor *y = ctx.Output(0);
|
||||
auto y_shape = y->GetTensorShape();
|
||||
if (dim == 1) {
|
||||
KERNEL_CHECK_FALSE((x_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
|
||||
"1D Input must be of size 4, but got size %lld.", x_shape->GetDimSize(0));
|
||||
KERNEL_CHECK_FALSE((y_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
|
||||
"1D Output must be of size 4, but got size %lld.", y_shape->GetDimSize(0));
|
||||
} else if (dim == 2) {
|
||||
KERNEL_CHECK_FALSE((x_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
|
||||
"First dimension of 2D Input must be of size 4, but got size %lld.", x_shape->GetDimSize(0));
|
||||
KERNEL_CHECK_FALSE((x_shape->GetDimSize(1) == 2), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Second dimension of 2D Input must be of size 2, but got size %lld.", x_shape->GetDimSize(1));
|
||||
KERNEL_CHECK_FALSE((y_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID,
|
||||
"First dimension of 2D Output must be of size 4, but got size %lld.", y_shape->GetDimSize(0));
|
||||
KERNEL_CHECK_FALSE((y_shape->GetDimSize(1) == 2), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Second dimension of 2D Output must be of size 2, but got size %lld.", y_shape->GetDimSize(1));
|
||||
}
|
||||
|
||||
auto x_type = x->GetDataType();
|
||||
auto y_type = y->GetDataType();
|
||||
KERNEL_CHECK_FALSE((x_type == y_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[%s] and output[%s] must have the same DataType.", DTypeStr(x_type).c_str(),
|
||||
DTypeStr(y_type).c_str());
|
||||
switch (x_type) {
|
||||
DATAFORMATVECPERMUTE_COMPUTE_CASE(DT_INT32, int32_t, dim, src_format_str, dst_format_str, x, y, ctx)
|
||||
DATAFORMATVECPERMUTE_COMPUTE_CASE(DT_INT64, int64_t, dim, src_format_str, dst_format_str, x, y, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(x_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t DataFormatVecPermute::DataFormatVecPermuteCompute(const int32_t dim, const string &src_format_str,
|
||||
const string &dst_format_str, Tensor *x, Tensor *y,
|
||||
CpuKernelContext &ctx) {
|
||||
T *x_addrs = reinterpret_cast<T *>(x->GetData());
|
||||
T *y_addrs = reinterpret_cast<T *>(y->GetData());
|
||||
|
||||
if (dim == 1) {
|
||||
for (uint64_t i = 0; i < dst_format_str.size(); i++) {
|
||||
for (uint64_t j = 0; j < src_format_str.size(); j++) {
|
||||
if (dst_format_str[i] == src_format_str[j]) {
|
||||
y_addrs[i] = x_addrs[j];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (dim == 2) {
|
||||
for (uint64_t i = 0; i < dst_format_str.size(); i++) {
|
||||
for (uint64_t j = 0; j < src_format_str.size(); j++) {
|
||||
if (dst_format_str[i] == src_format_str[j]) {
|
||||
y_addrs[i * 2] = x_addrs[j * 2];
|
||||
y_addrs[i * 2 + 1] = x_addrs[j * 2 + 1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kDataFormatVecPermute, DataFormatVecPermute);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_DATA_FORMAT_VEC_PERMUTE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_DATA_FORMAT_VEC_PERMUTE_H_
|
||||
|
||||
#include <string>
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class DataFormatVecPermute : public CpuKernel {
|
||||
public:
|
||||
DataFormatVecPermute() = default;
|
||||
~DataFormatVecPermute() override = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t DataFormatVecPermuteCompute(const int32_t dim, const std::string &src_format_str,
|
||||
const std::string &dst_format_str, Tensor *x, Tensor *y, CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,455 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "matrix_solve_ls.h"
|
||||
|
||||
#include <Eigen/Cholesky>
|
||||
#include <Eigen/Dense>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 3;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *MatrixSolveLs = "MatrixSolveLs";
|
||||
const int64_t kNum2 = 2;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MatrixSolveLsCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
bool qr_chole = (ctx.GetAttr("fast") == nullptr) ? true : ctx.GetAttr("fast")->GetBool();
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixSolveLs check input and output number failed.");
|
||||
|
||||
Tensor *matrix = ctx.Input(kFirstInputIndex);
|
||||
Tensor *b = ctx.Input(kSecondInputIndex);
|
||||
Tensor *l2 = ctx.Input(2);
|
||||
Tensor *x = ctx.Output(0);
|
||||
if ((matrix->GetDataSize() == 0) || (b->GetDataSize() == 0)) {
|
||||
KERNEL_LOG_ERROR("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
auto shapea = matrix->GetTensorShape();
|
||||
auto shapeb = b->GetTensorShape();
|
||||
auto shapel2 = l2->GetTensorShape();
|
||||
auto shapex = x->GetTensorShape();
|
||||
auto dims = shapea->GetDims();
|
||||
|
||||
if (ctx.Input(1)->GetTensorShape()->GetDims() == 1) {
|
||||
if (shapea->GetDimSize(dims - kNum2) != shapeb->GetDimSize(0)) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"[%s] #Rows mismatch between A and rhs."
|
||||
"#Rows of A = [%llu], #Rows of rhs = [%llu]",
|
||||
ctx.GetOpType().c_str(), shapea->GetDimSize(dims - kNum2), shapeb->GetDimSize(0));
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else {
|
||||
if (shapea->GetDimSize(dims - kNum2) != shapeb->GetDimSize(dims - kNum2)) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"[%s] #Rows mismatch between A and rhs."
|
||||
"#Rows of A = [%llu], #Rows of rhs = [%llu]",
|
||||
ctx.GetOpType().c_str(), shapea->GetDimSize(dims - kNum2), shapeb->GetDimSize(dims - kNum2));
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
if (shapel2->GetDims() != 0) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor l2 should be a scalar.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(1)->GetTensorShape()->GetDims() == 1) {
|
||||
if (shapex->GetDims() != shapeb->GetDims() || shapea->GetDimSize(dims - 1) != shapex->GetDimSize(0) ||
|
||||
shapex->GetDimSize(shapex->GetDims() - 1) != shapeb->GetDimSize(0)) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor y shape mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else {
|
||||
if (shapex->GetDims() != shapeb->GetDims() ||
|
||||
shapea->GetDimSize(dims - 1) != shapex->GetDimSize(shapex->GetDims() - kNum2) ||
|
||||
shapex->GetDimSize(shapex->GetDims() - 1) != shapeb->GetDimSize(shapeb->GetDims() - 1)) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor y shape mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
auto a_data_type = matrix->GetDataType();
|
||||
auto b_data_type = b->GetDataType();
|
||||
if (a_data_type != b_data_type) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor data type mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (a_data_type != DT_FLOAT && a_data_type != DT_DOUBLE && a_data_type != DT_COMPLEX64 &&
|
||||
a_data_type != DT_COMPLEX128) {
|
||||
KERNEL_LOG_ERROR("MatrixSolveLs kernel data type [%s] not support.", DTypeStr(a_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (qr_chole) {
|
||||
if (a_data_type == DT_COMPLEX64) {
|
||||
return ComplexCholesky<float>(ctx);
|
||||
}
|
||||
if (a_data_type == DT_COMPLEX128) {
|
||||
return ComplexCholesky<double>(ctx);
|
||||
}
|
||||
if (a_data_type == DT_DOUBLE) {
|
||||
return RealCholesky<double>(ctx);
|
||||
}
|
||||
if (a_data_type == DT_FLOAT) {
|
||||
return RealCholesky<float>(ctx);
|
||||
}
|
||||
} else {
|
||||
if (a_data_type == DT_COMPLEX64) {
|
||||
return ComplexQr<float>(ctx);
|
||||
}
|
||||
if (a_data_type == DT_COMPLEX128) {
|
||||
return ComplexQr<double>(ctx);
|
||||
}
|
||||
if (a_data_type == DT_DOUBLE) {
|
||||
return RealQr<double>(ctx);
|
||||
}
|
||||
if (a_data_type == DT_FLOAT) {
|
||||
return RealQr<float>(ctx);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(MatrixSolveLs, MatrixSolveLsCpuKernel);
|
||||
|
||||
template <typename T>
|
||||
void MatrixSolveLsCpuKernel::RealCholeskySingleCompute(T *aptr, T *bptr, T *xptr, double *l2, int64_t m, int64_t k,
|
||||
int64_t n) {
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a(m, k);
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(k, n);
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(m, n);
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_copy;
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_b;
|
||||
|
||||
for (int i = 0; i < m * k; i++) {
|
||||
*(a.data() + i) = *(aptr + i);
|
||||
}
|
||||
for (int i = 0; i < m * n; i++) {
|
||||
*(b.data() + i) = *(bptr + i);
|
||||
}
|
||||
|
||||
if (m >= k) {
|
||||
a_copy =
|
||||
a.transpose() * a + ((T)*l2) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(k, k);
|
||||
a_b = a.transpose() * b;
|
||||
} else {
|
||||
a_copy = a * a.transpose();
|
||||
a_b = b;
|
||||
}
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = a_copy.ldlt().solve(a_b.col(i));
|
||||
if (m < k) {
|
||||
xi = a.transpose() * xi;
|
||||
}
|
||||
x.col(i) = xi;
|
||||
}
|
||||
for (int64_t i = 0; i < k * n; i++) {
|
||||
*(xptr + i) = *(x.data() + i);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixSolveLsCpuKernel::RealCholesky(CpuKernelContext &ctx) {
|
||||
auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
auto aptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto bptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto xptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto l2 = reinterpret_cast<double *>(ctx.Input(2)->GetData());
|
||||
int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
|
||||
int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
|
||||
int64_t n = 1;
|
||||
if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
|
||||
n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
|
||||
}
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
const int64_t mat_size = m * k;
|
||||
const int64_t rhs_size = m * n;
|
||||
const int64_t res_size = n * k;
|
||||
const int64_t batch = data_num / mat_size;
|
||||
const int64_t kParallelDataNum = 16 * mat_size;
|
||||
const int64_t kParallelDataNumMid = 72 * mat_size;
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
RealCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
|
||||
"MatrixSolveLs Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < batch; i++) {
|
||||
RealCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MatrixSolveLsCpuKernel::ComplexCholeskySingleCompute(std::complex<T> *aptr, std::complex<T> *bptr,
|
||||
std::complex<T> *xptr, double *l2, int64_t m, int64_t k,
|
||||
int64_t n) {
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(kNum2 * m, kNum2 * k);
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(kNum2 * k, n);
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(kNum2 * m, n);
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_copy;
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a_b;
|
||||
auto l2value = abs(*l2);
|
||||
|
||||
for (int64_t i = 0; i < k; i++) {
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(A.data() + i + j * kNum2 * k) = std::real(*(aptr + i + j * k));
|
||||
}
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(A.data() + (i + k) + (j + m) * kNum2 * k) = std::real(*(aptr + i + j * k));
|
||||
}
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(A.data() + (i + k) + j * kNum2 * k) = -std::imag(*(aptr + i + j * k));
|
||||
}
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(A.data() + i + (j + m) * kNum2 * k) = std::imag(*(aptr + i + j * k));
|
||||
}
|
||||
}
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(b.data() + i + j * n) = std::real(*(bptr + i + j * n));
|
||||
*(b.data() + i + (j + m) * n) = std::imag(*(bptr + i + j * n));
|
||||
}
|
||||
}
|
||||
|
||||
if (m >= k) {
|
||||
a_copy =
|
||||
A.transpose() * A +
|
||||
((T)l2value) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(kNum2 * k, kNum2 * k);
|
||||
a_b = A.transpose() * b;
|
||||
} else {
|
||||
a_copy =
|
||||
A * A.transpose() +
|
||||
((T)l2value) * Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>::Identity(kNum2 * m, kNum2 * m);
|
||||
a_b = b;
|
||||
}
|
||||
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi;
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
xi = a_copy.ldlt().solve(a_b.col(i));
|
||||
if (m < k) {
|
||||
xi = A.transpose() * xi;
|
||||
}
|
||||
x.col(i) = xi;
|
||||
for (int64_t j = 0; j < k; j++) {
|
||||
(xptr + i + j * n)->real(*(x.data() + i + j * n));
|
||||
(xptr + i + j * n)->imag(*(x.data() + i + (j + k) * n));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixSolveLsCpuKernel::ComplexCholesky(CpuKernelContext &ctx) {
|
||||
auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
auto l2 = reinterpret_cast<double *>(ctx.Input(2)->GetData());
|
||||
auto aptr = reinterpret_cast<std::complex<T> *>(ctx.Input(0)->GetData());
|
||||
auto bptr = reinterpret_cast<std::complex<T> *>(ctx.Input(1)->GetData());
|
||||
auto xptr = reinterpret_cast<std::complex<T> *>(ctx.Output(0)->GetData());
|
||||
int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
|
||||
int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
|
||||
int64_t n = 1;
|
||||
if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
|
||||
n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
|
||||
}
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
const int64_t mat_size = m * k;
|
||||
const int64_t rhs_size = m * n;
|
||||
const int64_t res_size = n * k;
|
||||
const int64_t batch = data_num / mat_size;
|
||||
const int64_t kParallelDataNum = 16 * mat_size;
|
||||
const int64_t kParallelDataNumMid = 72 * mat_size;
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
ComplexCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
|
||||
"MatrixSolveLs Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < batch; i++) {
|
||||
ComplexCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MatrixSolveLsCpuKernel::RealQrSingleCompute(T *aptr, T *bptr, T *xptr, int64_t m, int64_t k, int64_t n) {
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> a(m, k);
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(k, n);
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(m, n);
|
||||
|
||||
for (int i = 0; i < m * k; i++) {
|
||||
*(a.data() + i) = *(aptr + i);
|
||||
}
|
||||
for (int i = 0; i < m * n; i++) {
|
||||
*(b.data() + i) = *(bptr + i);
|
||||
}
|
||||
|
||||
Eigen::ColPivHouseholderQR<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> qr_solve(a);
|
||||
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = qr_solve.solve(b.col(i));
|
||||
x.col(i) = xi;
|
||||
}
|
||||
for (int64_t i = 0; i < k * n; i++) {
|
||||
*(xptr + i) = *(x.data() + i);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixSolveLsCpuKernel::RealQr(CpuKernelContext &ctx) {
|
||||
auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
auto aptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto bptr = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto xptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
|
||||
int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
|
||||
int64_t n = 1;
|
||||
if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
|
||||
n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
|
||||
}
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
const int64_t mat_size = m * k;
|
||||
const int64_t rhs_size = m * n;
|
||||
const int64_t res_size = n * k;
|
||||
const int64_t batch = data_num / mat_size;
|
||||
const int64_t kParallelDataNum = 16 * mat_size;
|
||||
const int64_t kParallelDataNumMid = 72 * mat_size;
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
RealQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
|
||||
"MatrixSolveLs Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < batch; i++) {
|
||||
RealQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MatrixSolveLsCpuKernel::ComplexQrSingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr,
|
||||
int64_t m, int64_t k, int64_t n) {
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(kNum2 * m, kNum2 * k);
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> x(kNum2 * k, n);
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> b(kNum2 * m, n);
|
||||
for (int64_t i = 0; i < k; i++) {
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(A.data() + i + j * kNum2 * k) = std::real(*(aptr + i + j * k));
|
||||
}
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(A.data() + (i + k) + (j + m) * kNum2 * k) = std::real(*(aptr + i + j * k));
|
||||
}
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(A.data() + (i + k) + j * kNum2 * k) = -std::imag(*(aptr + i + j * k));
|
||||
}
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(A.data() + i + (j + m) * kNum2 * k) = std::imag(*(aptr + i + j * k));
|
||||
}
|
||||
}
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(b.data() + i + j * n) = std::real(*(bptr + i + j * n));
|
||||
*(b.data() + i + (j + m) * n) = std::imag(*(bptr + i + j * n));
|
||||
}
|
||||
}
|
||||
|
||||
Eigen::ColPivHouseholderQR<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> qr_solve(A);
|
||||
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> xi = qr_solve.solve(b.col(i));
|
||||
x.col(i) = xi;
|
||||
|
||||
for (int64_t j = 0; j < k; j++) {
|
||||
(xptr + i + j * n)->real(*(x.data() + i + j * n));
|
||||
(xptr + i + j * n)->imag(*(x.data() + i + (j + k) * n));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixSolveLsCpuKernel::ComplexQr(CpuKernelContext &ctx) {
|
||||
auto dims = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2);
|
||||
int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1);
|
||||
int64_t n = 1;
|
||||
if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) {
|
||||
n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1);
|
||||
}
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
const int64_t mat_size = m * k;
|
||||
const int64_t rhs_size = m * n;
|
||||
const int64_t res_size = n * k;
|
||||
const int64_t batch = data_num / mat_size;
|
||||
const int64_t kParallelDataNum = 16 * mat_size;
|
||||
const int64_t kParallelDataNumMid = 72 * mat_size;
|
||||
auto aptr = reinterpret_cast<std::complex<T> *>(ctx.Input(0)->GetData());
|
||||
auto bptr = reinterpret_cast<std::complex<T> *>(ctx.Input(1)->GetData());
|
||||
auto xptr = reinterpret_cast<std::complex<T> *>(ctx.Output(0)->GetData());
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
ComplexQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls),
|
||||
"MatrixSolveLs Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < batch; i++) {
|
||||
ComplexQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,62 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_SOLVE_LS_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MATRIX_SOLVE_LS_H_
|
||||
|
||||
#include <complex>
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MatrixSolveLsCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MatrixSolveLsCpuKernel() = default;
|
||||
~MatrixSolveLsCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void RealCholeskySingleCompute(T *aptr, T *bptr, T *xptr, double *l2, int64_t m, int64_t k, int64_t n);
|
||||
|
||||
template <typename T>
|
||||
uint32_t RealCholesky(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void RealQrSingleCompute(T *aptr, T *bptr, T *xptr, int64_t m, int64_t k, int64_t n);
|
||||
|
||||
template <typename T>
|
||||
uint32_t RealQr(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void ComplexCholeskySingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr, double *l2,
|
||||
int64_t m, int64_t k, int64_t n);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ComplexCholesky(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void ComplexQrSingleCompute(std::complex<T> *aptr, std::complex<T> *bptr, std::complex<T> *xptr, int64_t m, int64_t k,
|
||||
int64_t n);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ComplexQr(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,440 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "nuclear_norm.h"
|
||||
#include <string.h>
|
||||
#include <Eigen/Dense>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
#include "kernel_util.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#define NoneN 1000
|
||||
using namespace Eigen;
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const char *kNuclearNorm = "NuclearNorm";
|
||||
const size_t kNuclearNormInputNum = 1;
|
||||
const size_t kNuclearNormOutputNum = 1;
|
||||
constexpr int64_t kParallelDataNums = 1 * 1024;
|
||||
const size_t DIM_SIZE1 = 1;
|
||||
const size_t DIM_SIZE2 = 2;
|
||||
const size_t DIM_SIZE3 = 3;
|
||||
const size_t DIM_SIZE4 = 4;
|
||||
const size_t DIM_SIZE5 = 5;
|
||||
const size_t DIM_SIZE6 = 6;
|
||||
const size_t DIM_SIZE7 = 7;
|
||||
const size_t DIM_SIZE8 = 8;
|
||||
const size_t DIM_INDEX0 = 0;
|
||||
const size_t DIM_INDEX1 = 1;
|
||||
const size_t DIM_INDEX2 = 2;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t NuclearNormCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kNuclearNormInputNum, kNuclearNormOutputNum),
|
||||
"NuclearNorm Check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(NuclearNormParamCheck(ctx), "NuclearNorm Check params failed.");
|
||||
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
uint32_t res = KERNEL_STATUS_OK;
|
||||
|
||||
switch (data_type) {
|
||||
case (DT_FLOAT): {
|
||||
res = NuclearNormCompute<float>(ctx);
|
||||
break;
|
||||
}
|
||||
case (DT_DOUBLE): {
|
||||
res = NuclearNormCompute<double>(ctx);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
KERNEL_LOG_ERROR("NuclearNorm kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (res != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t NuclearNormCpuKernel::NuclearNormParamCheck(CpuKernelContext &ctx) {
|
||||
Tensor *input = ctx.Input(0);
|
||||
Tensor *output = ctx.Output(0);
|
||||
KERNEL_CHECK_FALSE((input->GetDataType() == output->GetDataType()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of the input [%s] need be the same as the output [%s]",
|
||||
DTypeStr(input->GetDataType()).c_str(), DTypeStr(output->GetDataType()).c_str());
|
||||
const size_t input_dimnum = input->GetTensorShape()->GetDims();
|
||||
KERNEL_CHECK_FALSE((input_dimnum >= DIM_SIZE2 && input_dimnum <= DIM_SIZE8), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The range of the dimension of the input tensor should be "
|
||||
"[%lld,%lld], but got input's dimension=%lld",
|
||||
DIM_SIZE2, DIM_SIZE8, input_dimnum);
|
||||
AttrValue *dim_ptr = ctx.GetAttr("dim");
|
||||
std::vector<int64_t> dim_temp = {0, 1};
|
||||
std::vector<int64_t> dim = (dim_ptr == nullptr) ? dim_temp : dim_ptr->GetListInt();
|
||||
if (dim_ptr == nullptr) {
|
||||
KERNEL_CHECK_FALSE((input_dimnum == DIM_SIZE2), KERNEL_STATUS_PARAM_INVALID,
|
||||
"When Attr dim is none, NuclearNorm expected a tensor with 2 "
|
||||
"dimensions, but got a tensor with [%lld] dimensions instead.",
|
||||
input_dimnum);
|
||||
}
|
||||
if (dim.size() == 1 && dim[0] == NoneN) {
|
||||
dim.clear();
|
||||
dim.push_back(0);
|
||||
dim.push_back(1);
|
||||
}
|
||||
KERNEL_CHECK_FALSE((dim.size() == DIM_SIZE2), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Attr dim'size must equal to 2, but got dim's size : [%lld]", dim.size());
|
||||
int64_t lower_bound = 0 - input_dimnum;
|
||||
int64_t upper_bound = input_dimnum - 1;
|
||||
KERNEL_CHECK_FALSE((dim[0] >= lower_bound && dim[0] <= upper_bound), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The range of dim[0] should be [%lld,%lld], but got input dim[0]=%lld", lower_bound, upper_bound,
|
||||
dim[0]);
|
||||
KERNEL_CHECK_FALSE((dim[1] >= lower_bound && dim[1] <= upper_bound), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The range of dim[1] should be [%lld,%lld], but got input dim[1]=%lld", lower_bound, upper_bound,
|
||||
dim[1]);
|
||||
dim[0] = (dim[0] < 0) ? dim[0] + input_dimnum : dim[0];
|
||||
dim[1] = (dim[1] < 0) ? dim[1] + input_dimnum : dim[1];
|
||||
KERNEL_CHECK_FALSE((dim[0] != dim[1]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The values in attr dim point to the same dimension.");
|
||||
KERNEL_LOG_DEBUG("NuclearNormCpuKernel[%s], input: size[%llu], output: size[%llu].", ctx.GetOpType().c_str(),
|
||||
input->GetDataSize(), output->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t NuclearNormCpuKernel::NuclearNormCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input_ptr = ctx.Input(0);
|
||||
auto input_shape = input_ptr->GetTensorShape();
|
||||
std::vector<int64_t> input_dims = input_shape->GetDimSizes();
|
||||
uint32_t res = KERNEL_STATUS_OK;
|
||||
switch (input_dims.size()) {
|
||||
case DIM_SIZE2:
|
||||
res = ComputeTensorNuclearNorm<T, DIM_SIZE2>(ctx);
|
||||
break;
|
||||
case DIM_SIZE3:
|
||||
res = ComputeTensorNuclearNorm<T, DIM_SIZE3>(ctx);
|
||||
break;
|
||||
case DIM_SIZE4:
|
||||
res = ComputeTensorNuclearNorm<T, DIM_SIZE4>(ctx);
|
||||
break;
|
||||
case DIM_SIZE5:
|
||||
res = ComputeTensorNuclearNorm<T, DIM_SIZE5>(ctx);
|
||||
break;
|
||||
case DIM_SIZE6:
|
||||
res = ComputeTensorNuclearNorm<T, DIM_SIZE6>(ctx);
|
||||
break;
|
||||
case DIM_SIZE7:
|
||||
res = ComputeTensorNuclearNorm<T, DIM_SIZE7>(ctx);
|
||||
break;
|
||||
case DIM_SIZE8:
|
||||
res = ComputeTensorNuclearNorm<T, DIM_SIZE8>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR(
|
||||
"Only tensors with ranks between 2 and 8 are currently supported."
|
||||
"Tensor rank: [%d]",
|
||||
input_dims.size());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (res != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, int32_t RANK>
|
||||
uint32_t NuclearNormCpuKernel::ComputeTensorNuclearNorm(const CpuKernelContext &ctx) {
|
||||
Tensor *input_ptr = ctx.Input(0);
|
||||
auto input_shape = input_ptr->GetTensorShape();
|
||||
void *data_ptr = input_ptr->GetData();
|
||||
int64_t value_num_ = input_ptr->NumElements();
|
||||
|
||||
T *input_data_ptr = reinterpret_cast<T *>(data_ptr);
|
||||
int64_t total_copy_size = value_num_ * static_cast<int64_t>(sizeof(T));
|
||||
Eigen::Tensor<T, 1, Eigen::RowMajor> eigen_tensor(value_num_);
|
||||
int memcpy_ret = memcpy_s(&eigen_tensor(0), total_copy_size, input_data_ptr, total_copy_size);
|
||||
|
||||
if (memcpy_ret != 0) {
|
||||
KERNEL_LOG_ERROR("memcpy_s error!");
|
||||
}
|
||||
std::vector<int64_t> input_dims = input_shape->GetDimSizes();
|
||||
std::array<Eigen::DenseIndex, RANK> dim_array;
|
||||
const int64_t input_dimnum = static_cast<int64_t>(input_shape->GetDims());
|
||||
for (int64_t i = 0; i < input_dimnum; i++) {
|
||||
dim_array.at(i) = input_dims[i];
|
||||
}
|
||||
Eigen::Tensor<T, RANK, Eigen::RowMajor> reshaped_tensor = eigen_tensor.reshape(dim_array);
|
||||
|
||||
AttrValue *dim_ptr = ctx.GetAttr("dim");
|
||||
std::vector<int64_t> dim_temp = {0, 1};
|
||||
std::vector<int64_t> dim = (dim_ptr == nullptr) ? dim_temp : dim_ptr->GetListInt();
|
||||
if (dim.size() == 1 && dim[0] == NoneN) {
|
||||
dim.clear();
|
||||
dim.push_back(0);
|
||||
dim.push_back(1);
|
||||
}
|
||||
dim[0] = (dim[0] < 0) ? dim[0] + input_dimnum : dim[0];
|
||||
dim[1] = (dim[1] < 0) ? dim[1] + input_dimnum : dim[1];
|
||||
|
||||
int64_t j = 0;
|
||||
for (int64_t i = 0; i < input_dimnum; i++) {
|
||||
if (i != dim[0] && i != dim[1]) {
|
||||
dim_array.at(j) = i;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
dim_array.at(j) = dim[0];
|
||||
dim_array.at(j + 1) = dim[1];
|
||||
Eigen::Tensor<T, RANK, Eigen::RowMajor> shuffled_tensor = reshaped_tensor.shuffle(dim_array);
|
||||
|
||||
int64_t dimsize0 = input_shape->GetDimSize(dim[0]);
|
||||
int64_t dimsize1 = input_shape->GetDimSize(dim[1]);
|
||||
int64_t iter_number = value_num_ / (dimsize0 * dimsize1);
|
||||
|
||||
std::array<Eigen::DenseIndex, DIM_SIZE3> dim_array_last;
|
||||
dim_array_last.at(DIM_INDEX0) = iter_number;
|
||||
dim_array_last.at(DIM_INDEX1) = dimsize0;
|
||||
dim_array_last.at(DIM_INDEX2) = dimsize1;
|
||||
Eigen::Tensor<T, DIM_SIZE3, Eigen::RowMajor> permuted_tensor = shuffled_tensor.reshape(dim_array_last);
|
||||
|
||||
auto output_data_ptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t copy_size = (dimsize0 * dimsize1) * static_cast<int64_t>(sizeof(T));
|
||||
if (iter_number <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < iter_number; i++) {
|
||||
T *mat = new T[dimsize0 * dimsize1];
|
||||
memcpy(mat, &permuted_tensor(i, 0, 0), copy_size);
|
||||
T nuclear_norm = matrix_nuclear_norm<T>(mat, dimsize0, dimsize1);
|
||||
*(output_data_ptr + i) = nuclear_norm;
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > static_cast<uint64_t>(iter_number)) {
|
||||
max_core_num = static_cast<uint64_t>(iter_number);
|
||||
}
|
||||
|
||||
auto shared_nuclear_norm = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T *mat = new T[dimsize0 * dimsize1];
|
||||
memcpy(mat, &permuted_tensor(i, 0, 0), copy_size);
|
||||
T nuclear_norm = matrix_nuclear_norm<T>(mat, dimsize0, dimsize1);
|
||||
*(output_data_ptr + i) = nuclear_norm;
|
||||
}
|
||||
};
|
||||
if (max_core_num != 0) {
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, static_cast<uint64_t>(iter_number),
|
||||
static_cast<uint64_t>(iter_number) / max_core_num, shared_nuclear_norm),
|
||||
"NuclearNorm Compute failed.");
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<std::vector<T>> NuclearNormCpuKernel::matrix_multiply(std::vector<std::vector<T>> const arrL,
|
||||
std::vector<std::vector<T>> const arrR) {
|
||||
size_t rowL = arrL.size();
|
||||
size_t colL = arrL[0].size();
|
||||
size_t colR = arrR[0].size();
|
||||
|
||||
std::vector<std::vector<T>> res(rowL);
|
||||
for (size_t i = 0; i < res.size(); i++) {
|
||||
res[i].resize(colR);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < rowL; i++) {
|
||||
for (size_t j = 0; j < colR; j++) {
|
||||
for (size_t k = 0; k < colL; k++) {
|
||||
res[i][j] += arrL[i][k] * arrR[k][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<std::vector<T>> NuclearNormCpuKernel::transpose(std::vector<std::vector<T>> const arr) {
|
||||
size_t row = arr.size();
|
||||
size_t col = arr[0].size();
|
||||
|
||||
std::vector<std::vector<T>> trans(col);
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
trans[i].resize(row);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (size_t j = 0; j < row; j++) {
|
||||
trans[i][j] = arr[j][i];
|
||||
}
|
||||
}
|
||||
return trans;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<size_t> NuclearNormCpuKernel::argsort(const std::vector<T> &array) {
|
||||
const size_t array_len(array.size());
|
||||
std::vector<size_t> array_index(array_len, 0);
|
||||
for (size_t i = 0; i < array_len; ++i) array_index[i] = i;
|
||||
|
||||
sort(array_index.begin(), array_index.end(),
|
||||
[&array](size_t pos1, size_t pos2) { return (array[pos1] > array[pos2]); });
|
||||
|
||||
return array_index;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void NuclearNormCpuKernel::get_row_col(std::vector<std::vector<T>> arr, T *max, size_t *row, size_t *col) {
|
||||
size_t n = arr.size();
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
for (size_t j = 0; j < n; j++) {
|
||||
if (i != j && fabs(arr[i][j]) > *max) {
|
||||
*max = fabs(arr[i][j]);
|
||||
*row = i;
|
||||
*col = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void NuclearNormCpuKernel::svd(std::vector<std::vector<T>> arr, std::vector<std::vector<T>> &E, std::vector<T> &e) {
|
||||
size_t n = arr.size();
|
||||
size_t row = 0;
|
||||
size_t col = 0;
|
||||
size_t iter_max_num = 10000;
|
||||
size_t iter_num = 0;
|
||||
T eps = 1e-40;
|
||||
T max = eps;
|
||||
T dot5 = 0.5;
|
||||
|
||||
E.resize(n);
|
||||
e.resize(n);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
E[i].resize(n, 0);
|
||||
E[i][i] = 1;
|
||||
}
|
||||
|
||||
while (iter_num < iter_max_num && max >= eps) {
|
||||
max = fabs(arr[0][1]);
|
||||
row = 0;
|
||||
col = 1;
|
||||
|
||||
get_row_col<T>(arr, &max, &row, &col);
|
||||
T theta = dot5 * atan2(-2 * arr[row][col], -(arr[row][row] - arr[col][col]));
|
||||
|
||||
T aii = arr[row][row];
|
||||
T ajj = arr[col][col];
|
||||
T aij = arr[row][col];
|
||||
T sin_theta = sin(theta);
|
||||
T cos_theta = cos(theta);
|
||||
T sin_2theta = sin(2 * theta);
|
||||
T cos_2theta = cos(2 * theta);
|
||||
arr[row][row] = aii * cos_theta * cos_theta + ajj * sin_theta * sin_theta + aij * sin_2theta;
|
||||
arr[col][col] = aii * sin_theta * sin_theta + ajj * cos_theta * cos_theta - aij * sin_2theta;
|
||||
arr[row][col] = dot5 * (ajj - aii) * sin_2theta + aij * cos_2theta;
|
||||
arr[col][row] = arr[row][col];
|
||||
for (size_t k = 0; k < n; k++) {
|
||||
if (k != row && k != col) {
|
||||
T arowk = arr[row][k];
|
||||
T acolk = arr[col][k];
|
||||
arr[row][k] = arowk * cos_theta + acolk * sin_theta;
|
||||
arr[k][row] = arr[row][k];
|
||||
arr[col][k] = acolk * cos_theta - arowk * sin_theta;
|
||||
arr[k][col] = arr[col][k];
|
||||
}
|
||||
}
|
||||
|
||||
T Eki;
|
||||
T Ekj;
|
||||
for (size_t k = 0; k < n; k++) {
|
||||
Eki = E[k][row];
|
||||
Ekj = E[k][col];
|
||||
E[k][row] = Eki * cos_theta + Ekj * sin_theta;
|
||||
E[k][col] = Ekj * cos_theta - Eki * sin_theta;
|
||||
}
|
||||
iter_num++;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
e[i] = arr[i][i];
|
||||
}
|
||||
|
||||
std::vector<size_t> sort_index;
|
||||
sort_index = argsort<T>(e);
|
||||
|
||||
std::vector<std::vector<T>> E_sorted(n);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
E_sorted[i].resize(n);
|
||||
}
|
||||
std::vector<T> e_sorted(n);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
e_sorted[i] = e[sort_index[i]];
|
||||
for (size_t j = 0; j < n; j++) {
|
||||
E_sorted[i][j] = E[i][sort_index[j]];
|
||||
}
|
||||
}
|
||||
E = E_sorted;
|
||||
e = e_sorted;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T NuclearNormCpuKernel::matrix_nuclear_norm(T *mat, size_t dim0, size_t dim1) {
|
||||
if (dim1 == DIM_SIZE1) {
|
||||
T nuclear_norm = 0.0;
|
||||
T temp = 0.0;
|
||||
for (size_t j = 0; j < dim0; j++) {
|
||||
temp = mat[j];
|
||||
temp = temp * temp;
|
||||
nuclear_norm += temp;
|
||||
}
|
||||
nuclear_norm = sqrt(nuclear_norm);
|
||||
return nuclear_norm;
|
||||
}
|
||||
std::vector<std::vector<double>> arr(dim0);
|
||||
size_t S_dim_size = dim0 < dim1 ? dim0 : dim1;
|
||||
for (size_t i = 0; i < arr.size(); i++) {
|
||||
arr[i].resize(dim1);
|
||||
}
|
||||
for (size_t i = 0; i < dim0; i++) {
|
||||
for (size_t j = 0; j < dim1; j++) {
|
||||
arr[i][j] = mat[i * dim1 + j];
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>> ATA;
|
||||
std::vector<std::vector<double>> E;
|
||||
std::vector<double> e;
|
||||
|
||||
ATA = matrix_multiply<double>(transpose(arr), arr);
|
||||
svd<double>(ATA, E, e);
|
||||
|
||||
double nuclear_norm = 0.0;
|
||||
for (size_t i = DIM_INDEX0; i < S_dim_size; i++) {
|
||||
if (e[i] > 0) {
|
||||
nuclear_norm += sqrt(e[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return nuclear_norm;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kNuclearNorm, NuclearNormCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,66 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_NUCLEARNORM_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_NUCLEARNORM_H_
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "securec.h"
|
||||
#include "status.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class NuclearNormCpuKernel : public CpuKernel {
|
||||
public:
|
||||
NuclearNormCpuKernel() = default;
|
||||
~NuclearNormCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t NuclearNormParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NuclearNormCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T, int32_t RANK>
|
||||
uint32_t ComputeTensorNuclearNorm(const CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
std::vector<std::vector<T>> matrix_multiply(std::vector<std::vector<T>> const arrL,
|
||||
std::vector<std::vector<T>> const arrR);
|
||||
|
||||
template <typename T>
|
||||
std::vector<std::vector<T>> transpose(std::vector<std::vector<T>> const arr);
|
||||
|
||||
template <typename T>
|
||||
std::vector<size_t> argsort(const std::vector<T> &array);
|
||||
|
||||
template <typename T>
|
||||
void get_row_col(std::vector<std::vector<T>> arr, T *max, size_t *row, size_t *col);
|
||||
|
||||
template <typename T>
|
||||
void svd(std::vector<std::vector<T>> arr, std::vector<std::vector<T>> &E, std::vector<T> &e);
|
||||
|
||||
template <typename T>
|
||||
T matrix_nuclear_norm(T *mat, size_t dim0, size_t dim1);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,410 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "quantile.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
constexpr uint32_t kQuantileInputNum = 2;
|
||||
constexpr uint32_t kQuantileOutputNum = 1;
|
||||
const int64_t paralled_data_size = 64 * 1024;
|
||||
const int64_t kQuantileAttrDefaultDim = 10000;
|
||||
const char *kQuantile = "Quantile";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
uint32_t QuantileCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
|
||||
input_ = ctx.Input(0);
|
||||
DataType input_type = input_->GetDataType();
|
||||
int64_t input_dim = input_->GetTensorShape()->GetDims();
|
||||
int64_t input_size = input_->GetTensorShape()->NumElements();
|
||||
q_ = ctx.Input(1);
|
||||
int64_t q_size = q_->GetTensorShape()->NumElements();
|
||||
T *q_addrs = reinterpret_cast<T *>(q_->GetData());
|
||||
DataType q_type = q_->GetDataType();
|
||||
int64_t q_dim = q_->GetTensorShape()->GetDims();
|
||||
int64_t min = -input_dim;
|
||||
int64_t max = input_dim - 1;
|
||||
auto dim_attr = ctx.GetAttr("dim");
|
||||
dim_ = (dim_attr == nullptr) ? kQuantileAttrDefaultDim : dim_attr->GetInt();
|
||||
auto keep_dims_attr = ctx.GetAttr("keep_dims");
|
||||
keep_dims_ = (keep_dims_attr == nullptr) ? false : keep_dims_attr->GetBool();
|
||||
auto ignore_attr = ctx.GetAttr("ignore_nan");
|
||||
ignore_nan_ = (ignore_attr == nullptr) ? false : ignore_attr->GetBool();
|
||||
|
||||
KERNEL_CHECK_FALSE(input_size > 0, KERNEL_STATUS_PARAM_INVALID, "quantile() input tensor must be non-empty");
|
||||
KERNEL_CHECK_FALSE(q_dim <= 1, KERNEL_STATUS_PARAM_INVALID,
|
||||
"quantile() q must be a scalar or 1D tensor,but got dimension = [%d].", q_dim);
|
||||
KERNEL_CHECK_FALSE(input_type == q_type, KERNEL_STATUS_PARAM_INVALID,
|
||||
"quantile() q tensor must be same dtype as the input tensor");
|
||||
|
||||
for (int64_t j = 0; j < q_size; ++j) {
|
||||
KERNEL_CHECK_FALSE(q_addrs[j] <= 1 && q_addrs[j] >= 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"quantile() q values must be in the range [0, 1]");
|
||||
}
|
||||
DataType out_type = ctx.Output(0)->GetDataType();
|
||||
output_ = ctx.Output(0);
|
||||
KERNEL_CHECK_FALSE(out_type == input_type, KERNEL_STATUS_PARAM_INVALID,
|
||||
"quantile() out tensor must be same dtype as the input tensor");
|
||||
if (dim_ != kQuantileAttrDefaultDim) {
|
||||
KERNEL_CHECK_FALSE(dim_ >= min && dim_ <= max, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Dimension out of range (expected to be in range of [%d] and [%d]).", min, max);
|
||||
}
|
||||
dim_ = MaybeWrapDim(dim_, input_dim);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t QuantileCpuKernel::MaybeWrapDim(int64_t dim, int64_t dim_post_expr) {
|
||||
if (dim == kQuantileAttrDefaultDim) {
|
||||
return dim;
|
||||
}
|
||||
if (dim_post_expr <= 0) {
|
||||
dim_post_expr = 1;
|
||||
}
|
||||
int64_t min = -dim_post_expr;
|
||||
int64_t max = dim_post_expr - 1;
|
||||
KERNEL_CHECK_FALSE(dim >= min && dim <= max, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Dimension out of range (expected to be in range of [%d] and [%d]).", min, max)
|
||||
if (dim < 0) {
|
||||
dim += dim_post_expr;
|
||||
}
|
||||
return dim;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> transpose(std::vector<T> &f, std::vector<int64_t> &shape, int index) {
|
||||
int element_count = f.size();
|
||||
int m = shape.size();
|
||||
int i;
|
||||
int *indexA = (int *)malloc(sizeof(int) * m);
|
||||
if (indexA == nullptr) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<int> pos(m);
|
||||
for (int i = 0; i < m; i++) pos[i] = i;
|
||||
if (m != 0) {
|
||||
std::swap(pos[m - 1], pos[((index + m) % m)]);
|
||||
}
|
||||
|
||||
int *indexB = (int *)malloc(sizeof(int) * m);
|
||||
if (indexB == nullptr) {
|
||||
free(indexA);
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<T> b(element_count);
|
||||
std::vector<int64_t> shapeb(shape);
|
||||
for (int i = 0; i < m; i++) {
|
||||
shapeb[i] = shape[pos[i]];
|
||||
}
|
||||
|
||||
for (int src = 0; src < element_count; src++) {
|
||||
int temp = src;
|
||||
for (i = m - 1; i >= 0; i--) {
|
||||
indexA[i] = temp % shape[i];
|
||||
temp = temp / shape[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
indexB[i] = indexA[pos[i]];
|
||||
}
|
||||
|
||||
int dst = 0;
|
||||
temp = 1;
|
||||
for (i = m - 1; i >= 0; i--) {
|
||||
dst = dst + indexB[i] * temp;
|
||||
temp = temp * shapeb[i];
|
||||
}
|
||||
b[dst] = f[src];
|
||||
}
|
||||
free(indexA);
|
||||
free(indexB);
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void QuantileCpuKernel::QuantileComputeParallelFunc(size_t start, size_t end, int64_t last_shape_size,
|
||||
std::vector<T> &sorted) {
|
||||
uint64_t q_size = q_->GetTensorShape()->NumElements();
|
||||
T *output_addr = reinterpret_cast<T *>(output_->GetData());
|
||||
T *q_addrs = reinterpret_cast<T *>(q_->GetData());
|
||||
for (u_int64_t i = start; i < end; i++) {
|
||||
std::vector<T> tmp;
|
||||
std::sort(sorted.begin() + i * last_shape_size, sorted.begin() + (i + 1) * last_shape_size);
|
||||
bool has_nan = false;
|
||||
bool all_nan = true;
|
||||
|
||||
for (u_int64_t j = i * last_shape_size; j < (i + 1) * last_shape_size; j++) {
|
||||
if (std::isnan(sorted[j])) {
|
||||
has_nan = true;
|
||||
} else {
|
||||
all_nan = false;
|
||||
}
|
||||
}
|
||||
|
||||
if ((has_nan && !ignore_nan_) || all_nan) {
|
||||
for (uint64_t j = 0; j < q_size; ++j) {
|
||||
output_addr[i * q_size + j] = NAN;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
for (auto k = i * last_shape_size; k < (i + 1) * last_shape_size; k++) {
|
||||
auto x = sorted[k];
|
||||
if (!isnan(x)) {
|
||||
tmp.push_back(x);
|
||||
}
|
||||
}
|
||||
std::sort(tmp.begin(), tmp.end());
|
||||
for (uint64_t j = 0; j < q_size; ++j) {
|
||||
T index = (tmp.size() - 1) * q_addrs[j];
|
||||
int32_t idx = index;
|
||||
if (idx == (int32_t)tmp.size() - 1) {
|
||||
output_addr[i * q_size + j] = tmp[idx];
|
||||
continue;
|
||||
}
|
||||
output_addr[i * q_size + j] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void QuantileCpuKernel::QuantileComputeSerialFunc(int64_t last_shape_size, std::vector<T> &sorted) {
|
||||
uint64_t n = input_->GetTensorShape()->NumElements();
|
||||
uint64_t q_size = q_->GetTensorShape()->NumElements();
|
||||
T *output_addr = reinterpret_cast<T *>(output_->GetData());
|
||||
T *q_addrs = reinterpret_cast<T *>(q_->GetData());
|
||||
for (u_int64_t i = 0; i < n; i += last_shape_size) {
|
||||
std::vector<T> tmp;
|
||||
sort(sorted.begin() + i, sorted.begin() + i + last_shape_size);
|
||||
bool has_nan = false;
|
||||
bool all_nan = true;
|
||||
for (auto j = i; j < i + last_shape_size; j++) {
|
||||
if (!isnan(sorted[j])) {
|
||||
tmp.push_back(sorted[j]);
|
||||
all_nan = false;
|
||||
} else {
|
||||
has_nan = true;
|
||||
}
|
||||
}
|
||||
sort(tmp.begin(), tmp.end());
|
||||
for (uint64_t j = 0; j < q_size; ++j) {
|
||||
if ((has_nan && !ignore_nan_) || all_nan) {
|
||||
output_addr[i * q_size / last_shape_size + j] = NAN;
|
||||
continue;
|
||||
}
|
||||
|
||||
T index = (tmp.size() - 1) * q_addrs[j];
|
||||
int32_t idx = index;
|
||||
if (idx == (int32_t)tmp.size() - 1) {
|
||||
output_addr[i * q_size / last_shape_size + j] = tmp[idx];
|
||||
continue;
|
||||
}
|
||||
output_addr[i * q_size / last_shape_size + j] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void QuantileCpuKernel::QuantileComputeDefaultFunc(std::vector<T> &sorted) {
|
||||
uint64_t q_size = q_->GetTensorShape()->NumElements();
|
||||
T *output_addr = reinterpret_cast<T *>(output_->GetData());
|
||||
T *q_addrs = reinterpret_cast<T *>(q_->GetData());
|
||||
std::sort(sorted.begin(), sorted.end());
|
||||
bool all_nan = true;
|
||||
std::vector<T> tmp;
|
||||
for (auto &x : sorted) {
|
||||
if (!isnan(x)) {
|
||||
tmp.push_back(x);
|
||||
all_nan = false;
|
||||
}
|
||||
}
|
||||
std::sort(tmp.begin(), tmp.end());
|
||||
for (uint64_t i = 0; i < q_size; ++i) {
|
||||
if ((has_nan_ && !ignore_nan_) || all_nan) {
|
||||
output_addr[i] = NAN;
|
||||
continue;
|
||||
}
|
||||
T index = (tmp.size() - 1) * q_addrs[i];
|
||||
int32_t idx = index;
|
||||
if (idx == (int32_t)tmp.size() - 1) {
|
||||
output_addr[i] = tmp[idx];
|
||||
continue;
|
||||
}
|
||||
output_addr[i] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int64_t> QuantileCpuKernel::SetQuantileOutputShape() {
|
||||
std::vector<int64_t> out_shape;
|
||||
int64_t q_dim = q_->GetTensorShape()->NumElements();
|
||||
int64_t input_dim = input_->GetTensorShape()->GetDims();
|
||||
uint64_t q_size = q_->GetTensorShape()->NumElements();
|
||||
std::vector<int64_t> input_shapesize = input_->GetTensorShape()->GetDimSizes();
|
||||
if (dim_ != kQuantileAttrDefaultDim && input_dim > 0) {
|
||||
out_shape = input_shapesize;
|
||||
if (keep_dims_) {
|
||||
out_shape[dim_] = 1;
|
||||
} else {
|
||||
out_shape.erase(out_shape.begin() + dim_);
|
||||
}
|
||||
} else if (keep_dims_) {
|
||||
out_shape = std::vector<int64_t>(input_dim, 1);
|
||||
}
|
||||
if (q_dim > 0) {
|
||||
out_shape.insert(out_shape.begin(), q_size);
|
||||
}
|
||||
return out_shape;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t QuantileCpuKernel::QuantileCompute(CpuKernelContext &ctx) {
|
||||
T *input_addrs = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
size_t data_size = input_->GetTensorShape()->NumElements() * sizeof(T);
|
||||
|
||||
std::vector<int64_t> out_shape = SetQuantileOutputShape();
|
||||
std::vector<int64_t> input_dims = input_->GetTensorShape()->GetDimSizes();
|
||||
int64_t input_shape_size = input_->GetTensorShape()->GetDims();
|
||||
std::vector<T> sorted;
|
||||
int64_t n = input_->GetTensorShape()->NumElements();
|
||||
for (int64_t i = 0; i < n; i++) {
|
||||
sorted.push_back(input_addrs[i]);
|
||||
if (isnan(input_addrs[i])) {
|
||||
has_nan_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (data_size <= paralled_data_size) {
|
||||
if (dim_ == kQuantileAttrDefaultDim) {
|
||||
QuantileComputeDefaultFunc<T>(sorted);
|
||||
} else if (dim_ == input_shape_size - 1) {
|
||||
QuantileComputeSerialFunc<T>(input_dims[input_dims.size() - 1], sorted);
|
||||
} else {
|
||||
input_dims.push_back(1);
|
||||
sorted = transpose<T>(sorted, input_dims, dim_);
|
||||
int32_t m = input_dims.size();
|
||||
if (m != 0) {
|
||||
std::swap(input_dims[m - 1], input_dims[((dim_ + m) % m)]);
|
||||
}
|
||||
QuantileComputeSerialFunc<T>(input_dims[input_dims.size() - 1], sorted);
|
||||
}
|
||||
} else {
|
||||
DoParallelQuantile(ctx, sorted, input_dims);
|
||||
}
|
||||
SetOutput<T>(out_shape);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
template <typename T>
|
||||
uint32_t QuantileCpuKernel::DoParallelQuantile(CpuKernelContext &ctx, std::vector<T> sorted,
|
||||
std::vector<int64_t> input_dims) {
|
||||
int64_t input_shape_size = input_->GetTensorShape()->GetDims();
|
||||
std::vector<int64_t> input_shape_dims = input_->GetTensorShape()->GetDimSizes();
|
||||
int64_t n = input_->GetTensorShape()->NumElements();
|
||||
if (dim_ == kQuantileAttrDefaultDim) {
|
||||
QuantileComputeDefaultFunc<T>(sorted);
|
||||
} else if (dim_ == input_shape_size - 1) {
|
||||
int64_t last_shape_size = input_dims[input_dims.size() - 1];
|
||||
auto shard_quantile = [&](size_t start, size_t end) {
|
||||
QuantileComputeParallelFunc<T>(start, end, last_shape_size, sorted);
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, n / last_shape_size, last_shape_size, shard_quantile),
|
||||
"Quantile Compute failed.");
|
||||
} else {
|
||||
input_shape_dims.push_back(1);
|
||||
sorted = transpose<T>(sorted, input_shape_dims, dim_);
|
||||
int32_t m = input_shape_dims.size();
|
||||
if (m != 0) {
|
||||
std::swap(input_shape_dims[m - 1], input_shape_dims[((dim_ + m) % m)]);
|
||||
}
|
||||
int64_t last_shape_size = input_shape_dims[input_shape_dims.size() - 1];
|
||||
auto shard_quantile = [&](size_t start, size_t end) {
|
||||
QuantileComputeParallelFunc<T>(start, end, last_shape_size, sorted);
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, n / last_shape_size, last_shape_size, shard_quantile),
|
||||
"Quantile Compute failed.");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
template <typename T>
|
||||
void QuantileCpuKernel::SetOutput(std::vector<int64_t> &out_shape) {
|
||||
auto output_addr = reinterpret_cast<T *>(output_->GetData());
|
||||
|
||||
int64_t l = output_->GetTensorShape()->NumElements();
|
||||
std::vector<T> out;
|
||||
int64_t q_dim = q_->GetTensorShape()->GetDims();
|
||||
std::vector<int64_t> tmp(out_shape);
|
||||
if (q_dim > 0) {
|
||||
for (int i = 0; i < l; i++) {
|
||||
out.push_back(*(output_addr + i));
|
||||
}
|
||||
|
||||
int64_t out_end_shape = out_shape[out_shape.size() - 1];
|
||||
out_shape.push_back(out_end_shape);
|
||||
std::swap(out_shape[0], out_shape[out_shape.size() - 1]);
|
||||
out_shape.erase(out_shape.begin());
|
||||
out_shape.insert(out_shape.begin(), 1);
|
||||
out = transpose<T>(out, out_shape, 0);
|
||||
for (int i = 0; i < l; i++) {
|
||||
output_addr[i] = out[i];
|
||||
}
|
||||
}
|
||||
output_->GetTensorShape()->SetDimSizes(tmp);
|
||||
}
|
||||
|
||||
uint32_t QuantileCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kQuantileInputNum, kQuantileOutputNum), "[%s] check params failed.", kQuantile);
|
||||
uint32_t res = KERNEL_STATUS_OK;
|
||||
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
res = GetInputAndCheck<float>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
res = GetInputAndCheck<double>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Quantile invalid input type [%s]", DTypeStr(data_type).c_str());
|
||||
break;
|
||||
}
|
||||
KERNEL_CHECK_FALSE((res == KERNEL_STATUS_OK), res, "GetInputAndCheck failed.");
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
res = QuantileCompute<float>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
res = QuantileCompute<double>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Quantile invalid input type [%s]", DTypeStr(data_type).c_str());
|
||||
break;
|
||||
}
|
||||
if (res != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kQuantile, QuantileCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,61 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_QUANTILE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_QUANTILE_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
namespace aicpu {
|
||||
class QuantileCpuKernel : public CpuKernel {
|
||||
public:
|
||||
QuantileCpuKernel() = default;
|
||||
|
||||
~QuantileCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t GetInputAndCheck(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t QuantileCompute(CpuKernelContext &ctx);
|
||||
uint32_t MaybeWrapDim(int64_t dim, int64_t dim_post_expr);
|
||||
template <typename T>
|
||||
void QuantileComputeSerialFunc(int64_t last_shape_size, std::vector<T> &sorted);
|
||||
template <typename T>
|
||||
void QuantileComputeParallelFunc(size_t start, size_t end, int64_t last_shape_size, std::vector<T> &sorted);
|
||||
|
||||
template <typename T>
|
||||
void QuantileComputeDefaultFunc(std::vector<T> &sorted);
|
||||
std::vector<int64_t> SetQuantileOutputShape();
|
||||
template <typename T>
|
||||
void SetOutput(std::vector<int64_t> &out_shape);
|
||||
template <typename T>
|
||||
uint32_t DoParallelQuantile(CpuKernelContext &ctx, std::vector<T> sorted, std::vector<int64_t> input_dims);
|
||||
int64_t last_shape_size_ = 0;
|
||||
bool ignore_nan_ = false;
|
||||
bool keep_dims_ = false;
|
||||
int dim_ = 0;
|
||||
int64_t input_dim_ = 0;
|
||||
Tensor *input_ = nullptr;
|
||||
Tensor *output_ = nullptr;
|
||||
Tensor *q_ = nullptr;
|
||||
bool has_nan_ = false;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,154 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sparse_segment_sqrt_n.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace aicpu {
|
||||
const uint32_t kInputNum = 3;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *SparseSegmentSqrtN = "SparseSegmentSqrtN";
|
||||
|
||||
#define COMPUTE_CASE(DTYPE, TYPE, DTYPE_1, DTYPE_2, CTX) \
|
||||
case (DTYPE): \
|
||||
if ((DTYPE_1) == DT_INT32) { \
|
||||
if ((DTYPE_2) == DT_INT32) { \
|
||||
return ComputeKernal<TYPE, int32_t, int32_t>(CTX); \
|
||||
} else { \
|
||||
return ComputeKernal<TYPE, int32_t, int64_t>(CTX); \
|
||||
} \
|
||||
} else { \
|
||||
if ((DTYPE_2) == DT_INT32) { \
|
||||
return ComputeKernal<TYPE, int64_t, int32_t>(CTX); \
|
||||
} else { \
|
||||
return ComputeKernal<TYPE, int64_t, int64_t>(CTX); \
|
||||
} \
|
||||
} \
|
||||
break;
|
||||
} // namespace aicpu
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SparseSegmentSqrtNCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentSqrtN normalcheck failed.");
|
||||
Tensor *x = ctx.Input(0);
|
||||
Tensor *indices = ctx.Input(1);
|
||||
Tensor *segment_ids = ctx.Input(2);
|
||||
|
||||
auto x_shape = x->GetTensorShape();
|
||||
auto indices_shape = indices->GetTensorShape();
|
||||
auto segment_ids_shape = segment_ids->GetTensorShape();
|
||||
|
||||
if (x_shape->GetDims() < 1) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input0's rank less than 1.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (indices_shape->NumElements() != segment_ids_shape->NumElements()) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input1&input2's ranks mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
auto x_data_type = x->GetDataType();
|
||||
auto indices_data_type = indices->GetDataType();
|
||||
auto segment_ids_data_type = segment_ids->GetDataType();
|
||||
|
||||
if (x_data_type != DT_FLOAT && x_data_type != DT_DOUBLE && x_data_type != DT_FLOAT16) {
|
||||
KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(x_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if ((indices_data_type != DT_INT32 && indices_data_type != DT_INT64) ||
|
||||
(segment_ids_data_type != DT_INT32 && segment_ids_data_type != DT_INT64)) {
|
||||
KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(indices_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
switch (x_data_type) {
|
||||
COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, segment_ids_data_type, ctx)
|
||||
COMPUTE_CASE(DT_FLOAT, float, indices_data_type, segment_ids_data_type, ctx)
|
||||
COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, segment_ids_data_type, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(x_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename T3>
|
||||
uint32_t SparseSegmentSqrtNCpuKernel::ComputeKernal(CpuKernelContext &ctx) {
|
||||
size_t n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
size_t m = ctx.Input(2)->GetTensorShape()->NumElements();
|
||||
size_t k = ctx.Output(0)->GetTensorShape()->NumElements();
|
||||
auto x_addr = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
|
||||
auto indices_addr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
|
||||
auto segment_ids_addr = reinterpret_cast<T3 *>(ctx.Input(2)->GetData());
|
||||
auto y_addr = reinterpret_cast<T1 *>(ctx.Output(0)->GetData());
|
||||
std::vector<int64_t> x_shape_list = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
x_shape_list[0] = segment_ids_addr[m - 1] + 1;
|
||||
ctx.Output(0)->GetTensorShape()->SetDimSizes(x_shape_list);
|
||||
for (size_t i = 0; i < k; i++) {
|
||||
y_addr[i] = (T1)0;
|
||||
}
|
||||
if (segment_ids_addr[0] != 0) {
|
||||
KERNEL_LOG_ERROR("segment_ids can't miss ids.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
for (size_t i = 1; i < m; i++) {
|
||||
if (segment_ids_addr[i] < segment_ids_addr[i - 1]) {
|
||||
KERNEL_LOG_ERROR("segment_ids should be sorted.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (segment_ids_addr[i] - segment_ids_addr[i - 1] > 1) {
|
||||
KERNEL_LOG_ERROR("segment_ids can't miss ids.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < m; i++) {
|
||||
if (indices_addr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
|
||||
KERNEL_LOG_ERROR("indices out of range.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
int oldindex = -1;
|
||||
int countnum = 0;
|
||||
for (size_t i = 0; i < m; i++) {
|
||||
if (oldindex == segment_ids_addr[i]) {
|
||||
countnum++;
|
||||
} else if (countnum != 0) {
|
||||
for (size_t j = 0; j < n; j++) {
|
||||
y_addr[j + oldindex * n] /= (T1)(sqrt(countnum));
|
||||
}
|
||||
countnum = 1;
|
||||
oldindex = segment_ids_addr[i];
|
||||
} else {
|
||||
countnum = 1;
|
||||
oldindex = segment_ids_addr[i];
|
||||
}
|
||||
for (size_t j = 0; j < n; j++) {
|
||||
y_addr[j + oldindex * n] += x_addr[j + indices_addr[i] * n];
|
||||
}
|
||||
}
|
||||
if (countnum != 0) {
|
||||
for (size_t j = 0; j < n; j++) {
|
||||
y_addr[j + oldindex * n] /= (T1)(sqrt(countnum));
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(SparseSegmentSqrtN, SparseSegmentSqrtNCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
#include "utils/sparse_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SparseSegmentSqrtNCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SparseSegmentSqrtNCpuKernel() = default;
|
||||
~SparseSegmentSqrtNCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T1, typename T2, typename T3>
|
||||
uint32_t ComputeKernal(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,171 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "unsorted_segment_prod.h"
|
||||
|
||||
#include <string>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kUnsortedSegmentProd = "UnsortedSegmentProd";
|
||||
const uint32_t input_num = 3;
|
||||
const uint32_t output_num = 1;
|
||||
constexpr int64_t kParallelDataNums = 64 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename input_t, typename segment_ids_t, typename num_segments_t>
|
||||
uint32_t UnsortedSegmentProdCpuKernel::UnsortedSegmentProdComputeTemplate(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_num, output_num), " node input size should be [%llu], get [%llu]",
|
||||
input_num, ctx.GetInputsSize(), " node output size should be [%llu], get [%llu]", output_num,
|
||||
ctx.GetOutputsSize());
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s]",
|
||||
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data size of the input [%llu] need be the same as the output "
|
||||
"[%llu]",
|
||||
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
int64_t data_size = ctx.Input(0)->NumElements();
|
||||
int64_t id_size = ctx.Input(1)->NumElements();
|
||||
|
||||
auto input_x = reinterpret_cast<input_t *>(ctx.Input(0)->GetData());
|
||||
KERNEL_CHECK_NULLPTR(input_x, KERNEL_STATUS_PARAM_INVALID, "Get input data failed")
|
||||
auto output_y = reinterpret_cast<input_t *>(ctx.Output(0)->GetData());
|
||||
KERNEL_CHECK_NULLPTR(output_y, KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
|
||||
auto segmentids = reinterpret_cast<segment_ids_t *>(ctx.Input(1)->GetData());
|
||||
KERNEL_CHECK_NULLPTR(segmentids, KERNEL_STATUS_PARAM_INVALID, "Get segment_ids failed")
|
||||
auto numsegments = reinterpret_cast<num_segments_t *>(ctx.Input(2)->GetData());
|
||||
KERNEL_CHECK_NULLPTR(numsegments, KERNEL_STATUS_PARAM_INVALID, "Get num_segments failed")
|
||||
|
||||
if (id_size <= 0) {
|
||||
KERNEL_LOG_ERROR("segment_ids num elements should great than 0");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
int64_t reshapesize = data_size / id_size;
|
||||
// Initialized to 1
|
||||
for (int64_t k = 0; k < data_size; k++) {
|
||||
*(output_y + k) = static_cast<input_t>(1);
|
||||
}
|
||||
if (data_size <= kParallelDataNums) {
|
||||
// calculation process
|
||||
for (int64_t i = 0; i < id_size; i++) {
|
||||
if (*(segmentids + i) < *numsegments) {
|
||||
for (int64_t j = 0; j < reshapesize; j++) {
|
||||
*(output_y + *(segmentids + i) * reshapesize + j) *= *(input_x + i * reshapesize + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > reshapesize) {
|
||||
max_core_num = reshapesize;
|
||||
}
|
||||
// calculation process
|
||||
auto shard_unsorted_segment_prod = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = 0; i < id_size; i++) {
|
||||
if (*(segmentids + i) < *numsegments) {
|
||||
for (int64_t j = start; j < end; j++) {
|
||||
*(output_y + *(segmentids + i) * reshapesize + j) *= *(input_x + i * reshapesize + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, reshapesize, reshapesize / max_core_num, shard_unsorted_segment_prod),
|
||||
"CpuKernelUtils::ParallelFor failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename input_t, typename segment_ids_t>
|
||||
uint32_t UnsortedSegmentProdCpuKernel::DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type) {
|
||||
switch (num_segments_type) {
|
||||
case DT_INT32:
|
||||
return UnsortedSegmentProdComputeTemplate<input_t, segment_ids_t, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return UnsortedSegmentProdComputeTemplate<input_t, segment_ids_t, int64_t>(ctx);
|
||||
|
||||
default:
|
||||
KERNEL_LOG_ERROR("UnsortedSegmentProd invalid num_segments_type type [%s]", DTypeStr(num_segments_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename input_t>
|
||||
uint32_t UnsortedSegmentProdCpuKernel::DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type) {
|
||||
auto num_segments_type = ctx.Input(2)->GetDataType();
|
||||
switch (segment_ids_type) {
|
||||
case DT_INT32:
|
||||
return DoComputeWithNumSegmentsType<input_t, int32_t>(ctx, num_segments_type);
|
||||
case DT_INT64:
|
||||
return DoComputeWithNumSegmentsType<input_t, int64_t>(ctx, num_segments_type);
|
||||
|
||||
default:
|
||||
KERNEL_LOG_ERROR("UnsortedSegmentProd invalid segment_ids_type type [%s]", DTypeStr(segment_ids_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t UnsortedSegmentProdCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
auto input_type = ctx.Input(0)->GetDataType();
|
||||
auto segment_ids_type = ctx.Input(1)->GetDataType();
|
||||
switch (input_type) {
|
||||
case DT_INT32:
|
||||
return DoComputeWithSegmentIdsType<int32_t>(ctx, segment_ids_type);
|
||||
case DT_INT16:
|
||||
return DoComputeWithSegmentIdsType<int16_t>(ctx, segment_ids_type);
|
||||
case DT_FLOAT:
|
||||
return DoComputeWithSegmentIdsType<float>(ctx, segment_ids_type);
|
||||
case DT_DOUBLE:
|
||||
return DoComputeWithSegmentIdsType<double>(ctx, segment_ids_type);
|
||||
case DT_FLOAT16:
|
||||
return DoComputeWithSegmentIdsType<Eigen::half>(ctx, segment_ids_type);
|
||||
case DT_INT8:
|
||||
return DoComputeWithSegmentIdsType<int8_t>(ctx, segment_ids_type);
|
||||
case DT_INT64:
|
||||
return DoComputeWithSegmentIdsType<int64_t>(ctx, segment_ids_type);
|
||||
case DT_UINT8:
|
||||
return DoComputeWithSegmentIdsType<uint8_t>(ctx, segment_ids_type);
|
||||
case DT_UINT16:
|
||||
return DoComputeWithSegmentIdsType<uint16_t>(ctx, segment_ids_type);
|
||||
case DT_UINT32:
|
||||
return DoComputeWithSegmentIdsType<uint32_t>(ctx, segment_ids_type);
|
||||
case DT_UINT64:
|
||||
return DoComputeWithSegmentIdsType<uint64_t>(ctx, segment_ids_type);
|
||||
case DT_COMPLEX64:
|
||||
return DoComputeWithSegmentIdsType<std::complex<float>>(ctx, segment_ids_type);
|
||||
case DT_COMPLEX128:
|
||||
return DoComputeWithSegmentIdsType<std::complex<double>>(ctx, segment_ids_type);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("UnsortedSegmentProd invalid input type [%s]", DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kUnsortedSegmentProd, UnsortedSegmentProdCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_PROD_H
|
||||
#define AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_PROD_H
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class UnsortedSegmentProdCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~UnsortedSegmentProdCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename input_t, typename segment_ids_t, typename num_segments_t>
|
||||
uint32_t UnsortedSegmentProdComputeTemplate(CpuKernelContext &ctx);
|
||||
template <typename input_t, typename segment_ids_t>
|
||||
uint32_t DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type);
|
||||
template <typename input_t>
|
||||
uint32_t DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -41,7 +41,7 @@ uint32_t EqualCalculate(const CpuKernelContext &ctx, BCalcInfo &calcInfo, bool f
|
|||
output_y[i] = (flag == true) ? (*x_index == *y_index) : (*x_index != *y_index);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, 1, shard_equal), "Equal calculate failed.")
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, 1, shard_equal), "Equal calculate failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
/**
|
||||
|
@ -69,7 +69,7 @@ uint32_t EqualCompute(const CpuKernelContext &ctx, bool flag) {
|
|||
calcInfo.input_1->GetDataSize(), calcInfo.output->GetData(), calcInfo.output->GetDataSize());
|
||||
|
||||
Bcast bcast;
|
||||
KERNEL_HANDLE_ERROR(bcast.GenerateBcastInfo(calcInfo), "Generate broadcast info failed.")
|
||||
KERNEL_HANDLE_ERROR(bcast.GenerateBcastInfo(calcInfo), "Generate broadcast info failed.");
|
||||
bcast.BCastIndexes(calcInfo.x_indexes, calcInfo.y_indexes);
|
||||
bcast.GetBcastVec(calcInfo);
|
||||
|
||||
|
|
|
@ -51,8 +51,12 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
|
|||
static const std::set<std::string> kMigrateAicpuKernelOps = {mindspore::kAdaptiveAvgPool2dOpName,
|
||||
mindspore::kAdaptiveAvgPool2dGradOpName,
|
||||
mindspore::kCacheSwapTableOpName,
|
||||
mindspore::kCol2imOpName,
|
||||
mindspore::kCumulativeLogsumexpOpName,
|
||||
mindspore::kDataFormatVecPermuteOpName,
|
||||
mindspore::kFillOpName,
|
||||
mindspore::kLogMatrixDeterminantOpName,
|
||||
mindspore::kMatrixSolveLsOpName,
|
||||
mindspore::kMaskedSelectOpName,
|
||||
mindspore::kMaskedSelectGradOpName,
|
||||
mindspore::kMedianOpName,
|
||||
|
@ -71,6 +75,10 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
|
|||
mindspore::kNanToNumOpName,
|
||||
mindspore::kQrOpName,
|
||||
mindspore::kResizeBicubicOpName};
|
||||
mindspore::kNuclearNormOpName,
|
||||
mindspore::kQuantileOpName,
|
||||
mindspore::kSparseSegmentSqrtNOpName,
|
||||
mindspore::kUnsortedSegmentProdOpName};
|
||||
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
|
||||
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";
|
||||
|
||||
|
|
|
@ -171,3 +171,5 @@ from .median_grad import _median_grad_aicpu
|
|||
from .reduce_sum import _reduce_sum_aicpu
|
||||
from .adaptive_avg_pool_2d_v1 import _adaptive_avg_pool_2d_v1_aicpu
|
||||
from .fill_v2 import _fill_v2_aicpu
|
||||
from .data_format_vec_permute import _data_format_vec_permute_aicpu
|
||||
from .quantile import _quantile_aicpu
|
||||
|
|
Loading…
Reference in New Issue