From 08b0e47e507cfb1e82ec97566822531db88b7d23 Mon Sep 17 00:00:00 2001 From: lilinjie Date: Thu, 29 Dec 2022 15:00:11 +0800 Subject: [PATCH] aicpu migration from pzh --- .jenkins/check/config/filter_cppcheck.txt | 9 +- .jenkins/check/config/whitelizard.txt | 2 + mindspore/ccsrc/include/common/utils/utils.h | 10 + .../aicpu_ops/cpu_kernel/ms_kernel/col2im.cc | 236 +++++++++ .../aicpu_ops/cpu_kernel/ms_kernel/col2im.h | 50 ++ .../ms_kernel/cumulativelogsumexp.cc | 211 ++++++++ .../ms_kernel/cumulativelogsumexp.h | 38 ++ .../ms_kernel/data_format_vec_permute.cc | 126 +++++ .../ms_kernel/data_format_vec_permute.h | 35 ++ .../cpu_kernel/ms_kernel/matrix_solve_ls.cc | 455 ++++++++++++++++++ .../cpu_kernel/ms_kernel/matrix_solve_ls.h | 62 +++ .../cpu_kernel/ms_kernel/nuclear_norm.cc | 440 +++++++++++++++++ .../cpu_kernel/ms_kernel/nuclear_norm.h | 66 +++ .../cpu_kernel/ms_kernel/quantile.cc | 410 ++++++++++++++++ .../aicpu_ops/cpu_kernel/ms_kernel/quantile.h | 61 +++ .../ms_kernel/sparse_segment_sqrt_n.cc | 154 ++++++ .../ms_kernel/sparse_segment_sqrt_n.h | 38 ++ .../ms_kernel/unsorted_segment_prod.cc | 171 +++++++ .../ms_kernel/unsorted_segment_prod.h | 37 ++ .../aicpu_ops/cpu_kernel/utils/equal_util.h | 4 +- .../optimizer/mindir/aicpu_lib_select.cc | 8 + .../mindspore/ops/_op_impl/aicpu/__init__.py | 2 + 22 files changed, 2615 insertions(+), 10 deletions(-) create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.h create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.h create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.h create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.h create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.h create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.h create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.h create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.cc create mode 100644 mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.h diff --git a/.jenkins/check/config/filter_cppcheck.txt b/.jenkins/check/config/filter_cppcheck.txt index fa0cfa41781..755c0bf6e28 100644 --- a/.jenkins/check/config/filter_cppcheck.txt +++ b/.jenkins/check/config/filter_cppcheck.txt @@ -83,18 +83,10 @@ "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/matmul_tik.cc" "syntaxError" # AICPU migration -"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "useStlAlgorithm" -"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "variableScope" -"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constParameter" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constVariable" -"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "unreadVariable" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "redundantAssignment" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/" "constArgument" -"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "useStlAlgorithm" -"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "variableScope" -"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constParameter" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "constVariable" -"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/" "unreadVariable" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "nullPointerRedundantCheck" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "variableScope" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unreadVariable" @@ -104,3 +96,4 @@ "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "knownConditionTrueFalse" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "passedByValue" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar" +"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable" diff --git a/.jenkins/check/config/whitelizard.txt b/.jenkins/check/config/whitelizard.txt index 930dfd0e2aa..151a8efc3e1 100644 --- a/.jenkins/check/config/whitelizard.txt +++ b/.jenkins/check/config/whitelizard.txt @@ -280,3 +280,5 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multinomial.cc:aicpu::Generate mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_set_diag_v3.cc:aicpu::MatrixSetDiagV3CpuKernel::DoCompute mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute +mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute +mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc:aicpu::Col2imCpuKernel::Col2imParamCheck diff --git a/mindspore/ccsrc/include/common/utils/utils.h b/mindspore/ccsrc/include/common/utils/utils.h index 7c826527afb..d2f6588cb26 100644 --- a/mindspore/ccsrc/include/common/utils/utils.h +++ b/mindspore/ccsrc/include/common/utils/utils.h @@ -163,6 +163,7 @@ constexpr auto kClipBoxesDOpName = "kClipBoxesD"; constexpr auto kClipByNormNoDivSumOpName = "ClipByNormNoDivSum"; constexpr auto kClipByValueOpName = "ClipByValue"; constexpr auto kCoalesceOpName = "Coalesce"; +constexpr auto kCol2imOpName = "Col2im"; constexpr auto kCombineMomentumOpName = "CombineMomentum"; constexpr auto kCombineMomentumWeightOpName = "CombineMomentumWeight"; constexpr auto kComputeAccidentalHitsOpName = "ComputeAccidentalHits"; @@ -209,6 +210,7 @@ constexpr auto kCumSumOpName = "CumSum"; constexpr auto kDataFormatDimMapOpName = "DataFormatDimMap"; constexpr auto kCumulativeLogsumexpOpName = "CumulativeLogsumexp"; constexpr auto kCumulativeLogsumexpDOpName = "CumulativeLogsumexpD"; +constexpr auto kDataFormatVecPermuteOpName = "DataFormatVecPermute"; constexpr auto kDeadNodeName = "DeadNode"; constexpr auto kDenseToDenseSetOperation = "DenseToDenseSetOperation"; constexpr auto kDenseToSparseSetOperation = "DenseToSparseSetOperation"; @@ -346,6 +348,8 @@ constexpr auto kInplaceUpdateOpName = "InplaceUpdate"; constexpr auto kInplaceUpdateDOpName = "InplaceUpdateD"; constexpr auto kInstanceNorm = "InstanceNorm"; constexpr auto kInstanceNormGradOpName = "InstanceNormGrad"; +constexpr auto kInstanceNormV2OpName = "InstanceNormV2"; +constexpr auto kInstanceNormV2GradOpName = "InstanceNormV2Grad"; constexpr auto kInTopKOpName = "InTopK"; constexpr auto kInTopKDOpName = "InTopKD"; constexpr auto kIsInfOpName = "IsInf"; @@ -403,6 +407,7 @@ constexpr auto kMatrixLogarithmOpName = "MatrixLogarithm"; constexpr auto kMatrixSetDiagOpName = "MatrixSetDiag"; constexpr auto kMatrixSetDiagDOpName = "MatrixSetDiagD"; constexpr auto kMatrixSetDiagV3OpName = "MatrixSetDiagV3"; +constexpr auto kMatrixSolveLsOpName = "MatrixSolveLs"; constexpr auto kMaximumGradOpName = "MaximumGrad"; constexpr auto kMaximumOpName = "Maximum"; constexpr auto kMaxPool3DGradGradOpName = "MaxPool3DGradGrad"; @@ -422,6 +427,7 @@ constexpr auto kMedianGradOpName = "MedianGrad"; constexpr auto kMemCpyAsyncOpName = "memcpy_async"; constexpr auto kMinimumGradOpName = "MinimumGrad"; constexpr auto kMinimumOpName = "Minimum"; +constexpr auto kMirrorPadOpName = "MirrorPad"; constexpr auto kMomentumOpName = "Momentum"; constexpr auto kMulOpName = "Mul"; constexpr auto kMultinomialOpName = "Multinomial"; @@ -439,6 +445,7 @@ constexpr auto kNonZeroOpName = "NonZero"; constexpr auto kNPUAllocFloatStatusOpName = "NPUAllocFloatStatus"; constexpr auto kNPUClearFloatStatusOpName = "NPUClearFloatStatus"; constexpr auto kNPUGetFloatStatusOpName = "NPUGetFloatStatus"; +constexpr auto kNuclearNormOpName = "NuclearNorm"; constexpr auto kOneHotOpName = "OneHot"; constexpr auto kOneHotDOpName = "OneHotD"; constexpr auto kPadAndShiftOpName = "PadAndShift"; @@ -472,6 +479,7 @@ constexpr auto kPullWeightOpName = "PullWeight"; constexpr auto kPushOpName = "Push"; constexpr auto kQrOpName = "Qr"; constexpr auto kPushWeightOpName = "PushWeight"; +constexpr auto kQuantileOpName = "Quantile"; constexpr auto kRandomChoiceWithMaskOpName = "RandomChoiceWithMask"; constexpr auto kRandomShuffleOpName = "RandomShuffle"; constexpr auto kRangeOpName = "Range"; @@ -591,6 +599,7 @@ constexpr auto kSparseSliceOpName = "SparseSlice"; constexpr auto kSparseSoftmaxCrossEntropyWithLogitsOpName = "SparseSoftmaxCrossEntropyWithLogits"; constexpr auto kSparseSparseMinimumOpName = "SparseSparseMinimum"; constexpr auto kSparseSparseMaximumOpName = "SparseSparseMaximum"; +constexpr auto kSparseTensorDenseMatMulOpName = "SparseTensorDenseMatMul"; constexpr auto kSplitOpName = "Split"; constexpr auto kSplitDOpName = "SplitD"; constexpr auto kSplitVOpName = "SplitV"; @@ -604,6 +613,7 @@ constexpr auto kStackDestroyOpName = "StackDestroy"; constexpr auto kStackInitOpName = "StackInit"; constexpr auto kStackOpName = "Stack"; constexpr auto kPackOpName = "Pack"; +constexpr auto kSparseSegmentSqrtNOpName = "SparseSegmentSqrtN"; constexpr auto kStackPopOpName = "StackPop"; constexpr auto kStackPushOpName = "StackPush"; constexpr auto kStandardLaplaceOpName = "StandardLaplace"; diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc new file mode 100644 index 00000000000..4cf6c99870f --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc @@ -0,0 +1,236 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "col2im.h" + +#include + +#include "cpu_ops_kernel.h" +#include "cpu_kernel_utils.h" +#include "status.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t kCol2imInputNum = 2; +const uint32_t kCol2imOutputNum = 1; +constexpr uint32_t kValue0 = 0; +constexpr uint32_t kValue1 = 1; +constexpr uint32_t kValue2 = 2; +constexpr uint32_t kValue4 = 4; +constexpr uint32_t kIndex0 = 0; +constexpr uint32_t kIndex1 = 1; +constexpr uint32_t kIndex2 = 2; +constexpr uint32_t kIndex3 = 3; +const char *kCol2im = "Col2im"; +} // namespace + +namespace aicpu { +uint32_t Col2imCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(Col2imParamCheck(ctx), "[%s] check params failed.", kCol2im); + auto data_type = ctx.Input(0)->GetDataType(); + uint32_t ret = KERNEL_STATUS_OK; + switch (data_type) { + case DT_FLOAT: + ret = Col2imCompute(ctx); + break; + case DT_FLOAT16: + ret = Col2imCompute(ctx); + break; + default: + KERNEL_LOG_ERROR("Range kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + ret = KERNEL_STATUS_PARAM_INVALID; + break; + } + + return ret; +} + +template +static inline T div_rtn(T x, T y) { + int q = x / y; + int r = x % y; + if ((r != 0) && ((r < 0) != (y < 0))) { + --q; + } + return q; +} + +uint32_t Col2imCpuKernel::Col2imParamCheck(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kCol2imInputNum, kCol2imOutputNum), "[%s] check params failed.", kCol2im); + Tensor *input_ = ctx.Input(0); + Tensor *output_size_ = ctx.Input(1); + KERNEL_CHECK_NULLPTR(ctx.GetAttr("kernel_size"), KERNEL_STATUS_PARAM_INVALID, + "Get ctx.GetAttr(\"kernel_size\") failed."); + KERNEL_CHECK_NULLPTR(ctx.GetAttr("dilation"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"dilation\") failed."); + KERNEL_CHECK_NULLPTR(ctx.GetAttr("padding"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"padding\") failed."); + KERNEL_CHECK_NULLPTR(ctx.GetAttr("stride"), KERNEL_STATUS_PARAM_INVALID, "Get ctx.GetAttr(\"stride\") failed."); + std::vector kernel_size = ctx.GetAttr("kernel_size")->GetListInt(); + std::vector dilation = ctx.GetAttr("dilation")->GetListInt(); + std::vector padding = ctx.GetAttr("padding")->GetListInt(); + std::vector stride = ctx.GetAttr("stride")->GetListInt(); + auto output_size_shape = output_size_->GetTensorShape()->GetDimSizes(); + KERNEL_CHECK_FALSE((output_size_shape.size() == kValue1 && output_size_->NumElements() == kValue2), + KERNEL_STATUS_PARAM_INVALID, + "Expected 1D tensor for output_size with non-zero dimensions for and " + "output_size's size equals to 2, but " + "got %dD tensor for output_size and output_size's size equals to %d.", + output_size_shape.size(), output_size_->NumElements()); + KERNEL_CHECK_FALSE(kernel_size.size() == kValue2, KERNEL_STATUS_PARAM_INVALID, + "It is expected kernel_size's size equals to 2, but got size %d.", kernel_size.size()); + KERNEL_CHECK_FALSE(dilation.size() == kValue2, KERNEL_STATUS_PARAM_INVALID, + "It is expected dilation_size equals to 2, but got size %d.", dilation.size()); + KERNEL_CHECK_FALSE(padding.size() == kValue2, KERNEL_STATUS_PARAM_INVALID, + "It is expected padding_size equals to 2, but got size %d.", padding.size()); + KERNEL_CHECK_FALSE(stride.size() == kValue2, KERNEL_STATUS_PARAM_INVALID, + "It is expected stride_size equals to 2, but got size %d.", stride.size()); + int32_t *output_size_data = reinterpret_cast(output_size_->GetData()); + std::vector output_size(kValue2, kValue0); + output_size[kIndex0] = output_size_data[kIndex0]; + output_size[kIndex1] = output_size_data[kIndex1]; + const int64_t output_height = output_size.front(); + const int64_t output_width = output_size.back(); + const int64_t kernel_height = kernel_size.front(); + const int64_t kernel_width = kernel_size.back(); + const int64_t dilation_height = dilation.front(); + const int64_t dilation_width = dilation.back(); + const int64_t pad_height = padding.front(); + const int64_t pad_width = padding.back(); + const int64_t stride_height = stride.front(); + const int64_t stride_width = stride.back(); + KERNEL_CHECK_FALSE(output_width > kValue0 && output_height > kValue0, KERNEL_STATUS_PARAM_INVALID, + "output should be greater than zero, but got " + "output_height: %d output_width: %d.", + output_height, output_width); + KERNEL_CHECK_FALSE(kernel_width > kValue0 && kernel_height > kValue0, KERNEL_STATUS_PARAM_INVALID, + "kernel should be greater than zero, but got " + "kernel_height: %d kernel_width: %d.", + kernel_height, kernel_width); + KERNEL_CHECK_FALSE(dilation_width > kValue0 && dilation_height > kValue0, KERNEL_STATUS_PARAM_INVALID, + "dilation should be greater than zero, but got " + "dilation_height: %d dilation_width: %d.", + dilation_height, dilation_width); + KERNEL_CHECK_FALSE(pad_width >= kValue0 && pad_height >= kValue0, KERNEL_STATUS_PARAM_INVALID, + "padding should be greater than zero, but got pad_height: " + "%d pad_width: %d.", + pad_height, pad_width); + KERNEL_CHECK_FALSE(stride_width > kValue0 && stride_height > kValue0, KERNEL_STATUS_PARAM_INVALID, + "stride should be greater than zero, but got " + "stride_height: %d stride_width: %d.", + stride_height, stride_width); + auto input_shape = input_->GetTensorShape()->GetDimSizes(); + KERNEL_CHECK_FALSE( + (input_shape.size() == kValue4 && input_shape[kIndex0] != kValue0 && input_shape[kIndex1] != kValue0 && + input_shape[kIndex2] != kValue0 && input_shape[kIndex3] != kValue0), + KERNEL_STATUS_PARAM_INVALID, + "Expected 4D (batch mode) tensor for input with non-zero " + "batch size and non-zero dimensions for input, but got %dD input: (%d %d " + "%d %d).", + input_shape.size(), input_shape[kIndex0], input_shape[kIndex1], input_shape[kIndex2], input_shape[kIndex3]); + KERNEL_CHECK_FALSE(input_shape[kIndex2] == (kernel_width * kernel_height), KERNEL_STATUS_PARAM_INVALID, + "Expected size of input's dimension 2 to match the calculated " + "number of kernel_size, but got input_shape[2]=%d and kernel_size=(%d, " + "%d).", + input_shape[kIndex2], kernel_height, kernel_width); + auto input_length = input_shape[kIndex3]; + int64_t n_blocks_height = + div_rtn(output_height + 2 * pad_height - dilation_height * (kernel_height - 1) - 1, stride_height) + 1; + int64_t n_blocks_width = + div_rtn(output_width + 2 * pad_width - dilation_width * (kernel_width - 1) - 1, stride_width) + 1; + KERNEL_CHECK_FALSE(input_length == (n_blocks_height * n_blocks_width), KERNEL_STATUS_PARAM_INVALID, + "Given output_size=(%d, %d), kernel_size=(%d, %d), dilation=(%d, %d", + "), padding=(%d, %d), stride=(%d, %d), expected size of input's " + "dimension 2 to match the calculated " + "number of sliding blocks %d * %d = %d, but got input.size(2)=%d.", + output_height, output_width, kernel_height, kernel_width, dilation_height, dilation_width, + pad_height, pad_width, stride_height, stride_width, n_blocks_height, n_blocks_width, + (n_blocks_height * n_blocks_width), input_length); + return KERNEL_STATUS_OK; +} + +template +void Col2imCpuKernel::InnerCompute(int64_t c_col, int64_t input_offset, int64_t output_offset, T *input_data, + T *output_data) { + int64_t w_offset = c_col % kernel_width; + int64_t h_offset = (c_col / kernel_width) % kernel_height; + int64_t c_im = c_col / kernel_height / kernel_width; + for (int64_t h_col = 0; h_col < height_col; ++h_col) { + int64_t h_im = h_col * stride_height - pad_height + h_offset * dilation_height; + for (int64_t w_col = 0; w_col < width_col; ++w_col) { + int64_t w_im = w_col * stride_width - pad_width + w_offset * dilation_width; + if (h_im >= 0 && h_im < output_height && w_im >= 0 && w_im < output_width) { + output_data[output_offset + (c_im * output_height + h_im) * output_width + w_im] += + input_data[input_offset + (c_col * height_col + h_col) * width_col + w_col]; + } + } + } +} + +template +uint32_t Col2imCpuKernel::Col2imCompute(CpuKernelContext &ctx) { + Tensor *input_ = ctx.Input(0); + Tensor *output_size_ = ctx.Input(1); + Tensor *output_ = ctx.Output(0); + int32_t *output_size_data = reinterpret_cast(output_size_->GetData()); + std::vector output_size(kValue2, kValue0); + output_size[kIndex0] = output_size_data[kIndex0]; + output_size[kIndex1] = output_size_data[kIndex1]; + + std::vector kernel_size = ctx.GetAttr("kernel_size")->GetListInt(); + std::vector dilation = ctx.GetAttr("dilation")->GetListInt(); + std::vector padding = ctx.GetAttr("padding")->GetListInt(); + std::vector stride = ctx.GetAttr("stride")->GetListInt(); + + output_height = output_size.front(); + output_width = output_size.back(); + kernel_height = kernel_size.front(); + kernel_width = kernel_size.back(); + dilation_height = dilation.front(); + dilation_width = dilation.back(); + pad_height = padding.front(); + pad_width = padding.back(); + stride_height = stride.front(); + stride_width = stride.back(); + + auto input_shape = input_->GetTensorShape()->GetDimSizes(); + const int64_t batch_size = input_shape[kIndex0]; + const int64_t n_input_plane = input_shape[kIndex1]; + + height_col = + (output_height + kValue2 * pad_height - (dilation_height * (kernel_height - kValue1) + kValue1)) / stride_height + + 1; + width_col = + (output_width + kValue2 * pad_width - (dilation_width * (kernel_width - kValue1) + kValue1)) / stride_width + 1; + + T *input_data = reinterpret_cast(input_->GetData()); + T *output_data = reinterpret_cast(output_->GetData()); + std::fill_n(output_data, output_->NumElements(), T(0)); + channels_col = n_input_plane * kernel_height * kernel_width; + batch_input_size = n_input_plane * kernel_height * kernel_width * height_col * width_col; + batch_output_size = n_input_plane * output_height * output_width; + for (int64_t elt = 0; elt < batch_size; ++elt) { + int64_t input_offset = batch_input_size * elt; + int64_t output_offset = batch_output_size * elt; + for (int64_t c_col = 0; c_col < channels_col; ++c_col) { + InnerCompute(c_col, input_offset, output_offset, input_data, output_data); + } + } + + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kCol2im, Col2imCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.h new file mode 100644 index 00000000000..1cc38019160 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.h @@ -0,0 +1,50 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_COL2IM_H_ +#define AICPU_KERNELS_NORMALIZED_COL2IM_H_ + +#include "cpu_ops_kernel.h" +#include "utils/bcast.h" + +namespace aicpu { +class Col2imCpuKernel : public CpuKernel { + public: + Col2imCpuKernel() = default; + ~Col2imCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + uint32_t Col2imParamCheck(CpuKernelContext &ctx); + template + uint32_t Col2imCompute(CpuKernelContext &ctx); + template + void InnerCompute(int64_t c_col, int64_t input_offset, int64_t output_offset, T *input_data, T *output_data); + + int64_t output_height, output_width; + int64_t kernel_height, kernel_width; + int64_t dilation_height, dilation_width; + int64_t pad_height, pad_width; + int64_t stride_height, stride_width; + + int64_t height_col, width_col; + + int64_t channels_col, batch_input_size, batch_output_size; +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.cc new file mode 100644 index 00000000000..d341ec3f987 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.cc @@ -0,0 +1,211 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +#include "cumulativelogsumexp.h" + +#include "cmath" +#include "cpu_kernel_utils.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t KCumulativeLogsumexpInputNum = 2; +const uint32_t KCumulativeLogsumexpOutputNum = 1; +const float float16_exclusive_data = -65504e+0; +const float float_exclusive_data = -3.4028235e+38; +const double double_exclusive_data = -1.7976931348623157e+308; +const int64_t ParallelFor_size_float16 = 16 * 1024; +const int64_t ParallelFor_size_float32 = 32 * 1024; +const int64_t ParallelFor_size_double = 64 * 1024; +const char *KCumulativeLogsumexp = "CumulativeLogsumexp"; +#define CUMULATIVELOGSUMEXP_COMPUTE_CASE(DTYPE, IN_TYPE, CTX) \ + case (DTYPE): { \ + uint32_t result = CumulativeLogsumexpCompute(CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("CumulativeLogsumexp kernel compute failed."); \ + return result; \ + } \ + break; \ + } +} // namespace +namespace aicpu { +uint32_t CumulativeLogsumexpCpuKernel::Compute(CpuKernelContext &ctx) { + // check params + KERNEL_HANDLE_ERROR(NormalCheck(ctx, KCumulativeLogsumexpInputNum, KCumulativeLogsumexpOutputNum), + "[%s] check input and output failed,", KCumulativeLogsumexp); + KERNEL_HANDLE_ERROR(CumulativeLogsumexpCheck(ctx), "[%s] check params failed.", KCumulativeLogsumexp); + auto data_type = ctx.Input(0)->GetDataType(); + switch (data_type) { + CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx) + CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_FLOAT, float, ctx) + CUMULATIVELOGSUMEXP_COMPUTE_CASE(DT_DOUBLE, double, ctx) + default: + KERNEL_LOG_ERROR("CumulativeLogsumexp kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} +uint32_t CumulativeLogsumexpCpuKernel::CumulativeLogsumexpCheck(CpuKernelContext &ctx) { + KERNEL_CHECK_FALSE((ctx.Input(1)->GetDataType() == DT_INT16 || ctx.Input(1)->GetDataType() == DT_INT32), + KERNEL_STATUS_PARAM_INVALID, "Data type of axis is not support, axis data type is [%u].", + ctx.Input(1)->GetDataType()) + KERNEL_CHECK_FALSE(ctx.Input(1)->NumElements() == 1, KERNEL_STATUS_PARAM_INVALID, "axis is out of shape"); + auto axis_data = static_cast(ctx.Input(1)->GetData()); + int64_t axis = *axis_data; + KERNEL_CHECK_FALSE((axis < ctx.Input(0)->GetTensorShape()->GetDims()), KERNEL_STATUS_PARAM_INVALID, + "axis is larger than input dims - 1") + KERNEL_CHECK_FALSE((axis >= -ctx.Input(0)->GetTensorShape()->GetDims()), KERNEL_STATUS_PARAM_INVALID, + "axis is lower than -input dims") + std::vector shape_input = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + std::vector shape_output = ctx.Output(0)->GetTensorShape()->GetDimSizes(); + KERNEL_CHECK_FALSE((shape_input.size() != 0), KERNEL_STATUS_PARAM_INVALID, + "Input must be at least rank 1, got [%zu].", shape_input.size()) + KERNEL_CHECK_FALSE((shape_input.size() == shape_output.size()), KERNEL_STATUS_PARAM_INVALID, + "The output shape size should be same as the output shape size") + DataType input0_type = ctx.Input(0)->GetDataType(); + DataType output0_type = ctx.Output(0)->GetDataType(); + KERNEL_CHECK_FALSE((input0_type == output0_type), KERNEL_STATUS_PARAM_INVALID, + "The data type of input0 [%s] need be same with output0 [%s] ", DTypeStr(input0_type).c_str(), + DTypeStr(output0_type).c_str()) + return KERNEL_STATUS_OK; +} +template +void CumulativeProcess(uint32_t outer, uint32_t inner, uint32_t depth, bool reverse, bool exclusive, t *input_data, + t *output_data, DataType data_type) { + for (size_t outer_index = 0; outer_index < outer; ++outer_index) { + size_t outer_index_adj; + if (reverse) { + outer_index_adj = (outer - 1) - outer_index; + } else { + outer_index_adj = outer_index; + } + for (size_t inner_index = 0; inner_index < inner; ++inner_index) { + double one = 1; + double temp = 0; + size_t inner_index_adj; + if (reverse) { + inner_index_adj = (inner - 1) - inner_index; + } else { + inner_index_adj = inner_index; + } + for (size_t depth_index = 0; depth_index < depth; ++depth_index) { + size_t depth_index_adj; + if (reverse) { + depth_index_adj = (depth - 1) - depth_index; + } else { + depth_index_adj = depth_index; + } + size_t index = outer_index_adj; + index += inner_index_adj * depth * outer; + index += depth_index_adj * outer; + if (exclusive) { + if (depth_index == 0) { + if (data_type == DT_FLOAT16) { + output_data[index] = static_cast(float16_exclusive_data); + } else if (data_type == DT_FLOAT) { + output_data[index] = static_cast(float_exclusive_data); + } else { + output_data[index] = static_cast(double_exclusive_data); + } + temp = static_cast(input_data[index]); + } else { + output_data[index] = static_cast(temp); + double a = temp; + double b, min0, max0; + b = static_cast(input_data[index]); + min0 = (a < b) ? a : b; + max0 = (a > b) ? a : b; + temp = log(one + exp(min0 - max0)) + max0; + } + } else { + if (depth_index == 0) { + output_data[index] = input_data[index]; + temp = static_cast(input_data[index]); + } else { + double a = temp; + double b, min0, max0; + b = static_cast(input_data[index]); + min0 = (a < b) ? a : b; + max0 = (a > b) ? a : b; + output_data[index] = static_cast(log(one + exp(min0 - max0)) + max0); + temp = log(one + exp(min0 - max0)) + max0; + } + } + } + } + } +} +template +uint32_t CumulativeLogsumexpCpuKernel::CumulativeLogsumexpCompute(CpuKernelContext &ctx) { + auto input_data = static_cast(ctx.Input(0)->GetData()); + auto axis_data = static_cast(ctx.Input(1)->GetData()); + bool exclusive = false; + bool reverse = false; + AttrValue *exclusive_attr = ctx.GetAttr("exclusive"); + if (exclusive_attr != nullptr) { + exclusive = exclusive_attr->GetBool(); + } + AttrValue *reverse_attr = ctx.GetAttr("reverse"); + if (reverse_attr != nullptr) { + reverse = reverse_attr->GetBool(); + } + int32_t axis = 0; + if (axis_data != nullptr) { + axis = *axis_data; + } + auto output_data = static_cast(ctx.Output(0)->GetData()); + auto shape = ctx.Input(0)->GetTensorShape(); + const int64_t rank = shape->GetDims(); + if (axis < 0) { + axis += shape->GetDims(); + } + uint32_t inner = 1; + uint32_t outer = 1; + uint32_t depth = 1; + for (int32_t i = 0; i < rank; ++i) { + if (i < axis) { + inner *= shape->GetDimSize(i); + } else if (i > axis) { + outer *= shape->GetDimSize(i); + } else { + depth = shape->GetDimSize(i); + } + } // end for + auto data_type = ctx.Input(0)->GetDataType(); + int64_t data_num = ctx.Input(0)->NumElements(); + int64_t data_size = data_num * sizeof(T); + if ((data_type == DT_FLOAT16 && data_size <= ParallelFor_size_float16) || + (data_type == DT_FLOAT && data_size <= ParallelFor_size_float32) || + (data_type == DT_DOUBLE && data_size <= ParallelFor_size_double)) { + CumulativeProcess(outer, inner, depth, reverse, exclusive, input_data, output_data, data_type); + } else { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2); + if (max_core_num > outer) { + max_core_num = outer; + } + auto shard_cumulativelogsumexp = [&](size_t start, size_t end) { + CumulativeProcess(outer, inner, depth, reverse, exclusive, input_data, output_data, data_type); + }; + if (max_core_num == 0) { + return KERNEL_STATUS_PARAM_INVALID; + } + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, outer, outer / max_core_num, shard_cumulativelogsumexp), + "CumulativeLogsumexp Compute failed."); + } // end else + return KERNEL_STATUS_OK; +} +REGISTER_CPU_KERNEL(KCumulativeLogsumexp, CumulativeLogsumexpCpuKernel); +} // namespace aicpu \ No newline at end of file diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.h new file mode 100644 index 00000000000..3fa4faf7534 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/cumulativelogsumexp.h @@ -0,0 +1,38 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_CUMULATIVELOGSUMEXP_H_ +#define AICPU_KERNELS_NORMALIZED_CUMULATIVELOGSUMEXP_H_ + +#include "cpu_ops_kernel.h" + +namespace aicpu { +class CumulativeLogsumexpCpuKernel : public CpuKernel { + public: + CumulativeLogsumexpCpuKernel() = default; + ~CumulativeLogsumexpCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + uint32_t CumulativeLogsumexpCheck(CpuKernelContext &ctx); + + template + uint32_t CumulativeLogsumexpCompute(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.cc new file mode 100644 index 00000000000..11ff3393e83 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.cc @@ -0,0 +1,126 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "data_format_vec_permute.h" + +#include +#include "cpu_kernel_utils.h" +#include "cpu_types.h" +#include "kernel_log.h" +#include "status.h" +#include "utils/kernel_util.h" +using namespace std; + +namespace { +const uint32_t kOutputNum = 1; +const uint32_t kInputNum = 1; +const char *kDataFormatVecPermute = "DataFormatVecPermute"; + +#define DATAFORMATVECPERMUTE_COMPUTE_CASE(DTYPE, TYPE, DIM, SRC_FORMAT_STR, DST_FORMAT_STR, X, Y, CTX) \ + case (DTYPE): { \ + uint32_t result = DataFormatVecPermuteCompute(DIM, SRC_FORMAT_STR, DST_FORMAT_STR, X, Y, CTX); \ + if (result != KERNEL_STATUS_OK) { \ + KERNEL_LOG_ERROR("DataFormatVecPermute kernel compute failed."); \ + return result; \ + } \ + break; \ + } + +} // namespace + +namespace aicpu { +uint32_t DataFormatVecPermute::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check DataFormatVecPermute params failed."); + AttrValue *src_format = ctx.GetAttr("src_format"); + std::string src_format_str = src_format->GetString(); + KERNEL_CHECK_FALSE((src_format_str.size() == 4), KERNEL_STATUS_PARAM_INVALID, + "src_format must be of length 4, but the length of src_format = [%d].", src_format_str.size()); + AttrValue *dst_format = ctx.GetAttr("dst_format"); + std::string dst_format_str = dst_format->GetString(); + KERNEL_CHECK_FALSE((dst_format_str.size() == 4), KERNEL_STATUS_PARAM_INVALID, + "dst_format must be of length 4, but the length of dst_format = [%d].", dst_format_str.size()); + Tensor *x = ctx.Input(0); + auto x_shape = x->GetTensorShape(); + int32_t dim = x_shape->GetDims(); + KERNEL_CHECK_FALSE((dim == 1 || dim == 2), KERNEL_STATUS_PARAM_INVALID, + "Input dimension must be 1 or 2, but got dimension = [%d].", dim); + Tensor *y = ctx.Output(0); + auto y_shape = y->GetTensorShape(); + if (dim == 1) { + KERNEL_CHECK_FALSE((x_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID, + "1D Input must be of size 4, but got size %lld.", x_shape->GetDimSize(0)); + KERNEL_CHECK_FALSE((y_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID, + "1D Output must be of size 4, but got size %lld.", y_shape->GetDimSize(0)); + } else if (dim == 2) { + KERNEL_CHECK_FALSE((x_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID, + "First dimension of 2D Input must be of size 4, but got size %lld.", x_shape->GetDimSize(0)); + KERNEL_CHECK_FALSE((x_shape->GetDimSize(1) == 2), KERNEL_STATUS_PARAM_INVALID, + "Second dimension of 2D Input must be of size 2, but got size %lld.", x_shape->GetDimSize(1)); + KERNEL_CHECK_FALSE((y_shape->GetDimSize(0) == 4), KERNEL_STATUS_PARAM_INVALID, + "First dimension of 2D Output must be of size 4, but got size %lld.", y_shape->GetDimSize(0)); + KERNEL_CHECK_FALSE((y_shape->GetDimSize(1) == 2), KERNEL_STATUS_PARAM_INVALID, + "Second dimension of 2D Output must be of size 2, but got size %lld.", y_shape->GetDimSize(1)); + } + + auto x_type = x->GetDataType(); + auto y_type = y->GetDataType(); + KERNEL_CHECK_FALSE((x_type == y_type), KERNEL_STATUS_PARAM_INVALID, + "Input[%s] and output[%s] must have the same DataType.", DTypeStr(x_type).c_str(), + DTypeStr(y_type).c_str()); + switch (x_type) { + DATAFORMATVECPERMUTE_COMPUTE_CASE(DT_INT32, int32_t, dim, src_format_str, dst_format_str, x, y, ctx) + DATAFORMATVECPERMUTE_COMPUTE_CASE(DT_INT64, int64_t, dim, src_format_str, dst_format_str, x, y, ctx) + default: + KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(), + DTypeStr(x_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t DataFormatVecPermute::DataFormatVecPermuteCompute(const int32_t dim, const string &src_format_str, + const string &dst_format_str, Tensor *x, Tensor *y, + CpuKernelContext &ctx) { + T *x_addrs = reinterpret_cast(x->GetData()); + T *y_addrs = reinterpret_cast(y->GetData()); + + if (dim == 1) { + for (uint64_t i = 0; i < dst_format_str.size(); i++) { + for (uint64_t j = 0; j < src_format_str.size(); j++) { + if (dst_format_str[i] == src_format_str[j]) { + y_addrs[i] = x_addrs[j]; + break; + } + } + } + } else if (dim == 2) { + for (uint64_t i = 0; i < dst_format_str.size(); i++) { + for (uint64_t j = 0; j < src_format_str.size(); j++) { + if (dst_format_str[i] == src_format_str[j]) { + y_addrs[i * 2] = x_addrs[j * 2]; + y_addrs[i * 2 + 1] = x_addrs[j * 2 + 1]; + break; + } + } + } + } + + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kDataFormatVecPermute, DataFormatVecPermute); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.h new file mode 100644 index 00000000000..09fbb858466 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/data_format_vec_permute.h @@ -0,0 +1,35 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_DATA_FORMAT_VEC_PERMUTE_H_ +#define AICPU_KERNELS_NORMALIZED_DATA_FORMAT_VEC_PERMUTE_H_ + +#include +#include "cpu_ops_kernel.h" + +namespace aicpu { +class DataFormatVecPermute : public CpuKernel { + public: + DataFormatVecPermute() = default; + ~DataFormatVecPermute() override = default; + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + uint32_t DataFormatVecPermuteCompute(const int32_t dim, const std::string &src_format_str, + const std::string &dst_format_str, Tensor *x, Tensor *y, CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc new file mode 100644 index 00000000000..95babf88938 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc @@ -0,0 +1,455 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "matrix_solve_ls.h" + +#include +#include +#include +#include + +#include "cpu_kernel_utils.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const uint32_t kInputNum = 3; +const uint32_t kOutputNum = 1; +const char *MatrixSolveLs = "MatrixSolveLs"; +const int64_t kNum2 = 2; +} // namespace + +namespace aicpu { +uint32_t MatrixSolveLsCpuKernel::Compute(CpuKernelContext &ctx) { + bool qr_chole = (ctx.GetAttr("fast") == nullptr) ? true : ctx.GetAttr("fast")->GetBool(); + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MatrixSolveLs check input and output number failed."); + + Tensor *matrix = ctx.Input(kFirstInputIndex); + Tensor *b = ctx.Input(kSecondInputIndex); + Tensor *l2 = ctx.Input(2); + Tensor *x = ctx.Output(0); + if ((matrix->GetDataSize() == 0) || (b->GetDataSize() == 0)) { + KERNEL_LOG_ERROR("[%s] Input is empty tensor.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + auto shapea = matrix->GetTensorShape(); + auto shapeb = b->GetTensorShape(); + auto shapel2 = l2->GetTensorShape(); + auto shapex = x->GetTensorShape(); + auto dims = shapea->GetDims(); + + if (ctx.Input(1)->GetTensorShape()->GetDims() == 1) { + if (shapea->GetDimSize(dims - kNum2) != shapeb->GetDimSize(0)) { + KERNEL_LOG_ERROR( + "[%s] #Rows mismatch between A and rhs." + "#Rows of A = [%llu], #Rows of rhs = [%llu]", + ctx.GetOpType().c_str(), shapea->GetDimSize(dims - kNum2), shapeb->GetDimSize(0)); + return KERNEL_STATUS_PARAM_INVALID; + } + } else { + if (shapea->GetDimSize(dims - kNum2) != shapeb->GetDimSize(dims - kNum2)) { + KERNEL_LOG_ERROR( + "[%s] #Rows mismatch between A and rhs." + "#Rows of A = [%llu], #Rows of rhs = [%llu]", + ctx.GetOpType().c_str(), shapea->GetDimSize(dims - kNum2), shapeb->GetDimSize(dims - kNum2)); + return KERNEL_STATUS_PARAM_INVALID; + } + } + if (shapel2->GetDims() != 0) { + KERNEL_LOG_ERROR("[%s] Tensor l2 should be a scalar.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (ctx.Input(1)->GetTensorShape()->GetDims() == 1) { + if (shapex->GetDims() != shapeb->GetDims() || shapea->GetDimSize(dims - 1) != shapex->GetDimSize(0) || + shapex->GetDimSize(shapex->GetDims() - 1) != shapeb->GetDimSize(0)) { + KERNEL_LOG_ERROR("[%s] Tensor y shape mismatch.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + } else { + if (shapex->GetDims() != shapeb->GetDims() || + shapea->GetDimSize(dims - 1) != shapex->GetDimSize(shapex->GetDims() - kNum2) || + shapex->GetDimSize(shapex->GetDims() - 1) != shapeb->GetDimSize(shapeb->GetDims() - 1)) { + KERNEL_LOG_ERROR("[%s] Tensor y shape mismatch.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + } + + auto a_data_type = matrix->GetDataType(); + auto b_data_type = b->GetDataType(); + if (a_data_type != b_data_type) { + KERNEL_LOG_ERROR("[%s] Tensor data type mismatch.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (a_data_type != DT_FLOAT && a_data_type != DT_DOUBLE && a_data_type != DT_COMPLEX64 && + a_data_type != DT_COMPLEX128) { + KERNEL_LOG_ERROR("MatrixSolveLs kernel data type [%s] not support.", DTypeStr(a_data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + if (qr_chole) { + if (a_data_type == DT_COMPLEX64) { + return ComplexCholesky(ctx); + } + if (a_data_type == DT_COMPLEX128) { + return ComplexCholesky(ctx); + } + if (a_data_type == DT_DOUBLE) { + return RealCholesky(ctx); + } + if (a_data_type == DT_FLOAT) { + return RealCholesky(ctx); + } + } else { + if (a_data_type == DT_COMPLEX64) { + return ComplexQr(ctx); + } + if (a_data_type == DT_COMPLEX128) { + return ComplexQr(ctx); + } + if (a_data_type == DT_DOUBLE) { + return RealQr(ctx); + } + if (a_data_type == DT_FLOAT) { + return RealQr(ctx); + } + } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(MatrixSolveLs, MatrixSolveLsCpuKernel); + +template +void MatrixSolveLsCpuKernel::RealCholeskySingleCompute(T *aptr, T *bptr, T *xptr, double *l2, int64_t m, int64_t k, + int64_t n) { + Eigen::Matrix a(m, k); + Eigen::Matrix x(k, n); + Eigen::Matrix b(m, n); + Eigen::Matrix a_copy; + Eigen::Matrix a_b; + + for (int i = 0; i < m * k; i++) { + *(a.data() + i) = *(aptr + i); + } + for (int i = 0; i < m * n; i++) { + *(b.data() + i) = *(bptr + i); + } + + if (m >= k) { + a_copy = + a.transpose() * a + ((T)*l2) * Eigen::Matrix::Identity(k, k); + a_b = a.transpose() * b; + } else { + a_copy = a * a.transpose(); + a_b = b; + } + for (int64_t i = 0; i < n; i++) { + Eigen::Matrix xi = a_copy.ldlt().solve(a_b.col(i)); + if (m < k) { + xi = a.transpose() * xi; + } + x.col(i) = xi; + } + for (int64_t i = 0; i < k * n; i++) { + *(xptr + i) = *(x.data() + i); + } +} + +template +uint32_t MatrixSolveLsCpuKernel::RealCholesky(CpuKernelContext &ctx) { + auto dims = ctx.Input(0)->GetTensorShape()->GetDims(); + auto aptr = reinterpret_cast(ctx.Input(0)->GetData()); + auto bptr = reinterpret_cast(ctx.Input(1)->GetData()); + auto xptr = reinterpret_cast(ctx.Output(0)->GetData()); + auto l2 = reinterpret_cast(ctx.Input(2)->GetData()); + int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2); + int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1); + int64_t n = 1; + if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) { + n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1); + } + int64_t data_num = ctx.Input(0)->NumElements(); + const int64_t mat_size = m * k; + const int64_t rhs_size = m * n; + const int64_t res_size = n * k; + const int64_t batch = data_num / mat_size; + const int64_t kParallelDataNum = 16 * mat_size; + const int64_t kParallelDataNumMid = 72 * mat_size; + if (data_num >= kParallelDataNum) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + + if (data_num <= kParallelDataNumMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) { + for (int64_t i = start; i < end; i++) { + RealCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n); + } + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls), + "MatrixSolveLs Compute failed."); + } else { + for (int64_t i = 0; i < batch; i++) { + RealCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n); + } + } + return KERNEL_STATUS_OK; +} + +template +void MatrixSolveLsCpuKernel::ComplexCholeskySingleCompute(std::complex *aptr, std::complex *bptr, + std::complex *xptr, double *l2, int64_t m, int64_t k, + int64_t n) { + Eigen::Matrix A(kNum2 * m, kNum2 * k); + Eigen::Matrix x(kNum2 * k, n); + Eigen::Matrix b(kNum2 * m, n); + Eigen::Matrix a_copy; + Eigen::Matrix a_b; + auto l2value = abs(*l2); + + for (int64_t i = 0; i < k; i++) { + for (int64_t j = 0; j < m; j++) { + *(A.data() + i + j * kNum2 * k) = std::real(*(aptr + i + j * k)); + } + for (int64_t j = 0; j < m; j++) { + *(A.data() + (i + k) + (j + m) * kNum2 * k) = std::real(*(aptr + i + j * k)); + } + for (int64_t j = 0; j < m; j++) { + *(A.data() + (i + k) + j * kNum2 * k) = -std::imag(*(aptr + i + j * k)); + } + for (int64_t j = 0; j < m; j++) { + *(A.data() + i + (j + m) * kNum2 * k) = std::imag(*(aptr + i + j * k)); + } + } + for (int64_t i = 0; i < n; i++) { + for (int64_t j = 0; j < m; j++) { + *(b.data() + i + j * n) = std::real(*(bptr + i + j * n)); + *(b.data() + i + (j + m) * n) = std::imag(*(bptr + i + j * n)); + } + } + + if (m >= k) { + a_copy = + A.transpose() * A + + ((T)l2value) * Eigen::Matrix::Identity(kNum2 * k, kNum2 * k); + a_b = A.transpose() * b; + } else { + a_copy = + A * A.transpose() + + ((T)l2value) * Eigen::Matrix::Identity(kNum2 * m, kNum2 * m); + a_b = b; + } + + Eigen::Matrix xi; + for (int64_t i = 0; i < n; i++) { + xi = a_copy.ldlt().solve(a_b.col(i)); + if (m < k) { + xi = A.transpose() * xi; + } + x.col(i) = xi; + for (int64_t j = 0; j < k; j++) { + (xptr + i + j * n)->real(*(x.data() + i + j * n)); + (xptr + i + j * n)->imag(*(x.data() + i + (j + k) * n)); + } + } +} + +template +uint32_t MatrixSolveLsCpuKernel::ComplexCholesky(CpuKernelContext &ctx) { + auto dims = ctx.Input(0)->GetTensorShape()->GetDims(); + auto l2 = reinterpret_cast(ctx.Input(2)->GetData()); + auto aptr = reinterpret_cast *>(ctx.Input(0)->GetData()); + auto bptr = reinterpret_cast *>(ctx.Input(1)->GetData()); + auto xptr = reinterpret_cast *>(ctx.Output(0)->GetData()); + int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2); + int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1); + int64_t n = 1; + if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) { + n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1); + } + int64_t data_num = ctx.Input(0)->NumElements(); + const int64_t mat_size = m * k; + const int64_t rhs_size = m * n; + const int64_t res_size = n * k; + const int64_t batch = data_num / mat_size; + const int64_t kParallelDataNum = 16 * mat_size; + const int64_t kParallelDataNumMid = 72 * mat_size; + if (data_num >= kParallelDataNum) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + if (data_num <= kParallelDataNumMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) { + for (int64_t i = start; i < end; i++) { + ComplexCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n); + } + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls), + "MatrixSolveLs Compute failed."); + } else { + for (int64_t i = 0; i < batch; i++) { + ComplexCholeskySingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, l2, m, k, n); + } + } + return KERNEL_STATUS_OK; +} + +template +void MatrixSolveLsCpuKernel::RealQrSingleCompute(T *aptr, T *bptr, T *xptr, int64_t m, int64_t k, int64_t n) { + Eigen::Matrix a(m, k); + Eigen::Matrix x(k, n); + Eigen::Matrix b(m, n); + + for (int i = 0; i < m * k; i++) { + *(a.data() + i) = *(aptr + i); + } + for (int i = 0; i < m * n; i++) { + *(b.data() + i) = *(bptr + i); + } + + Eigen::ColPivHouseholderQR> qr_solve(a); + + for (int64_t i = 0; i < n; i++) { + Eigen::Matrix xi = qr_solve.solve(b.col(i)); + x.col(i) = xi; + } + for (int64_t i = 0; i < k * n; i++) { + *(xptr + i) = *(x.data() + i); + } +} + +template +uint32_t MatrixSolveLsCpuKernel::RealQr(CpuKernelContext &ctx) { + auto dims = ctx.Input(0)->GetTensorShape()->GetDims(); + auto aptr = reinterpret_cast(ctx.Input(0)->GetData()); + auto bptr = reinterpret_cast(ctx.Input(1)->GetData()); + auto xptr = reinterpret_cast(ctx.Output(0)->GetData()); + int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2); + int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1); + int64_t n = 1; + if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) { + n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1); + } + int64_t data_num = ctx.Input(0)->NumElements(); + const int64_t mat_size = m * k; + const int64_t rhs_size = m * n; + const int64_t res_size = n * k; + const int64_t batch = data_num / mat_size; + const int64_t kParallelDataNum = 16 * mat_size; + const int64_t kParallelDataNumMid = 72 * mat_size; + if (data_num >= kParallelDataNum) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + if (data_num <= kParallelDataNumMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) { + for (int64_t i = start; i < end; i++) { + RealQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n); + } + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls), + "MatrixSolveLs Compute failed."); + } else { + for (int64_t i = 0; i < batch; i++) { + RealQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n); + } + } + return KERNEL_STATUS_OK; +} + +template +void MatrixSolveLsCpuKernel::ComplexQrSingleCompute(std::complex *aptr, std::complex *bptr, std::complex *xptr, + int64_t m, int64_t k, int64_t n) { + Eigen::Matrix A(kNum2 * m, kNum2 * k); + Eigen::Matrix x(kNum2 * k, n); + Eigen::Matrix b(kNum2 * m, n); + for (int64_t i = 0; i < k; i++) { + for (int64_t j = 0; j < m; j++) { + *(A.data() + i + j * kNum2 * k) = std::real(*(aptr + i + j * k)); + } + for (int64_t j = 0; j < m; j++) { + *(A.data() + (i + k) + (j + m) * kNum2 * k) = std::real(*(aptr + i + j * k)); + } + for (int64_t j = 0; j < m; j++) { + *(A.data() + (i + k) + j * kNum2 * k) = -std::imag(*(aptr + i + j * k)); + } + for (int64_t j = 0; j < m; j++) { + *(A.data() + i + (j + m) * kNum2 * k) = std::imag(*(aptr + i + j * k)); + } + } + for (int64_t i = 0; i < n; i++) { + for (int64_t j = 0; j < m; j++) { + *(b.data() + i + j * n) = std::real(*(bptr + i + j * n)); + *(b.data() + i + (j + m) * n) = std::imag(*(bptr + i + j * n)); + } + } + + Eigen::ColPivHouseholderQR> qr_solve(A); + + for (int64_t i = 0; i < n; i++) { + Eigen::Matrix xi = qr_solve.solve(b.col(i)); + x.col(i) = xi; + + for (int64_t j = 0; j < k; j++) { + (xptr + i + j * n)->real(*(x.data() + i + j * n)); + (xptr + i + j * n)->imag(*(x.data() + i + (j + k) * n)); + } + } +} + +template +uint32_t MatrixSolveLsCpuKernel::ComplexQr(CpuKernelContext &ctx) { + auto dims = ctx.Input(0)->GetTensorShape()->GetDims(); + int64_t m = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 2); + int64_t k = ctx.Input(0)->GetTensorShape()->GetDimSize(dims - 1); + int64_t n = 1; + if (ctx.Input(1)->GetTensorShape()->GetDims() > 1) { + n = ctx.Input(1)->GetTensorShape()->GetDimSize(dims - 1); + } + int64_t data_num = ctx.Input(0)->NumElements(); + const int64_t mat_size = m * k; + const int64_t rhs_size = m * n; + const int64_t res_size = n * k; + const int64_t batch = data_num / mat_size; + const int64_t kParallelDataNum = 16 * mat_size; + const int64_t kParallelDataNumMid = 72 * mat_size; + auto aptr = reinterpret_cast *>(ctx.Input(0)->GetData()); + auto bptr = reinterpret_cast *>(ctx.Input(1)->GetData()); + auto xptr = reinterpret_cast *>(ctx.Output(0)->GetData()); + if (data_num >= kParallelDataNum) { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum); + if (data_num <= kParallelDataNumMid) { + max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores + } + auto sharder_matrix_solve_ls = [&](int64_t start, int64_t end) { + for (int64_t i = start; i < end; i++) { + ComplexQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n); + } + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, batch, batch / max_core_num, sharder_matrix_solve_ls), + "MatrixSolveLs Compute failed."); + } else { + for (int64_t i = 0; i < batch; i++) { + ComplexQrSingleCompute(aptr + i * mat_size, bptr + i * rhs_size, xptr + i * res_size, m, k, n); + } + } + return KERNEL_STATUS_OK; +} + +} // namespace aicpu \ No newline at end of file diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.h new file mode 100644 index 00000000000..e706e8853c5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.h @@ -0,0 +1,62 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_MATRIX_SOLVE_LS_H_ +#define AICPU_KERNELS_NORMALIZED_MATRIX_SOLVE_LS_H_ + +#include +#include "cpu_ops_kernel.h" +#include "cpu_types.h" +#include "utils/bcast.h" + +namespace aicpu { +class MatrixSolveLsCpuKernel : public CpuKernel { + public: + MatrixSolveLsCpuKernel() = default; + ~MatrixSolveLsCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + void RealCholeskySingleCompute(T *aptr, T *bptr, T *xptr, double *l2, int64_t m, int64_t k, int64_t n); + + template + uint32_t RealCholesky(CpuKernelContext &ctx); + + template + void RealQrSingleCompute(T *aptr, T *bptr, T *xptr, int64_t m, int64_t k, int64_t n); + + template + uint32_t RealQr(CpuKernelContext &ctx); + + template + void ComplexCholeskySingleCompute(std::complex *aptr, std::complex *bptr, std::complex *xptr, double *l2, + int64_t m, int64_t k, int64_t n); + + template + uint32_t ComplexCholesky(CpuKernelContext &ctx); + + template + void ComplexQrSingleCompute(std::complex *aptr, std::complex *bptr, std::complex *xptr, int64_t m, int64_t k, + int64_t n); + + template + uint32_t ComplexQr(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.cc new file mode 100644 index 00000000000..a94fbcc8dea --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.cc @@ -0,0 +1,440 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nuclear_norm.h" +#include +#include +#include +#include +#include +#include +#include +#include "kernel_util.h" +#include "utils/kernel_util.h" +#define NoneN 1000 +using namespace Eigen; +using namespace std; + +namespace { +const char *kNuclearNorm = "NuclearNorm"; +const size_t kNuclearNormInputNum = 1; +const size_t kNuclearNormOutputNum = 1; +constexpr int64_t kParallelDataNums = 1 * 1024; +const size_t DIM_SIZE1 = 1; +const size_t DIM_SIZE2 = 2; +const size_t DIM_SIZE3 = 3; +const size_t DIM_SIZE4 = 4; +const size_t DIM_SIZE5 = 5; +const size_t DIM_SIZE6 = 6; +const size_t DIM_SIZE7 = 7; +const size_t DIM_SIZE8 = 8; +const size_t DIM_INDEX0 = 0; +const size_t DIM_INDEX1 = 1; +const size_t DIM_INDEX2 = 2; +} // namespace + +namespace aicpu { +uint32_t NuclearNormCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kNuclearNormInputNum, kNuclearNormOutputNum), + "NuclearNorm Check input and output number failed."); + KERNEL_HANDLE_ERROR(NuclearNormParamCheck(ctx), "NuclearNorm Check params failed."); + + auto data_type = ctx.Input(0)->GetDataType(); + uint32_t res = KERNEL_STATUS_OK; + + switch (data_type) { + case (DT_FLOAT): { + res = NuclearNormCompute(ctx); + break; + } + case (DT_DOUBLE): { + res = NuclearNormCompute(ctx); + break; + } + default: + KERNEL_LOG_ERROR("NuclearNorm kernel data type [%s] not support.", DTypeStr(data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (res != KERNEL_STATUS_OK) { + return KERNEL_STATUS_INNER_ERROR; + } + return KERNEL_STATUS_OK; +} + +uint32_t NuclearNormCpuKernel::NuclearNormParamCheck(CpuKernelContext &ctx) { + Tensor *input = ctx.Input(0); + Tensor *output = ctx.Output(0); + KERNEL_CHECK_FALSE((input->GetDataType() == output->GetDataType()), KERNEL_STATUS_PARAM_INVALID, + "The data type of the input [%s] need be the same as the output [%s]", + DTypeStr(input->GetDataType()).c_str(), DTypeStr(output->GetDataType()).c_str()); + const size_t input_dimnum = input->GetTensorShape()->GetDims(); + KERNEL_CHECK_FALSE((input_dimnum >= DIM_SIZE2 && input_dimnum <= DIM_SIZE8), KERNEL_STATUS_PARAM_INVALID, + "The range of the dimension of the input tensor should be " + "[%lld,%lld], but got input's dimension=%lld", + DIM_SIZE2, DIM_SIZE8, input_dimnum); + AttrValue *dim_ptr = ctx.GetAttr("dim"); + std::vector dim_temp = {0, 1}; + std::vector dim = (dim_ptr == nullptr) ? dim_temp : dim_ptr->GetListInt(); + if (dim_ptr == nullptr) { + KERNEL_CHECK_FALSE((input_dimnum == DIM_SIZE2), KERNEL_STATUS_PARAM_INVALID, + "When Attr dim is none, NuclearNorm expected a tensor with 2 " + "dimensions, but got a tensor with [%lld] dimensions instead.", + input_dimnum); + } + if (dim.size() == 1 && dim[0] == NoneN) { + dim.clear(); + dim.push_back(0); + dim.push_back(1); + } + KERNEL_CHECK_FALSE((dim.size() == DIM_SIZE2), KERNEL_STATUS_PARAM_INVALID, + "Attr dim'size must equal to 2, but got dim's size : [%lld]", dim.size()); + int64_t lower_bound = 0 - input_dimnum; + int64_t upper_bound = input_dimnum - 1; + KERNEL_CHECK_FALSE((dim[0] >= lower_bound && dim[0] <= upper_bound), KERNEL_STATUS_PARAM_INVALID, + "The range of dim[0] should be [%lld,%lld], but got input dim[0]=%lld", lower_bound, upper_bound, + dim[0]); + KERNEL_CHECK_FALSE((dim[1] >= lower_bound && dim[1] <= upper_bound), KERNEL_STATUS_PARAM_INVALID, + "The range of dim[1] should be [%lld,%lld], but got input dim[1]=%lld", lower_bound, upper_bound, + dim[1]); + dim[0] = (dim[0] < 0) ? dim[0] + input_dimnum : dim[0]; + dim[1] = (dim[1] < 0) ? dim[1] + input_dimnum : dim[1]; + KERNEL_CHECK_FALSE((dim[0] != dim[1]), KERNEL_STATUS_PARAM_INVALID, + "The values in attr dim point to the same dimension."); + KERNEL_LOG_DEBUG("NuclearNormCpuKernel[%s], input: size[%llu], output: size[%llu].", ctx.GetOpType().c_str(), + input->GetDataSize(), output->GetDataSize()); + + return KERNEL_STATUS_OK; +} + +template +uint32_t NuclearNormCpuKernel::NuclearNormCompute(CpuKernelContext &ctx) { + Tensor *input_ptr = ctx.Input(0); + auto input_shape = input_ptr->GetTensorShape(); + std::vector input_dims = input_shape->GetDimSizes(); + uint32_t res = KERNEL_STATUS_OK; + switch (input_dims.size()) { + case DIM_SIZE2: + res = ComputeTensorNuclearNorm(ctx); + break; + case DIM_SIZE3: + res = ComputeTensorNuclearNorm(ctx); + break; + case DIM_SIZE4: + res = ComputeTensorNuclearNorm(ctx); + break; + case DIM_SIZE5: + res = ComputeTensorNuclearNorm(ctx); + break; + case DIM_SIZE6: + res = ComputeTensorNuclearNorm(ctx); + break; + case DIM_SIZE7: + res = ComputeTensorNuclearNorm(ctx); + break; + case DIM_SIZE8: + res = ComputeTensorNuclearNorm(ctx); + break; + default: + KERNEL_LOG_ERROR( + "Only tensors with ranks between 2 and 8 are currently supported." + "Tensor rank: [%d]", + input_dims.size()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (res != KERNEL_STATUS_OK) { + return KERNEL_STATUS_INNER_ERROR; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t NuclearNormCpuKernel::ComputeTensorNuclearNorm(const CpuKernelContext &ctx) { + Tensor *input_ptr = ctx.Input(0); + auto input_shape = input_ptr->GetTensorShape(); + void *data_ptr = input_ptr->GetData(); + int64_t value_num_ = input_ptr->NumElements(); + + T *input_data_ptr = reinterpret_cast(data_ptr); + int64_t total_copy_size = value_num_ * static_cast(sizeof(T)); + Eigen::Tensor eigen_tensor(value_num_); + int memcpy_ret = memcpy_s(&eigen_tensor(0), total_copy_size, input_data_ptr, total_copy_size); + + if (memcpy_ret != 0) { + KERNEL_LOG_ERROR("memcpy_s error!"); + } + std::vector input_dims = input_shape->GetDimSizes(); + std::array dim_array; + const int64_t input_dimnum = static_cast(input_shape->GetDims()); + for (int64_t i = 0; i < input_dimnum; i++) { + dim_array.at(i) = input_dims[i]; + } + Eigen::Tensor reshaped_tensor = eigen_tensor.reshape(dim_array); + + AttrValue *dim_ptr = ctx.GetAttr("dim"); + std::vector dim_temp = {0, 1}; + std::vector dim = (dim_ptr == nullptr) ? dim_temp : dim_ptr->GetListInt(); + if (dim.size() == 1 && dim[0] == NoneN) { + dim.clear(); + dim.push_back(0); + dim.push_back(1); + } + dim[0] = (dim[0] < 0) ? dim[0] + input_dimnum : dim[0]; + dim[1] = (dim[1] < 0) ? dim[1] + input_dimnum : dim[1]; + + int64_t j = 0; + for (int64_t i = 0; i < input_dimnum; i++) { + if (i != dim[0] && i != dim[1]) { + dim_array.at(j) = i; + j++; + } + } + dim_array.at(j) = dim[0]; + dim_array.at(j + 1) = dim[1]; + Eigen::Tensor shuffled_tensor = reshaped_tensor.shuffle(dim_array); + + int64_t dimsize0 = input_shape->GetDimSize(dim[0]); + int64_t dimsize1 = input_shape->GetDimSize(dim[1]); + int64_t iter_number = value_num_ / (dimsize0 * dimsize1); + + std::array dim_array_last; + dim_array_last.at(DIM_INDEX0) = iter_number; + dim_array_last.at(DIM_INDEX1) = dimsize0; + dim_array_last.at(DIM_INDEX2) = dimsize1; + Eigen::Tensor permuted_tensor = shuffled_tensor.reshape(dim_array_last); + + auto output_data_ptr = reinterpret_cast(ctx.Output(0)->GetData()); + int64_t copy_size = (dimsize0 * dimsize1) * static_cast(sizeof(T)); + if (iter_number <= kParallelDataNums) { + for (int64_t i = 0; i < iter_number; i++) { + T *mat = new T[dimsize0 * dimsize1]; + memcpy(mat, &permuted_tensor(i, 0, 0), copy_size); + T nuclear_norm = matrix_nuclear_norm(mat, dimsize0, dimsize1); + *(output_data_ptr + i) = nuclear_norm; + } + } else { + uint32_t min_core_num = 1; + uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2); + if (max_core_num > static_cast(iter_number)) { + max_core_num = static_cast(iter_number); + } + + auto shared_nuclear_norm = [&](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + T *mat = new T[dimsize0 * dimsize1]; + memcpy(mat, &permuted_tensor(i, 0, 0), copy_size); + T nuclear_norm = matrix_nuclear_norm(mat, dimsize0, dimsize1); + *(output_data_ptr + i) = nuclear_norm; + } + }; + if (max_core_num != 0) { + KERNEL_HANDLE_ERROR( + CpuKernelUtils::ParallelFor(ctx, static_cast(iter_number), + static_cast(iter_number) / max_core_num, shared_nuclear_norm), + "NuclearNorm Compute failed."); + } + } + return KERNEL_STATUS_OK; +} + +template +std::vector> NuclearNormCpuKernel::matrix_multiply(std::vector> const arrL, + std::vector> const arrR) { + size_t rowL = arrL.size(); + size_t colL = arrL[0].size(); + size_t colR = arrR[0].size(); + + std::vector> res(rowL); + for (size_t i = 0; i < res.size(); i++) { + res[i].resize(colR); + } + + for (size_t i = 0; i < rowL; i++) { + for (size_t j = 0; j < colR; j++) { + for (size_t k = 0; k < colL; k++) { + res[i][j] += arrL[i][k] * arrR[k][j]; + } + } + } + + return res; +} + +template +std::vector> NuclearNormCpuKernel::transpose(std::vector> const arr) { + size_t row = arr.size(); + size_t col = arr[0].size(); + + std::vector> trans(col); + for (size_t i = 0; i < col; i++) { + trans[i].resize(row); + } + + for (size_t i = 0; i < col; i++) { + for (size_t j = 0; j < row; j++) { + trans[i][j] = arr[j][i]; + } + } + return trans; +} + +template +std::vector NuclearNormCpuKernel::argsort(const std::vector &array) { + const size_t array_len(array.size()); + std::vector array_index(array_len, 0); + for (size_t i = 0; i < array_len; ++i) array_index[i] = i; + + sort(array_index.begin(), array_index.end(), + [&array](size_t pos1, size_t pos2) { return (array[pos1] > array[pos2]); }); + + return array_index; +} + +template +void NuclearNormCpuKernel::get_row_col(std::vector> arr, T *max, size_t *row, size_t *col) { + size_t n = arr.size(); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + if (i != j && fabs(arr[i][j]) > *max) { + *max = fabs(arr[i][j]); + *row = i; + *col = j; + } + } + } +} + +template +void NuclearNormCpuKernel::svd(std::vector> arr, std::vector> &E, std::vector &e) { + size_t n = arr.size(); + size_t row = 0; + size_t col = 0; + size_t iter_max_num = 10000; + size_t iter_num = 0; + T eps = 1e-40; + T max = eps; + T dot5 = 0.5; + + E.resize(n); + e.resize(n); + for (size_t i = 0; i < n; i++) { + E[i].resize(n, 0); + E[i][i] = 1; + } + + while (iter_num < iter_max_num && max >= eps) { + max = fabs(arr[0][1]); + row = 0; + col = 1; + + get_row_col(arr, &max, &row, &col); + T theta = dot5 * atan2(-2 * arr[row][col], -(arr[row][row] - arr[col][col])); + + T aii = arr[row][row]; + T ajj = arr[col][col]; + T aij = arr[row][col]; + T sin_theta = sin(theta); + T cos_theta = cos(theta); + T sin_2theta = sin(2 * theta); + T cos_2theta = cos(2 * theta); + arr[row][row] = aii * cos_theta * cos_theta + ajj * sin_theta * sin_theta + aij * sin_2theta; + arr[col][col] = aii * sin_theta * sin_theta + ajj * cos_theta * cos_theta - aij * sin_2theta; + arr[row][col] = dot5 * (ajj - aii) * sin_2theta + aij * cos_2theta; + arr[col][row] = arr[row][col]; + for (size_t k = 0; k < n; k++) { + if (k != row && k != col) { + T arowk = arr[row][k]; + T acolk = arr[col][k]; + arr[row][k] = arowk * cos_theta + acolk * sin_theta; + arr[k][row] = arr[row][k]; + arr[col][k] = acolk * cos_theta - arowk * sin_theta; + arr[k][col] = arr[col][k]; + } + } + + T Eki; + T Ekj; + for (size_t k = 0; k < n; k++) { + Eki = E[k][row]; + Ekj = E[k][col]; + E[k][row] = Eki * cos_theta + Ekj * sin_theta; + E[k][col] = Ekj * cos_theta - Eki * sin_theta; + } + iter_num++; + } + + for (size_t i = 0; i < n; i++) { + e[i] = arr[i][i]; + } + + std::vector sort_index; + sort_index = argsort(e); + + std::vector> E_sorted(n); + for (size_t i = 0; i < n; i++) { + E_sorted[i].resize(n); + } + std::vector e_sorted(n); + for (size_t i = 0; i < n; i++) { + e_sorted[i] = e[sort_index[i]]; + for (size_t j = 0; j < n; j++) { + E_sorted[i][j] = E[i][sort_index[j]]; + } + } + E = E_sorted; + e = e_sorted; +} + +template +T NuclearNormCpuKernel::matrix_nuclear_norm(T *mat, size_t dim0, size_t dim1) { + if (dim1 == DIM_SIZE1) { + T nuclear_norm = 0.0; + T temp = 0.0; + for (size_t j = 0; j < dim0; j++) { + temp = mat[j]; + temp = temp * temp; + nuclear_norm += temp; + } + nuclear_norm = sqrt(nuclear_norm); + return nuclear_norm; + } + std::vector> arr(dim0); + size_t S_dim_size = dim0 < dim1 ? dim0 : dim1; + for (size_t i = 0; i < arr.size(); i++) { + arr[i].resize(dim1); + } + for (size_t i = 0; i < dim0; i++) { + for (size_t j = 0; j < dim1; j++) { + arr[i][j] = mat[i * dim1 + j]; + } + } + + std::vector> ATA; + std::vector> E; + std::vector e; + + ATA = matrix_multiply(transpose(arr), arr); + svd(ATA, E, e); + + double nuclear_norm = 0.0; + for (size_t i = DIM_INDEX0; i < S_dim_size; i++) { + if (e[i] > 0) { + nuclear_norm += sqrt(e[i]); + } + } + + return nuclear_norm; +} +REGISTER_CPU_KERNEL(kNuclearNorm, NuclearNormCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.h new file mode 100644 index 00000000000..e0333e96663 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/nuclear_norm.h @@ -0,0 +1,66 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_NUCLEARNORM_H_ +#define AICPU_KERNELS_NORMALIZED_NUCLEARNORM_H_ +#include +#include + +#include "cpu_ops_kernel.h" +#include "cpu_kernel_utils.h" +#include "kernel_log.h" +#include "securec.h" +#include "status.h" +#include "utils/bcast.h" + +namespace aicpu { +class NuclearNormCpuKernel : public CpuKernel { + public: + NuclearNormCpuKernel() = default; + ~NuclearNormCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + uint32_t NuclearNormParamCheck(CpuKernelContext &ctx); + + template + uint32_t NuclearNormCompute(CpuKernelContext &ctx); + + template + uint32_t ComputeTensorNuclearNorm(const CpuKernelContext &ctx); + + template + std::vector> matrix_multiply(std::vector> const arrL, + std::vector> const arrR); + + template + std::vector> transpose(std::vector> const arr); + + template + std::vector argsort(const std::vector &array); + + template + void get_row_col(std::vector> arr, T *max, size_t *row, size_t *col); + + template + void svd(std::vector> arr, std::vector> &E, std::vector &e); + + template + T matrix_nuclear_norm(T *mat, size_t dim0, size_t dim1); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.cc new file mode 100644 index 00000000000..97c18fb1d04 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.cc @@ -0,0 +1,410 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "quantile.h" + +#include +#include +#include +#include + +#include "cpu_kernel_utils.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +constexpr uint32_t kQuantileInputNum = 2; +constexpr uint32_t kQuantileOutputNum = 1; +const int64_t paralled_data_size = 64 * 1024; +const int64_t kQuantileAttrDefaultDim = 10000; +const char *kQuantile = "Quantile"; +} // namespace + +namespace aicpu { +template +uint32_t QuantileCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) { + input_ = ctx.Input(0); + DataType input_type = input_->GetDataType(); + int64_t input_dim = input_->GetTensorShape()->GetDims(); + int64_t input_size = input_->GetTensorShape()->NumElements(); + q_ = ctx.Input(1); + int64_t q_size = q_->GetTensorShape()->NumElements(); + T *q_addrs = reinterpret_cast(q_->GetData()); + DataType q_type = q_->GetDataType(); + int64_t q_dim = q_->GetTensorShape()->GetDims(); + int64_t min = -input_dim; + int64_t max = input_dim - 1; + auto dim_attr = ctx.GetAttr("dim"); + dim_ = (dim_attr == nullptr) ? kQuantileAttrDefaultDim : dim_attr->GetInt(); + auto keep_dims_attr = ctx.GetAttr("keep_dims"); + keep_dims_ = (keep_dims_attr == nullptr) ? false : keep_dims_attr->GetBool(); + auto ignore_attr = ctx.GetAttr("ignore_nan"); + ignore_nan_ = (ignore_attr == nullptr) ? false : ignore_attr->GetBool(); + + KERNEL_CHECK_FALSE(input_size > 0, KERNEL_STATUS_PARAM_INVALID, "quantile() input tensor must be non-empty"); + KERNEL_CHECK_FALSE(q_dim <= 1, KERNEL_STATUS_PARAM_INVALID, + "quantile() q must be a scalar or 1D tensor,but got dimension = [%d].", q_dim); + KERNEL_CHECK_FALSE(input_type == q_type, KERNEL_STATUS_PARAM_INVALID, + "quantile() q tensor must be same dtype as the input tensor"); + + for (int64_t j = 0; j < q_size; ++j) { + KERNEL_CHECK_FALSE(q_addrs[j] <= 1 && q_addrs[j] >= 0, KERNEL_STATUS_PARAM_INVALID, + "quantile() q values must be in the range [0, 1]"); + } + DataType out_type = ctx.Output(0)->GetDataType(); + output_ = ctx.Output(0); + KERNEL_CHECK_FALSE(out_type == input_type, KERNEL_STATUS_PARAM_INVALID, + "quantile() out tensor must be same dtype as the input tensor"); + if (dim_ != kQuantileAttrDefaultDim) { + KERNEL_CHECK_FALSE(dim_ >= min && dim_ <= max, KERNEL_STATUS_PARAM_INVALID, + "Dimension out of range (expected to be in range of [%d] and [%d]).", min, max); + } + dim_ = MaybeWrapDim(dim_, input_dim); + return KERNEL_STATUS_OK; +} + +uint32_t QuantileCpuKernel::MaybeWrapDim(int64_t dim, int64_t dim_post_expr) { + if (dim == kQuantileAttrDefaultDim) { + return dim; + } + if (dim_post_expr <= 0) { + dim_post_expr = 1; + } + int64_t min = -dim_post_expr; + int64_t max = dim_post_expr - 1; + KERNEL_CHECK_FALSE(dim >= min && dim <= max, KERNEL_STATUS_PARAM_INVALID, + "Dimension out of range (expected to be in range of [%d] and [%d]).", min, max) + if (dim < 0) { + dim += dim_post_expr; + } + return dim; +} + +template +std::vector transpose(std::vector &f, std::vector &shape, int index) { + int element_count = f.size(); + int m = shape.size(); + int i; + int *indexA = (int *)malloc(sizeof(int) * m); + if (indexA == nullptr) { + return {}; + } + + std::vector pos(m); + for (int i = 0; i < m; i++) pos[i] = i; + if (m != 0) { + std::swap(pos[m - 1], pos[((index + m) % m)]); + } + + int *indexB = (int *)malloc(sizeof(int) * m); + if (indexB == nullptr) { + free(indexA); + return {}; + } + + std::vector b(element_count); + std::vector shapeb(shape); + for (int i = 0; i < m; i++) { + shapeb[i] = shape[pos[i]]; + } + + for (int src = 0; src < element_count; src++) { + int temp = src; + for (i = m - 1; i >= 0; i--) { + indexA[i] = temp % shape[i]; + temp = temp / shape[i]; + } + + for (i = 0; i < m; i++) { + indexB[i] = indexA[pos[i]]; + } + + int dst = 0; + temp = 1; + for (i = m - 1; i >= 0; i--) { + dst = dst + indexB[i] * temp; + temp = temp * shapeb[i]; + } + b[dst] = f[src]; + } + free(indexA); + free(indexB); + + return b; +} + +template +void QuantileCpuKernel::QuantileComputeParallelFunc(size_t start, size_t end, int64_t last_shape_size, + std::vector &sorted) { + uint64_t q_size = q_->GetTensorShape()->NumElements(); + T *output_addr = reinterpret_cast(output_->GetData()); + T *q_addrs = reinterpret_cast(q_->GetData()); + for (u_int64_t i = start; i < end; i++) { + std::vector tmp; + std::sort(sorted.begin() + i * last_shape_size, sorted.begin() + (i + 1) * last_shape_size); + bool has_nan = false; + bool all_nan = true; + + for (u_int64_t j = i * last_shape_size; j < (i + 1) * last_shape_size; j++) { + if (std::isnan(sorted[j])) { + has_nan = true; + } else { + all_nan = false; + } + } + + if ((has_nan && !ignore_nan_) || all_nan) { + for (uint64_t j = 0; j < q_size; ++j) { + output_addr[i * q_size + j] = NAN; + } + continue; + } + for (auto k = i * last_shape_size; k < (i + 1) * last_shape_size; k++) { + auto x = sorted[k]; + if (!isnan(x)) { + tmp.push_back(x); + } + } + std::sort(tmp.begin(), tmp.end()); + for (uint64_t j = 0; j < q_size; ++j) { + T index = (tmp.size() - 1) * q_addrs[j]; + int32_t idx = index; + if (idx == (int32_t)tmp.size() - 1) { + output_addr[i * q_size + j] = tmp[idx]; + continue; + } + output_addr[i * q_size + j] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx); + } + } +} + +template +void QuantileCpuKernel::QuantileComputeSerialFunc(int64_t last_shape_size, std::vector &sorted) { + uint64_t n = input_->GetTensorShape()->NumElements(); + uint64_t q_size = q_->GetTensorShape()->NumElements(); + T *output_addr = reinterpret_cast(output_->GetData()); + T *q_addrs = reinterpret_cast(q_->GetData()); + for (u_int64_t i = 0; i < n; i += last_shape_size) { + std::vector tmp; + sort(sorted.begin() + i, sorted.begin() + i + last_shape_size); + bool has_nan = false; + bool all_nan = true; + for (auto j = i; j < i + last_shape_size; j++) { + if (!isnan(sorted[j])) { + tmp.push_back(sorted[j]); + all_nan = false; + } else { + has_nan = true; + } + } + sort(tmp.begin(), tmp.end()); + for (uint64_t j = 0; j < q_size; ++j) { + if ((has_nan && !ignore_nan_) || all_nan) { + output_addr[i * q_size / last_shape_size + j] = NAN; + continue; + } + + T index = (tmp.size() - 1) * q_addrs[j]; + int32_t idx = index; + if (idx == (int32_t)tmp.size() - 1) { + output_addr[i * q_size / last_shape_size + j] = tmp[idx]; + continue; + } + output_addr[i * q_size / last_shape_size + j] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx); + } + } +} +template +void QuantileCpuKernel::QuantileComputeDefaultFunc(std::vector &sorted) { + uint64_t q_size = q_->GetTensorShape()->NumElements(); + T *output_addr = reinterpret_cast(output_->GetData()); + T *q_addrs = reinterpret_cast(q_->GetData()); + std::sort(sorted.begin(), sorted.end()); + bool all_nan = true; + std::vector tmp; + for (auto &x : sorted) { + if (!isnan(x)) { + tmp.push_back(x); + all_nan = false; + } + } + std::sort(tmp.begin(), tmp.end()); + for (uint64_t i = 0; i < q_size; ++i) { + if ((has_nan_ && !ignore_nan_) || all_nan) { + output_addr[i] = NAN; + continue; + } + T index = (tmp.size() - 1) * q_addrs[i]; + int32_t idx = index; + if (idx == (int32_t)tmp.size() - 1) { + output_addr[i] = tmp[idx]; + continue; + } + output_addr[i] = tmp[idx] + (tmp[idx + 1] - tmp[idx]) * (index - idx); + } +} + +std::vector QuantileCpuKernel::SetQuantileOutputShape() { + std::vector out_shape; + int64_t q_dim = q_->GetTensorShape()->NumElements(); + int64_t input_dim = input_->GetTensorShape()->GetDims(); + uint64_t q_size = q_->GetTensorShape()->NumElements(); + std::vector input_shapesize = input_->GetTensorShape()->GetDimSizes(); + if (dim_ != kQuantileAttrDefaultDim && input_dim > 0) { + out_shape = input_shapesize; + if (keep_dims_) { + out_shape[dim_] = 1; + } else { + out_shape.erase(out_shape.begin() + dim_); + } + } else if (keep_dims_) { + out_shape = std::vector(input_dim, 1); + } + if (q_dim > 0) { + out_shape.insert(out_shape.begin(), q_size); + } + return out_shape; +} + +template +uint32_t QuantileCpuKernel::QuantileCompute(CpuKernelContext &ctx) { + T *input_addrs = reinterpret_cast(ctx.Input(0)->GetData()); + size_t data_size = input_->GetTensorShape()->NumElements() * sizeof(T); + + std::vector out_shape = SetQuantileOutputShape(); + std::vector input_dims = input_->GetTensorShape()->GetDimSizes(); + int64_t input_shape_size = input_->GetTensorShape()->GetDims(); + std::vector sorted; + int64_t n = input_->GetTensorShape()->NumElements(); + for (int64_t i = 0; i < n; i++) { + sorted.push_back(input_addrs[i]); + if (isnan(input_addrs[i])) { + has_nan_ = true; + } + } + + if (data_size <= paralled_data_size) { + if (dim_ == kQuantileAttrDefaultDim) { + QuantileComputeDefaultFunc(sorted); + } else if (dim_ == input_shape_size - 1) { + QuantileComputeSerialFunc(input_dims[input_dims.size() - 1], sorted); + } else { + input_dims.push_back(1); + sorted = transpose(sorted, input_dims, dim_); + int32_t m = input_dims.size(); + if (m != 0) { + std::swap(input_dims[m - 1], input_dims[((dim_ + m) % m)]); + } + QuantileComputeSerialFunc(input_dims[input_dims.size() - 1], sorted); + } + } else { + DoParallelQuantile(ctx, sorted, input_dims); + } + SetOutput(out_shape); + return KERNEL_STATUS_OK; +} +template +uint32_t QuantileCpuKernel::DoParallelQuantile(CpuKernelContext &ctx, std::vector sorted, + std::vector input_dims) { + int64_t input_shape_size = input_->GetTensorShape()->GetDims(); + std::vector input_shape_dims = input_->GetTensorShape()->GetDimSizes(); + int64_t n = input_->GetTensorShape()->NumElements(); + if (dim_ == kQuantileAttrDefaultDim) { + QuantileComputeDefaultFunc(sorted); + } else if (dim_ == input_shape_size - 1) { + int64_t last_shape_size = input_dims[input_dims.size() - 1]; + auto shard_quantile = [&](size_t start, size_t end) { + QuantileComputeParallelFunc(start, end, last_shape_size, sorted); + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, n / last_shape_size, last_shape_size, shard_quantile), + "Quantile Compute failed."); + } else { + input_shape_dims.push_back(1); + sorted = transpose(sorted, input_shape_dims, dim_); + int32_t m = input_shape_dims.size(); + if (m != 0) { + std::swap(input_shape_dims[m - 1], input_shape_dims[((dim_ + m) % m)]); + } + int64_t last_shape_size = input_shape_dims[input_shape_dims.size() - 1]; + auto shard_quantile = [&](size_t start, size_t end) { + QuantileComputeParallelFunc(start, end, last_shape_size, sorted); + }; + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, n / last_shape_size, last_shape_size, shard_quantile), + "Quantile Compute failed."); + } + return 0; +} +template +void QuantileCpuKernel::SetOutput(std::vector &out_shape) { + auto output_addr = reinterpret_cast(output_->GetData()); + + int64_t l = output_->GetTensorShape()->NumElements(); + std::vector out; + int64_t q_dim = q_->GetTensorShape()->GetDims(); + std::vector tmp(out_shape); + if (q_dim > 0) { + for (int i = 0; i < l; i++) { + out.push_back(*(output_addr + i)); + } + + int64_t out_end_shape = out_shape[out_shape.size() - 1]; + out_shape.push_back(out_end_shape); + std::swap(out_shape[0], out_shape[out_shape.size() - 1]); + out_shape.erase(out_shape.begin()); + out_shape.insert(out_shape.begin(), 1); + out = transpose(out, out_shape, 0); + for (int i = 0; i < l; i++) { + output_addr[i] = out[i]; + } + } + output_->GetTensorShape()->SetDimSizes(tmp); +} + +uint32_t QuantileCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kQuantileInputNum, kQuantileOutputNum), "[%s] check params failed.", kQuantile); + uint32_t res = KERNEL_STATUS_OK; + + auto data_type = ctx.Input(0)->GetDataType(); + switch (data_type) { + case DT_FLOAT: + res = GetInputAndCheck(ctx); + break; + case DT_DOUBLE: + res = GetInputAndCheck(ctx); + break; + default: + KERNEL_LOG_ERROR("Quantile invalid input type [%s]", DTypeStr(data_type).c_str()); + break; + } + KERNEL_CHECK_FALSE((res == KERNEL_STATUS_OK), res, "GetInputAndCheck failed."); + switch (data_type) { + case DT_FLOAT: + res = QuantileCompute(ctx); + break; + case DT_DOUBLE: + res = QuantileCompute(ctx); + break; + default: + KERNEL_LOG_ERROR("Quantile invalid input type [%s]", DTypeStr(data_type).c_str()); + break; + } + if (res != KERNEL_STATUS_OK) { + return KERNEL_STATUS_INNER_ERROR; + } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kQuantile, QuantileCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.h new file mode 100644 index 00000000000..855dde4fe19 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/quantile.h @@ -0,0 +1,61 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_QUANTILE_H_ +#define AICPU_KERNELS_NORMALIZED_QUANTILE_H_ + +#include + +#include "cpu_ops_kernel.h" +namespace aicpu { +class QuantileCpuKernel : public CpuKernel { + public: + QuantileCpuKernel() = default; + + ~QuantileCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + uint32_t GetInputAndCheck(CpuKernelContext &ctx); + template + uint32_t QuantileCompute(CpuKernelContext &ctx); + uint32_t MaybeWrapDim(int64_t dim, int64_t dim_post_expr); + template + void QuantileComputeSerialFunc(int64_t last_shape_size, std::vector &sorted); + template + void QuantileComputeParallelFunc(size_t start, size_t end, int64_t last_shape_size, std::vector &sorted); + + template + void QuantileComputeDefaultFunc(std::vector &sorted); + std::vector SetQuantileOutputShape(); + template + void SetOutput(std::vector &out_shape); + template + uint32_t DoParallelQuantile(CpuKernelContext &ctx, std::vector sorted, std::vector input_dims); + int64_t last_shape_size_ = 0; + bool ignore_nan_ = false; + bool keep_dims_ = false; + int dim_ = 0; + int64_t input_dim_ = 0; + Tensor *input_ = nullptr; + Tensor *output_ = nullptr; + Tensor *q_ = nullptr; + bool has_nan_ = false; +}; +} // namespace aicpu +#endif \ No newline at end of file diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.cc new file mode 100644 index 00000000000..52f017edce6 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.cc @@ -0,0 +1,154 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "sparse_segment_sqrt_n.h" + +#include + +#include "Eigen/Core" +#include "utils/kernel_util.h" + +namespace aicpu { +const uint32_t kInputNum = 3; +const uint32_t kOutputNum = 1; +const char *SparseSegmentSqrtN = "SparseSegmentSqrtN"; + +#define COMPUTE_CASE(DTYPE, TYPE, DTYPE_1, DTYPE_2, CTX) \ + case (DTYPE): \ + if ((DTYPE_1) == DT_INT32) { \ + if ((DTYPE_2) == DT_INT32) { \ + return ComputeKernal(CTX); \ + } else { \ + return ComputeKernal(CTX); \ + } \ + } else { \ + if ((DTYPE_2) == DT_INT32) { \ + return ComputeKernal(CTX); \ + } else { \ + return ComputeKernal(CTX); \ + } \ + } \ + break; +} // namespace aicpu + +namespace aicpu { +uint32_t SparseSegmentSqrtNCpuKernel::Compute(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentSqrtN normalcheck failed."); + Tensor *x = ctx.Input(0); + Tensor *indices = ctx.Input(1); + Tensor *segment_ids = ctx.Input(2); + + auto x_shape = x->GetTensorShape(); + auto indices_shape = indices->GetTensorShape(); + auto segment_ids_shape = segment_ids->GetTensorShape(); + + if (x_shape->GetDims() < 1) { + KERNEL_LOG_ERROR("[%s] Tensor input0's rank less than 1.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + if (indices_shape->NumElements() != segment_ids_shape->NumElements()) { + KERNEL_LOG_ERROR("[%s] Tensor input1&input2's ranks mismatch.", ctx.GetOpType().c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + auto x_data_type = x->GetDataType(); + auto indices_data_type = indices->GetDataType(); + auto segment_ids_data_type = segment_ids->GetDataType(); + + if (x_data_type != DT_FLOAT && x_data_type != DT_DOUBLE && x_data_type != DT_FLOAT16) { + KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(x_data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if ((indices_data_type != DT_INT32 && indices_data_type != DT_INT64) || + (segment_ids_data_type != DT_INT32 && segment_ids_data_type != DT_INT64)) { + KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(indices_data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + + switch (x_data_type) { + COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, segment_ids_data_type, ctx) + COMPUTE_CASE(DT_FLOAT, float, indices_data_type, segment_ids_data_type, ctx) + COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, segment_ids_data_type, ctx) + default: + KERNEL_LOG_ERROR("SparseSegmentSqrtN kernel data type [%s] not support.", DTypeStr(x_data_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +template +uint32_t SparseSegmentSqrtNCpuKernel::ComputeKernal(CpuKernelContext &ctx) { + size_t n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0); + size_t m = ctx.Input(2)->GetTensorShape()->NumElements(); + size_t k = ctx.Output(0)->GetTensorShape()->NumElements(); + auto x_addr = reinterpret_cast(ctx.Input(0)->GetData()); + auto indices_addr = reinterpret_cast(ctx.Input(1)->GetData()); + auto segment_ids_addr = reinterpret_cast(ctx.Input(2)->GetData()); + auto y_addr = reinterpret_cast(ctx.Output(0)->GetData()); + std::vector x_shape_list = ctx.Input(0)->GetTensorShape()->GetDimSizes(); + x_shape_list[0] = segment_ids_addr[m - 1] + 1; + ctx.Output(0)->GetTensorShape()->SetDimSizes(x_shape_list); + for (size_t i = 0; i < k; i++) { + y_addr[i] = (T1)0; + } + if (segment_ids_addr[0] != 0) { + KERNEL_LOG_ERROR("segment_ids can't miss ids."); + return KERNEL_STATUS_PARAM_INVALID; + } + for (size_t i = 1; i < m; i++) { + if (segment_ids_addr[i] < segment_ids_addr[i - 1]) { + KERNEL_LOG_ERROR("segment_ids should be sorted."); + return KERNEL_STATUS_PARAM_INVALID; + } + if (segment_ids_addr[i] - segment_ids_addr[i - 1] > 1) { + KERNEL_LOG_ERROR("segment_ids can't miss ids."); + return KERNEL_STATUS_PARAM_INVALID; + } + } + for (size_t i = 0; i < m; i++) { + if (indices_addr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) { + KERNEL_LOG_ERROR("indices out of range."); + return KERNEL_STATUS_PARAM_INVALID; + } + } + int oldindex = -1; + int countnum = 0; + for (size_t i = 0; i < m; i++) { + if (oldindex == segment_ids_addr[i]) { + countnum++; + } else if (countnum != 0) { + for (size_t j = 0; j < n; j++) { + y_addr[j + oldindex * n] /= (T1)(sqrt(countnum)); + } + countnum = 1; + oldindex = segment_ids_addr[i]; + } else { + countnum = 1; + oldindex = segment_ids_addr[i]; + } + for (size_t j = 0; j < n; j++) { + y_addr[j + oldindex * n] += x_addr[j + indices_addr[i] * n]; + } + } + if (countnum != 0) { + for (size_t j = 0; j < n; j++) { + y_addr[j + oldindex * n] /= (T1)(sqrt(countnum)); + } + } + return KERNEL_STATUS_OK; +} +REGISTER_CPU_KERNEL(SparseSegmentSqrtN, SparseSegmentSqrtNCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.h new file mode 100644 index 00000000000..04884d14280 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_segment_sqrt_n.h @@ -0,0 +1,38 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_H_ +#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_H_ + +#include "cpu_ops_kernel.h" +#include "cpu_types.h" +#include "utils/bcast.h" +#include "utils/sparse_tensor.h" + +namespace aicpu { +class SparseSegmentSqrtNCpuKernel : public CpuKernel { + public: + SparseSegmentSqrtNCpuKernel() = default; + ~SparseSegmentSqrtNCpuKernel() override = default; + + protected: + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + uint32_t ComputeKernal(CpuKernelContext &ctx); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.cc new file mode 100644 index 00000000000..613f2f5b6d3 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.cc @@ -0,0 +1,171 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "unsorted_segment_prod.h" + +#include +#include "cpu_kernel_utils.h" +#include "cpu_types.h" +#include "utils/eigen_tensor.h" +#include "utils/kernel_util.h" + +namespace { +const char *kUnsortedSegmentProd = "UnsortedSegmentProd"; +const uint32_t input_num = 3; +const uint32_t output_num = 1; +constexpr int64_t kParallelDataNums = 64 * 1024; +} // namespace + +namespace aicpu { +template +uint32_t UnsortedSegmentProdCpuKernel::UnsortedSegmentProdComputeTemplate(CpuKernelContext &ctx) { + KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_num, output_num), " node input size should be [%llu], get [%llu]", + input_num, ctx.GetInputsSize(), " node output size should be [%llu], get [%llu]", output_num, + ctx.GetOutputsSize()); + if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) { + KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s]", + DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) { + KERNEL_LOG_ERROR( + "The data size of the input [%llu] need be the same as the output " + "[%llu]", + ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize()); + return KERNEL_STATUS_PARAM_INVALID; + } + int64_t data_size = ctx.Input(0)->NumElements(); + int64_t id_size = ctx.Input(1)->NumElements(); + + auto input_x = reinterpret_cast(ctx.Input(0)->GetData()); + KERNEL_CHECK_NULLPTR(input_x, KERNEL_STATUS_PARAM_INVALID, "Get input data failed") + auto output_y = reinterpret_cast(ctx.Output(0)->GetData()); + KERNEL_CHECK_NULLPTR(output_y, KERNEL_STATUS_PARAM_INVALID, "Get output data failed") + auto segmentids = reinterpret_cast(ctx.Input(1)->GetData()); + KERNEL_CHECK_NULLPTR(segmentids, KERNEL_STATUS_PARAM_INVALID, "Get segment_ids failed") + auto numsegments = reinterpret_cast(ctx.Input(2)->GetData()); + KERNEL_CHECK_NULLPTR(numsegments, KERNEL_STATUS_PARAM_INVALID, "Get num_segments failed") + + if (id_size <= 0) { + KERNEL_LOG_ERROR("segment_ids num elements should great than 0"); + return KERNEL_STATUS_PARAM_INVALID; + } + + int64_t reshapesize = data_size / id_size; + // Initialized to 1 + for (int64_t k = 0; k < data_size; k++) { + *(output_y + k) = static_cast(1); + } + if (data_size <= kParallelDataNums) { + // calculation process + for (int64_t i = 0; i < id_size; i++) { + if (*(segmentids + i) < *numsegments) { + for (int64_t j = 0; j < reshapesize; j++) { + *(output_y + *(segmentids + i) * reshapesize + j) *= *(input_x + i * reshapesize + j); + } + } + } + } else { + uint32_t min_core_num = 1; + uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2); + if (max_core_num > reshapesize) { + max_core_num = reshapesize; + } + // calculation process + auto shard_unsorted_segment_prod = [&](int64_t start, int64_t end) { + for (int64_t i = 0; i < id_size; i++) { + if (*(segmentids + i) < *numsegments) { + for (int64_t j = start; j < end; j++) { + *(output_y + *(segmentids + i) * reshapesize + j) *= *(input_x + i * reshapesize + j); + } + } + } + }; + KERNEL_HANDLE_ERROR( + CpuKernelUtils::ParallelFor(ctx, reshapesize, reshapesize / max_core_num, shard_unsorted_segment_prod), + "CpuKernelUtils::ParallelFor failed."); + } + return KERNEL_STATUS_OK; +} + +template +uint32_t UnsortedSegmentProdCpuKernel::DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type) { + switch (num_segments_type) { + case DT_INT32: + return UnsortedSegmentProdComputeTemplate(ctx); + case DT_INT64: + return UnsortedSegmentProdComputeTemplate(ctx); + + default: + KERNEL_LOG_ERROR("UnsortedSegmentProd invalid num_segments_type type [%s]", DTypeStr(num_segments_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } +} + +template +uint32_t UnsortedSegmentProdCpuKernel::DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type) { + auto num_segments_type = ctx.Input(2)->GetDataType(); + switch (segment_ids_type) { + case DT_INT32: + return DoComputeWithNumSegmentsType(ctx, num_segments_type); + case DT_INT64: + return DoComputeWithNumSegmentsType(ctx, num_segments_type); + + default: + KERNEL_LOG_ERROR("UnsortedSegmentProd invalid segment_ids_type type [%s]", DTypeStr(segment_ids_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } +} + +uint32_t UnsortedSegmentProdCpuKernel::Compute(CpuKernelContext &ctx) { + auto input_type = ctx.Input(0)->GetDataType(); + auto segment_ids_type = ctx.Input(1)->GetDataType(); + switch (input_type) { + case DT_INT32: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_INT16: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_FLOAT: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_DOUBLE: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_FLOAT16: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_INT8: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_INT64: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_UINT8: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_UINT16: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_UINT32: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_UINT64: + return DoComputeWithSegmentIdsType(ctx, segment_ids_type); + case DT_COMPLEX64: + return DoComputeWithSegmentIdsType>(ctx, segment_ids_type); + case DT_COMPLEX128: + return DoComputeWithSegmentIdsType>(ctx, segment_ids_type); + default: + KERNEL_LOG_ERROR("UnsortedSegmentProd invalid input type [%s]", DTypeStr(input_type).c_str()); + return KERNEL_STATUS_PARAM_INVALID; + } + return KERNEL_STATUS_OK; +} + +REGISTER_CPU_KERNEL(kUnsortedSegmentProd, UnsortedSegmentProdCpuKernel); +} // namespace aicpu diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.h new file mode 100644 index 00000000000..0b0df24982d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/unsorted_segment_prod.h @@ -0,0 +1,37 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_PROD_H +#define AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_PROD_H + +#include "cpu_ops_kernel.h" + +namespace aicpu { +class UnsortedSegmentProdCpuKernel : public CpuKernel { + public: + ~UnsortedSegmentProdCpuKernel() = default; + uint32_t Compute(CpuKernelContext &ctx) override; + + private: + template + uint32_t UnsortedSegmentProdComputeTemplate(CpuKernelContext &ctx); + template + uint32_t DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type); + template + uint32_t DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type); +}; +} // namespace aicpu +#endif diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/equal_util.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/equal_util.h index 3bae061e034..65ffcc0f482 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/equal_util.h +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/utils/equal_util.h @@ -41,7 +41,7 @@ uint32_t EqualCalculate(const CpuKernelContext &ctx, BCalcInfo &calcInfo, bool f output_y[i] = (flag == true) ? (*x_index == *y_index) : (*x_index != *y_index); } }; - KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, 1, shard_equal), "Equal calculate failed.") + KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, 1, shard_equal), "Equal calculate failed."); return KERNEL_STATUS_OK; } /** @@ -69,7 +69,7 @@ uint32_t EqualCompute(const CpuKernelContext &ctx, bool flag) { calcInfo.input_1->GetDataSize(), calcInfo.output->GetData(), calcInfo.output->GetDataSize()); Bcast bcast; - KERNEL_HANDLE_ERROR(bcast.GenerateBcastInfo(calcInfo), "Generate broadcast info failed.") + KERNEL_HANDLE_ERROR(bcast.GenerateBcastInfo(calcInfo), "Generate broadcast info failed."); bcast.BCastIndexes(calcInfo.x_indexes, calcInfo.y_indexes); bcast.GetBcastVec(calcInfo); diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc index 1ba5183d798..c42b37201cb 100644 --- a/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc +++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc @@ -51,8 +51,12 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An static const std::set kMigrateAicpuKernelOps = {mindspore::kAdaptiveAvgPool2dOpName, mindspore::kAdaptiveAvgPool2dGradOpName, mindspore::kCacheSwapTableOpName, + mindspore::kCol2imOpName, + mindspore::kCumulativeLogsumexpOpName, + mindspore::kDataFormatVecPermuteOpName, mindspore::kFillOpName, mindspore::kLogMatrixDeterminantOpName, + mindspore::kMatrixSolveLsOpName, mindspore::kMaskedSelectOpName, mindspore::kMaskedSelectGradOpName, mindspore::kMedianOpName, @@ -71,6 +75,10 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An mindspore::kNanToNumOpName, mindspore::kQrOpName, mindspore::kResizeBicubicOpName}; + mindspore::kNuclearNormOpName, + mindspore::kQuantileOpName, + mindspore::kSparseSegmentSqrtNOpName, + mindspore::kUnsortedSegmentProdOpName}; static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels"; static const std::string kCpuKernelSoName = "mindspore_cpu_kernels"; diff --git a/mindspore/python/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/python/mindspore/ops/_op_impl/aicpu/__init__.py index 609a7bdfde2..9f442b85dcb 100644 --- a/mindspore/python/mindspore/ops/_op_impl/aicpu/__init__.py +++ b/mindspore/python/mindspore/ops/_op_impl/aicpu/__init__.py @@ -171,3 +171,5 @@ from .median_grad import _median_grad_aicpu from .reduce_sum import _reduce_sum_aicpu from .adaptive_avg_pool_2d_v1 import _adaptive_avg_pool_2d_v1_aicpu from .fill_v2 import _fill_v2_aicpu +from .data_format_vec_permute import _data_format_vec_permute_aicpu +from .quantile import _quantile_aicpu