aicpu migration 35 ops, 0105 branch

This commit is contained in:
lilinjie 2023-01-05 19:11:49 +08:00
parent 551bcec327
commit 66cfa84dce
87 changed files with 11161 additions and 28 deletions

View File

@ -98,3 +98,8 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "zerodivcond"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noConstructor"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noExplicitConstructor"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "identicalConditionAfterEarlyExit"

View File

@ -282,3 +282,44 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc:aicpu::Col2imCpuKernel::Col2imParamCheck
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd_update.cc:aicpu::ScatterNdUpdateCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_sparse.cc:aicpu::RaggedTensorToSparseCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_tensor.cc:aicpu::RaggedTensorToTensorCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::GenerateRandomCrop
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc:aicpu::ScatterNdCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool.cc:aicpu::SpacialMaxPool
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/slice.cc:aicpu::SliceCpuKernel::SliceCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradC
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradComputeFP16
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP16

View File

@ -1,26 +0,0 @@
mindspore.ops.PSROIPooling
==========================
.. py:class:: mindspore.ops.PSROIPooling(spatial_scale, group_size, output_dim)
对输入Tensor应用Position Sensitive ROI-Pooling。
参数:
- **spatial_scale** (float) - 将框坐标映射到输入坐标的比例因子。例如如果你的框定义在224x224的图像上并且你的输入是112x112的特征图由原始图像的0.5倍缩放产生此时需要将其设置为0.5。
- **group_size** (int) - 执行池化后输出的大小(以像素为单位),以(高度,宽度)的格式输出。
- **output_dim** (int) - 执行池化后输出的维度。
输入:
- **features** (Tensor) - 输入特征Tensor其shape必须为 :math:`(N, C, H, W)` 。 各维度的值应满足: :math:`(C == output\_dim * group\_size * group\_size)` 。数据类型为float16或者float32。
- **rois** (Tensor) - 其shape为 :math:`(batch, 5, rois_n)` 数据类型为float16或者float32。第一个维度的batch为批处理大小。第二个维度的大小必须为5。第三维度rois_n是rois的数量。rois_n的值格式为(index, x1, y1, x2, y2)。其中第一个元素是rois的索引。方框坐标格式为(x1、y1、x2、y2)之后将把这些方框的选中的区域提取出来。区域坐标必须满足0 <= x1 < x2和0 <= y1 < y2。
输出:
- **out** (Tensor) - 池化后的输出。其shape为 :math:`(rois.shape[0] * rois.shape[2], output\_dim, group\_size, group\_size)`
异常:
- **TypeError** - `spatial_scale` 不是float类型。
- **TypeError** - `group_size` 或者 `output_dim` 不是 int类型。
- **TypeError** - `features` 或者 `rois` 不是Tensor。
- **TypeError** - `rois` 数据类型不是float16或者float32。
- **ValueError** - `features` 的shape不满足 :math:`(C == output\_dim * group\_size * group\_size)`
- **ValueError** - `spatial_scale` 为负数。

View File

@ -157,6 +157,7 @@ constexpr auto kCastOpName = "Cast";
constexpr auto kCentralizationOpName = "Centralization";
constexpr auto kCeLUOpName = "CeLU";
constexpr auto kCeluV2OpName = "CeluV2";
constexpr auto kCheckNumericsOpName = "CheckNumerics";
constexpr auto kClearZeroOpName = "ClearZero";
constexpr auto kClipBoxesOpName = "kClipBoxes";
constexpr auto kClipBoxesDOpName = "kClipBoxesD";
@ -277,6 +278,7 @@ constexpr auto kFillV2DOpName = "FillV2D";
constexpr auto kFSEDecodeOpName = "FSEDecode";
constexpr auto kFive2FourOpName = "Five2Four";
constexpr auto kFlattenGradOpName = "FlattenGrad";
constexpr auto kFloorDivOpName = "FloorDiv";
constexpr auto kFour2FiveOpName = "Four2Five";
constexpr auto kFractionalAvgPoolGradOpName = "FractionalAvgPoolGrad";
constexpr auto kFusedAdaFactorName = "FusedAdaFactor";
@ -386,6 +388,7 @@ constexpr auto kLinSpaceDOpName = "LinSpaceD";
constexpr auto kListDiffOpName = "ListDiff";
constexpr auto kLogMatrixDeterminantOpName = "LogMatrixDeterminant";
constexpr auto kLogOpName = "Log";
constexpr auto kLog1pOpName = "Log1p";
constexpr auto kLogSoftmaxOpName = "LogSoftmax";
constexpr auto kLogSoftmaxV2OpName = "LogSoftmaxV2";
constexpr auto kLogSoftmaxGradOpName = "LogSoftmaxGrad";
@ -409,6 +412,8 @@ constexpr auto kMatrixSetDiagOpName = "MatrixSetDiag";
constexpr auto kMatrixSetDiagDOpName = "MatrixSetDiagD";
constexpr auto kMatrixSetDiagV3OpName = "MatrixSetDiagV3";
constexpr auto kMatrixSolveLsOpName = "MatrixSolveLs";
constexpr auto kMatrixTriangularSolveOpName = "MatrixTriangularSolve";
constexpr auto kMaximumGradGradOpName = "MaximumGradGrad";
constexpr auto kMaximumGradOpName = "MaximumGrad";
constexpr auto kMaximumOpName = "Maximum";
constexpr auto kMaxPool3DGradGradOpName = "MaxPool3DGradGrad";
@ -422,15 +427,22 @@ constexpr auto kMaxPoolExt2OpName = "MaxPoolExt2";
constexpr auto kMaxPoolWithArgmaxOpName = "MaxPoolWithArgmax";
constexpr auto kMaxUnpool2DOpName = "MaxUnpool2D";
constexpr auto kMaxUnpool2DGradOpName = "MaxUnpool2DGrad";
constexpr auto kMaxUnpool3DOpName = "MaxUnpool3D";
constexpr auto kMaxUnpool3DGradOpName = "MaxUnpool3DGrad";
constexpr auto kMeanGradOpName = "MeanGrad";
constexpr auto kMedianOpName = "Median";
constexpr auto kMedianGradOpName = "MedianGrad";
constexpr auto kMemCpyAsyncOpName = "memcpy_async";
constexpr auto kMinimumGradGradOpName = "MinimumGradGrad";
constexpr auto kMinimumGradOpName = "MinimumGrad";
constexpr auto kMinimumOpName = "Minimum";
constexpr auto kMirrorPadOpName = "MirrorPad";
constexpr auto kMomentumOpName = "Momentum";
constexpr auto kMulOpName = "Mul";
constexpr auto kMulNoNanOpName = "MulNoNan";
constexpr auto kMultilabelMarginLossGradOpName = "MultilabelMarginLossGrad";
constexpr auto kMultiMarginLossGradOpName = "MultiMarginLossGrad";
constexpr auto kMultiMarginLossOpName = "MultiMarginLoss";
constexpr auto kMultinomialOpName = "Multinomial";
constexpr auto kMuxReceiveOpName = "MuxReceive";
constexpr auto kMuxSendOpName = "MuxSend";
@ -438,17 +450,21 @@ constexpr auto kNanToNumOpName = "NanToNum";
constexpr auto kNegOpName = "Neg";
constexpr auto kIm2ColOpName = "Im2Col";
constexpr auto kNewIm2ColOpName = "NewIm2Col";
constexpr auto kNextAfterOpName = "NextAfter";
constexpr auto kIm2colOpName = "Im2col";
constexpr auto kNMSWithMaskOpName = "NMSWithMask";
constexpr auto kNonDeterministicInts = "NonDeterministicInts";
constexpr auto kNonDeterministicIntsOpName = "NonDeterministicInts";
constexpr auto kNonMaxSuppressionV3OpName = "NonMaxSuppressionV3";
constexpr auto kNonZeroOpName = "NonZero";
constexpr auto kNPUAllocFloatStatusOpName = "NPUAllocFloatStatus";
constexpr auto kNPUClearFloatStatusOpName = "NPUClearFloatStatus";
constexpr auto kNPUGetFloatStatusOpName = "NPUGetFloatStatus";
constexpr auto kNthElementOpName = "NthElement";
constexpr auto kNuclearNormOpName = "NuclearNorm";
constexpr auto kOneHotOpName = "OneHot";
constexpr auto kOneHotDOpName = "OneHotD";
constexpr auto kOrgqrOpName = "Orgqr";
constexpr auto kPadAndShiftOpName = "PadAndShift";
constexpr auto kPaddingOpName = "Padding";
constexpr auto kPadOpName = "Pad";
@ -457,8 +473,11 @@ constexpr auto kParallelResizeBilinearOpName = "ParallelResizeBilinear";
constexpr auto kSyncResizeBilinearV2OpName = "SyncResizeBilinearV2";
constexpr auto kParallelResizeBilinearGradOpName = "ParallelResizeBilinearGrad";
constexpr auto kSyncResizeBilinearV2GradOpName = "SyncResizeBilinearV2Grad";
constexpr auto kParameterizedTruncatedNormalOpName = "ParameterizedTruncatedNormal";
constexpr auto kPartialOpName = "partial";
constexpr auto kPdistGradOpName = "PdistGrad";
constexpr auto kPoissonOpName = "Poisson";
constexpr auto kPolarOpName = "Polar";
constexpr auto kPoolingOpName = "Pooling";
constexpr auto kPSROIPoolingOpName = "PSROIPooling";
constexpr auto kPSROIPoolingV2OpName = "PSROIPoolingV2";
@ -481,13 +500,18 @@ constexpr auto kPushOpName = "Push";
constexpr auto kQrOpName = "Qr";
constexpr auto kPushWeightOpName = "PushWeight";
constexpr auto kQuantileOpName = "Quantile";
constexpr auto kRaggedRangeOpName = "RaggedRange";
constexpr auto kRaggedTensorToSparseOpName = "RaggedTensorToSparse";
constexpr auto kRaggedTensorToTensorOpName = "RaggedTensorToTensor";
constexpr auto kRandomChoiceWithMaskOpName = "RandomChoiceWithMask";
constexpr auto kRandomPoissonOpName = "RandomPoisson";
constexpr auto kRandomShuffleOpName = "RandomShuffle";
constexpr auto kRangeOpName = "Range";
constexpr auto kRangeDOpName = "RangeD";
constexpr auto kQuantDTypeCastOpName = "QuantDTypeCast";
constexpr auto kRealDivOpName = "RealDiv";
constexpr auto kReciprocalOpName = "Reciprocal";
constexpr auto kReciprocalGradOpName = "ReciprocalGrad";
constexpr auto kRecvOpName = "StreamRecv";
constexpr auto kReduceAllOpName = "ReduceAll";
constexpr auto kReduceAllDOpName = "ReduceAllD";
@ -536,6 +560,7 @@ constexpr auto kResizeNearestNeighborV2DOpName = "ResizeNearestNeighborV2D";
constexpr auto kReverseV2OpName = "ReverseV2";
constexpr auto kReverseV2DOpName = "ReverseV2D";
constexpr auto kReturnOpName = "Return";
constexpr auto kRGBToHSVOpName = "RGBToHSV";
constexpr auto kROIAlignGradName = "ROIAlignGrad";
constexpr auto kRpcRecvOpName = "RpcRecv";
constexpr auto kRpcSendOpName = "RpcSend";
@ -543,6 +568,9 @@ constexpr auto kRpnProposalsOpName = "RpnProposals";
constexpr auto kRpnProposalsDOpName = "RpnProposalsD";
constexpr auto kRsqrtGradOpName = "RsqrtGrad";
constexpr auto kRsqrtOpName = "Rsqrt";
constexpr auto kSampleDistortedBoundingBoxExt2OpName = "SampleDistortedBoundingBoxExt2";
constexpr auto kScaleAndTranslateOpName = "ScaleAndTranslate";
constexpr auto kScaleAndTranslateGradOpName = "ScaleAndTranslateGrad";
constexpr auto kScatterAddOpName = "ScatterAdd";
constexpr auto kScatterNdOpName = "ScatterNd";
constexpr auto kScatterNdDOpName = "ScatterNdD";
@ -554,13 +582,19 @@ constexpr auto kSegmentMinOpName = "SegmentMin";
constexpr auto kSegmentProdOpName = "SegmentProd";
constexpr auto kSegmentSumOpName = "SegmentSum";
constexpr auto kSelectOpName = "Select";
constexpr auto kSelfAdjointEigOpName = "SelfAdjointEig";
constexpr auto kSeLUOpName = "SeLU";
constexpr auto kSeluOpName = "Selu";
constexpr auto kSendOpName = "StreamSend";
constexpr auto kSetSizeOpName = "SetSize";
constexpr auto kSGDName = "SGD";
constexpr auto kSigmoidOpName = "Sigmoid";
constexpr auto kSigmoidCrossEntropyWithLogitsV2OpName = "SigmoidCrossEntropyWithLogitsV2";
constexpr auto kSignOpName = "Sign";
constexpr auto kSimpleMeanGradOpName = "SimpleMeanGrad";
constexpr auto kSinOpName = "Sin";
constexpr auto kSincOpName = "Sinc";
constexpr auto kSinhOpName = "Sinh";
constexpr auto kSliceGradOpName = "SliceGrad";
constexpr auto kSliceOpName = "Slice";
constexpr auto kSliceDV2OpName = "SliceDV2";

View File

@ -0,0 +1,134 @@
/**
* Copyright 2021 Jilin University
* Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved..
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "check_numerics.h"
#include <securec.h>
#include "unsupported/Eigen/CXX11/Tensor"
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/kernel_util.h"
namespace {
const std::uint32_t kCheckNumericsInputNum{1};
const std::uint32_t kCheckNumericsOutputNum{1};
const char *const kCheckNumerics{"CheckNumerics"};
const std::int64_t kCheckNumericsParallelNum{64 * 1024};
} // namespace
namespace aicpu {
namespace detail {
template <typename T>
inline bool ScalarCheckNumerics(const T x) {
return !std::isfinite(x);
}
template <>
inline bool ScalarCheckNumerics(const Eigen::half x) {
return !Eigen::half_impl::isfinite(x);
}
inline std::uint32_t ParallelForCheckNumerics(const CpuKernelContext &ctx, std::int64_t total,
std::int64_t per_unit_size,
const std::function<void(std::int64_t, std::int64_t)> &work) {
if (total > kCheckNumericsParallelNum)
return aicpu::CpuKernelUtils::ParallelFor(ctx, total, per_unit_size, work);
else
work(0, total);
return KERNEL_STATUS_OK;
}
template <typename T>
inline std::uint32_t ComputeCheckNumericsKernel(const CpuKernelContext &ctx) {
T *input0{static_cast<T *>(ctx.Input(0)->GetData())};
T *output{static_cast<T *>(ctx.Output(0)->GetData())};
std::int64_t total{ctx.Input(0)->NumElements()};
std::uint32_t core_num{aicpu::CpuKernelUtils::GetCPUNum(ctx)};
std::int64_t per_unit_size{total / std::min(std::max(1L, core_num - 2L), total)};
bool flag = false;
std::uint32_t ret = ParallelForCheckNumerics(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
flag = flag || std::any_of(input0 + begin, input0 + end, ScalarCheckNumerics<T>);
if (!flag) {
auto ret = memcpy_s(output + begin, static_cast<size_t>((end - begin) * sizeof(T)), input0 + begin,
static_cast<size_t>((end - begin) * sizeof(T)));
if (ret != EOK) {
KERNEL_LOG_ERROR("memcpy_s error");
}
}
});
return flag ? KERNEL_STATUS_PARAM_INVALID : ret;
}
template <typename T>
inline std::uint32_t ComputeCheckNumerics(const CpuKernelContext &ctx) {
std::uint32_t result{ComputeCheckNumericsKernel<T>(ctx)};
if (result != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("CheckNumerics compute failed.");
}
return result;
}
inline std::uint32_t ExtraCheckCheckNumerics(const CpuKernelContext &ctx) {
if (ctx.Input(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get input data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Output(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get output data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
KERNEL_LOG_ERROR(
"The data size of the input [%llu] need be the same as the output "
"[%llu].",
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
inline std::uint32_t CheckCheckNumerics(CpuKernelContext &ctx) {
return NormalCheck(ctx, kCheckNumericsInputNum, kCheckNumericsOutputNum) ? KERNEL_STATUS_PARAM_INVALID
: ExtraCheckCheckNumerics(ctx);
}
inline std::uint32_t ComputeCheckNumerics(const CpuKernelContext &ctx) {
DataType input_type{ctx.Input(0)->GetDataType()};
switch (input_type) {
case DT_FLOAT16:
return ComputeCheckNumerics<Eigen::half>(ctx);
case DT_FLOAT:
return ComputeCheckNumerics<std::float_t>(ctx);
case DT_DOUBLE:
return ComputeCheckNumerics<std::double_t>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
} // namespace detail
std::uint32_t CheckNumericsCpuKernel::Compute(CpuKernelContext &ctx) {
return detail::CheckCheckNumerics(ctx) ? KERNEL_STATUS_PARAM_INVALID : detail::ComputeCheckNumerics(ctx);
}
REGISTER_CPU_KERNEL(kCheckNumerics, CheckNumericsCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,29 @@
/**
* Copyright 2021 Jilin University
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_CHECK_NUMERICS_H
#define AICPU_KERNELS_NORMALIZED_CHECK_NUMERICS_H
#include "cpu_ops_kernel.h"
namespace aicpu {
class CheckNumericsCpuKernel final : public CpuKernel {
public:
std::uint32_t Compute(CpuKernelContext &ctx) override;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,296 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021.All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "floordiv.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *const kFloorDiv = "FloorDiv";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 4 * 1024;
const int64_t kParallelDataNumSameShape = 16 * 1024;
const int64_t kParallelDataNumSameShapeMid = 32 * 1024;
#define FLOORDIV_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = FloorDivCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("FloorDiv kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t FloorDivCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kFloorDiv);
KERNEL_HANDLE_ERROR(FloorDivParamCheck(ctx), "[%s] check params failed.", kFloorDiv);
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
FLOORDIV_COMPUTE_CASE(DT_INT8, int8_t, ctx)
FLOORDIV_COMPUTE_CASE(DT_INT16, int16_t, ctx)
FLOORDIV_COMPUTE_CASE(DT_INT32, int32_t, ctx)
FLOORDIV_COMPUTE_CASE(DT_INT64, int64_t, ctx)
FLOORDIV_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
FLOORDIV_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
FLOORDIV_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
FLOORDIV_COMPUTE_CASE(DT_FLOAT, float, ctx)
FLOORDIV_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("FloorDiv kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t FloorDivCpuKernel::FloorDivParamCheck(const CpuKernelContext &ctx) const {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
KERNEL_CHECK_NULLPTR(input_1->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
KERNEL_CHECK_NULLPTR(output->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
return KERNEL_STATUS_OK;
}
template <typename T>
T DivCal(const T &x_i, const T &y_i) {
return static_cast<T>(Eigen::numext::floor(x_i / y_i));
}
template <>
int8_t DivCal(const int8_t &x_i, const int8_t &y_i) {
if ((x_i < 0) != (y_i < 0)) {
int8_t abs_x_i = x_i < 0 ? -x_i : x_i;
int8_t abs_y_i = y_i < 0 ? -y_i : y_i;
return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
} else {
return (x_i / y_i);
}
}
template <>
int16_t DivCal(const int16_t &x_i, const int16_t &y_i) {
if ((x_i < 0) != (y_i < 0)) {
int16_t abs_x_i = x_i < 0 ? -x_i : x_i;
int16_t abs_y_i = y_i < 0 ? -y_i : y_i;
return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
} else {
return (x_i / y_i);
}
}
template <>
int32_t DivCal(const int32_t &x_i, const int32_t &y_i) {
if ((x_i < 0) != (y_i < 0)) {
int32_t abs_x_i = x_i < 0 ? -x_i : x_i;
int32_t abs_y_i = y_i < 0 ? -y_i : y_i;
return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
} else {
return (x_i / y_i);
}
}
template <>
int64_t DivCal(const int64_t &x_i, const int64_t &y_i) {
if ((x_i < 0) != (y_i < 0)) {
int64_t abs_x_i = x_i < 0 ? -x_i : x_i;
int64_t abs_y_i = y_i < 0 ? -y_i : y_i;
return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
} else {
return (x_i / y_i);
}
}
// special compute is used in the following situations.
// 1. the shapes of input1 and input2 are the same
// 2. input1 is a 1D tensor with only one element or input1 is scalar
// 3. input2 is a 1D tensor with only one element or input2 is scalar
// 4. the shapes of input1 and input2 are different
template <typename T>
uint32_t FloorDivCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
const T *input2, T *output) {
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = start; i < end; ++i) {
if (*(input2 + i) == static_cast<T>(0)) {
KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
return KERNEL_STATUS_INNER_ERROR;
}
*(output + i) = DivCal<T>(*(input1 + i), *(input2 + i));
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
if (*(input2 + i) == static_cast<T>(0)) {
KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
return KERNEL_STATUS_INNER_ERROR;
}
*(output + i) = DivCal<T>(*input1, *(input2 + i));
}
break;
case BcastShapeType::Y_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
if (*input2 == static_cast<T>(0)) {
KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
return KERNEL_STATUS_INNER_ERROR;
}
*(output + i) = DivCal<T>(*(input1 + i), *input2);
}
break;
default:
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t FloorDivCpuKernel::NoBcastCompute(const CpuKernelContext &ctx) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type = in0_elements_nums == in1_elements_nums
? BcastShapeType::SAME_SHAPE
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, static_cast<int64_t>(4)); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
uint32_t status = KERNEL_STATUS_OK;
auto sharder_floor_div = [&](int64_t start, int64_t end) {
uint32_t status_sharder = SpecialCompute<T>(type, start, end, in0, in1, out);
if (status_sharder != KERNEL_STATUS_OK) {
status = status_sharder;
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_floor_div),
"FloorDiv Compute failed.");
return status;
}
return SpecialCompute<T>(type, 0, data_num, in0, in1, out);
}
template <typename T>
uint32_t FloorDivCpuKernel::BcastParallelCompute(const CpuKernelContext &ctx, const Bcast &bcast) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
int64_t data_num = ctx.Output(0)->NumElements();
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, static_cast<int64_t>(4)); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
uint32_t status = KERNEL_STATUS_OK;
auto sharder_floor_div = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
if (*(in1 + bcast.GetBroadcastYIndex(i)) == static_cast<T>(0)) {
KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
status = KERNEL_STATUS_INNER_ERROR;
break;
}
*(out + i) = DivCal<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_floor_div),
"FloorDiv Compute failed.");
return status;
}
template <typename T>
uint32_t FloorDivCpuKernel::BcastCompute(const CpuKernelContext &ctx, const Bcast &bcast) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
if (data_num >= kParallelDataNum) {
return BcastParallelCompute<T>(ctx, bcast);
} else {
for (int64_t i = 0; i < data_num; ++i) {
if (*(in1 + bcast.GetBroadcastYIndex(i)) == static_cast<T>(0)) {
KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
return KERNEL_STATUS_INNER_ERROR;
}
*(out + i) = DivCal<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t FloorDivCpuKernel::FloorDivCompute(const CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (isNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
}
REGISTER_CPU_KERNEL(kFloorDiv, FloorDivCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,49 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_FLOORDIV_H_
#define AICPU_KERNELS_NORMALIZED_FLOORDIV_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class FloorDivCpuKernel : public CpuKernel {
public:
FloorDivCpuKernel() = default;
~FloorDivCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t FloorDivParamCheck(const CpuKernelContext &ctx) const;
template <typename T>
uint32_t SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
template <typename T>
uint32_t NoBcastCompute(const CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(const CpuKernelContext &ctx, const Bcast &bcast);
template <typename T>
uint32_t BcastParallelCompute(const CpuKernelContext &ctx, const Bcast &bcast);
template <typename T>
uint32_t FloorDivCompute(const CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,162 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "log1p.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *const kLog1p = "Log1p";
constexpr int64_t kParallelDataNums = 16 * 1024;
#define LOG1P_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = Log1pCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Log1p kernel compute failed."); \
return result; \
} \
break; \
}
#define LOG1P_COMPUTE_CASE2(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = Log1pComputeComplex<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Log1p kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t Log1pCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLog1p);
KERNEL_HANDLE_ERROR(Log1pCheck(ctx), "[%s] check params failed.", kLog1p);
DataType data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
LOG1P_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
LOG1P_COMPUTE_CASE(DT_FLOAT, float, ctx)
LOG1P_COMPUTE_CASE(DT_DOUBLE, double, ctx)
LOG1P_COMPUTE_CASE2(DT_COMPLEX64, std::complex<float>, ctx)
LOG1P_COMPUTE_CASE2(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("Log1p kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t Log1pCpuKernel::Log1pCheck(const CpuKernelContext &ctx) const {
auto input_0 = ctx.Input(0);
auto output_0 = ctx.Output(0);
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
KERNEL_CHECK_NULLPTR(input_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t Log1pCpuKernel::Log1pCompute(const CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Input(0)->NumElements();
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
if (data_size <= kParallelDataNums) {
for (int64_t i = 0; i < data_num; i++) {
KERNEL_CHECK_FALSE(*(input_x + i) >= static_cast<T>(-1), KERNEL_STATUS_PARAM_INVALID,
"[%llu] must be at least more than -1.", i);
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
}
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_log1p = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
KERNEL_CHECK_FALSE(*(input_x + i) >= static_cast<T>(-1), KERNEL_STATUS_PARAM_INVALID,
"[%llu] must be at least more than -1.", i);
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
}
return KERNEL_STATUS_OK;
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log1p),
"Log1p Compute failed.");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t Log1pCpuKernel::Log1pComputeComplex(const CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto data_type = ctx.Input(0)->GetDataType();
int64_t data_num = ctx.Input(0)->NumElements();
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
typedef Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> ArrayxXd;
ArrayxXd array_x(1, data_num);
if (data_size <= kParallelDataNums) {
if (data_type == DT_COMPLEX64) {
for (int64_t i = 0; i < data_num; i++) {
array_x(0, i) = *(input_x + i);
KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<float>(-1), KERNEL_STATUS_PARAM_INVALID,
"[%llu] must be at least more than -1.", i);
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
}
} else {
for (int64_t i = 0; i < data_num; i++) {
array_x(0, i) = *(input_x + i);
KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<double>(-1), KERNEL_STATUS_PARAM_INVALID,
"[%llu] must be at least more than -1.", i);
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
}
}
return KERNEL_STATUS_OK;
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_log1p = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
if (data_type == DT_COMPLEX64) {
array_x(0, i) = *(input_x + i);
KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<float>(-1), KERNEL_STATUS_PARAM_INVALID,
"[%llu] must be at least more than -1.", i);
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
} else {
array_x(0, i) = *(input_x + i);
KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<double>(-1), KERNEL_STATUS_PARAM_INVALID,
"[%llu] must be at least more than -1.", i);
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
}
}
return KERNEL_STATUS_OK;
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log1p),
"Log1p Compute failed.");
return KERNEL_STATUS_OK;
}
}
REGISTER_CPU_KERNEL(kLog1p, Log1pCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_LOG1P_H
#define AICPU_KERNELS_NORMALIZED_LOG1P_H
#include "cpu_ops_kernel.h"
namespace aicpu {
class Log1pCpuKernel : public CpuKernel {
public:
Log1pCpuKernel() = default;
~Log1pCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t Log1pCheck(const CpuKernelContext &ctx) const;
template <typename T>
uint32_t Log1pCompute(const CpuKernelContext &ctx);
template <typename T>
uint32_t Log1pComputeComplex(const CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,180 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "matrix_triangular_solve.h"
#include <chrono>
#include <fstream>
#include <iostream>
#include "Eigen/Core"
#include "complex"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "kernel_log.h"
using namespace Eigen;
using namespace std;
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kMatrixTriangularSolve = "MatrixTriangularSolve";
constexpr int64_t kParallelDataNums = 16 * 1024;
#define MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = MatrixTriangularSolveCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("MatrixTriangularSolve kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t MatrixTriangularSolveCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"MatrixTriangularSolve check input and output number failed.");
KERNEL_HANDLE_ERROR(MatrixTriangularSolveCheck(ctx), "MatrixTriangularSolve check params failed.");
// check the data type of the inputs
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_FLOAT, float, ctx)
MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("MatrixTriangularSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MatrixTriangularSolveCpuKernel::MatrixTriangularSolveCheck(CpuKernelContext &ctx) {
Tensor *in_matrix = ctx.Input(0);
Tensor *in_rhs = ctx.Input(1);
// check same data type constraint
auto in_type0 = in_matrix->GetDataType();
auto in_type1 = in_rhs->GetDataType();
KERNEL_CHECK_FALSE((in_type0 == in_type1), KERNEL_STATUS_PARAM_INVALID,
"The data type of input1 [%s] need be same with "
"input0 [%s].",
DTypeStr(in_type1).c_str(), DTypeStr(in_type0).c_str())
// check the number of matrix
auto in_shape0 = in_matrix->GetTensorShape();
auto in_shape1 = in_rhs->GetTensorShape();
std::vector<int64_t> dims0 = in_shape0->GetDimSizes();
std::vector<int64_t> dims1 = in_shape1->GetDimSizes();
// Check the shape of two inputs
if (dims0[0] != dims1[0]) {
KERNEL_LOG_ERROR("The shapes of two inputs are not matched");
return KERNEL_STATUS_PARAM_INVALID;
}
// check square
int m = dims0.size();
if (dims0[m - 2] != dims0[m - 1] || dims0[m - 1] == 0) {
KERNEL_LOG_ERROR("The input0 must be one or more squares.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MatrixTriangularSolveCpuKernel::MatrixTriangularSolveCompute(CpuKernelContext &ctx) {
Tensor *matrix_tensor = ctx.Input(0);
Tensor *rhs_tensor = ctx.Input(1);
Tensor *y_tensor = ctx.Output(0);
auto input_matrix = reinterpret_cast<T *>(matrix_tensor->GetData());
KERNEL_CHECK_NULLPTR(input_matrix, KERNEL_STATUS_PARAM_INVALID, "Get input data0 failed.")
auto input_rhs = reinterpret_cast<T *>(rhs_tensor->GetData());
KERNEL_CHECK_NULLPTR(input_rhs, KERNEL_STATUS_PARAM_INVALID, "Get input data1 failed.")
auto output_y = reinterpret_cast<T *>(y_tensor->GetData());
KERNEL_CHECK_NULLPTR(output_y, KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
AttrValue *lower_attr = ctx.GetAttr("lower");
KERNEL_CHECK_NULLPTR(lower_attr, KERNEL_STATUS_PARAM_INVALID, "Get attr [lower] failed.");
AttrValue *adjoint_attr = ctx.GetAttr("adjoint");
KERNEL_CHECK_NULLPTR(adjoint_attr, KERNEL_STATUS_PARAM_INVALID, "Get attr [adjoint] failed.");
bool lower_data = lower_attr->GetBool();
bool adjoint_data = adjoint_attr->GetBool();
auto matrix_shape = matrix_tensor->GetTensorShape();
auto rhs_shape = rhs_tensor->GetTensorShape();
auto y_shape = y_tensor->GetTensorShape();
// Get the number of elements
auto input1_num = matrix_tensor->NumElements();
// slice
std::vector<int64_t> matrix_dims = matrix_shape->GetDimSizes();
auto last_matrix_dims = *(matrix_dims.end() - 1);
size_t matrix_size = last_matrix_dims * last_matrix_dims; // size of a matrix
size_t matrix_num = input1_num / matrix_size; // number of matrix
std::vector<int64_t> rhs_dims = rhs_shape->GetDimSizes();
auto last_rhs_dims = *(rhs_dims.end() - 1);
size_t rhs_size = last_matrix_dims * last_rhs_dims;
auto data_size = matrix_num * matrix_size;
auto shard_matrix_triangular_solve = [&](size_t start, size_t end) {
for (size_t k = start; k < end; ++k) {
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_input(
input_matrix + k * matrix_size, last_matrix_dims, last_matrix_dims);
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_rhs(
input_rhs + k * rhs_size, last_matrix_dims, last_rhs_dims);
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_output(
output_y + k * rhs_size, last_matrix_dims, last_rhs_dims);
if (lower_data) {
auto triangle = eigen_input.template triangularView<Eigen::Lower>();
if (adjoint_data) {
eigen_output.noalias() = triangle.adjoint().solve(eigen_rhs);
} else {
eigen_output.noalias() = triangle.solve(eigen_rhs);
}
} else {
auto triangle = eigen_input.template triangularView<Eigen::Upper>();
if (adjoint_data) {
eigen_output.noalias() = triangle.adjoint().solve(eigen_rhs);
} else {
eigen_output.noalias() = triangle.solve(eigen_rhs);
}
}
}
};
if (data_size < kParallelDataNums) {
shard_matrix_triangular_solve(0, matrix_num);
} else {
uint32_t min_core_num = 1;
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > matrix_num) {
max_core_num = matrix_num;
}
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_matrix_triangular_solve),
"MatrixTriangularSolve Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMatrixTriangularSolve, MatrixTriangularSolveCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,42 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. \
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MATRIXTRIANGULARSOLVE_H_
#define AICPU_KERNELS_NORMALIZED_MATRIXTRIANGULARSOLVE_H_
#include "cpu_ops_kernel.h"
#include "Eigen/Core"
namespace aicpu {
class MatrixTriangularSolveCpuKernel : public CpuKernel {
public:
MatrixTriangularSolveCpuKernel() = default;
~MatrixTriangularSolveCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
bool lower;
bool adjoint;
template <typename T>
static uint32_t MatrixTriangularSolveCompute(CpuKernelContext &ctx);
static uint32_t MatrixTriangularSolveCheck(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,127 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "maximum_grad_grad.h"
#include <fstream>
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
constexpr uint32_t kMaximumGradGradInputNum = 4;
constexpr uint32_t kMaximumGradGradOutputNum = 3;
const char *kMaximumGradGrad = "MaximumGradGrad";
#define MAXIMUMGRADGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = MaximumGradGradCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("MaximumGradGrad kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t MaximumGradGradCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMaximumGradGradInputNum, kMaximumGradGradOutputNum),
"MaximumGradGrad check input and output number failed.");
KERNEL_HANDLE_ERROR(MaximumGradGradParamCheck(ctx), "MaximumGradGrad check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
MAXIMUMGRADGRAD_COMPUTE_CASE(DT_INT32, int32_t, ctx)
MAXIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
MAXIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
default:
KERNEL_LOG_ERROR("The data type of input is not support, input data type is [%s].", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MaximumGradGradCpuKernel::MaximumGradGradParamCheck(CpuKernelContext &ctx) {
// the non null of inputs and outputs has been verified in NormalCheck
Tensor *x1 = ctx.Input(0);
Tensor *x2 = ctx.Input(1);
Tensor *grad_y1 = ctx.Input(2);
Tensor *grad_y2 = ctx.Input(3);
// type check
DataType grad_y1_type = grad_y1->GetDataType();
DataType grad_y2_type = grad_y2->GetDataType();
DataType x1_type = x1->GetDataType();
DataType x2_type = x2->GetDataType();
KERNEL_CHECK_FALSE(((grad_y1_type == grad_y2_type) && (grad_y2_type == x1_type) && (x1_type == x2_type)),
KERNEL_STATUS_PARAM_INVALID,
"The data type of grad_y1 [%s], grad_y2 [%s], x1 [%s] and "
"x2 [%s] need to be same.",
DTypeStr(grad_y1_type).c_str(), DTypeStr(grad_y2_type).c_str(), DTypeStr(x1_type).c_str(),
DTypeStr(x2_type).c_str())
// shape check
auto grad_y1_shape = grad_y1->GetTensorShape()->GetDimSizes();
auto grad_y2_shape = grad_y2->GetTensorShape()->GetDimSizes();
auto x1_shape = x1->GetTensorShape()->GetDimSizes();
auto x2_shape = x2->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE(grad_y1_shape == x1_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y1 and x1.");
KERNEL_CHECK_FALSE(grad_y2_shape == x2_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y2 and x2.");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MaximumGradGradCpuKernel::MaximumGradGradCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
Tensor *input1_tensor = ctx.Input(1);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
template <typename T>
uint32_t MaximumGradGradCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto in2 = reinterpret_cast<T *>(ctx.Input(2)->GetData());
auto in3 = reinterpret_cast<T *>(ctx.Input(3)->GetData());
auto out0 = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto out1 = reinterpret_cast<T *>(ctx.Output(1)->GetData());
auto out2 = reinterpret_cast<T *>(ctx.Output(2)->GetData());
*out0 = static_cast<T>(0);
*out1 = static_cast<T>(0);
int64_t data_num = ctx.Output(2)->NumElements();
for (int64_t i = 0; i < data_num; ++i) {
if (*(in0 + bcast.GetBroadcastXIndex(i)) >= *(in1 + bcast.GetBroadcastYIndex(i))) {
*(out2 + i) = *(in2 + bcast.GetBroadcastXIndex(i));
} else {
*(out2 + i) = *(in3 + bcast.GetBroadcastYIndex(i));
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMaximumGradGrad, MaximumGradGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,41 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MAXIMUM_GRAD_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_MAXIMUM_GRAD_GRAD_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class MaximumGradGradCpuKernel : public CpuKernel {
public:
MaximumGradGradCpuKernel() = default;
~MaximumGradGradCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t MaximumGradGradParamCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t MaximumGradGradCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,286 @@
/**
* Copyright 2021 Harbin Institute of Technology
* Copyright 2021 Huawei Technologies Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "maxpool.h"
#include <Eigen/Dense>
#include <string>
#include <vector>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *MAXPOOL = "MaxPool";
constexpr uint32_t kMaxPoolInputNum = 1;
constexpr uint32_t kMaxPoolOutputNum = 1;
constexpr int64_t kParallelNum = 64 * 1024;
struct PoolParams {
int depth;
int tensor_cols;
int tensor_rows;
int tensor_batch;
int ksize_rows;
int ksize_cols;
int ksize_depth;
int strides_rows;
int strides_cols;
int strides_depth;
int64_t out_height;
int64_t out_width;
int out_depth;
int64_t pad_top;
int64_t pad_bottom;
int64_t pad_left;
int64_t pad_right;
};
} // namespace
namespace aicpu {
uint32_t GetOutputSize(int input_size, int kernel_size, int stride, const std::string &padding, int64_t *output_size,
int64_t *padding_before, int64_t *padding_after) {
KERNEL_CHECK_FALSE(stride > 0, KERNEL_STATUS_PARAM_INVALID, "[MaxPool] Stride must be positive.");
std::string same("SAME"), valid("VALID");
if (valid == padding) {
*output_size = (input_size - kernel_size + stride) / stride;
*padding_before = 0;
*padding_after = 0;
} else if (same == padding) {
*output_size = (input_size + stride - 1) / stride;
const int64_t padding_need =
std::max(static_cast<int64_t>(0), (*output_size - 1) * stride + kernel_size - input_size);
*padding_before = padding_need / 2;
*padding_after = padding_need - *padding_before;
} else {
KERNEL_LOG_ERROR("[MaxPool] Padding is invalid.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (*output_size < 0) {
KERNEL_LOG_ERROR("[MaxPool] Computed output size is negative.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t ConstructPoolParams(aicpu::CpuKernelContext &ctx, const aicpu::TensorShape &data_format, PoolParams &params) {
Format format = data_format.GetFormat();
KERNEL_CHECK_FALSE((format == FORMAT_NHWC || format == FORMAT_NCHW), KERNEL_STATUS_PARAM_INVALID,
"[MaxPool] Format is not NHWC or NCHW.");
std::vector<int64_t> tensor_in_shapes = data_format.GetDimSizes();
if (tensor_in_shapes.size() != 4) {
KERNEL_LOG_ERROR("[MaxPool] Input tensor must have 2 spacial dimensions.");
return KERNEL_STATUS_PARAM_INVALID;
}
std::vector<int64_t> ksize = ctx.GetAttr("ksize")->GetListInt(), strides = ctx.GetAttr("strides")->GetListInt();
std::string padding = ctx.GetAttr("padding")->GetString();
std::string data_format_str = "";
if (ctx.GetAttr("data_format") == nullptr) {
KERNEL_LOG_INFO("[MaxPool] Attr data_format is empty, using default value NHWC.");
format = FORMAT_NHWC;
} else {
std::map<std::string, aicpu::Format> format_str_to_enum_map = {{"NHWC", FORMAT_NHWC}, {"NCHW", FORMAT_NCHW}};
data_format_str = ctx.GetAttr("data_format")->GetString();
KERNEL_HANDLE_ERROR(format_str_to_enum_map.find(data_format_str) == format_str_to_enum_map.end(),
"[MaxPool] data_format string is invalid.");
format = format_str_to_enum_map[data_format_str];
}
switch (format) {
case FORMAT_NHWC:
params.depth = tensor_in_shapes[kFormatNHWCIndexC];
params.tensor_rows = tensor_in_shapes[kFormatNHWCIndexH];
params.tensor_cols = tensor_in_shapes[kFormatNHWCIndexW];
params.tensor_batch = tensor_in_shapes[kFormatNHWCIndexN];
params.ksize_rows = ksize[kFormatNHWCIndexH];
params.ksize_cols = ksize[kFormatNHWCIndexW];
params.ksize_depth = ksize[kFormatNHWCIndexC];
params.strides_rows = strides[kFormatNHWCIndexH];
params.strides_cols = strides[kFormatNHWCIndexW];
params.strides_depth = strides[kFormatNHWCIndexC];
break;
case FORMAT_NCHW:
params.depth = tensor_in_shapes[kFormatNCHWIndexC];
params.tensor_rows = tensor_in_shapes[kFormatNCHWIndexH];
params.tensor_cols = tensor_in_shapes[kFormatNCHWIndexW];
params.tensor_batch = tensor_in_shapes[kFormatNCHWIndexN];
params.ksize_rows = ksize[kFormatNCHWIndexH];
params.ksize_cols = ksize[kFormatNCHWIndexW];
params.ksize_depth = ksize[kFormatNCHWIndexC];
params.strides_rows = strides[kFormatNCHWIndexH];
params.strides_cols = strides[kFormatNCHWIndexW];
params.strides_depth = strides[kFormatNCHWIndexC];
break;
default:
KERNEL_LOG_ERROR("[MaxPool] Format is not NHWC or NCHW, current is [%s].", FormatToSerialString(format).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto ret1 = GetOutputSize(params.tensor_rows, params.ksize_rows, params.strides_rows, padding, &params.out_height,
&params.pad_top, &params.pad_bottom),
ret2 = GetOutputSize(params.tensor_cols, params.ksize_cols, params.strides_cols, padding, &params.out_width,
&params.pad_left, &params.pad_right);
KERNEL_CHECK_FALSE(ret1 == KERNEL_STATUS_OK && ret2 == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
"[MaxPool] An error occurred while calculating output size.");
params.out_depth = params.depth;
return KERNEL_STATUS_OK;
}
template <class T>
uint32_t SpacialMaxPool(CpuKernelContext &ctx, const PoolParams &params) {
Tensor *input = ctx.Input(kFirstInputIndex);
Tensor *output = ctx.Output(kFirstOutputIndex);
const T *raw_input_data = static_cast<T *>(input->GetData());
T *raw_output_data = static_cast<T *>(output->GetData());
auto shard_NCHW = [&params, &raw_input_data, &raw_output_data](int64_t start, int64_t limit) {
typedef Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenArrayMap;
typedef Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> EigenArrayMap;
const int64_t batch_size = limit;
const int64_t X_W = static_cast<int64_t>(params.tensor_cols), X_H = static_cast<int64_t>(params.tensor_rows);
const int64_t Y_W = params.out_width, Y_H = params.out_height;
const int64_t X_HxW = X_H * X_W, Y_HxW = Y_H * Y_W;
const int64_t X_stride = X_HxW, Y_stride = Y_HxW;
const int64_t stride_h = static_cast<int64_t>(params.strides_rows),
stride_w = static_cast<int64_t>(params.strides_cols);
const int64_t pad_t = params.pad_top, pad_l = params.pad_left;
const int64_t kernel_h = static_cast<int64_t>(params.ksize_rows),
kernel_w = static_cast<int64_t>(params.ksize_cols);
const T *x_ptr = raw_input_data + start * X_stride;
T *y_ptr = raw_output_data + start * Y_stride;
for (int64_t i = start; i < batch_size; ++i) {
ConstEigenArrayMap x_arr(x_ptr, X_W, X_H);
EigenArrayMap y_arr(y_ptr, Y_W, Y_H);
for (int64_t h = 0; h < Y_H; ++h) {
const int64_t t = std::max(h * stride_h - pad_t, static_cast<int64_t>(0));
const int64_t b = std::min(h * stride_h - pad_t + kernel_h, X_H);
for (int64_t w = 0; w < Y_W; ++w) {
const int64_t l = std::max(w * stride_w - pad_l, static_cast<int64_t>(0));
const int64_t r = std::min(w * stride_w - pad_l + kernel_w, X_W);
const int64_t y = h * Y_W + w;
y_arr(y) = x_arr.block(l, t, r - l, b - t).maxCoeff();
}
}
x_ptr += X_stride;
y_ptr += Y_stride;
}
};
auto shard_NHWC = [&params, &raw_input_data, &raw_output_data](int64_t start, int64_t limit) {
typedef Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenArrayMap;
typedef Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> EigenArrayMap;
const int64_t batch_size = limit;
const int64_t X_W = static_cast<int64_t>(params.tensor_cols), X_H = static_cast<int64_t>(params.tensor_rows);
const int64_t Y_W = params.out_width, Y_H = params.out_height;
const int64_t X_HxW = X_H * X_W, Y_HxW = Y_H * Y_W;
const int64_t C = static_cast<int64_t>(params.depth);
const int64_t X_stride = X_HxW * C, Y_stride = Y_HxW * C;
const int64_t stride_h = static_cast<int64_t>(params.strides_rows),
stride_w = static_cast<int64_t>(params.strides_cols);
const int64_t pad_t = params.pad_top, pad_l = params.pad_left;
const int64_t kernel_h = static_cast<int64_t>(params.ksize_rows),
kernel_w = static_cast<int64_t>(params.ksize_cols);
const T *x_ptr = raw_input_data + start * X_stride;
T *y_ptr = raw_output_data + start * Y_stride;
for (int64_t i = start; i < batch_size; ++i) {
ConstEigenArrayMap x_arr(x_ptr, C, X_HxW);
EigenArrayMap y_arr(y_ptr, C, Y_HxW);
for (int64_t h = 0; h < Y_H; ++h) {
const int64_t t = std::max(h * stride_h - pad_t, static_cast<int64_t>(0));
const int64_t b = std::min(h * stride_h - pad_t + kernel_h, X_H);
for (int64_t w = 0; w < Y_W; ++w) {
const int64_t l = std::max(w * stride_w - pad_l, static_cast<int64_t>(0));
const int64_t r = std::min(w * stride_w - pad_l + kernel_w, X_W);
const int64_t y = h * Y_W + w;
y_arr.col(y).setConstant(Eigen::NumTraits<T>::lowest());
for (int64_t xi = t; xi < b; ++xi) {
for (int64_t yj = l; yj < r; ++yj) {
y_arr.col(y) = y_arr.col(y).max(x_arr.col(xi * X_W + yj));
}
}
}
}
x_ptr += X_stride;
y_ptr += Y_stride;
}
};
int64_t total_elements = params.tensor_batch * params.tensor_cols * params.tensor_rows * params.depth;
if (ctx.GetAttr("data_format") != nullptr && ctx.GetAttr("data_format")->GetString() == "NCHW") {
int64_t total_images = params.tensor_batch * params.depth;
KERNEL_LOG_INFO("[MaxPool] Calling new shard_NCHW");
if (total_elements <= kParallelNum) {
shard_NCHW(0, total_images);
return KERNEL_STATUS_OK;
} else {
uint32_t max_core_num = aicpu::CpuKernelUtils::GetCPUNum(ctx);
max_core_num = std::min(total_images, static_cast<int64_t>(max_core_num));
return CpuKernelUtils::ParallelFor(ctx, total_images, total_images / max_core_num, shard_NCHW);
}
} else {
int64_t total_images_with_chann = params.tensor_batch;
KERNEL_LOG_INFO("[MaxPool] Calling new shard_NHWC");
if (total_elements <= kParallelNum) {
shard_NHWC(0, total_images_with_chann);
return KERNEL_STATUS_OK;
} else {
uint32_t max_core_num = aicpu::CpuKernelUtils::GetCPUNum(ctx);
max_core_num = std::min(total_images_with_chann, static_cast<int64_t>(max_core_num));
return CpuKernelUtils::ParallelFor(ctx, total_images_with_chann, total_images_with_chann / max_core_num,
shard_NHWC);
}
}
}
template <class T>
uint32_t ComputeMaxPoolImpl(CpuKernelContext &ctx) {
TensorShape ts = *(ctx.Input(kFirstInputIndex)->GetTensorShape());
PoolParams params;
KERNEL_CHECK_FALSE(ConstructPoolParams(ctx, ts, params) == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
"[MaxPool] Pooling parameters construct failed.")
return SpacialMaxPool<T>(ctx, params);
}
uint32_t MaxPoolCpuKernel::Compute(CpuKernelContext &ctx) {
const std::vector<std::string> required_attrs = {"ksize", "strides", "padding"};
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMaxPoolInputNum, kMaxPoolOutputNum, required_attrs),
"[MaxPool] Check input and output number failed.");
DataType input_type = ctx.Input(kFirstInputIndex)->GetDataType();
switch (input_type) {
case DT_FLOAT16:
return ComputeMaxPoolImpl<Eigen::half>(ctx);
case DT_FLOAT:
return ComputeMaxPoolImpl<float>(ctx);
case DT_DOUBLE:
return ComputeMaxPoolImpl<double>(ctx);
case DT_INT8:
return ComputeMaxPoolImpl<int8_t>(ctx);
case DT_INT16:
return ComputeMaxPoolImpl<int16_t>(ctx);
case DT_INT32:
return ComputeMaxPoolImpl<int32_t>(ctx);
case DT_INT64:
return ComputeMaxPoolImpl<int64_t>(ctx);
case DT_UINT8:
return ComputeMaxPoolImpl<uint8_t>(ctx);
case DT_UINT16:
return ComputeMaxPoolImpl<uint16_t>(ctx);
default:
KERNEL_LOG_ERROR("[MaxPool] Data type [%s] is not supported.", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(MAXPOOL, MaxPoolCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,27 @@
/**
* Copyright 2021 Harbin Institute of Technology
* Copyright 2021 Huawei Technologies Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class MaxPoolCpuKernel : public CpuKernel {
public:
~MaxPoolCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
};
} // namespace aicpu

View File

@ -0,0 +1,129 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minimum_grad_grad.h"
#include <fstream>
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
constexpr uint32_t kMinimumGradGradInputNum = 4;
constexpr uint32_t kMinimumGradGradOutputNum = 3;
const char *kMinimumGradGrad = "MinimumGradGrad";
#define MINIMUMGRADGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = MinimumGradGradCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("MinimumGradGrad kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t MinimumGradGradCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMinimumGradGradInputNum, kMinimumGradGradOutputNum),
"MinimumGradGrad check input and output number failed.");
KERNEL_HANDLE_ERROR(MinimumGradGradParamCheck(ctx), "MinimumGradGrad check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
MINIMUMGRADGRAD_COMPUTE_CASE(DT_INT32, int32_t, ctx)
MINIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
MINIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
default:
KERNEL_LOG_ERROR("The data type of input is not support, input data type is [%s].", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MinimumGradGradCpuKernel::MinimumGradGradParamCheck(CpuKernelContext &ctx) {
// the non null of inputs and outputs has been verified in
// NormalCheck
Tensor *x1 = ctx.Input(0);
Tensor *x2 = ctx.Input(1);
Tensor *grad_y1 = ctx.Input(2);
Tensor *grad_y2 = ctx.Input(3);
// type check
DataType grad_y1_type = grad_y1->GetDataType();
DataType grad_y2_type = grad_y2->GetDataType();
DataType x1_type = x1->GetDataType();
DataType x2_type = x2->GetDataType();
KERNEL_CHECK_FALSE(((grad_y1_type == grad_y2_type) && (grad_y2_type == x1_type) && (x1_type == x2_type)),
KERNEL_STATUS_PARAM_INVALID,
"The data type of grad_y1 [%s], grad_y2 [%s], x1 [%s] and "
"x2 [%s] need to be same.",
DTypeStr(grad_y1_type).c_str(), DTypeStr(grad_y2_type).c_str(), DTypeStr(x1_type).c_str(),
DTypeStr(x2_type).c_str())
// shape check
auto grad_y1_shape = grad_y1->GetTensorShape()->GetDimSizes();
auto grad_y2_shape = grad_y2->GetTensorShape()->GetDimSizes();
auto x1_shape = x1->GetTensorShape()->GetDimSizes();
auto x2_shape = x2->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE(grad_y1_shape == x1_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y1 and x1.");
KERNEL_CHECK_FALSE(grad_y2_shape == x2_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y2 and x2.");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MinimumGradGradCpuKernel::MinimumGradGradCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
Tensor *input1_tensor = ctx.Input(1);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
template <typename T>
uint32_t MinimumGradGradCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto in2 = reinterpret_cast<T *>(ctx.Input(2)->GetData());
auto in3 = reinterpret_cast<T *>(ctx.Input(3)->GetData());
auto out0 = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto out1 = reinterpret_cast<T *>(ctx.Output(1)->GetData());
auto out2 = reinterpret_cast<T *>(ctx.Output(2)->GetData());
*out0 = static_cast<T>(0);
*out1 = static_cast<T>(0);
int64_t data_num = ctx.Output(2)->NumElements();
for (int64_t i = 0; i < data_num; ++i) {
if (*(in0 + bcast.GetBroadcastXIndex(i)) <= *(in1 + bcast.GetBroadcastYIndex(i))) {
*(out2 + i) = *(in2 + bcast.GetBroadcastXIndex(i));
} else {
*(out2 + i) = *(in3 + bcast.GetBroadcastYIndex(i));
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMinimumGradGrad, MinimumGradGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,44 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MINIMUM_GRAD_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_MINIMUM_GRAD_GRAD_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class MinimumGradGradCpuKernel : public CpuKernel {
public:
MinimumGradGradCpuKernel() = default;
~MinimumGradGradCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t MinimumGradGradParamCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t MinimumGradGradCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,249 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "mul_no_nan.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kMulNoNan = "MulNoNan";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 8 * 1024;
const int64_t kParallelDataNumMid = 64 * 1024;
const int64_t kParallelDataNumSameShape = 32 * 1024;
const int64_t kParallelDataNumSameShapeMid = 256 * 1024;
#define MULNONAN_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = MulNoNanCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("MulNoNan kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t MulNoNanCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MulNoNan check input and output number failed.");
KERNEL_HANDLE_ERROR(MulNoNanParamCheck(ctx), "MulNoNan check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
MULNONAN_COMPUTE_CASE(DT_INT8, int8_t, ctx)
MULNONAN_COMPUTE_CASE(DT_INT16, int16_t, ctx)
MULNONAN_COMPUTE_CASE(DT_INT32, int32_t, ctx)
MULNONAN_COMPUTE_CASE(DT_INT64, int64_t, ctx)
MULNONAN_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
MULNONAN_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
MULNONAN_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
MULNONAN_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
MULNONAN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
MULNONAN_COMPUTE_CASE(DT_FLOAT, float, ctx)
MULNONAN_COMPUTE_CASE(DT_DOUBLE, double, ctx)
MULNONAN_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
MULNONAN_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("MulNoNan kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MulNoNanCpuKernel::MulNoNanParamCheck(CpuKernelContext &ctx) {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"LessCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
// special compute is used in the following situations.
// 1. the shapes of input1 and input2 are the same
// 2. input1 is a 1D tensor with only one element or input1 is scalar
// 3. input2 is a 1D tensor with only one element or input2 is scalar
// 4. the shapes of input1 and input2 are different
template <typename T>
void MulNoNanCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
const T *input2, T *output) {
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = start; i < end; ++i) {
if (*(input2 + i) == (T)0) {
*(output + i) = (T)0;
} else {
*(output + i) = *(input1 + i) * *(input2 + i);
}
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
if (*(input2 + i) == (T)0) {
*(output + i) = (T)0;
} else {
*(output + i) = *input1 * *(input2 + i);
}
}
break;
case BcastShapeType::Y_ONE_ELEMENT:
if (*input2 == (T)0) {
for (int64_t i = start; i < end; ++i) {
*(output + i) = (T)0;
}
} else {
for (int64_t i = start; i < end; ++i) {
*(output + i) = *(input1 + i) * *input2;
}
}
break;
default:
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
}
template <typename T>
uint32_t MulNoNanCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type = in0_elements_nums == in1_elements_nums
? BcastShapeType::SAME_SHAPE
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_mul_no_nan = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, in0, in1, out); };
if (max_core_num == 0) {
KERNEL_LOG_ERROR("Divisor max_core_num is 0");
} else {
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_mul_no_nan),
"MulNoNan Compute failed.");
}
} else {
SpecialCompute<T>(type, 0, data_num, in0, in1, out);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MulNoNanCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_mul_no_nan = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
if (*(in1 + bcast.GetBroadcastYIndex(i)) == (T)0) {
*(out + i) = (T)0;
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) * *(in1 + bcast.GetBroadcastYIndex(i));
}
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("Divisor max_core_num is 0");
} else {
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_mul_no_nan),
"MulNoNan Compute failed.");
}
} else {
for (int64_t i = 0; i < data_num; ++i) {
if (*(in1 + bcast.GetBroadcastYIndex(i)) == (T)0) {
*(out + i) = (T)0;
} else {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) * *(in1 + bcast.GetBroadcastYIndex(i));
}
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MulNoNanCpuKernel::MulNoNanCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool noNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (noNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMulNoNan, MulNoNanCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,48 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MUL_NO_NAN_H_
#define AICPU_KERNELS_NORMALIZED_MUL_NO_NAN_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class MulNoNanCpuKernel : public CpuKernel {
public:
MulNoNanCpuKernel() = default;
~MulNoNanCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t MulNoNanParamCheck(CpuKernelContext &ctx);
template <typename T>
void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
uint32_t MulNoNanCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,196 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "multilabel_margin_loss_grad.h"
#include <Eigen/Dense>
#include <algorithm>
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const char *kMultilabelMarginLossGrad = "MultilabelMarginLossGrad";
} // namespace
namespace aicpu {
uint32_t MultilabelMarginLossGradCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
uint32_t kInputNum = 4;
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"MultilabelMarginLossGrad check input and output number failed.");
KERNEL_HANDLE_ERROR(MultilabelMarginLossGradCheck(ctx), "MultilabelMarginLossGrad check params failed.");
auto data_type = ctx.Input(1)->GetDataType();
switch (data_type) {
case DT_FLOAT16:
return MultilabelMarginLossGradComputeFP16<Eigen::half>(ctx);
case DT_FLOAT:
return MultilabelMarginLossGradCompute<float>(ctx);
default:
KERNEL_LOG_ERROR("MultilabelMarginLossGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t MultilabelMarginLossGradCpuKernel::MultilabelMarginLossGradCheck(CpuKernelContext &ctx) {
auto target = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
size_t dims = ctx.Input(1)->GetTensorShape()->GetDims();
int64_t batch_size =
(dims == 2) ? ctx.Input(1)->GetTensorShape()->GetDimSize(1) : ctx.Input(1)->GetTensorShape()->GetDimSize(0);
size_t data_num = ctx.Input(1)->GetTensorShape()->NumElements();
AttrValue *Attr_red = ctx.GetAttr("reduction");
std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
for (size_t i = 0; i < data_num; i++) {
KERNEL_CHECK_FALSE(*(target + i) >= -1 && (*(target + i) < batch_size), KERNEL_STATUS_PARAM_INVALID,
"[%s]'s target out of range.", ctx.GetOpType().c_str());
}
if (reduction == "none") {
if (dims == 1) {
KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() == 0, KERNEL_STATUS_PARAM_INVALID,
"[%s]'s y_grad should be a scalar "
"when rank of x is 1.",
ctx.GetOpType().c_str())
} else {
KERNEL_CHECK_FALSE(
ctx.Input(0)->GetTensorShape()->GetDims() == 1 &&
ctx.Input(0)->GetTensorShape()->GetDimSize(0) == ctx.Input(1)->GetTensorShape()->GetDimSize(0),
KERNEL_STATUS_PARAM_INVALID,
"[%s]'s y_grad's shape should be the same as "
"{x_shape[0]} when the rank of x is 2 and reduction is none.",
ctx.GetOpType().c_str())
}
} else {
KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() == 0, KERNEL_STATUS_PARAM_INVALID,
"[%s]'s y_grad should be a scalar "
"when reduction is mean or sum.",
ctx.GetOpType().c_str())
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MultilabelMarginLossGradCpuKernel::MultilabelMarginLossGradCompute(CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto input_target = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
auto input_istarget = reinterpret_cast<int32_t *>(ctx.Input(3)->GetData());
auto output_x_grad = reinterpret_cast<T *>(ctx.Output(0)->GetData());
AttrValue *Attr_red = ctx.GetAttr("reduction");
std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
size_t dims = ctx.Input(1)->GetTensorShape()->GetDims();
size_t batch_size =
(dims == 2) ? ctx.Input(1)->GetTensorShape()->GetDimSize(1) : ctx.Input(1)->GetTensorShape()->GetDimSize(0);
size_t data_num = ctx.Input(1)->GetTensorShape()->NumElements();
size_t nframe = data_num / batch_size;
auto g = static_cast<T>(reduction == "mean" ? 1. / data_num : 1. / batch_size);
std::vector<T> output_vector(data_num, 0);
for (size_t t = 0; t < nframe; t++) {
for (size_t m = 0; m < batch_size; m++) {
int32_t target_idx = input_target[m];
if (target_idx < 0) {
break;
}
auto calc_target = input_x[target_idx];
for (size_t n = 0; n < batch_size; n++) {
if (input_istarget[n] == 0) {
float z = 1 - calc_target + input_x[n];
if (z > 0) {
output_vector[t * batch_size + target_idx] -= g;
output_vector[t * batch_size + n] += g;
}
}
}
}
input_x += batch_size;
input_target += batch_size;
input_istarget += batch_size;
}
auto y_grad = ctx.Input(0);
auto y_grad_data = reinterpret_cast<T *>(y_grad->GetData());
size_t y_grad_dims = y_grad->GetTensorShape()->GetDims();
if (reduction != "none" || y_grad_dims == 0) {
for (size_t i = 0; i < data_num; i++) {
*(output_x_grad + i) = output_vector[i] * (*(y_grad_data));
}
} else {
for (size_t i = 0; i < nframe; i++) {
for (size_t j = 0; j < batch_size; j++) {
*(output_x_grad + i * batch_size + j) = output_vector[i * batch_size + j] * (*(y_grad_data + i));
}
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t MultilabelMarginLossGradCpuKernel::MultilabelMarginLossGradComputeFP16(CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto input_target = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
auto input_istarget = reinterpret_cast<int32_t *>(ctx.Input(3)->GetData());
auto output_x_grad = reinterpret_cast<T *>(ctx.Output(0)->GetData());
AttrValue *Attr_red = ctx.GetAttr("reduction");
std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
size_t dims = ctx.Input(1)->GetTensorShape()->GetDims();
size_t batch_size =
(dims == 2) ? ctx.Input(1)->GetTensorShape()->GetDimSize(1) : ctx.Input(1)->GetTensorShape()->GetDimSize(0);
size_t data_num = ctx.Input(1)->GetTensorShape()->NumElements();
size_t nframe = data_num / batch_size;
float g = static_cast<float>(reduction == "mean" ? 1. / data_num : 1. / batch_size);
std::vector<float> output_vector(data_num, 0);
for (size_t t = 0; t < nframe; t++) {
for (size_t m = 0; m < batch_size; m++) {
int32_t target_idx = input_target[m];
if (target_idx < 0) {
break;
}
float calc_target = static_cast<float>(input_x[target_idx]);
for (size_t n = 0; n < batch_size; n++) {
if (input_istarget[n] == 0) {
float z = 1 - calc_target + static_cast<float>(input_x[n]);
if (z > 0) {
output_vector[t * batch_size + target_idx] -= g;
output_vector[t * batch_size + n] += g;
}
}
}
}
input_x += batch_size;
input_target += batch_size;
input_istarget += batch_size;
}
auto y_grad = ctx.Input(0);
auto y_grad_data = reinterpret_cast<T *>(y_grad->GetData());
size_t y_grad_dims = y_grad->GetTensorShape()->GetDims();
if (reduction != "none" || y_grad_dims == 0) {
for (size_t i = 0; i < data_num; i++) {
*(output_x_grad + i) = static_cast<T>(output_vector[i] * static_cast<float>(*(y_grad_data)));
}
} else {
for (size_t i = 0; i < nframe; i++) {
for (size_t j = 0; j < batch_size; j++) {
*(output_x_grad + i * batch_size + j) =
static_cast<T>(output_vector[i * batch_size + j] * static_cast<float>(*(y_grad_data + i)));
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kMultilabelMarginLossGrad, MultilabelMarginLossGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,39 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MULITLABEL_MARGIN_LOSS_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_MULTILABEL_MARGIN_LOSS_GRAD_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class MultilabelMarginLossGradCpuKernel : public CpuKernel {
public:
MultilabelMarginLossGradCpuKernel() = default;
~MultilabelMarginLossGradCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
static uint32_t MultilabelMarginLossGradCheck(CpuKernelContext &ctx);
template <typename T>
static uint32_t MultilabelMarginLossGradCompute(CpuKernelContext &ctx);
template <typename T>
static uint32_t MultilabelMarginLossGradComputeFP16(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,168 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "non_max_suppression_with_overlaps.h"
#include <algorithm>
#include <queue>
#include "Eigen/Core"
#include "unsupported/Eigen/CXX11/Tensor"
#include "cpu_attr_value.h"
#include "cpu_tensor.h"
#include "cpu_tensor_shape.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/allocator_utils.h"
#include "utils/kernel_util.h"
namespace {
const char *kNonMaxSuppressionWithOverlaps = "NonMaxSuppressionWithOverlaps";
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 5;
const uint32_t kFirstInputIndex = 0;
const uint32_t kSecondInputIndex = 1;
const uint32_t kThirdInputIndex = 2;
const uint32_t kforthInputIndex = 3;
const uint32_t kfifthInputIndex = 4;
const uint32_t kFirstOutputIndex = 0;
const uint32_t kOverlapsRank = 2;
} // namespace
namespace aicpu {
uint32_t NonMaxSuppressionWithOverlapsCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"NonMaxSuppressionWithOverlaps check input and output number failed.");
overlaps_ = ctx.Input(kFirstInputIndex);
scores_ = ctx.Input(kSecondInputIndex);
Tensor *max_output_size_tensor = ctx.Input(kThirdInputIndex);
max_output_size_ = *static_cast<int32_t *>(max_output_size_tensor->GetData());
KERNEL_CHECK_FALSE((max_output_size_ >= 0), KERNEL_STATUS_PARAM_INVALID,
"The input max_output_size must be non-negative");
overlap_threshold_tensor_ = ctx.Input(kforthInputIndex);
score_threshold_tensor_ = ctx.Input(kfifthInputIndex);
output_indices_ = ctx.Output(kFirstOutputIndex);
std::shared_ptr<TensorShape> overlaps_shape = overlaps_->GetTensorShape();
int32_t overlaps_rank = overlaps_shape->GetDims();
if (overlaps_rank != kOverlapsRank || overlaps_shape->GetDimSize(0) != overlaps_shape->GetDimSize(1)) {
KERNEL_LOG_ERROR(
"The input dim size of overlaps must be 2-D and must be square, "
"while %d, %lld",
overlaps_rank, overlaps_shape->GetDimSize(1));
return KERNEL_STATUS_PARAM_INVALID;
}
num_boxes_ = overlaps_shape->GetDimSize(0);
std::shared_ptr<TensorShape> scores_shape = scores_->GetTensorShape();
int32_t scores_rank = scores_shape->GetDims();
KERNEL_CHECK_FALSE((scores_rank == 1), KERNEL_STATUS_PARAM_INVALID,
"The input dim size of scores must be 1-D, while %d.", scores_rank);
KERNEL_CHECK_FALSE((scores_shape->GetDimSize(0) == num_boxes_), KERNEL_STATUS_PARAM_INVALID,
"The len of scores must be equal to the number of boxes, "
"while dims[%lld], num_boxes_[%d].",
scores_shape->GetDimSize(0), num_boxes_);
overlaps_dtype_ = static_cast<DataType>(overlaps_->GetDataType());
if (overlaps_dtype_ != DT_FLOAT) {
KERNEL_LOG_ERROR("The dtype of input[0] overlaps must be float.");
return KERNEL_STATUS_PARAM_INVALID;
}
scores_dtype_ = static_cast<DataType>(scores_->GetDataType());
if (scores_dtype_ != DT_FLOAT) {
KERNEL_LOG_ERROR("The dtype of input[1] scores must be float.");
return KERNEL_STATUS_PARAM_INVALID;
}
overlap_threshold_dtype_ = static_cast<DataType>(overlap_threshold_tensor_->GetDataType());
if (overlap_threshold_dtype_ != DT_FLOAT) {
KERNEL_LOG_ERROR("The dtype of input[3] overlap_threshold must be float.");
return KERNEL_STATUS_PARAM_INVALID;
}
score_threshold_dtype_ = static_cast<DataType>(score_threshold_tensor_->GetDataType());
if (score_threshold_dtype_ != DT_FLOAT) {
KERNEL_LOG_ERROR("The dtype of input[4] score_threshold must be float.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T, typename T_threshold>
uint32_t NonMaxSuppressionWithOverlapsCpuKernel::DoNonMaxSuppressionWithOverlapsOp() {
KERNEL_LOG_INFO("DoNonMaxSuppressionWithOverlapsOp start!!");
Eigen::TensorMap<Eigen::Tensor<T, kOverlapsRank, Eigen::RowMajor>> overlaps_map(
reinterpret_cast<T *>(overlaps_->GetData()), num_boxes_, num_boxes_);
std::vector<T> scores_data(num_boxes_);
std::copy_n(reinterpret_cast<T *>(scores_->GetData()), num_boxes_, scores_data.begin());
auto overlap_threshold = static_cast<T>(*(static_cast<T_threshold *>(overlap_threshold_tensor_->GetData())));
auto score_threshold = static_cast<T>(*(static_cast<T_threshold *>(score_threshold_tensor_->GetData())));
std::unique_ptr<int32_t[]> indices_data(new int32_t[max_output_size_]);
if (indices_data == nullptr) {
KERNEL_LOG_ERROR("DoNonMaxSuppressionWithOverlapsOp: new indices_data failed");
return KERNEL_STATUS_INNER_ERROR;
}
struct Candidate {
int box_index;
T score;
int suppress_begin_index;
};
auto cmp = [](const Candidate boxes_i, const Candidate boxes_j) { return boxes_i.score < boxes_j.score; };
std::priority_queue<Candidate, std::deque<Candidate>, decltype(cmp)> candidate_priority_queue(cmp);
for (uint32_t i = 0; i < scores_data.size(); ++i) {
if (scores_data[i] > score_threshold) {
candidate_priority_queue.emplace(Candidate({(int)i, scores_data[i]}));
}
}
T similarity = static_cast<T>(0.0);
Candidate next_candidate = {.box_index = 0, .score = static_cast<T>(0.0), .suppress_begin_index = 0};
int32_t cnt = 0;
while (cnt < max_output_size_ && !candidate_priority_queue.empty()) {
next_candidate = candidate_priority_queue.top();
candidate_priority_queue.pop();
bool should_suppress = false;
for (int j = cnt - 1; j >= next_candidate.suppress_begin_index; --j) {
similarity = overlaps_map(next_candidate.box_index, indices_data[j]);
if (similarity >= overlap_threshold) {
should_suppress = true;
break;
}
}
next_candidate.suppress_begin_index = cnt;
if (!should_suppress) {
indices_data[cnt] = next_candidate.box_index;
cnt += 1;
}
}
auto value = reinterpret_cast<int32_t *>(output_indices_->GetData());
for (int j = 0; j <= std::min(cnt, max_output_size_) - 1; j++) {
*(value + j) = indices_data[j];
}
output_indices_->GetTensorShape()->SetDimSizes({std::min(cnt, max_output_size_)});
KERNEL_LOG_INFO("DoNonMaxSuppressionWithOverlapsOp end!!");
return KERNEL_STATUS_OK;
}
uint32_t NonMaxSuppressionWithOverlapsCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_LOG_INFO("NonMaxSuppressionWithOverlaps kernel in.");
uint32_t res = GetInputAndCheck(ctx);
if (res != KERNEL_STATUS_OK) {
return res;
}
res = DoNonMaxSuppressionWithOverlapsOp<float, float>();
KERNEL_CHECK_FALSE((res == KERNEL_STATUS_OK), res, "Compute failed.");
KERNEL_LOG_INFO("Compute end!!");
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kNonMaxSuppressionWithOverlaps, NonMaxSuppressionWithOverlapsCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,47 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_NON_MAX_SUPPRESSION_WITH_OVERLAPS_H_
#define AICPU_KERNELS_NORMALIZED_NON_MAX_SUPPRESSION_WITH_OVERLAPS_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "eigen_tensor.h"
namespace aicpu {
class NonMaxSuppressionWithOverlapsCpuKernel : public CpuKernel {
public:
~NonMaxSuppressionWithOverlapsCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t GetInputAndCheck(CpuKernelContext &ctx);
template <typename T, typename T_threshold>
uint32_t DoNonMaxSuppressionWithOverlapsOp();
const Tensor *overlaps_ = nullptr;
Tensor *scores_ = nullptr;
Tensor *overlap_threshold_tensor_ = nullptr;
Tensor *score_threshold_tensor_ = nullptr;
Tensor *output_indices_ = nullptr;
int32_t num_boxes_ = 0;
int32_t max_output_size_ = 0;
DataType overlaps_dtype_ = DT_UINT32;
DataType scores_dtype_ = DT_UINT32;
DataType overlap_threshold_dtype_ = DT_UINT32;
DataType score_threshold_dtype_ = DT_UINT32;
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_NON_MAX_SUPPRESSION_WITH_OVERLAPS_H_

View File

@ -0,0 +1,138 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nth_element.h"
#include <vector>
#include <algorithm>
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
#include "utils/eigen_tensor.h"
#include "kernel_log.h"
#include "status.h"
using namespace std;
namespace {
const char *kNthElement = "NthElement";
constexpr uint64_t kParallelDataNums = 32 * 1024;
#define NTHELEMENT_COMPUTE_CASE(DTYPE, TYPE, X, Y, N, LAST_DIM, CTX) \
case (DTYPE): { \
uint32_t result = NthElementCompute<TYPE>(X, Y, N, LAST_DIM, CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("NthElement kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t NthElement::Compute(CpuKernelContext &ctx) {
Tensor *input_n = ctx.Input(1);
KERNEL_CHECK_FALSE((input_n->GetTensorShape()->GetDimSizes().empty()), KERNEL_STATUS_PARAM_INVALID,
"Input n must be a scalar.");
DataType n_type = input_n->GetDataType();
KERNEL_CHECK_FALSE((n_type == DT_INT32), KERNEL_STATUS_PARAM_INVALID, "The type of input n must be int32.");
KERNEL_CHECK_NULLPTR(input_n->GetData(), KERNEL_STATUS_PARAM_INVALID, "NthElement Get input n failed.");
int32_t *n_data = reinterpret_cast<int32_t *>(input_n->GetData());
int32_t n = *n_data;
KERNEL_CHECK_FALSE((n >= 0), KERNEL_STATUS_PARAM_INVALID, "Input n must be non-negative but is [%d].", n);
Tensor *x = ctx.Input(0);
KERNEL_CHECK_NULLPTR(x, KERNEL_STATUS_PARAM_INVALID, "NthElement Get input x failed.");
auto x_shape = x->GetTensorShape();
int32_t dims = x_shape->GetDims();
KERNEL_CHECK_FALSE((dims >= 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 1 but is rank [%d]",
dims);
const int32_t last_dim = x_shape->GetDimSize(dims - 1);
KERNEL_CHECK_FALSE((last_dim > n), KERNEL_STATUS_PARAM_INVALID, "Input x must have last dimension = [%d] > n = [%d]",
last_dim, n);
AttrValue *reverse_attr = ctx.GetAttr("reverse");
KERNEL_CHECK_NULLPTR(reverse_attr, KERNEL_STATUS_PARAM_INVALID, "NthElement get attr reverse failed.");
bool reverse = reverse_attr->GetBool();
if (reverse) {
n = last_dim - n - 1;
}
Tensor *y = ctx.Output(0);
auto x_type = x->GetDataType();
switch (x_type) {
NTHELEMENT_COMPUTE_CASE(DT_FLOAT, float, x, y, n, last_dim, ctx)
NTHELEMENT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, x, y, n, last_dim, ctx)
NTHELEMENT_COMPUTE_CASE(DT_UINT8, uint8_t, x, y, n, last_dim, ctx)
NTHELEMENT_COMPUTE_CASE(DT_UINT16, uint16_t, x, y, n, last_dim, ctx)
NTHELEMENT_COMPUTE_CASE(DT_INT8, int8_t, x, y, n, last_dim, ctx)
NTHELEMENT_COMPUTE_CASE(DT_INT16, int16_t, x, y, n, last_dim, ctx)
NTHELEMENT_COMPUTE_CASE(DT_INT32, int32_t, x, y, n, last_dim, ctx)
NTHELEMENT_COMPUTE_CASE(DT_INT64, int64_t, x, y, n, last_dim, ctx)
NTHELEMENT_COMPUTE_CASE(DT_DOUBLE, double, x, y, n, last_dim, ctx)
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(x_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t NthElement::NthElementCompute(Tensor *x, Tensor *y, const int32_t n, const int32_t last_dim,
CpuKernelContext &ctx) {
T *x_addrs = reinterpret_cast<T *>(x->GetData());
T *y_addrs = reinterpret_cast<T *>(y->GetData());
const uint64_t num_rows = y->NumElements();
const uint64_t num = x->NumElements();
if (num <= kParallelDataNums) {
std::vector<T> buf(last_dim);
for (size_t i = 0; i < num_rows; i++) {
const T *input_start = x_addrs + i * last_dim;
const T *input_end = input_start + last_dim;
std::copy(input_start, input_end, buf.begin());
std::nth_element(buf.begin(), buf.begin() + n, buf.end());
y_addrs[i] = buf[n];
}
} else {
uint32_t min_core_num = 1;
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > num_rows) {
max_core_num = num_rows;
}
auto shard_nth_element = [&](size_t start, size_t end) {
std::vector<T> buf(last_dim);
for (size_t i = start; i < end; ++i) {
const T *input_start = x_addrs + i * last_dim;
const T *input_end = input_start + last_dim;
std::copy(input_start, input_end, buf.begin());
std::nth_element(buf.begin(), buf.begin() + n, buf.end());
y_addrs[i] = buf[n];
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, num_rows, num_rows / max_core_num, shard_nth_element),
"NthElement Parallel Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kNthElement, NthElement);
} // namespace aicpu

View File

@ -0,0 +1,34 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_NTH_ELEMENT_H_
#define AICPU_KERNELS_NORMALIZED_NTH_ELEMENT_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class NthElement : public CpuKernel {
public:
NthElement() = default;
~NthElement() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t NthElementCompute(Tensor *x, Tensor *y, const int32_t n, const int32_t last_dim, CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,198 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
* \file one_hot.cc
* \brief
*/
#include "one_hot.h"
#include <string>
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "status.h"
#include "cpu_types.h"
#include "utils/kernel_util.h"
#include "utils/eigen_tensor.h"
#include "utils/sparse_tensor.h"
namespace {
const uint32_t kInputNum = 4;
const uint32_t kOutputNum = 1;
const char *kOneHot = "OneHot";
const int64_t kParallelDataNumSameShape = 100 * 1024;
#define ONE_HOT_INPUT_COMPUTE_CASE(DTYPE, TYPE, ODTYPE, CTX) \
case (DTYPE): { \
switch (ODTYPE) { \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_COMPLEX64, std::complex<float>, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_COMPLEX128, std::complex<double>, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_DOUBLE, double, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_FLOAT, float_t, CTX); \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_FLOAT16, Eigen::half, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT8, int8_t, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT16, int16_t, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT32, int32_t, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT64, int64_t, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT8, uint8_t, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT16, uint16_t, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT32, uint32_t, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT64, uint64_t, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_BOOL, bool, CTX) \
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_STRING, std::string, CTX) \
default: \
KERNEL_LOG_ERROR("OneHot kernel output data type [%s] not support.", DTypeStr(output_data_type).c_str()); \
return KERNEL_STATUS_PARAM_INVALID; \
} \
break; \
}
#define ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, ODTYPE, OTYPE, CTX) \
case (ODTYPE): { \
uint32_t result = OneHotCompute<OTYPE, TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("OneHot kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t OneHotCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "OneHot check input and output number failed.");
KERNEL_HANDLE_ERROR(OneHotParamCheck(ctx), "OneHot check params failed.");
auto input_data_type = ctx.Input(0)->GetDataType();
auto output_data_type = ctx.Output(0)->GetDataType();
switch (input_data_type) {
ONE_HOT_INPUT_COMPUTE_CASE(DT_UINT8, uint8_t, output_data_type, ctx);
ONE_HOT_INPUT_COMPUTE_CASE(DT_INT32, int32_t, output_data_type, ctx);
ONE_HOT_INPUT_COMPUTE_CASE(DT_INT64, int64_t, output_data_type, ctx);
default:
KERNEL_LOG_ERROR("OneHot kernel input data type [%s] not support.", DTypeStr(input_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T, typename TI>
uint32_t OneHotCpuKernel::OneHotCompute(CpuKernelContext &ctx) {
// 输入张量
Tensor *indices = ctx.Input(0);
// 输出张量
Tensor *output = ctx.Output(0);
// 输入张量数据
auto indices_data = reinterpret_cast<TI *>(indices->GetData());
// 输出张量数据
auto output_data = reinterpret_cast<T *>(output->GetData());
// depth值
auto depth = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
// on_value值
auto on_value = reinterpret_cast<T *>(ctx.Input(2)->GetData());
// off_value值
auto off_value = reinterpret_cast<T *>(ctx.Input(3)->GetData());
// 输入张量形状
auto indices_shape = indices->GetTensorShape();
// axis值
int64_t axis = ctx.GetAttr("axis") == nullptr ? -1 : ctx.GetAttr("axis")->GetInt();
if (axis == -1) {
axis = indices_shape->GetDims();
}
// 输出张量形状
auto output_shape = output->GetTensorShape();
// 对输出张量用off_value进行初始化匿名函数
auto init_output_func = [&](int64_t start, int64_t end) -> void {
for (int i = start; i < end; ++i) {
*(output_data + i) = *(off_value);
}
};
// 计算axis前维度大小
int64_t prefix_dim_size = 1;
for (int i = 0; i < axis; ++i) {
prefix_dim_size *= indices_shape->GetDimSize(i);
}
// 计算计算axis后维度大小
int64_t suffix_dim_size = indices_shape->NumElements() / prefix_dim_size;
// 输入张量元素总个数
int64_t data_num = indices_shape->NumElements();
// depth_value为depth的具体值
int32_t depth_value = *(depth);
// 将输出张量的维度看做{prefix_dim_sizedepth, suffix_dim_size}
// 通过offset = suffix_dim_size == 1?(d0 * depth_value + d1):(d0 * prefix_dim_size * depth_value + d1 *
// suffix_dim_size + d2)来计算出独热张量有效值的位置 然后对输出张量的该位置赋值为on_value
const auto get_output_func = [&](int64_t start, int64_t end) -> void {
for (int64_t i = start; i < end; ++i) {
int64_t d0 = i / suffix_dim_size;
int64_t d1 = i - (d0 * suffix_dim_size);
int64_t depth_v = SubtleMustCopy<int64_t>(*(indices_data + d0 * suffix_dim_size + d1));
if (depth_v < static_cast<int64_t>(depth_value) && depth_v >= 0) {
int64_t offset = suffix_dim_size == 1 ? i * depth_value + depth_v
: d0 * depth_value * suffix_dim_size + depth_v * suffix_dim_size + d1;
*(output_data + offset) = *(on_value);
}
}
};
// 使用CpuKernelUtils::GetCPUNum接口获取AI CPU的核数
uint32_t max_core_num = std::max(1U, aicpu::CpuKernelUtils::GetCPUNum(ctx));
// 多线程执行状态
bool run_state = true;
// 对于数据量小于100K的场景则只单核运行否则使用实际的AI CPU总核数进行计算
if (data_num >= kParallelDataNumSameShape) {
max_core_num = (max_core_num > data_num) ? data_num : max_core_num;
max_core_num = max_core_num == 0 ? 1 : max_core_num;
uint32_t ret1 = CpuKernelUtils::ParallelFor(ctx, output_shape->NumElements(),
(output_shape->NumElements() / max_core_num), init_output_func);
uint32_t ret2 = CpuKernelUtils::ParallelFor(ctx, data_num, (data_num / max_core_num), get_output_func);
run_state = (ret1 == KERNEL_STATUS_OK) && (ret2 == KERNEL_STATUS_OK);
} else {
// 输入数据大小没有100k单核调用
init_output_func(0, output_shape->NumElements());
get_output_func(0, data_num);
}
return run_state ? KERNEL_STATUS_OK : KERNEL_STATUS_INNER_ERROR;
}
// 参数校验
uint32_t OneHotCpuKernel::OneHotParamCheck(CpuKernelContext &ctx) {
Tensor *indices = ctx.Input(0);
Tensor *depth = ctx.Input(1);
Tensor *on_value = ctx.Input(2);
Tensor *off_value = ctx.Input(3);
int64_t axis = ctx.GetAttr("axis") == nullptr ? -1 : ctx.GetAttr("axis")->GetInt();
DataType on_value_type = on_value->GetDataType();
DataType off_value_type = off_value->GetDataType();
KERNEL_CHECK_FALSE((on_value_type == off_value_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of on_value [%s] need be same with off_value [%s].",
DTypeStr(on_value_type).c_str(), DTypeStr(off_value_type).c_str())
auto depth_shape = depth->GetTensorShape();
auto on_value_shape = on_value->GetTensorShape();
auto off_value_shape = off_value->GetTensorShape();
KERNEL_CHECK_FALSE((depth_shape->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
"Depth must be a scalar, actual dim num is %d.", depth_shape->GetDims())
KERNEL_CHECK_FALSE((on_value_shape->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
"On_value must be a scalar, actual dim num is %d.", on_value_shape->GetDims())
KERNEL_CHECK_FALSE((off_value_shape->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
"Off_value must be a scalar , actual dim num is %d.", off_value_shape->GetDims())
int32_t output_dims = indices->GetTensorShape()->GetDims() + 1;
KERNEL_CHECK_FALSE(((axis > -2 && axis < output_dims)), KERNEL_STATUS_PARAM_INVALID,
"Expected axis value should between [-1, %d]. But received: %d.", output_dims - 1, axis)
int32_t depth_value = *(reinterpret_cast<int32_t *>(ctx.Input(1)->GetData()));
KERNEL_CHECK_FALSE((depth_value >= 0), KERNEL_STATUS_PARAM_INVALID,
"Depth should be a non-negative. But received: %d.", depth_value)
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kOneHot, OneHotCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,41 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
* \file one_hot.h
* \brief
*/
#ifndef AICPU_KERNELS_NORMALIZED_ONE_HOT_H_
#define AICPU_KERNELS_NORMALIZED_ONE_HOT_H_
#include <type_traits>
#include "cpu_ops_kernel.h"
namespace aicpu {
class OneHotCpuKernel : public CpuKernel {
public:
OneHotCpuKernel() = default;
~OneHotCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T, typename TI>
uint32_t OneHotCompute(CpuKernelContext &ctx);
uint32_t OneHotParamCheck(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,228 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "orgqr.h"
#include "Eigen/Dense"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include <numeric>
#include <iostream>
using namespace Eigen;
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kOrgqr = "Orgqr";
const double ZERO = 0.;
const uint32_t kTWO = 2;
constexpr int64_t kParallelDataNums = 18 * 1024;
constexpr int64_t kParallelDataNumsMid = 32 * 1024;
#define ORGQR_COMPUTE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = OrgqrCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Orgqr kernel compute failed."); \
return result; \
} \
break; \
}
#define ORGQR_COMPUTE_COMPLEX(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = OrgqrComputeComplex<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Orgqr kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t OrgqrCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Orgqr check input and output number failed.");
KERNEL_HANDLE_ERROR(OrgqrCheck(ctx), "[%s] check params failed.", kOrgqr);
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
ORGQR_COMPUTE(DT_FLOAT, float, ctx)
ORGQR_COMPUTE(DT_DOUBLE, double, ctx)
ORGQR_COMPUTE_COMPLEX(DT_COMPLEX64, std::complex<float_t>, ctx)
ORGQR_COMPUTE_COMPLEX(DT_COMPLEX128, std::complex<double_t>, ctx)
default:
KERNEL_LOG_ERROR("Orgqr kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t OrgqrCpuKernel::OrgqrCheck(CpuKernelContext &ctx) {
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
size_t shape_size = shape_x.size();
KERNEL_CHECK_FALSE((shape_size > 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 2.")
KERNEL_CHECK_FALSE((shape_x[shape_size - kTWO] > 0), KERNEL_STATUS_PARAM_INVALID,
"Dimension [%zu] of input x must be at least 1, but [%zu].", shape_size - kTWO,
shape_x[shape_size - kTWO])
KERNEL_CHECK_FALSE((shape_x[shape_size - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
"Dimension [%zu] of input x must be at least 1, but [%zu].", shape_size - 1,
shape_x[shape_size - 1])
KERNEL_CHECK_FALSE((shape_x[shape_size - kTWO] >= shape_x[shape_size - 1]), KERNEL_STATUS_PARAM_INVALID,
"Dimension [%zu] of input x must be bigger than dimension [%zu], when input x has rank [%zu].",
shape_size - kTWO, shape_size - 1, shape_size)
std::vector<int64_t> shape_tau = ctx.Input(1)->GetTensorShape()->GetDimSizes();
size_t shape_tau_size = shape_tau.size();
KERNEL_CHECK_FALSE((shape_x[shape_size - 1] >= shape_tau[shape_tau_size - 1]), KERNEL_STATUS_PARAM_INVALID,
"Dimension [%zu] of input tau must be less than [%zu], but [%zu].", shape_tau_size - 1,
shape_x[shape_size - 1], shape_tau[shape_tau_size - 1])
if (shape_size > kTWO) {
KERNEL_CHECK_FALSE((shape_x[0] == shape_tau[0]), KERNEL_STATUS_PARAM_INVALID,
"Dimension 0 of input tau must equal Dimension 0 of input x when input has batch")
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t OrgqrCpuKernel::OrgqrCompute(CpuKernelContext &ctx) {
auto *x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto *tau = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto *y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
size_t shape_size = shape_x.size();
size_t m = shape_x[shape_size - kTWO];
size_t n = shape_x[shape_size - 1];
std::vector<int64_t> shape_tau = ctx.Input(1)->GetTensorShape()->GetDimSizes();
size_t p = *(shape_tau.end() - 1);
size_t size_mn = m * n;
size_t matrix_num = ctx.Input(0)->NumElements() / size_mn;
int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartrixXd;
typedef Eigen::Matrix<T, Eigen::Dynamic, 1> VectorXd;
if (data_size <= kParallelDataNums) {
for (size_t i = 0; i < matrix_num; i++) {
Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
MartrixXd tmp = MartrixXd::Identity(m, m);
Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
for (size_t k = 0; k < p; k++) {
VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
vector_v[0] = 1;
tmp.rightCols(m - k) =
tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.transpose();
}
martrix_y = tmp.leftCols(n);
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_size <= kParallelDataNumsMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > matrix_num) {
max_core_num = matrix_num;
}
auto shard_qr = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
MartrixXd tmp = MartrixXd::Identity(m, m);
Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
for (size_t k = 0; k < p; k++) {
VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
vector_v[0] = 1;
tmp.rightCols(m - k) =
tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.transpose();
}
martrix_y = tmp.leftCols(n);
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_qr),
"Orgqr Compute failed.");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t OrgqrCpuKernel::OrgqrComputeComplex(CpuKernelContext &ctx) {
auto *x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto *tau = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto *y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
size_t shape_size = shape_x.size();
size_t m = shape_x[shape_size - kTWO];
size_t n = shape_x[shape_size - 1];
std::vector<int64_t> shape_tau = ctx.Input(1)->GetTensorShape()->GetDimSizes();
size_t p = *(shape_tau.end() - 1);
size_t size_mn = m * n;
size_t matrix_num = ctx.Input(0)->NumElements() / size_mn;
int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartrixXd;
typedef Eigen::Matrix<T, Eigen::Dynamic, 1> VectorXd;
if (data_size <= kParallelDataNums) {
for (size_t i = 0; i < matrix_num; i++) {
Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
MartrixXd tmp = MartrixXd::Identity(m, m);
Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
for (size_t k = 0; k < p; k++) {
VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
vector_v[0] = 1;
tmp.rightCols(m - k) =
tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.adjoint();
}
martrix_y = tmp.leftCols(n);
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_size <= kParallelDataNumsMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > matrix_num) {
max_core_num = matrix_num;
}
auto shard_qr = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
MartrixXd tmp = MartrixXd::Identity(m, m);
Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
for (size_t k = 0; k < p; k++) {
VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
vector_v[0] = 1;
tmp.rightCols(m - k) =
tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.adjoint();
}
martrix_y = tmp.leftCols(n);
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_qr),
"Orgqr Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kOrgqr, OrgqrCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,43 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_ORGQR_H_
#define AICPU_KERNELS_NORMALIZED_ORQGR_H_
#include <vector>
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class OrgqrCpuKernel : public CpuKernel {
public:
OrgqrCpuKernel() = default;
~OrgqrCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t OrgqrCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t OrgqrCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t OrgqrComputeComplex(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_ORGQR_H_

View File

@ -0,0 +1,140 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "pack.h"
#include <securec.h>
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "Eigen/Core"
namespace {
const uint32_t kOutputNum{1u};
const uint32_t kInputNum{aicpu::kDynamicInput};
const char *kPack = "Pack";
// constexpr int64_t kParallelDataNums = 512 * 1024;
#define PACK_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = PackCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Pack kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t PackCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kPack);
KERNEL_HANDLE_ERROR(PackCheck(ctx), "[%s] check params failed.", kPack);
DataType data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
PACK_COMPUTE_CASE(DT_BOOL, bool, ctx)
PACK_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
PACK_COMPUTE_CASE(DT_FLOAT, float, ctx)
PACK_COMPUTE_CASE(DT_DOUBLE, double, ctx)
PACK_COMPUTE_CASE(DT_INT8, int8_t, ctx)
PACK_COMPUTE_CASE(DT_INT16, int16_t, ctx)
PACK_COMPUTE_CASE(DT_INT32, int32_t, ctx)
PACK_COMPUTE_CASE(DT_INT64, int64_t, ctx)
PACK_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
PACK_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
PACK_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
PACK_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
PACK_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
PACK_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("Pack kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t PackCpuKernel::PackCheck(CpuKernelContext &ctx) {
auto *input = ctx.Input(0);
AttrValue *n_attr = ctx.GetAttr("N");
AttrValue *axis_attr = ctx.GetAttr("axis");
int64_t axis = axis_attr->GetInt();
auto expanded_num_dims = input->GetTensorShape()->GetDims() + 1; // first_input.dims() + 1;
if (axis < 0) axis += expanded_num_dims;
if (axis < 0 || axis >= expanded_num_dims) {
KERNEL_LOG_ERROR("Pack axis error.");
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t input_num = n_attr->GetInt();
auto x1_dims = input->GetTensorShape()->GetDims();
for (int64_t i = 1; i < input_num; i++) {
auto input_dims = ctx.Input(i)->GetTensorShape()->GetDims();
if (x1_dims != input_dims) {
KERNEL_LOG_ERROR("Pack input dims no equal.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PackCpuKernel::PackCompute(CpuKernelContext &ctx) {
AttrValue *axis_attr = ctx.GetAttr("axis");
int64_t axis = axis_attr->GetInt();
AttrValue *n_attr = ctx.GetAttr("N");
int64_t input_num = n_attr->GetInt();
auto *input = ctx.Input(0);
auto *output = ctx.Output(0);
auto expanded_num_dims = input->GetTensorShape()->GetDims() + 1;
if (axis < 0) axis += expanded_num_dims;
std::vector<int64_t> temp_shape = input->GetTensorShape()->GetDimSizes();
temp_shape.insert(temp_shape.begin() + axis, input_num);
auto *y = reinterpret_cast<T *>(output->GetData());
int64_t x_NumElements = input->GetTensorShape()->NumElements();
if (axis == 0) {
int64_t num = 0;
for (int64_t j = 0; j < input_num; j++) {
auto *input_x = reinterpret_cast<T *>(ctx.Input(j)->GetData());
auto input_numelements = ctx.Input(j)->GetTensorShape()->NumElements();
for (int64_t i = 0; i < input_numelements; i++) {
*(y + num) = *(input_x + i);
num++;
}
}
} else {
int64_t num = 0;
for (int64_t j = 0; j < x_NumElements; j++) {
for (int64_t i = 0; i < input_num; i++) {
auto *input_x = reinterpret_cast<T *>(ctx.Input(i)->GetData());
*(y + num) = *(input_x + j);
num++;
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kPack, PackCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_PACK_H_
#define AICPU_KERNELS_NORMALIZED_PACK_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class PackCpuKernel : public CpuKernel {
public:
~PackCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx);
uint32_t PackCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t PackCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,379 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "parameterized_truncated_normal.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include <Eigen/Dense>
#include <algorithm>
#include <cmath>
#include <iostream>
#include <random>
using namespace std;
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 5;
const char *kParameterizedTruncatedNormal = "ParameterizedTruncatedNormal";
using RNG_Engine = std::mt19937;
static constexpr int kMaxIterations = 1000;
#define BATCH_SIZE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
batch_size = int64_t(GetBatchSizeCheckDims<TYPE>(CTX)); \
break; \
}
// override functions for half
bool isinf(Eigen::half &data) { return Eigen::half_impl::isinf(data); }
void swap(Eigen::half &data1, Eigen::half &data2) {
Eigen::half tmp = data1;
data1 = data2;
data2 = tmp;
}
Eigen::half exp(Eigen::half &data) { return Eigen::half_impl::exp(data); }
Eigen::half log(Eigen::half &data) { return Eigen::half_impl::log(data); }
} // namespace
namespace aicpu {
template <typename T>
T GetBatchSizeCheckDims(CpuKernelContext &ctx) {
auto output_shape = reinterpret_cast<T *>(ctx.Input(0)->GetData());
for (int i = 1; i < ctx.Input(0)->NumElements(); i++) {
KERNEL_CHECK_FALSE((output_shape[i] >= 0), KERNEL_STATUS_PARAM_INVALID, "The output dimension must be >= 0.")
}
return output_shape[0];
}
template <typename T>
void Generate(int64_t size, T mean, T stddev, T minval, T maxval, T **output_ptr, RNG_Engine &rng) {
auto output = *output_ptr;
std::normal_distribution<double> normal_dist(0, 1);
std::uniform_real_distribution<double> unifrom_dist(0, 1);
// Vectorized intermediate calculations for uniform rejection sampling.
const T stddev_inside_bound = T(1.3);
/**
* If possible, make one-sided bound be the lower bound, or make both
* bounds positive. Otherwise, the bounds are on either side of the
* mean.
*/
if ((isinf(minval) && minval < T(0)) || maxval < mean) {
// Reverse all calculations. norm_min and norm_max will be flipped.
swap(minval, maxval);
stddev = -stddev;
}
auto tmp_num = (stddev == static_cast<T>(0)) ? static_cast<T>(1) : stddev;
// Calculate normalized samples, then convert them.
const T norm_min = (minval - mean) / tmp_num;
const T norm_max = (maxval - mean) / tmp_num;
int sample_num = 0;
// Determine the method to use.
const T sqrt_factor = sqrt((norm_min * norm_min) + T(4));
const T cutoff = T(2) * exp(T(0.5) + (norm_min * (norm_min - sqrt_factor)) / T(4)) / (norm_min + sqrt_factor);
const T diff = norm_max - norm_min;
if (((norm_min < -stddev_inside_bound) && (norm_max >= T(0.))) ||
((norm_max > stddev_inside_bound) && (norm_min <= T(0.)))) {
/**
* If the bounds are a least 3 standard deviations from the mean
* on at least one side then we rejection sample by sampling
* from the normal distribution and rejecting samples outside
* the bounds.
* Under this condition the acceptance rate per iteration should
* always be ~ 50%. This sampler is more efficient (and more
* numerically stable when one or both bounds is far from the mean).
*/
while (sample_num < size) {
for (int iter = 0; iter <= kMaxIterations;) {
T normal_sample = T(normal_dist(rng));
if ((normal_sample >= norm_min) && (normal_sample <= norm_max)) {
*output = normal_sample * stddev + mean;
if (stddev <= static_cast<T>(0)) {
*output = static_cast<T>(INFINITY);
} else {
output = output + 1;
}
sample_num++;
break;
} else {
iter++;
if (iter > kMaxIterations) {
/**
* This should never occur because this sampler should
* (by the selection criteria above) be used if at least 3
* standard deviations of one side of the distribution
* is within the limits (so acceptance probability per
* iterations >~ 1/2 per iteration).
*/
KERNEL_LOG_ERROR(
"TruncatedNormal randn rejection sampler "
"exceeded maximum iterations");
*output_ptr = output;
return;
}
}
}
}
} else if (diff < cutoff) {
// Sample from a uniform distribution on [norm_min, norm_max].
const T plus_Factor = (norm_min < T(0)) ? T(0) : norm_min * norm_min;
while (sample_num < size) {
for (int iter = 0; iter <= kMaxIterations;) {
T uniform_sample = T(unifrom_dist(rng));
T z = uniform_sample * diff + norm_min;
T g = (plus_Factor - z * z) / T(2.0);
bool accept = T(unifrom_dist(rng)) <= exp(g);
if (accept || iter + 1 >= kMaxIterations) {
if (!accept) {
KERNEL_LOG_ERROR(
"TruncatedNormal uniform rejection sampler "
"exceeded max iterations. Sample may contain outliers.");
*output_ptr = output;
return;
}
*output = z * stddev + mean;
if (stddev <= static_cast<T>(0)) {
*output = static_cast<T>(INFINITY);
} else {
output = output + 1;
}
sample_num++;
break;
} else {
iter++;
}
}
}
} else {
/**
* Sample from an exponential distribution with alpha maximizing
* acceptance probability, offset by norm_min from the origin.
* Accept only if less than norm_max.
*/
const T alpha = (norm_min + sqrt((norm_min * norm_min) + T(4))) / T(2);
while (sample_num < size) {
for (int iter = 0; iter <= kMaxIterations;) {
T uniform_sample = T(unifrom_dist(rng));
T z = -log(uniform_sample) / alpha + norm_min;
const T x = norm_min < alpha ? alpha - z : norm_min - alpha;
const T g = exp(-x * x / T(2.0));
const T u = T(unifrom_dist(rng));
bool accept = (u <= g && z < norm_max);
if (accept || iter + 1 >= kMaxIterations) {
if (!accept) {
KERNEL_LOG_ERROR(
"TruncatedNormal exponential distribution "
"rejection sampler exceeds max iterations. "
"Sample may contain outliers.");
*output_ptr = output;
return;
}
*output = z * stddev + mean;
output = output + 1;
sample_num++;
break;
} else {
iter++;
}
}
}
}
*output_ptr = output;
return;
}
template <typename T_shape, typename T_val>
uint32_t BatchGenerate(CpuKernelContext &ctx) {
Tensor *input_0 = ctx.Input(0);
auto output_shape = reinterpret_cast<T_shape *>(input_0->GetData());
// check shape
auto batch_size = output_shape[0];
int sample_size = 1;
for (int i = 1; i < ctx.Input(0)->NumElements(); i++) {
sample_size *= output_shape[i];
}
Tensor *input_3 = ctx.Input(3);
Tensor *input_4 = ctx.Input(4);
Tensor *input_1 = ctx.Input(1);
Tensor *input_2 = ctx.Input(2);
Tensor *output = ctx.Output(0);
auto output_data = reinterpret_cast<T_val *>(output->GetData());
auto means = reinterpret_cast<T_val *>(input_1->GetData());
auto stdevs = reinterpret_cast<T_val *>(input_2->GetData());
auto minvals = reinterpret_cast<T_val *>(input_3->GetData());
auto maxvals = reinterpret_cast<T_val *>(input_4->GetData());
// setup seed
int64_t final_seed = 0;
auto attr_seed = ctx.GetAttr("seed");
if (attr_seed != nullptr) {
final_seed = attr_seed->GetInt();
}
if (final_seed == 0) {
auto attr_seed2 = ctx.GetAttr("seed2");
if (attr_seed2 != nullptr) {
final_seed = attr_seed2->GetInt();
}
}
// setup random engine
std::random_device r;
RNG_Engine rng;
final_seed = final_seed ? final_seed : r();
rng.seed(final_seed);
vector<T_val *> params = {means, stdevs, minvals, maxvals};
vector<int> params_idx;
if (input_1->NumElements() > 1) {
params_idx.push_back(0);
}
if (input_2->NumElements() > 1) {
params_idx.push_back(1);
}
if (input_3->NumElements() > 1) {
params_idx.push_back(2);
}
if (input_4->NumElements() > 1) {
params_idx.push_back(3);
}
for (int batch = 0; batch < batch_size; batch++) {
auto maxval = *params[3];
auto minval = *params[2];
KERNEL_CHECK_FALSE((maxval > minval), KERNEL_STATUS_PARAM_INVALID,
"Max value must be greater than min value in each batch")
Generate<T_val>(int64_t(sample_size), *params[0], *params[1], minval, maxval, &output_data, rng);
for (auto i : params_idx) {
params[i] = params[i] + 1;
}
}
return KERNEL_STATUS_OK;
}
uint32_t ParameterizedTruncatedNormalCpuKernel::ParameterizedTruncatedNormalCheck(CpuKernelContext &ctx) {
DataType val_datatype = ctx.Input(1)->GetDataType();
DataType shape_datatype = ctx.Input(0)->GetDataType();
for (uint32_t i = 0; i < kInputNum; i++) {
Tensor *input = ctx.Input(i);
// check input datatype
DataType input_datatype = input->GetDataType();
switch (i) {
case 0:
KERNEL_CHECK_FALSE((input_datatype == DT_INT32 || input_datatype == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
"Input[0] data type must DT_INT32 or DT_INT64,"
"but got data type[%s].",
DTypeStr(input_datatype).c_str());
break;
case 1:
KERNEL_CHECK_FALSE((input_datatype == DT_FLOAT16 || input_datatype == DT_FLOAT || input_datatype == DT_DOUBLE),
KERNEL_STATUS_PARAM_INVALID,
"Input[1] data type must DT_FLOAT16 or DT_FLOAT or DT_DOUBLE,"
"but got data type[%s].",
DTypeStr(input_datatype).c_str());
break;
default:
KERNEL_CHECK_FALSE((input_datatype == val_datatype), KERNEL_STATUS_PARAM_INVALID,
"The data type of input[%u] [%s] need be same with input[1] [%s].", i,
DTypeStr(input_datatype).c_str(), DTypeStr(val_datatype).c_str())
}
// check input dimension
auto input_dims = input->GetTensorShape()->GetDims();
int64_t batch_size = 0;
switch (shape_datatype) {
BATCH_SIZE_CASE(DT_INT32, int32_t, ctx)
BATCH_SIZE_CASE(DT_INT64, int64_t, ctx)
default:
KERNEL_LOG_ERROR("input0 data type [%u] not support.", shape_datatype);
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_CHECK_FALSE((batch_size >= 0), KERNEL_STATUS_PARAM_INVALID, "The batch size must be >= 0.")
switch (i) {
case 0:
KERNEL_CHECK_FALSE((input_dims == 1), KERNEL_STATUS_PARAM_INVALID,
"Input[0] should be rank 1, but got rank [%d].", input_dims);
break;
default:
KERNEL_CHECK_FALSE((input_dims <= 1), KERNEL_STATUS_PARAM_INVALID,
"Input[%u] should be at most rank 1, but got rank [%d].", i, input_dims);
if (input_dims == 1) {
auto num_of_elems = input->NumElements();
KERNEL_CHECK_FALSE((num_of_elems == 1 || num_of_elems == batch_size), KERNEL_STATUS_PARAM_INVALID,
"Input[%u] length should be 1 or equal to the "
"batch size, got %d.",
i, num_of_elems);
}
}
}
return KERNEL_STATUS_OK;
}
void ParameterizedTruncatedNormalCpuKernel::SetMap() {
calls_[DT_INT32][DT_FLOAT16] = BatchGenerate<int32_t, Eigen::half>;
calls_[DT_INT32][DT_FLOAT] = BatchGenerate<int32_t, float>;
calls_[DT_INT32][DT_DOUBLE] = BatchGenerate<int32_t, double>;
calls_[DT_INT64][DT_FLOAT16] = BatchGenerate<int64_t, Eigen::half>;
calls_[DT_INT64][DT_FLOAT] = BatchGenerate<int64_t, float>;
calls_[DT_INT64][DT_DOUBLE] = BatchGenerate<int64_t, double>;
}
uint32_t ParameterizedTruncatedNormalCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"ParameterizedTruncatedNormal check input and output number failed.");
KERNEL_HANDLE_ERROR(ParameterizedTruncatedNormalCheck(ctx), "ParameterizedTruncatedNormal check params failed.");
DataType val_datatype = ctx.Input(1)->GetDataType();
DataType shape_datatype = ctx.Input(0)->GetDataType();
SetMap();
calls_[shape_datatype][val_datatype](ctx);
calls_.clear();
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kParameterizedTruncatedNormal, ParameterizedTruncatedNormalCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_PARAMETERIZEDTRUNCATEDNORMAL_H_
#define AICPU_KERNELS_NORMALIZED_PARAMETERIZEDTRUNCATEDNORMAL_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class ParameterizedTruncatedNormalCpuKernel : public CpuKernel {
public:
ParameterizedTruncatedNormalCpuKernel() = default;
~ParameterizedTruncatedNormalCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
static uint32_t ParameterizedTruncatedNormalCheck(CpuKernelContext &ctx);
// use map for 2 template parameter functions
void SetMap();
std::map<int, std::map<int, std::function<void(CpuKernelContext &)>>> calls_;
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_PARAMETERIZEDTRUNCATEDNORMAL_H_

View File

@ -0,0 +1,185 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "pdist_grad.h"
#include <algorithm>
#include <math.h>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "kernel_log.h"
#include "status.h"
namespace {
const char *kPdistGrad = "PdistGrad";
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 3;
constexpr int64_t kParallelDataNums = 16 * 1024;
constexpr int64_t kParallelDataNumsMid = 7 * 1024;
#define SWITCH_PARALLEL(SHARD, end_num, divisor) \
if (end_num >= (kParallelDataNumsMid / divisor)) { \
uint32_t min_core_num = 1; \
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2); \
if (end_num < (kParallelDataNums / divisor)) { \
max_core_num = std::min(max_core_num, 4L); \
} \
if (max_core_num > end_num) { \
max_core_num = end_num; \
} \
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, end_num / max_core_num, SHARD), \
"PdistGrad #SHARD Compute failed."); \
} else { \
SHARD(0, end_num); \
}
} // namespace
namespace aicpu {
template <typename T>
struct Grad {
static inline T abs(T x) { return static_cast<T>(std::abs(*((float *)&x))); }
static inline T pow(T x, float p) { return static_cast<T>(std::pow(*((float *)&x), p)); }
static inline T sign(T x) { return x > T{0.0f} ? T{1.0f} : T{-1.0f}; }
struct o_grad {
static inline T backward(T diff, T grad, T dist, float p) { return diff > T{0.0f} ? grad : -grad; }
};
struct t_grad {
static inline float backward(float diff, float grad, float dist, float p) {
return dist == 0.0f ? 0.0f : grad * diff / dist;
}
static inline Eigen::half backward(Eigen::half diff, Eigen::half grad, Eigen::half dist, float p) {
return dist == Eigen::half{0.0f} ? Eigen::half{0.0f}
: sign(diff) * pow(abs(diff), p - 1) * grad / pow(dist, p - 1);
}
};
struct p_grad {
static inline T backward(T diff, T grad, T dist, float p) {
return dist == T{0.0f} ? T{0.0f} : sign(diff) * pow(abs(diff), p - 1) * grad / pow(dist, p - 1);
}
};
struct i_grad {
static inline T backward(T diff, T grad, T dist, float p) {
return (diff == dist || -diff == dist) ? sign(diff) * grad : T{0.0f};
}
};
template <typename S>
static uint32_t ParallelForPdistGrad(T *grad, T *x, T *dist, T *y, float p, CpuKernelContext &ctx) {
int64_t data_num = ctx.Input(1)->NumElements();
int64_t n = ctx.Input(1)->GetTensorShape()->GetDimSize(0);
int64_t m = ctx.Input(1)->GetTensorShape()->GetDimSize(1);
auto shard_pdistgrad = [&](int64_t start, int64_t end) {
int64_t index;
for (int64_t col = start; col < end; col++) {
index = 0;
for (int64_t i = col; i < data_num; i += m) {
for (int64_t j = i + m; j < data_num; j += m) {
T diff = x[i] - x[j];
if (diff == T{0.0f}) {
index++;
continue;
}
T result = S::backward(diff, grad[index], dist[index], p);
*(y + i) += result;
*(y + j) -= result;
index++;
}
}
}
};
SWITCH_PARALLEL(shard_pdistgrad, m, n);
return KERNEL_STATUS_OK;
}
static inline uint32_t PdistGradComputeKernel(T *grad, T *x, T *dist, T *y, float p, CpuKernelContext &ctx) {
int64_t data_num = ctx.Input(1)->NumElements();
T zero = T{0};
auto shard_fill = [&](int64_t start, int64_t end) { std::fill(y + start, y + end, zero); };
SWITCH_PARALLEL(shard_fill, data_num, 1);
if (p == 0.0) {
return KERNEL_STATUS_OK;
} else if (p == 1.0) {
return ParallelForPdistGrad<o_grad>(grad, x, dist, y, p, ctx);
} else if (p == 2.0) {
return ParallelForPdistGrad<t_grad>(grad, x, dist, y, p, ctx);
} else if (std::isinf(p)) {
return ParallelForPdistGrad<i_grad>(grad, x, dist, y, p, ctx);
} else {
return ParallelForPdistGrad<p_grad>(grad, x, dist, y, p, ctx);
}
}
}; // Grad
uint32_t PdistGradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "PdistGrad check input and output number failed.");
DataType input_type = ctx.Input(1)->GetDataType();
DataType output_type = ctx.Output(0)->GetDataType();
KERNEL_CHECK_FALSE((input_type == output_type), KERNEL_STATUS_PARAM_INVALID,
"Input data type[%s] is not equal to output data type[%s].", DTypeStr(input_type).c_str(),
DTypeStr(output_type).c_str());
uint64_t input_size = ctx.Input(1)->GetDataSize();
uint64_t output_size = ctx.Output(0)->GetDataSize();
KERNEL_CHECK_FALSE((input_size == output_size), KERNEL_STATUS_PARAM_INVALID,
"Input data size[%llu] is not equal to output data size[%llu].", input_size, output_size);
switch (input_type) {
case DT_FLOAT16:
return PdistGradCompute<Eigen::half>(ctx);
case DT_FLOAT:
return PdistGradCompute<float>(ctx);
default:
KERNEL_LOG_ERROR("PdistGrad kernel data type [%s] not support.", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
template <typename T>
uint32_t PdistGradCpuKernel::PdistGradCompute(CpuKernelContext &ctx) {
Tensor *grad_tensor = ctx.Input(0);
Tensor *x_tensor = ctx.Input(1);
Tensor *pdist_tensor = ctx.Input(2);
Tensor *y_tensor = ctx.Output(0);
T *grad = reinterpret_cast<T *>(grad_tensor->GetData());
T *x = reinterpret_cast<T *>(x_tensor->GetData());
T *pdist = reinterpret_cast<T *>(pdist_tensor->GetData());
T *y = reinterpret_cast<T *>(y_tensor->GetData());
float p = 2.0;
AttrValue *p_attr = ctx.GetAttr("p");
if (p_attr != nullptr) {
p = p_attr->GetFloat();
}
KERNEL_CHECK_FALSE((p >= 0), KERNEL_STATUS_PARAM_INVALID, "Attr[p] data cannot be less than 0.");
uint32_t ret = Grad<T>::PdistGradComputeKernel(grad, x, pdist, y, p, ctx);
if (ret != KERNEL_STATUS_OK) {
return ret;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kPdistGrad, PdistGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_PDIST_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_PDIST_GRAD_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class PdistGradCpuKernel : public CpuKernel {
public:
PdistGradCpuKernel() = default;
~PdistGradCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t PdistGradCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,82 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "polar.h"
#include "complex"
#include "cpu_kernel_utils.h"
#include "iostream"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
const char *kPolar = "Polar";
const int64_t kParallelDataNumMid = 35 * 1024;
const int64_t kParallelDataNum = 7 * 1024;
} // namespace
namespace aicpu {
uint32_t PolarCpuKernel::Compute(CpuKernelContext &ctx) {
DataType abs_type = ctx.Input(0)->GetDataType();
DataType angle_type = ctx.Input(1)->GetDataType();
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Polar check input and output number failed.");
if (abs_type == DT_FLOAT && angle_type == DT_FLOAT) {
return PolarCompute<float>(ctx);
} else if (abs_type == DT_DOUBLE && angle_type == DT_DOUBLE) {
return PolarCompute<double>(ctx);
} else {
KERNEL_LOG_ERROR("Polar kernel data type [%s],[%s] not support.", DTypeStr(abs_type).c_str(),
DTypeStr(angle_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t PolarCpuKernel::PolarCompute(CpuKernelContext &ctx) {
auto abs = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto angle = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto output = reinterpret_cast<std::complex<T> *>(ctx.Output(0)->GetData());
auto input_shape = ctx.Input(0)->GetTensorShape();
int64_t elements = input_shape->NumElements();
auto sharder_polar = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
output[i].real(abs[i] * cos(angle[i]));
output[i].imag(abs[i] * sin(angle[i]));
}
};
if (elements > kParallelDataNum) {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (elements <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, static_cast<int64_t>(4)); // up to 4 cpu cores
}
if (max_core_num > elements) {
max_core_num = elements;
}
if (max_core_num > 0) {
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, elements, elements / max_core_num, sharder_polar),
"Polar Compute failed.");
}
} else {
sharder_polar(0, elements);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kPolar, PolarCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
* \file polar.h
* \brief
*/
#ifndef AICPU_KERNELS_NORMALIZED_STFT_H_
#define AICPU_KERNELS_NORMALIZED_STFT_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class PolarCpuKernel : public CpuKernel {
public:
PolarCpuKernel() = default;
~PolarCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
static uint32_t PolarCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,203 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ragged_range.h"
#include <vector>
#include <cmath>
#include <type_traits>
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
#include "utils/eigen_tensor.h"
#include "kernel_log.h"
#include "status.h"
using namespace std;
namespace {
const uint32_t kOutputNum = 2;
const uint32_t kInputNum = 3;
const char *kRaggedRange = "RaggedRange";
constexpr int64_t kParallelDataNums = 16 * 1024;
#define RAGGEDRANGE_COMPUTE_CASE(DTYPE, TYPE, TSPLITS, NROWS, STARTS, LIMITS, DELTAS, BROADCAST_START, \
BROADCAST_LIMITS, BROADCAST_DELTAS, RT_NESTED_SPLITS, RT_DENSE_VALUE, CTX) \
case (DTYPE): { \
uint32_t result = \
RaggedRangeCompute<TYPE, TSPLITS>(NROWS, STARTS, LIMITS, DELTAS, BROADCAST_START, BROADCAST_LIMITS, \
BROADCAST_DELTAS, RT_NESTED_SPLITS, RT_DENSE_VALUE, CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("RaggedRange kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t RaggedRange::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "RaggedRange check params failed.");
Tensor *starts = ctx.Input(0);
auto starts_shape = starts->GetTensorShape();
int32_t starts_dim = starts_shape->GetDims();
Tensor *limits = ctx.Input(1);
auto limits_shape = limits->GetTensorShape();
int32_t limits_dim = limits_shape->GetDims();
Tensor *deltas = ctx.Input(2);
auto deltas_shape = deltas->GetTensorShape();
int32_t deltas_dim = deltas_shape->GetDims();
KERNEL_CHECK_FALSE((starts_dim <= 1), KERNEL_STATUS_PARAM_INVALID, "starts must be a scalar or vector.");
KERNEL_CHECK_FALSE((limits_dim <= 1), KERNEL_STATUS_PARAM_INVALID, "limits must be a scalar or vector.");
KERNEL_CHECK_FALSE((deltas_dim <= 1), KERNEL_STATUS_PARAM_INVALID, "deltas must be a scalar or vector.");
bool broadcast_starts = starts_dim == 0;
bool broadcast_limits = limits_dim == 0;
bool broadcast_deltas = deltas_dim == 0;
vector<int> in_sizes;
if (!broadcast_starts) in_sizes.push_back(starts_shape->GetDimSize(0));
if (!broadcast_limits) in_sizes.push_back(limits_shape->GetDimSize(0));
if (!broadcast_deltas) in_sizes.push_back(deltas_shape->GetDimSize(0));
for (uint32_t i = 1; i < in_sizes.size(); ++i) {
KERNEL_CHECK_FALSE((in_sizes[i] == in_sizes[i - 1]), KERNEL_STATUS_PARAM_INVALID,
"starts, limits, and deltas must have the same shape.");
}
uint32_t nrows = in_sizes.empty() ? 1 : in_sizes[0];
AttrValue *attr = ctx.GetAttr("Tsplits");
KERNEL_CHECK_NULLPTR(attr, KERNEL_STATUS_PARAM_INVALID, "Get attr[Tsplits] failed.");
DataType Tsplits = attr->GetDataType();
KERNEL_CHECK_FALSE((Tsplits == DT_INT32 || Tsplits == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
"The attr Tsplits must be int32 or int64.");
Tensor *rt_nested_splits = ctx.Output(0);
Tensor *rt_dense_values = ctx.Output(1);
auto starts_type = starts->GetDataType();
auto limits_type = limits->GetDataType();
auto deltas_type = deltas->GetDataType();
KERNEL_CHECK_FALSE((starts_type == limits_type && limits_type == deltas_type), KERNEL_STATUS_PARAM_INVALID,
"starts, limits and deltas must have the same type.");
if (Tsplits == DT_INT32) {
switch (starts_type) {
RAGGEDRANGE_COMPUTE_CASE(DT_FLOAT, float, int32_t, nrows, starts, limits, deltas, broadcast_starts,
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
RAGGEDRANGE_COMPUTE_CASE(DT_DOUBLE, double, int32_t, nrows, starts, limits, deltas, broadcast_starts,
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
RAGGEDRANGE_COMPUTE_CASE(DT_INT32, int32_t, int32_t, nrows, starts, limits, deltas, broadcast_starts,
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
RAGGEDRANGE_COMPUTE_CASE(DT_INT64, int64_t, int32_t, nrows, starts, limits, deltas, broadcast_starts,
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(starts_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
} else {
switch (starts_type) {
RAGGEDRANGE_COMPUTE_CASE(DT_FLOAT, float, int64_t, nrows, starts, limits, deltas, broadcast_starts,
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
RAGGEDRANGE_COMPUTE_CASE(DT_DOUBLE, double, int64_t, nrows, starts, limits, deltas, broadcast_starts,
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
RAGGEDRANGE_COMPUTE_CASE(DT_INT32, int32_t, int64_t, nrows, starts, limits, deltas, broadcast_starts,
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
RAGGEDRANGE_COMPUTE_CASE(DT_INT64, int64_t, int64_t, nrows, starts, limits, deltas, broadcast_starts,
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(starts_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
template <typename T, typename TSPLITS>
uint32_t RaggedRange::RaggedRangeCompute(const uint32_t nrows, Tensor *starts, Tensor *limits, Tensor *deltas,
bool broadcast_starts, bool broadcast_limits, bool broadcast_deltas,
Tensor *rt_nested_splits, Tensor *rt_dense_values, CpuKernelContext &ctx) {
T *starts_addr = reinterpret_cast<T *>(starts->GetData());
T *limits_addr = reinterpret_cast<T *>(limits->GetData());
T *deltas_addr = reinterpret_cast<T *>(deltas->GetData());
TSPLITS *rt_nested_splits_addr = reinterpret_cast<TSPLITS *>(rt_nested_splits->GetData());
rt_nested_splits_addr[0] = 0;
for (uint32_t row = 0; row < nrows; ++row) {
T start = broadcast_starts ? starts_addr[0] : starts_addr[row];
T limit = broadcast_limits ? limits_addr[0] : limits_addr[row];
T delta = broadcast_deltas ? deltas_addr[0] : deltas_addr[row];
KERNEL_CHECK_FALSE((delta != 0), KERNEL_STATUS_PARAM_INVALID, "Requires delta != 0.");
rt_nested_splits_addr[row + 1] = rt_nested_splits_addr[row] + RangeSize<T, TSPLITS>(start, limit, delta);
}
T *rt_dense_values_addr = reinterpret_cast<T *>(rt_dense_values->GetData());
if (nrows <= kParallelDataNums) {
int value_index = 0;
for (uint32_t row = 0; row < nrows; ++row) {
TSPLITS row_size = rt_nested_splits_addr[row + 1] - rt_nested_splits_addr[row];
T value = broadcast_starts ? starts_addr[0] : starts_addr[row];
T delta = broadcast_deltas ? deltas_addr[0] : deltas_addr[row];
for (TSPLITS i = 0; i < row_size; ++i) {
rt_dense_values_addr[value_index++] = value;
value += delta;
}
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > nrows) {
max_core_num = nrows;
}
auto shared_rtvalues = [&](size_t start, size_t end) {
for (size_t row = start; row < end; row++) {
TSPLITS row_size = rt_nested_splits_addr[row + 1] - rt_nested_splits_addr[row];
T value = broadcast_starts ? starts_addr[0] : starts_addr[row];
T delta = broadcast_deltas ? deltas_addr[0] : deltas_addr[row];
TSPLITS y_offset = rt_nested_splits_addr[row];
for (TSPLITS i = 0; i < row_size; ++i) {
rt_dense_values_addr[y_offset++] = value;
value += delta;
}
}
};
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, nrows, nrows / max_core_num, shared_rtvalues);
if (ret != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed.");
return KERNEL_STATUS_INNER_ERROR;
}
}
return KERNEL_STATUS_OK;
}
template <typename T, typename TSPLITS>
TSPLITS RaggedRange::RangeSize(T start, T limit, T delta) {
if (((delta > 0) && (limit < start)) || ((delta < 0) && (limit > start))) {
return 0;
}
return (std::is_integral<T>::value ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
: std::ceil(std::abs((limit - start) / delta)));
}
REGISTER_CPU_KERNEL(kRaggedRange, RaggedRange);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_RAGGED_RANGE_H_
#define AICPU_KERNELS_NORMALIZED_RAGGED_RANGE_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class RaggedRange : public CpuKernel {
public:
RaggedRange() = default;
~RaggedRange() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T, typename TSPLITS>
uint32_t RaggedRangeCompute(const uint32_t nrows, Tensor *starts, Tensor *limits, Tensor *deltas,
bool broadcast_starts, bool broadcast_limits, bool broadcast_deltas,
Tensor *rt_nested_splits, Tensor *rt_dense_values, CpuKernelContext &ctx);
template <typename T, typename TSPLITS>
TSPLITS RangeSize(T start, T limit, T delta);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,336 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ragged_tensor_to_sparse.h"
namespace {
const std::uint32_t kInputNum{aicpu::kDynamicInput};
const std::uint32_t kOutputNum{3u};
const char *kRaggedTensorToSparse = "RaggedTensorToSparse";
} // namespace
namespace aicpu {
uint32_t RaggedTensorToSparseCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
n_ = ctx.GetInputsSize() - 1;
KERNEL_CHECK_FALSE((n_ >= 1), KERNEL_STATUS_PARAM_INVALID,
"Input num must great equal 1,"
"but got input num[%u]",
n_);
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"RaggedTensorToSparse check input and output number failed.");
Tensor *rt_dense_values_ptr = ctx.Input(n_);
KERNEL_CHECK_NULLPTR(rt_dense_values_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input rt_dense_values failed.");
auto rt_dense_values_shape_ptr = rt_dense_values_ptr->GetTensorShape();
KERNEL_CHECK_NULLPTR(rt_dense_values_shape_ptr, KERNEL_STATUS_PARAM_INVALID,
"Get input rt_dense_values shape failed.");
DataType rt_dense_values_data_type = rt_dense_values_ptr->GetDataType();
KERNEL_CHECK_FALSE((rt_dense_values_data_type == DT_INT32 || rt_dense_values_data_type == DT_INT64 ||
rt_dense_values_data_type == DT_BOOL || rt_dense_values_data_type == DT_INT8 ||
rt_dense_values_data_type == DT_UINT8 || rt_dense_values_data_type == DT_INT16 ||
rt_dense_values_data_type == DT_UINT16 || rt_dense_values_data_type == DT_DOUBLE ||
rt_dense_values_data_type == DT_FLOAT || rt_dense_values_data_type == DT_FLOAT16),
KERNEL_STATUS_PARAM_INVALID,
"Input rt_dense_values data type must {DT_BOOL, DT_INT8, "
"DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, "
"DT_DOUBLE, DT_FLOAT, DT_FLOAT16},"
"but got data type [%s].",
DTypeStr(rt_dense_values_data_type).c_str());
auto rt_dense_values_data_ptr = rt_dense_values_ptr->GetData();
KERNEL_CHECK_NULLPTR(rt_dense_values_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input rt_dense_values data failed.");
return KERNEL_STATUS_OK;
}
// Validate `rt_nested_splits`
template <typename T1>
uint32_t RaggedTensorToSparseCpuKernel::ValidateInputs(std::vector<typename TTypes<T1>::Flat> rt_nested_splits,
const Tensor *rt_dense_values_in) {
for (uint32_t i = 0; i < rt_nested_splits.size(); ++i) {
if (rt_nested_splits[i].size() == 0) {
KERNEL_LOG_ERROR("ragged splits may not be empty.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (rt_nested_splits[i](0) != 0) {
KERNEL_LOG_ERROR("First value of ragged splits must be 0.");
return KERNEL_STATUS_PARAM_INVALID;
}
for (uint32_t j = 1; j < rt_nested_splits[i].size(); ++j) {
if (rt_nested_splits[i](j) < rt_nested_splits[i](j - 1)) {
KERNEL_LOG_ERROR("Ragged splits should be non decreasing.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
if (i > 0) {
T1 last_split = rt_nested_splits[i - 1](rt_nested_splits[i - 1].size() - 1);
if (rt_nested_splits[i].size() != last_split + 1) {
KERNEL_LOG_ERROR(
"Final value of ragged splits must match the length "
"the corresponding ragged values.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
}
if (rt_dense_values_in->GetTensorShape()->GetDimSizes()[0] !=
rt_nested_splits.back()(rt_nested_splits.back().size() - 1)) {
KERNEL_LOG_ERROR(
"Final value of ragged splits must match the length "
"the corresponding ragged values.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
std::vector<std::vector<int64_t>> RaggedTensorToSparseCpuKernel::MakeIndexSuffixes(const TensorShape &values_shape) {
std::vector<std::vector<int64_t>> suffixes{{}};
for (int32_t dim = 1; dim < values_shape.GetDims(); ++dim) {
std::vector<std::vector<int64_t>> new_suffixes;
for (const auto &suffix : suffixes) {
for (int64_t i = 0; i < values_shape.GetDimSize(dim); ++i) {
new_suffixes.push_back(suffix);
new_suffixes.back().push_back(i);
}
}
suffixes.swap(new_suffixes);
}
return suffixes;
}
template <typename T1>
bool RaggedTensorToSparseCpuKernel::IsCompleted(const std::vector<int64_t> &pos, int dim,
const std::vector<typename TTypes<T1>::Flat> &rt_nested_splits) {
int64_t current_child = pos[dim + 1];
int64_t limit_child = rt_nested_splits[dim](pos[dim] + 1);
return current_child >= limit_child;
}
void RaggedTensorToSparseCpuKernel::input_list(CpuKernelContext &ctx, OpInputList *list) {
static uint32_t start = 0, stop;
if (ctx.Input(0)->NumElements() > 0) {
stop = start + static_cast<uint32_t>(ctx.Input(0)->NumElements());
*list = OpInputList(&ctx, start, stop);
}
}
template <typename T1, typename T2>
uint32_t RaggedTensorToSparseCpuKernel::DoCompute(CpuKernelContext &ctx) {
// Assemble each value in `sparse_indices` using three parts:
// - `index_prefix` is the index in dimensions up through the last ragged
// dimension.
// - `index_middle` is the index in the last ragged dimension.
// - `index_suffix` is the index in the dense value dimensions.
OpInputList rt_nested_splits_in;
input_list(ctx, &rt_nested_splits_in);
const int64_t rt_nested_splits_len = n_;
std::vector<typename TTypes<T1>::Flat> rt_nested_splits;
rt_nested_splits.reserve(n_);
for (int i = 0; i < rt_nested_splits_len; ++i) {
if (rt_nested_splits_in[i]->NumElements() > 0) {
EigenTensor indicesET(rt_nested_splits_in[i], rt_nested_splits_in[i]->GetData());
Eigen::Tensor<T1, 1, Eigen::RowMajor, Eigen::DenseIndex> m = indicesET.flat<T1>();
rt_nested_splits.push_back(indicesET.flat<T1>());
}
}
const Tensor *rt_dense_values_in = ctx.Input(n_);
KERNEL_CHECK_FALSE((ValidateInputs<T1>(rt_nested_splits, rt_dense_values_in) == KERNEL_STATUS_OK),
KERNEL_STATUS_PARAM_INVALID, "ValidateInputs failed.");
KERNEL_CHECK_FALSE((Update<T1>(ctx, rt_nested_splits) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"Update failed.");
OutPutSparseValues<T2>(ctx);
OutPutSparseDenseShape<T1>(ctx, rt_nested_splits_in, rt_nested_splits);
return KERNEL_STATUS_OK;
}
template <typename T1>
uint32_t RaggedTensorToSparseCpuKernel::Update(CpuKernelContext &ctx,
std::vector<typename TTypes<T1>::Flat> rt_nested_splits) {
const Tensor *rt_dense_values_in = ctx.Input(n_);
const int64_t rt_nested_splits_len = n_;
std::vector<int64_t> index_prefix(n_);
std::vector<std::vector<int64_t>> index_suffixes = MakeIndexSuffixes(*rt_dense_values_in->GetTensorShape());
// Allocate the `sparse_indices` output tensor.
const int64_t nvals = (rt_nested_splits.back()(rt_nested_splits.back().size() - 1) * index_suffixes.size());
const int64_t indices_len = rt_nested_splits_len + rt_dense_values_in->GetTensorShape()->GetDims();
Tensor *sparse_indices = ctx.Output(0);
KERNEL_CHECK_NULLPTR((sparse_indices), KERNEL_STATUS_PARAM_INVALID, "Get sparse_indices failed.");
sparse_indices->SetDataType(DT_INT64);
auto sparse_indices_ptr = reinterpret_cast<int64_t *>(sparse_indices->GetData());
KERNEL_CHECK_NULLPTR(sparse_indices_ptr, KERNEL_STATUS_PARAM_INVALID, "Get sparse_indices data failed.");
KERNEL_CHECK_NULLPTR(sparse_indices, KERNEL_STATUS_PARAM_INVALID, "Create sparse_indices Flat failed.");
// pos[i] is the current position in rt_nested_splits[i]. final_pos is a
// reference to make it easier to refer to pos[-1].
std::vector<int64_t> pos(n_);
int64_t &final_pos = pos[n_ - 1];
// Each iteration through the loop, we increment pos[-1], and add indices
// for all the values corresponding to
// rt_nested_splits[-1][pos[-1]:pos[-1]+1].
int next_index = 0;
int64_t num = 0;
int max_final_pos = rt_nested_splits.back().size() - 1;
for (; final_pos < max_final_pos; ++final_pos) {
// Update `pos` to skip over completed elements (i.e., elements where
// we have already generated indices for all contained values).
for (int dim = n_ - 2; dim >= 0; --dim) {
while (IsCompleted<T1>(pos, dim, rt_nested_splits)) {
pos[dim] += 1;
}
}
// Update index_prefix.
for (size_t dim = 0; dim < index_prefix.size(); ++dim) {
int start = dim > 0 ? rt_nested_splits[dim - 1](pos[dim - 1]) : 0;
index_prefix[dim] = pos[dim] - start;
}
// Get length of the final-ragged-dimension slice.
const auto &final_splits = rt_nested_splits[n_ - 1];
int64_t slice_len = final_splits(final_pos + 1) - final_splits(final_pos);
// Add sparse_indices for this slice.
for (int64_t i = 0; i < slice_len; ++i) {
for (const auto &index_suffix : index_suffixes) {
int dim = 0;
for (int64_t index : index_prefix) { // index_prefix
sparse_indices_ptr[num++] = index;
dim++;
}
dim++;
sparse_indices_ptr[num++] = i;
for (int64_t index : index_suffix) { // index_suffix
sparse_indices_ptr[num++] = index;
dim++;
}
KERNEL_CHECK_FALSE((dim == indices_len), KERNEL_STATUS_PARAM_INVALID,
"dim should be equal to indices_len,but get %d.", dim);
++next_index;
}
}
}
KERNEL_CHECK_FALSE((next_index == nvals), KERNEL_STATUS_PARAM_INVALID,
"next_index should be equal to nvals,but get %d.", next_index);
return KERNEL_STATUS_OK;
}
template <typename T2>
void RaggedTensorToSparseCpuKernel::OutPutSparseValues(CpuKernelContext &ctx) {
// Output the `sparse_values` Tensor.
const Tensor *rt_dense_values_in = ctx.Input(n_);
Tensor *spares_values_out = ctx.Output(1);
if (rt_dense_values_in->GetTensorShape()->GetDims() == 1) {
spares_values_out->SetDataType(rt_dense_values_in->GetDataType());
auto rt_dense_shape = rt_dense_values_in->GetTensorShape();
auto spares_values_out_ptr = reinterpret_cast<T2 *>(spares_values_out->GetData());
auto rt_dense_values_in_ptr = reinterpret_cast<T2 *>(rt_dense_values_in->GetData());
for (int64_t i = 0; i < rt_dense_values_in->NumElements(); i++) {
spares_values_out_ptr[i] = rt_dense_values_in_ptr[i];
}
} else {
spares_values_out->SetDataType(rt_dense_values_in->GetDataType());
auto rt_dense_shape = rt_dense_values_in->GetTensorShape();
auto spares_values_out_ptr = reinterpret_cast<T2 *>(spares_values_out->GetData());
auto rt_dense_values_in_ptr = reinterpret_cast<T2 *>(rt_dense_values_in->GetData());
for (int64_t i = 0; i < rt_dense_values_in->NumElements(); i++) {
spares_values_out_ptr[i] = rt_dense_values_in_ptr[i];
}
}
}
template <typename T1>
void RaggedTensorToSparseCpuKernel::OutPutSparseDenseShape(CpuKernelContext &ctx, OpInputList rt_nested_splits_in,
std::vector<typename TTypes<T1>::Flat> rt_nested_splits) {
// Output the `sparse_dense_shape` Tensor.
const Tensor *rt_dense_values_in = ctx.Input(n_);
Tensor *sparse_dense_shape_out = ctx.Output(2);
int64_t *sparse_dense_shape = static_cast<int64_t *>(sparse_dense_shape_out->GetData());
sparse_dense_shape[0] = rt_nested_splits_in[0]->GetTensorShape()->GetDimSizes()[0] - 1;
for (int dim = 0; dim < n_; ++dim) {
const auto &splits = rt_nested_splits[dim];
T1 max_width = 0;
for (int i = 1; i < splits.size(); ++i) {
max_width = std::max(max_width, splits(i) - splits(i - 1));
}
sparse_dense_shape[dim + 1] = max_width;
}
for (int dim = 1; dim < rt_dense_values_in->GetTensorShape()->GetDims(); ++dim) {
sparse_dense_shape[dim + n_] = rt_dense_values_in->GetTensorShape()->GetDimSizes()[dim];
}
}
uint32_t RaggedTensorToSparseCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_CHECK_FALSE((CheckAndInitParams(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"CheckAndInitParams failed.");
DataType type1 = ctx.Input(n_)->GetDataType();
DataType SplitType = ctx.Input(0)->GetDataType();
switch (SplitType) {
case DT_INT32:
switch (type1) {
case DT_DOUBLE:
return DoCompute<int32_t, double>(ctx);
case DT_FLOAT16:
return DoCompute<int32_t, Eigen::half>(ctx);
case DT_FLOAT:
return DoCompute<int32_t, float>(ctx);
case DT_INT8:
return DoCompute<int32_t, int8_t>(ctx);
case DT_INT16:
return DoCompute<int32_t, int16_t>(ctx);
case DT_INT32:
return DoCompute<int32_t, int32_t>(ctx);
case DT_INT64:
return DoCompute<int32_t, int64_t>(ctx);
case DT_UINT8:
return DoCompute<int32_t, uint8_t>(ctx);
case DT_UINT16:
return DoCompute<int32_t, uint16_t>(ctx);
case DT_BOOL:
return DoCompute<int32_t, bool>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
return KERNEL_STATUS_PARAM_INVALID;
};
break;
case DT_INT64:
switch (type1) {
case DT_DOUBLE:
return DoCompute<int64_t, double>(ctx);
case DT_FLOAT16:
return DoCompute<int64_t, Eigen::half>(ctx);
case DT_FLOAT:
return DoCompute<int64_t, float>(ctx);
case DT_INT8:
return DoCompute<int64_t, int8_t>(ctx);
case DT_INT16:
return DoCompute<int64_t, int16_t>(ctx);
case DT_INT32:
return DoCompute<int64_t, int32_t>(ctx);
case DT_INT64:
return DoCompute<int64_t, int64_t>(ctx);
case DT_UINT8:
return DoCompute<int64_t, uint8_t>(ctx);
case DT_UINT16:
return DoCompute<int64_t, uint16_t>(ctx);
case DT_BOOL:
return DoCompute<int64_t, bool>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
return KERNEL_STATUS_PARAM_INVALID;
};
break;
default:
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(SplitType).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
REGISTER_CPU_KERNEL(kRaggedTensorToSparse, RaggedTensorToSparseCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,87 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_RAGGED_TENSOR_TO_SPARSE_H_
#define AICPU_KERNELS_NORMALIZED_RAGGED_TENSOR_TO_SPARSE_H_
#include <memory>
#include <vector>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "securec.h"
#include "status.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace aicpu {
class OpInputList {
public:
OpInputList() : ctx_(nullptr), start_(0), stop_(0) {}
OpInputList(CpuKernelContext *ctx, uint32_t start, uint32_t stop) : ctx_(ctx), start_(start), stop_(stop) {}
OpInputList(const OpInputList &) = default;
OpInputList &operator=(const OpInputList &other) = default;
Tensor *operator[](uint32_t i) const { return ctx_->Input(start_ + i); }
uint32_t size() const { return stop_ - start_; }
private:
CpuKernelContext *ctx_; // not owned
uint32_t start_;
uint32_t stop_;
};
class RaggedTensorToSparseCpuKernel : public CpuKernel {
public:
RaggedTensorToSparseCpuKernel() : type1(DT_DOUBLE), n_(1) {}
~RaggedTensorToSparseCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t CheckAndInitParams(CpuKernelContext &ctx);
template <typename T1>
uint32_t ValidateInputs(std::vector<typename TTypes<T1>::Flat> rt_nested_splits, const Tensor *rt_dense_values_in);
std::vector<std::vector<int64_t>> MakeIndexSuffixes(const TensorShape &values_shape);
template <typename T1>
bool IsCompleted(const std::vector<int64_t> &pos, int dim,
const std::vector<typename TTypes<T1>::Flat> &rt_nested_splits);
void input_list(CpuKernelContext &ctx, OpInputList *list);
template <typename T1, typename T2>
uint32_t DoCompute(CpuKernelContext &ctx);
template <typename T1>
uint32_t Update(CpuKernelContext &ctx, std::vector<typename TTypes<T1>::Flat> rt_nested_splits);
template <typename T2>
void OutPutSparseValues(CpuKernelContext &ctx);
template <typename T1>
void OutPutSparseDenseShape(CpuKernelContext &ctx, OpInputList rt_nested_splits_in,
std::vector<typename TTypes<T1>::Flat> rt_nested_splits);
private:
DataType type1;
int64_t n_;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,617 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ragged_tensor_to_tensor.h"
namespace {
constexpr uint32_t kInputNum = 4;
constexpr uint32_t kOutputNum = 1;
const char *kRaggedTensorToTensor = "RaggedTensorToTensor";
} // namespace
namespace aicpu {
uint32_t RaggedTensorToTensorCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"RaggedTensorToTensor check input and output number failed.");
DataType type1 = ctx.Input(1)->GetDataType();
DataType SplitType = ctx.Input(0)->GetDataType();
switch (SplitType) {
case DT_INT32:
switch (type1) {
case DT_DOUBLE:
return DoCompute<int32_t, double>(ctx);
case DT_FLOAT16:
return DoCompute<int32_t, Eigen::half>(ctx);
case DT_FLOAT:
return DoCompute<int32_t, float>(ctx);
case DT_INT8:
return DoCompute<int32_t, int8_t>(ctx);
case DT_INT16:
return DoCompute<int32_t, int16_t>(ctx);
case DT_INT32:
return DoCompute<int32_t, int32_t>(ctx);
case DT_INT64:
return DoCompute<int32_t, int64_t>(ctx);
case DT_UINT8:
return DoCompute<int32_t, uint8_t>(ctx);
case DT_UINT16:
return DoCompute<int32_t, uint16_t>(ctx);
case DT_BOOL:
return DoCompute<int32_t, bool>(ctx);
default: {
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
};
break;
case DT_INT64:
switch (type1) {
case DT_DOUBLE:
return DoCompute<int64_t, double>(ctx);
case DT_FLOAT16:
return DoCompute<int64_t, Eigen::half>(ctx);
case DT_FLOAT:
return DoCompute<int64_t, float>(ctx);
case DT_INT8:
return DoCompute<int64_t, int8_t>(ctx);
case DT_INT16:
return DoCompute<int64_t, int16_t>(ctx);
case DT_INT32:
return DoCompute<int64_t, int32_t>(ctx);
case DT_INT64:
return DoCompute<int64_t, int64_t>(ctx);
case DT_UINT8:
return DoCompute<int64_t, uint8_t>(ctx);
case DT_UINT16:
return DoCompute<int64_t, uint16_t>(ctx);
case DT_BOOL:
return DoCompute<int64_t, bool>(ctx);
default: {
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
};
break;
default: {
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(SplitType).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
}
graphStatus RaggedTensorToTensorCpuKernel::GetRowPartitionTypes(CpuKernelContext &ctx) {
std::vector<std::string> partition_types;
AttrValue *row_part = ctx.GetAttr("row_partition_types");
int64_t N = ctx.Input(0)->GetTensorShape()->GetDims();
row_partition_types_.reserve(N);
partition_types.reserve(N);
if (!row_part) {
KERNEL_LOG_ERROR("row_partition_types error.");
return GRAPH_FAILED;
}
partition_types = row_part->GetListString();
const auto string_to_type =
new std::unordered_map<std::string, RowPartitionType>({{"FIRST_DIM_SIZE", RowPartitionType::FIRST_DIM_SIZE},
{"VALUE_ROWIDS", RowPartitionType::VALUE_ROWIDS},
{"ROW_LENGTHS", RowPartitionType::ROW_LENGTHS},
{"ROW_SPLITS", RowPartitionType::ROW_SPLITS},
{"ROW_LIMITS", RowPartitionType::ROW_LIMITS},
{"ROW_STARTS", RowPartitionType::ROW_STARTS}});
for (const std::string &type_str : partition_types) {
const auto iter = string_to_type->find(type_str);
if (iter == string_to_type->end()) {
delete string_to_type;
KERNEL_LOG_ERROR("Unknown string for partition info type.");
return GRAPH_FAILED;
}
row_partition_types_.push_back(iter->second);
}
delete string_to_type;
return GRAPH_SUCCESS;
}
int32_t RaggedTensorToTensorCpuKernel::GetRaggedRank(const std::vector<RowPartitionType> &partition_types) {
if (partition_types.empty()) {
return 0;
}
if (partition_types[0] == RowPartitionType::FIRST_DIM_SIZE) {
return partition_types.size() - 1;
}
return partition_types.size();
}
RowPartitionType RaggedTensorToTensorCpuKernel::GetRowPartitionTypeByDimension(int dimension) {
if (row_partition_types_[0] == RowPartitionType::FIRST_DIM_SIZE) {
return row_partition_types_[dimension + 1];
} else {
return row_partition_types_[dimension];
}
}
// Returns the relationship between dimension and dimension + 1.
template <typename INDEX_TYPE>
typename TTypes<INDEX_TYPE>::Flat RaggedTensorToTensorCpuKernel::GetRowPartitionTensor(CpuKernelContext &c,
int64_t dimension) {
if (row_partition_types_[0] == RowPartitionType::FIRST_DIM_SIZE) {
Tensor *row_partition = c.Input(dimension + 1 + kFirstPartitionInputIndex);
EigenTensor rowET(row_partition, reinterpret_cast<INDEX_TYPE *>(row_partition->GetData()));
typename TTypes<INDEX_TYPE>::Flat flat_tensor = rowET.flat<INDEX_TYPE>();
return flat_tensor;
} else {
Tensor *row_partition = c.Input(dimension + kFirstPartitionInputIndex);
EigenTensor rowET(row_partition, reinterpret_cast<INDEX_TYPE *>(row_partition->GetData()));
typename TTypes<INDEX_TYPE>::Flat flat_tensor = rowET.flat<INDEX_TYPE>();
return flat_tensor;
}
}
string RaggedTensorToTensorCpuKernel::RowPartitionTypeToString(RowPartitionType row_partition_type) {
switch (row_partition_type) {
case RowPartitionType::FIRST_DIM_SIZE:
return "FIRST_DIM_SIZE";
case RowPartitionType::VALUE_ROWIDS:
return "VALUE_ROWIDS";
case RowPartitionType::ROW_LENGTHS:
return "ROW_LENGTHS";
case RowPartitionType::ROW_SPLITS:
return "ROW_SPLITS";
case RowPartitionType::ROW_LIMITS:
return "ROW_LIMITS";
case RowPartitionType::ROW_STARTS:
return "ROW_STARTS";
default:
return "UNKNOWN ROW PARTITION TYPE";
}
}
graphStatus RaggedTensorToTensorCpuKernel::ValidateDefaultValueShape(const TensorShapeProto &default_value_shape,
const TensorShapeProto &value_shape,
const char *op_name) {
if (default_value_shape.unknown_rank || value_shape.unknown_rank) {
return GRAPH_SUCCESS;
}
if (default_value_shape.dims.size() > value_shape.dims.size()) {
KERNEL_LOG_ERROR("default_value must have less dimensions than the values.");
return GRAPH_FAILED;
}
for (size_t i = 0; i < std::min(default_value_shape.dims.size(), value_shape.dims.size() - 1); ++i) {
if (default_value_shape.dims[i].size >= 0 && value_shape.dims[i + 1].size >= 0 &&
default_value_shape.dims[i].size != 1 && default_value_shape.dims[i].size != value_shape.dims[i + 1].size) {
return GRAPH_FAILED;
}
}
return GRAPH_SUCCESS;
}
graphStatus RaggedTensorToTensorCpuKernel::AsProto(Tensor *tshape, TensorShapeProto *proto, std::string name) const {
proto->dims.clear();
if (name == "shape") {
if (tshape->GetTensorShape()) {
if ((tshape->GetDataType() == DT_INT32 &&
static_cast<int64_t *>(tshape->GetData())[0] == static_cast<int32_t>(-1)) ||
(tshape->GetDataType() == DT_INT64 &&
static_cast<int64_t *>(tshape->GetData())[0] == static_cast<int64_t>(-1))) {
proto->unknown_rank = true;
return KERNEL_STATUS_OK;
}
}
if (tshape->GetDataType() == DT_INT32) {
int64_t dimsnum = tshape->GetTensorShape()->NumElements();
Dim tdim;
proto->dims.reserve(dimsnum);
auto dd = static_cast<int32_t *>(tshape->GetData());
for (int64_t i = 0; i < tshape->GetTensorShape()->NumElements(); i++) {
tdim.size = dd[i];
proto->dims.push_back(tdim);
proto->unknown_rank = false;
}
return KERNEL_STATUS_OK;
} else if (tshape->GetDataType() == DT_INT64) {
int64_t dimsnum = tshape->GetTensorShape()->NumElements();
Dim tdim;
proto->dims.reserve(dimsnum);
for (int64_t i = 0; i < tshape->GetTensorShape()->NumElements(); i++) {
tdim.size = static_cast<int64_t *>(tshape->GetData())[i];
proto->dims.push_back(tdim);
proto->unknown_rank = false;
}
return KERNEL_STATUS_OK;
}
KERNEL_LOG_ERROR("Expected an int32 or int64 shape tensor.");
return KERNEL_STATUS_PARAM_INVALID;
} else {
if (tshape->GetTensorShape()->GetUnknownRank()) {
proto->unknown_rank = true;
} else {
for (int i = 0; i < tshape->GetTensorShape()->GetDims(); i++) {
Dim dim;
dim.size = tshape->GetTensorShape()->GetDimSizes()[i];
proto->dims.push_back(dim);
}
}
return KERNEL_STATUS_OK;
}
}
graphStatus RaggedTensorToTensorCpuKernel::CombineRaggedTensorToTensorShapes(int32_t ragged_rank,
const TensorShapeProto &shape,
const TensorShapeProto &value_shape,
TensorShapeProto &output_shape,
const char *op_name) {
if (value_shape.unknown_rank && shape.unknown_rank) {
output_shape.dims.clear();
output_shape.unknown_rank = true;
return GRAPH_SUCCESS;
}
if (shape.unknown_rank) {
while (output_shape.dims.size() < ragged_rank + value_shape.dims.size()) {
Dim temp_dim;
temp_dim.size = -1;
output_shape.dims.emplace_back(temp_dim);
}
} else {
output_shape = shape;
}
if (value_shape.unknown_rank) {
return GRAPH_SUCCESS;
}
if (ragged_rank + value_shape.dims.size() != output_shape.dims.size()) {
KERNEL_LOG_ERROR(
"error:ragged_rank plus value_shape dims should be equal to output dim "
"sizes.");
return GRAPH_FAILED;
}
for (size_t i = 1; i < value_shape.dims.size(); ++i) {
const Dim value_dim = value_shape.dims[i];
Dim output_shape_dim = output_shape.dims.at(output_shape.dims.size() - value_shape.dims.size() + i);
if (value_dim.size >= 0) {
if (output_shape_dim.size >= 0 && output_shape_dim.size != value_dim.size) {
KERNEL_LOG_ERROR("Value and shape dimension are inconsistent.");
return GRAPH_FAILED;
}
if (output_shape_dim.size < 0) {
output_shape_dim.size = value_dim.size;
}
}
}
return GRAPH_SUCCESS;
}
template <typename INDEX_TYPE>
uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputSize(INDEX_TYPE first_dim, CpuKernelContext &c,
vector<INDEX_TYPE> *result) {
TensorShapeProto value_shape_proto;
Tensor *value_ptr = c.Input(kValueInputIndex);
AsProto(value_ptr, &value_shape_proto, "value");
TensorShapeProto default_value_shape_proto;
Tensor *default_value_ptr = c.Input(kDefaultValueInputIndex);
AsProto(default_value_ptr, &default_value_shape_proto, "default_value");
TensorShapeProto output_shape_proto;
Tensor *output_ptr = c.Output(0);
KERNEL_CHECK_NULLPTR(output_ptr, KERNEL_STATUS_PARAM_INVALID, "Output error.");
KERNEL_CHECK_FALSE(
(ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto, "RaggedTensorToTensor") != GRAPH_FAILED),
KERNEL_STATUS_PARAM_INVALID, "ValidateDefaultValueShape error.");
TensorShapeProto shape_proto;
{
Tensor *shape_ptr = c.Input(kShapeInputIndex);
AsProto(shape_ptr, &shape_proto, "shape");
}
KERNEL_CHECK_FALSE((CombineRaggedTensorToTensorShapes(ragged_rank_, shape_proto, value_shape_proto,
output_shape_proto, "RaggedTensorToTensor") != GRAPH_FAILED),
KERNEL_STATUS_PARAM_INVALID, "CombineRaggedTensorToTensorShapes error.");
result->reserve(output_shape_proto.dims.size());
for (unsigned int dim = 0; dim < output_shape_proto.dims.size(); dim++) {
// Note that this may be -1 (if dimension size is unknown).
result->push_back(output_shape_proto.dims[dim].size);
}
if ((*result)[0] < 0) {
(*result)[0] = first_dim;
}
for (int i = 1; i <= ragged_rank_; ++i) {
KERNEL_CHECK_FALSE(((*result)[i] >= 0), KERNEL_STATUS_PARAM_INVALID, "Result error.");
}
return KERNEL_STATUS_OK;
}
/**
* The output_index represents the index in the output tensor
* where the first element of a particular dimension would be written.
* If it is -1, it indicates that the index is out of scope.
* Example, given first_dimension = 10, first_dimension_output = 6,
* and output_index_multiplier = 100:
* result = [0 100 200 300 400 500 -1 -1 -1 -1]
* If first_dimension_output = 11 instead, then:
* result = [0 100 200 300 400 500 600 700 800 900]
*/
template <typename INDEX_TYPE>
vector<INDEX_TYPE> RaggedTensorToTensorCpuKernel::CalculateFirstParentOutputIndex(INDEX_TYPE first_dimension,
INDEX_TYPE output_index_multiplier,
INDEX_TYPE first_dimension_output) {
const INDEX_TYPE min_dimension = std::min(first_dimension, first_dimension_output);
vector<INDEX_TYPE> result;
result.reserve(first_dimension);
int current_output_index = 0;
for (INDEX_TYPE i = 0; i < min_dimension; ++i, current_output_index += output_index_multiplier) {
result.push_back(current_output_index);
}
for (INDEX_TYPE i = min_dimension; i < first_dimension; ++i) {
result.push_back(-1);
}
unsigned int fisrt_dim = (unsigned int)first_dimension;
if (result.size() < fisrt_dim) KERNEL_LOG_ERROR("Resize size shou l d be greater equal first dim.");
return result;
}
template <typename INDEX_TYPE>
uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputIndexRowSplit(const typename TTypes<INDEX_TYPE>::Flat &row_split,
const vector<INDEX_TYPE> &parent_output_index,
INDEX_TYPE output_index_multiplier,
INDEX_TYPE output_size,
vector<INDEX_TYPE> *result) {
INDEX_TYPE row_split_size = row_split.size();
if (row_split_size > 0) {
result->reserve(row_split(row_split_size - 1));
}
for (INDEX_TYPE i = 0; i < row_split_size - 1; ++i) {
INDEX_TYPE row_length = row_split(i + 1) - row_split(i);
INDEX_TYPE real_length = std::min(output_size, row_length);
INDEX_TYPE parent_output_index_current = parent_output_index[i];
if (parent_output_index_current == -1) {
real_length = 0;
}
for (INDEX_TYPE j = 0; j < real_length; ++j) {
result->push_back(parent_output_index_current);
parent_output_index_current += output_index_multiplier;
}
for (INDEX_TYPE j = 0; j < row_length - real_length; ++j) {
result->push_back(-1);
}
}
if (row_split_size > 0) {
unsigned int row_split_size1 = row_split(row_split_size - 1);
KERNEL_CHECK_FALSE((result->size() >= row_split_size1), KERNEL_STATUS_PARAM_INVALID,
"Result size should be greater equal row split size.");
}
return KERNEL_STATUS_OK;
}
// Calculate the output index of the first element of a list.
// The parent_output_index is the same computation for the previous list.
// -1 indicates an element or list that is out of range.
// The output_index_multiplier is the number of output indices one moves
// forward for each column.
// E.g., given:
// value_rowids:[0 1 2 2 2 3 5 5 6]
// parent_output_index:[1000 1100 2000 2100 -1 3000 4000]
// output_index_multiplier: 10
// output_size: 2
// You get:
// result = [1000 1100 2000 2010 -1 2100 -1 -1 3000]
// result[0] = parent_output_index[value_rowids[0]]
// result[1] = parent_output_index[value_rowids[1]]
// result[2] = parent_output_index[value_rowids[2]]
// result[3] = parent_output_index[value_rowids[2] + 10]
// result[4] = -1 because it is the third element the size is 2.
// result[5] = parent_output_index[value_rowids[3]]
// result[6] = -1 because parent_output_index[value_rowids[6]] == -1
// result[7] = -1 because parent_output_index[value_rowids[6]] == -1
// result[8] = parent_output_index[value_rowids[7]]
template <typename INDEX_TYPE>
uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputIndexValueRowID(
const typename TTypes<INDEX_TYPE>::Flat &value_rowids, const vector<INDEX_TYPE> &parent_output_index,
INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size, vector<INDEX_TYPE> *result) {
const INDEX_TYPE index_size = value_rowids.size();
result->reserve(index_size);
KERNEL_CHECK_FALSE((index_size != 0), KERNEL_STATUS_PARAM_INVALID, "Index size should not be zero.");
INDEX_TYPE current_output_column = 0;
unsigned int current_value_rowid = value_rowids(0);
KERNEL_CHECK_FALSE((current_value_rowid < parent_output_index.size()), KERNEL_STATUS_PARAM_INVALID,
"Current value rowid should be less than parent output index size.");
INDEX_TYPE current_output_index = parent_output_index[current_value_rowid];
result->push_back(current_output_index);
for (INDEX_TYPE i = 1; i < index_size; ++i) {
unsigned int next_value_rowid = value_rowids(i);
if (next_value_rowid == current_value_rowid && current_output_index >= 0) {
++current_output_column;
if (current_output_column < output_size) {
current_output_index += output_index_multiplier;
} else {
current_output_index = -1;
}
}
if (next_value_rowid != current_value_rowid) {
current_output_column = 0;
current_value_rowid = next_value_rowid;
if (next_value_rowid >= parent_output_index.size()) {
KERNEL_LOG_ERROR("Next value rowid should be less than parent output index size.");
return KERNEL_STATUS_PARAM_INVALID;
}
current_output_index = parent_output_index[next_value_rowid];
}
result->push_back(current_output_index);
}
size_t result_size = result->size();
size_t value_rowid_size = value_rowids.size();
KERNEL_CHECK_FALSE((result_size == value_rowid_size), KERNEL_STATUS_PARAM_INVALID, "Invalid row ids.");
return KERNEL_STATUS_OK;
}
template <typename INDEX_TYPE>
uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputIndex(CpuKernelContext &ctx, int64_t dimension,
const vector<INDEX_TYPE> &parent_output_index,
INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
vector<INDEX_TYPE> *result) {
const typename TTypes<INDEX_TYPE>::Flat row_partition_tensor = GetRowPartitionTensor<INDEX_TYPE>(ctx, dimension);
auto partition_type = GetRowPartitionTypeByDimension(dimension);
switch (partition_type) {
case RowPartitionType::VALUE_ROWIDS:
return CalculateOutputIndexValueRowID(row_partition_tensor, parent_output_index, output_index_multiplier,
output_size, result);
case RowPartitionType::ROW_SPLITS:
return CalculateOutputIndexRowSplit(row_partition_tensor, parent_output_index, output_index_multiplier,
output_size, result);
default:
KERNEL_LOG_ERROR("Unsupported partition type:[%s]", RowPartitionTypeToString(partition_type));
return KERNEL_STATUS_PARAM_INVALID;
}
}
template <typename INDEX_TYPE>
uint32_t RaggedTensorToTensorCpuKernel::GetFirstDimensionSize(CpuKernelContext &ctx, INDEX_TYPE *result) {
const Tensor *first_partition_tensor = ctx.Input(kFirstPartitionInputIndex);
const RowPartitionType first_partition_type = row_partition_types_[0];
switch (first_partition_type) {
case RowPartitionType::FIRST_DIM_SIZE:
*result = static_cast<INDEX_TYPE *>(first_partition_tensor->GetData())[0];
return KERNEL_STATUS_OK;
case RowPartitionType::VALUE_ROWIDS:
KERNEL_LOG_ERROR("Cannot handle VALUE_ROWIDS in first dimension.");
return KERNEL_STATUS_PARAM_INVALID;
case RowPartitionType::ROW_SPLITS:
*result = first_partition_tensor->GetTensorShape()->GetDimSizes()[0] - 1;
return KERNEL_STATUS_OK;
default:
KERNEL_LOG_ERROR("Cannot handle type [%s]", RowPartitionTypeToString(first_partition_type));
return KERNEL_STATUS_INNER_ERROR;
}
}
template <typename INDEX_TYPE, typename VALUE_TYPE>
uint32_t RaggedTensorToTensorCpuKernel::DoCompute(CpuKernelContext &ctx) {
KERNEL_CHECK_FALSE((GetRowPartitionTypes(ctx) != GRAPH_FAILED), KERNEL_STATUS_PARAM_INVALID,
"GetRowPartitionTypes error");
ragged_rank_ = GetRaggedRank(row_partition_types_);
INDEX_TYPE first_dimension;
KERNEL_CHECK_FALSE((GetFirstDimensionSize(ctx, &first_dimension) == 0), KERNEL_STATUS_PARAM_INVALID,
"GetFirstDimensionSize error.");
vector<INDEX_TYPE> output_size;
KERNEL_CHECK_FALSE((CalculateOutputSize(first_dimension, ctx, &output_size) == 0), KERNEL_STATUS_PARAM_INVALID,
"CalculateOutputSize error.");
vector<INDEX_TYPE> multiplier;
multiplier.resize(output_size.size());
multiplier[multiplier.size() - 1] = 1;
for (int i = output_size.size() - 2; i >= 0; --i) {
multiplier[i] = multiplier[i + 1] * output_size[i + 1];
}
Tensor *output_tensor = nullptr;
output_tensor = ctx.Output(0);
auto output_shape = output_tensor->GetTensorShape();
auto output_shape_dims = output_shape->GetDimSizes();
for (unsigned int i = 0; i < output_size.size(); i++) {
output_shape_dims[i] = output_size[i];
}
const INDEX_TYPE full_size = multiplier[0] * output_size[0];
if (full_size > 0) {
vector<INDEX_TYPE> output_index = CalculateFirstParentOutputIndex(first_dimension, multiplier[0], output_size[0]);
for (int i = 1; i <= ragged_rank_; ++i) {
vector<INDEX_TYPE> new_output_index;
KERNEL_CHECK_FALSE(
(CalculateOutputIndex(ctx, i - 1, output_index, multiplier[i], output_size[i], &new_output_index) == 0),
KERNEL_STATUS_PARAM_INVALID, "CalculateOutputIndex error.");
output_index = new_output_index;
}
return SetOutput<INDEX_TYPE, VALUE_TYPE>(ctx, output_index, output_tensor);
}
return KERNEL_STATUS_OK;
}
template <typename INDEX_TYPE, typename VALUE_TYPE>
uint32_t RaggedTensorToTensorCpuKernel::SetOutput(CpuKernelContext &ctx, const vector<INDEX_TYPE> &output_index,
Tensor *output_tensor) {
EigenTensor outputET(output_tensor, reinterpret_cast<INDEX_TYPE *>(output_tensor->GetData()));
typename aicpu::TTypes<VALUE_TYPE>::Flat output_flat = outputET.flat<VALUE_TYPE>();
const auto value_tensor = ctx.Input(kValueInputIndex);
const auto default_value_tensor = ctx.Input(kDefaultValueInputIndex);
if (value_tensor->GetTensorShape()->GetDims() == 1) {
// Initialize tensor to default_value.
VALUE_TYPE *base_output = output_flat.data();
VALUE_TYPE *default_value_pt = static_cast<VALUE_TYPE *>(default_value_tensor->GetData());
VALUE_TYPE default_value = default_value_pt[0];
std::fill(base_output, base_output + output_flat.size(), default_value);
EigenTensor valuesET(value_tensor, reinterpret_cast<INDEX_TYPE *>(value_tensor->GetData()));
auto values = valuesET.flat<VALUE_TYPE>();
unsigned int values_size = values.size();
KERNEL_CHECK_FALSE((values_size == output_index.size()), KERNEL_STATUS_PARAM_INVALID,
"Values and indices must be equal.");
for (unsigned int i = 0; i < values_size; ++i) {
if (output_index[i] >= 0) {
output_flat(output_index[i]) = values(i);
}
}
} else {
auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
auto default_value_shape = default_value_tensor->GetTensorShape()->GetDimSizes();
int64_t output_element_size = 1;
for (const int64_t &d : output_shape) {
output_element_size *= d;
}
// Initialize tensor to default_value.
std::vector<int64_t> broadcast_shape;
auto ret = GetBroadcastShape(default_value_shape, output_shape, broadcast_shape);
KERNEL_CHECK_FALSE(ret == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID, "Broadcast failed.");
KERNEL_CHECK_FALSE(broadcast_shape == output_shape, KERNEL_STATUS_PARAM_INVALID,
"Unable to broadcast shape of default_value to result.");
BroadcastIterator iter(default_value_shape, output_shape, broadcast_shape);
auto default_value_addr = reinterpret_cast<VALUE_TYPE *>(default_value_tensor->GetData());
auto output_addr = reinterpret_cast<VALUE_TYPE *>(output_tensor->GetData());
iter.SetPos(0);
for (int i = 0; i < output_element_size; ++i) {
output_addr[i] = default_value_addr[iter.GetInputPosA()];
iter.GenNextPos();
}
VALUE_TYPE *base_output = output_flat.data();
EigenTensor valuesET(value_tensor, reinterpret_cast<INDEX_TYPE *>(value_tensor->GetData()));
auto values = valuesET.flat<VALUE_TYPE>();
size_t values_size = values.size();
size_t output_index_size = output_index.size();
// A value "element" is a group of values that are arranged together.
// For example, if the value shape is [3,4,5], then 20 values are in a
// value element.
unsigned int value_element_size;
if (output_index_size != 0) {
value_element_size = values_size / output_index_size;
} else {
KERNEL_LOG_DEBUG("Values and indices must be equal");
return KERNEL_STATUS_PARAM_INVALID;
}
unsigned int value_element_bytesize = value_element_size * sizeof(VALUE_TYPE);
const VALUE_TYPE *values_base = values.data();
unsigned int values_dimsize = value_tensor->GetTensorShape()->GetDimSizes()[0];
KERNEL_CHECK_FALSE((values_dimsize == output_index_size), KERNEL_STATUS_PARAM_INVALID,
"Values and indices must be equal.");
KERNEL_CHECK_FALSE((values_size == output_index_size * value_element_size), KERNEL_STATUS_PARAM_INVALID,
"Values and indices must be equal.");
INDEX_TYPE value_index = 0;
for (unsigned int i = 0; i < output_index_size; ++i, value_index += value_element_size) {
if (output_index[i] >= 0) {
VALUE_TYPE *dst = base_output + output_index[i];
const VALUE_TYPE *src = values_base + value_index;
copy_array<VALUE_TYPE, INDEX_TYPE>(dst, src, value_element_size, value_element_bytesize);
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kRaggedTensorToTensor, RaggedTensorToTensorCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,150 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_RAGGEDTENSORTOTENSOR_H_
#define AICPU_KERNELS_NORMALIZED_RAGGEDTENSORTOTENSOR_H_
#include <memory>
#include <vector>
#include <iostream>
#include <string>
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "securec.h"
#include "status.h"
#include "utils/eigen_tensor.h"
#include "utils/broadcast_iterator.h"
#include "utils/kernel_util.h"
#include "Eigen/Core"
#include "unsupported/Eigen/CXX11/Tensor"
#include <unordered_map>
using std::string;
using std::vector;
namespace aicpu {
struct DimStruct {
int64_t size = 1;
};
using Dim = DimStruct;
struct TensorShapeProtoStruct {
std::vector<Dim> dims;
bool unknown_rank = false;
};
using TensorShapeProto = TensorShapeProtoStruct;
enum class RowPartitionType { FIRST_DIM_SIZE, VALUE_ROWIDS, ROW_LENGTHS, ROW_SPLITS, ROW_LIMITS, ROW_STARTS };
const int kShapeInputIndex = 0;
const int kValueInputIndex = 1;
const int kDefaultValueInputIndex = 2;
const int kFirstPartitionInputIndex = 3;
using graphStatus = uint32_t;
const graphStatus GRAPH_FAILED = 0xFFFFFFFF;
const graphStatus GRAPH_SUCCESS = 0;
template <typename VALUE_TYPE, typename INDEX_TYPE>
void slow_copy_array(VALUE_TYPE *dst, const VALUE_TYPE *src, INDEX_TYPE size) {
for (INDEX_TYPE index = 0; index < size; ++index) {
dst[index] = src[index];
}
}
template <typename VALUE_TYPE, typename INDEX_TYPE>
void copy_array(VALUE_TYPE *dst, const VALUE_TYPE *src, INDEX_TYPE size, size_t bytes) {
memcpy(dst, src, bytes);
}
template <>
void copy_array<string, int64_t>(std::string *dst, const string *src, int64_t size, size_t bytes) {
slow_copy_array(dst, src, size);
}
template <>
void copy_array<string, int32_t>(string *dst, const string *src, int32_t size, size_t bytes) {
slow_copy_array(dst, src, size);
}
template <>
void copy_array<Eigen::half, int64_t>(Eigen::half *dst, const Eigen::half *src, int64_t size, size_t bytes) {
slow_copy_array(dst, src, size);
}
template <>
void copy_array<Eigen::half, int32_t>(Eigen::half *dst, const Eigen::half *src, int32_t size, size_t bytes) {
slow_copy_array(dst, src, size);
}
class RaggedTensorToTensorCpuKernel : public CpuKernel {
public:
graphStatus GetRowPartitionTypes(CpuKernelContext &ctx);
int32_t GetRaggedRank(const std::vector<RowPartitionType> &partition_types);
RowPartitionType GetRowPartitionTypeByDimension(int dimension);
template <typename INDEX_TYPE>
typename TTypes<INDEX_TYPE>::Flat GetRowPartitionTensor(CpuKernelContext &c, int64_t dimension);
string RowPartitionTypeToString(RowPartitionType row_partition_type);
graphStatus ValidateDefaultValueShape(const TensorShapeProto &default_value_shape,
const TensorShapeProto &value_shape, const char *op_name);
graphStatus AsProto(Tensor *tshape, TensorShapeProto *proto, std::string name) const;
graphStatus CombineRaggedTensorToTensorShapes(int32_t ragged_rank, const TensorShapeProto &shape,
const TensorShapeProto &value_shape, TensorShapeProto &output_shape,
const char *op_name);
template <typename INDEX_TYPE>
uint32_t CalculateOutputSize(INDEX_TYPE first_dim, CpuKernelContext &c, vector<INDEX_TYPE> *result);
template <typename INDEX_TYPE>
vector<INDEX_TYPE> CalculateFirstParentOutputIndex(INDEX_TYPE first_dimension, INDEX_TYPE output_index_multiplier,
INDEX_TYPE first_dimension_output);
template <typename INDEX_TYPE>
uint32_t CalculateOutputIndexRowSplit(const typename TTypes<INDEX_TYPE>::Flat &row_split,
const vector<INDEX_TYPE> &parent_output_index,
INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
vector<INDEX_TYPE> *result);
template <typename INDEX_TYPE>
uint32_t CalculateOutputIndexValueRowID(const typename TTypes<INDEX_TYPE>::Flat &value_rowids,
const vector<INDEX_TYPE> &parent_output_index,
INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
vector<INDEX_TYPE> *result);
template <typename INDEX_TYPE>
uint32_t CalculateOutputIndex(CpuKernelContext &context, int64_t dimension,
const vector<INDEX_TYPE> &parent_output_index, INDEX_TYPE output_index_multiplier,
INDEX_TYPE output_size, vector<INDEX_TYPE> *result);
template <typename INDEX_TYPE>
uint32_t GetFirstDimensionSize(CpuKernelContext &context, INDEX_TYPE *result);
template <typename INDEX_TYPE, typename VALUE_TYPE>
uint32_t DoCompute(CpuKernelContext &context);
template <typename INDEX_TYPE, typename VALUE_TYPE>
uint32_t SetOutput(CpuKernelContext &context, const vector<INDEX_TYPE> &output_index, Tensor *output_tensor);
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
std::vector<RowPartitionType> row_partition_types_;
int ragged_rank_;
};
}; // namespace aicpu
#endif

View File

@ -0,0 +1,160 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "reciprocal.h"
#include <float.h>
#include <complex>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kReciprocal = "Reciprocal";
const size_t kReciprocalInputNum = 1;
const size_t kReciprocalOutputNum = 1;
constexpr int64_t kParallelDataNums = 32 * 1024;
} // namespace
namespace aicpu {
uint32_t ReciprocalCpuKernel::Compute(CpuKernelContext &ctx) {
Tensor *x = ctx.Input(0);
Tensor *y = ctx.Output(0);
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kReciprocalOutputNum, kReciprocalInputNum), "Check Reciprocal params failed.");
if (x->GetDataType() != y->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s]",
DTypeStr(x->GetDataType()).c_str(), DTypeStr(y->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (x->GetDataSize() != y->GetDataSize()) {
KERNEL_LOG_ERROR(
"The data size of the input [%llu] need be the same as the output "
"[%llu]",
x->GetDataSize(), y->GetDataSize());
return KERNEL_STATUS_PARAM_INVALID;
}
uint64_t data_num = x->NumElements();
DataType data_type = x->GetDataType();
uint32_t res = KERNEL_STATUS_OK;
switch (data_type) {
case DT_FLOAT:
res = ReciprocalCompute<float>(x, y, data_num, ctx);
break;
case DT_DOUBLE:
res = ReciprocalCompute<double>(x, y, data_num, ctx);
break;
case DT_FLOAT16:
res = ReciprocalCompute<Eigen::half>(x, y, data_num, ctx);
break;
case DT_COMPLEX64:
res = ReciprocalComputeComplex<std::complex<float>>(x, y, data_num, ctx);
break;
case DT_COMPLEX128:
res = ReciprocalComputeComplex<std::complex<double>>(x, y, data_num, ctx);
break;
default:
KERNEL_LOG_ERROR("Reciprocal kernel data type [%s] not support", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (res != KERNEL_STATUS_OK) {
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t ReciprocalCpuKernel::ReciprocalCompute(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(x->GetData());
auto output_y = reinterpret_cast<T *>(y->GetData());
if (data_num <= kParallelDataNums) {
for (size_t i = 0; i < data_num; i++) {
if (input_x[i] == static_cast<T>(0)) {
KERNEL_LOG_ERROR("Reciprocal kernel input[%d] cannot be 0", i);
return KERNEL_STATUS_INNER_ERROR;
}
output_y[i] = static_cast<T>(1) / (input_x[i]);
}
} else {
uint32_t min_core_num = 1;
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shared_reciprocal = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
if (input_x[i] == static_cast<T>(0)) {
KERNEL_LOG_ERROR("Reciprocal kernel input[%d] cannot be 0", i);
return KERNEL_STATUS_INNER_ERROR;
}
output_y[i] = static_cast<T>(1) / (input_x[i]);
}
return KERNEL_STATUS_OK;
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal);
if (ret != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
return KERNEL_STATUS_INNER_ERROR;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal),
"Reciprocal Compute failed.");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t ReciprocalCpuKernel::ReciprocalComputeComplex(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(x->GetData());
auto output_y = reinterpret_cast<T *>(y->GetData());
if (data_num <= kParallelDataNums) {
for (size_t i = 0; i < data_num; i++) {
output_y[i] = conj(input_x[i]) / (input_x[i].real() * input_x[i].real() + input_x[i].imag() * input_x[i].imag());
}
} else {
uint32_t min_core_num = 1;
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shared_reciprocal = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
output_y[i] =
conj(input_x[i]) / (input_x[i].real() * input_x[i].real() + input_x[i].imag() * input_x[i].imag());
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal);
if (ret != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
return KERNEL_STATUS_INNER_ERROR;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal),
"Reciprocal Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kReciprocal, ReciprocalCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_RECIPROCAL_H_
#define AICPU_KERNELS_NORMALIZED_RECIPROCAL_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class ReciprocalCpuKernel : public CpuKernel {
public:
~ReciprocalCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t ReciprocalCompute(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx);
template <typename T>
uint32_t ReciprocalComputeComplex(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,155 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "reciprocal_grad.h"
#include <float.h>
#include <complex>
#include <math.h>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kReciprocalGrad = "ReciprocalGrad";
const size_t kReciprocalGradInputNum = 2;
const size_t kReciprocalGradOutputNum = 1;
constexpr int64_t kParallelDataNums = 64 * 1024;
constexpr int64_t kParallelComplexDataNums = 16 * 1024;
} // namespace
namespace aicpu {
uint32_t ReciprocalGradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kReciprocalGradInputNum, kReciprocalGradOutputNum),
"Check ReciprocalGrad params failed.");
Tensor *y = ctx.Input(0);
Tensor *dy = ctx.Input(1);
Tensor *z = ctx.Output(0);
if (y->GetDataType() != dy->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the input2 [%s] need be the same as the input1 [%s]",
DTypeStr(dy->GetDataType()).c_str(), DTypeStr(y->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (y->GetDataSize() != dy->GetDataSize()) {
KERNEL_LOG_ERROR(
"The data size of the input2 [%llu] need be the same as the input1 "
"[%llu]",
dy->GetDataSize(), y->GetDataSize());
return KERNEL_STATUS_PARAM_INVALID;
}
uint64_t data_num = y->NumElements();
DataType data_type = y->GetDataType();
uint32_t res = KERNEL_STATUS_OK;
switch (data_type) {
case DT_FLOAT16:
res = ReciprocalGradCompute<Eigen::half>(y, dy, z, data_num, ctx);
break;
case DT_FLOAT:
res = ReciprocalGradCompute<float>(y, dy, z, data_num, ctx);
break;
case DT_DOUBLE:
res = ReciprocalGradCompute<double>(y, dy, z, data_num, ctx);
break;
case DT_COMPLEX64:
res = ReciprocalGradComputeComplex<std::complex<float>>(y, dy, z, data_num, ctx);
break;
case DT_COMPLEX128:
res = ReciprocalGradComputeComplex<std::complex<double>>(y, dy, z, data_num, ctx);
break;
default:
KERNEL_LOG_ERROR("ReciprocalGrad invalid input type [%s]", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (res != KERNEL_STATUS_OK) {
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t ReciprocalGradCpuKernel::ReciprocalGradCompute(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num,
CpuKernelContext &ctx) {
auto input_y = reinterpret_cast<T *>(y->GetData());
auto input_dy = reinterpret_cast<T *>(dy->GetData());
auto output_z = reinterpret_cast<T *>(z->GetData());
if (data_num <= kParallelDataNums) {
for (size_t i = 0; i < data_num; i++) {
output_z[i] = static_cast<T>(-1) * input_dy[i] * input_y[i] * input_y[i];
}
} else {
uint32_t min_core_num = 1;
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_ReciprocalGrad = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
output_z[i] = static_cast<T>(-1) * input_dy[i] * input_y[i] * input_y[i];
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad);
if (ret != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
return KERNEL_STATUS_INNER_ERROR;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad),
"ReciprocalGrad Compute failed.");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t ReciprocalGradCpuKernel::ReciprocalGradComputeComplex(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num,
CpuKernelContext &ctx) {
auto input_y = reinterpret_cast<T *>(y->GetData());
auto input_dy = reinterpret_cast<T *>(dy->GetData());
auto output_z = reinterpret_cast<T *>(z->GetData());
if (data_num <= kParallelComplexDataNums) {
for (size_t i = 0; i < data_num; i++) {
output_z[i] = static_cast<T>(-1) * input_dy[i] * conj(input_y[i] * input_y[i]);
}
} else {
uint32_t min_core_num = 1;
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_ReciprocalGrad = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
output_z[i] = static_cast<T>(-1) * input_dy[i] * conj(input_y[i] * input_y[i]);
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad);
if (ret != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
return KERNEL_STATUS_INNER_ERROR;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad),
"ReciprocalGrad Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kReciprocalGrad, ReciprocalGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_RECIPROCALGRAD_H_
#define AICPU_KERNELS_NORMALIZED_RECIPROCALGRAD_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class ReciprocalGradCpuKernel : public CpuKernel {
public:
~ReciprocalGradCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t ReciprocalGradCompute(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num, CpuKernelContext &ctx);
template <typename T>
uint32_t ReciprocalGradComputeComplex(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num, CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,487 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "reduce_mean.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "algorithm"
#include "iostream"
namespace {
const char *kReduceMean = "ReduceMean";
#define REDUCEMEAN_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = ReduceMeanCompute<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("ReduceMean kernel compute failed."); \
return result; \
} \
break; \
}
#define REDUCEMEAN_COMPUTE_CASE_CP(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = ReduceMeanCompute_Complex<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("ReduceMean kernel compute failed."); \
return result; \
} \
break; \
}
#define REDUCEMEAN_COMPUTE_CASE_ALL(TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE_CP(DT_COMPLEX64, std::complex<float>, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE_CP(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX) \
REDUCEMEAN_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)
} // namespace
namespace aicpu {
template <typename T>
T ComplexDiv(T sum, int64_t num) {
T res;
auto real = sum.real();
auto imag = sum.imag();
res.real(real / num);
res.imag(imag / num);
return res;
}
uint32_t ReduceMeanCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
uint32_t input_num = ctx.GetInputsSize();
uint32_t output_num = ctx.GetOutputsSize();
if (input_num != 2 || output_num != 1) {
KERNEL_LOG_ERROR("The number of input or output parameters does not match.");
return KERNEL_STATUS_PARAM_INVALID;
}
Tensor *input_data = ctx.Input(0);
KERNEL_CHECK_NULLPTR(input_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[0] failed.")
Tensor *axes_data = ctx.Input(1);
KERNEL_CHECK_NULLPTR(axes_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[1] failed.")
Tensor *output_data = ctx.Output(0);
KERNEL_CHECK_NULLPTR(output_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output[0] failed.");
DataType data_type = ctx.Input(0)->GetDataType();
DataType axes_type = ctx.Input(1)->GetDataType();
switch (axes_type) {
case DT_INT32:
switch (data_type) {
REDUCEMEAN_COMPUTE_CASE_ALL(int32_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
case DT_INT64:
switch (data_type) {
REDUCEMEAN_COMPUTE_CASE_ALL(int64_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
default:
KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(axes_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
/*
Calculate the mean of the corresponding dimension data
Rule: except for the specified dimension, a set of data with other
dimensions unchanged participate in the calculation of a mean.
e.g. input_x : float array[2][2][2]={1,2,3,4,5,6,7,8}
axes : [1 , 2]
output:[2.5, 6.5]
2.5 is calculated from array[0][0][0], array[0][0][1],
array[0][1][0] and array[0][1][1]
The same group of data addresses involved in calculating the
mean consists of one same base address and different offset addresses
input_data_address = base_address + offset_address
*/
template <typename T1, typename T2>
uint32_t ReduceMeanCpuKernel::ReduceMeanCompute(CpuKernelContext &ctx) {
Tensor *input_data = ctx.Input(0);
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
const int64_t input_data_num = input_data->NumElements();
auto input_data_shape = input_data->GetTensorShape();
const int32_t input_data_dims = input_data_shape->GetDims();
std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
std::vector<int64_t> dims_addr(input_data_dims);
dims_addr[input_data_dims - 1] = 1;
int64_t addr_tmp = 1;
for (int32_t i = input_data_dims - 2; i > -1; i--) {
addr_tmp *= input_data_dimsize[i + 1];
dims_addr[i] = addr_tmp;
}
Tensor *output_data = ctx.Output(0);
auto output_data_shape = output_data->GetTensorShape();
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
const int64_t output_data_num = output_data->NumElements();
Tensor *axes_data = ctx.Input(1);
auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
int64_t axes_data_num = axes_data->NumElements();
// Check the effectiveness of the value of axes
for (int64_t i = 0; i < axes_data_num; i++) {
if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
KERNEL_LOG_ERROR("The value of axes is incorrect.");
return KERNEL_STATUS_PARAM_INVALID;
} else if (*(axes_data_addr + i) < 0) {
*(axes_data_addr + i) += input_data_dims;
}
}
std::sort(axes_data_addr, axes_data_addr + axes_data_num);
std::vector<T2> axes_data_norepeat;
for (int64_t i = 0; i < axes_data_num - 1; i++) {
T2 value = axes_data_addr[i];
if (value == axes_data_addr[i + 1]) {
axes_data_num--;
continue;
}
axes_data_norepeat.push_back(value);
}
axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
// deal with attr
auto attr_value = ctx.GetAttr("keep_dims");
bool keep_dims;
if (attr_value == nullptr) {
keep_dims = false;
} else {
keep_dims = static_cast<bool>(attr_value->GetBool());
}
if (axes_data_num == input_data_dims) {
if (keep_dims) {
std::vector<int64_t> dims_new(axes_data_num, 1);
output_data_shape->SetDimSizes(dims_new);
} else {
std::vector<int64_t> dims_new(1, 1);
output_data_shape->SetDimSizes(dims_new);
}
T1 data_sum = static_cast<T1>(0);
for (int64_t i = 0; i < input_data_num; i++) {
data_sum += input_data_addr[i];
}
output_data_addr[0] = data_sum / input_data_num;
} else {
std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
if (keep_dims) {
for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
dims_new[*iter] = 1;
}
} else {
for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
dims_new.erase(dims_new.begin() + (*iter));
}
}
output_data_shape->SetDimSizes(dims_new);
// Extract unspecified dimensions
std::vector<T2> dims_base;
const int32_t axes_data_num_const = axes_data_num;
const int32_t dims_base_num = input_data_dims - axes_data_num_const;
for (T2 i = 0; i < (T2)input_data_dims; i++) {
bool cflag = true;
for (int64_t j = 0; j < axes_data_num_const; j++) {
if (axes_data_norepeat[j] == i) {
cflag = false;
break;
}
}
if (cflag) {
dims_base.push_back(i);
}
}
int64_t addr_stride[axes_data_num_const];
addr_tmp = 1;
addr_stride[axes_data_num_const - 1] = addr_tmp;
for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
addr_stride[i] = addr_tmp;
}
int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > output_data_num) {
max_core_num = output_data_num;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
int64_t output_i_addr = 0;
int64_t seq_tmp = i;
for (int32_t j = dims_base_num - 1; j > -1; j--) {
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
seq_tmp = next;
output_i_addr += loc * dims_addr[dims_base[j]];
if (seq_tmp == 0) {
break;
}
}
T1 data_sum = input_data_addr[output_i_addr];
// In the array, the actual address of the element participating in the calculation.
int64_t addr_offset = 0;
for (int64_t j = 1; j < offset_num; j++) {
int32_t stride = axes_data_num_const - 1;
for (int32_t k = stride - 1; k > -1; k--) {
if (j % addr_stride[k] == 0) {
addr_offset -=
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
stride = k;
continue;
}
break;
}
addr_offset += dims_addr[axes_data_norepeat[stride]];
data_sum += input_data_addr[output_i_addr + addr_offset];
}
output_data_addr[i] = data_sum / offset_num;
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
"ReduceMean Compute failed.");
} else {
for (int64_t i = 0; i < output_data_num; i++) {
// In the array, the actual address of the output.
int64_t output_i_addr = 0;
int64_t seq_tmp = i;
for (int32_t j = dims_base_num - 1; j > -1; j--) {
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
seq_tmp = next;
output_i_addr += loc * dims_addr[dims_base[j]];
if (seq_tmp == 0) {
break;
}
}
T1 data_sum = input_data_addr[output_i_addr];
// In the array, the actual address of the element participating in the calculation.
int64_t addr_offset = 0;
for (int64_t j = 1; j < offset_num; j++) {
int32_t stride = axes_data_num_const - 1;
for (int32_t k = stride - 1; k > -1; k--) {
if (j % addr_stride[k] == 0) {
addr_offset -=
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
stride = k;
continue;
}
break;
}
addr_offset += dims_addr[axes_data_norepeat[stride]];
data_sum += input_data_addr[output_i_addr + addr_offset];
}
output_data_addr[i] = data_sum / offset_num;
}
}
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t ReduceMeanCpuKernel::ReduceMeanCompute_Complex(CpuKernelContext &ctx) {
Tensor *input_data = ctx.Input(0);
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
const int64_t input_data_num = input_data->NumElements();
auto input_data_shape = input_data->GetTensorShape();
const int32_t input_data_dims = input_data_shape->GetDims();
std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
std::vector<int64_t> dims_addr(input_data_dims);
dims_addr[input_data_dims - 1] = 1;
int64_t addr_tmp = 1;
for (int32_t i = input_data_dims - 2; i > -1; i--) {
addr_tmp *= input_data_dimsize[i + 1];
dims_addr[i] = addr_tmp;
}
Tensor *output_data = ctx.Output(0);
auto output_data_shape = output_data->GetTensorShape();
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
const int64_t output_data_num = output_data->NumElements();
Tensor *axes_data = ctx.Input(1);
auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
int64_t axes_data_num = axes_data->NumElements();
// Check the effectiveness of the value of axes
for (int64_t i = 0; i < axes_data_num; i++) {
if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
KERNEL_LOG_ERROR("The value of axes is incorrect.");
return KERNEL_STATUS_PARAM_INVALID;
} else if (*(axes_data_addr + i) < 0) {
*(axes_data_addr + i) += input_data_dims;
}
}
std::sort(axes_data_addr, axes_data_addr + axes_data_num);
std::vector<T2> axes_data_norepeat;
for (int64_t i = 0; i < axes_data_num - 1; i++) {
T2 value = axes_data_addr[i];
if (value == axes_data_addr[i + 1]) {
axes_data_num--;
continue;
}
axes_data_norepeat.push_back(value);
}
axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
// deal with attr
auto attr_value = ctx.GetAttr("keep_dims");
bool keep_dims;
if (attr_value == nullptr) {
keep_dims = false;
} else {
keep_dims = static_cast<bool>(attr_value->GetBool());
}
if (axes_data_num == input_data_dims) {
if (keep_dims) {
std::vector<int64_t> dims_new(axes_data_num, 1);
output_data_shape->SetDimSizes(dims_new);
} else {
std::vector<int64_t> dims_new(1, 1);
output_data_shape->SetDimSizes(dims_new);
}
T1 data_sum = static_cast<T1>(0);
for (int64_t i = 0; i < input_data_num; i++) {
data_sum += input_data_addr[i];
}
output_data_addr[0] = ComplexDiv<T1>(data_sum, input_data_num);
} else {
std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
if (keep_dims) {
for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
dims_new[*iter] = 1;
}
} else {
for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
dims_new.erase(dims_new.begin() + (*iter));
}
}
output_data_shape->SetDimSizes(dims_new);
// Extract unspecified dimensions
std::vector<T2> dims_base;
const int32_t axes_data_num_const = axes_data_num;
const int32_t dims_base_num = input_data_dims - axes_data_num_const;
for (T2 i = 0; i < (T2)input_data_dims; i++) {
bool cflag = true;
for (int64_t j = 0; j < axes_data_num_const; j++) {
if (axes_data_norepeat[j] == i) {
cflag = false;
break;
}
}
if (cflag) {
dims_base.push_back(i);
}
}
int64_t addr_stride[axes_data_num_const];
addr_tmp = 1;
addr_stride[axes_data_num_const - 1] = addr_tmp;
for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
addr_stride[i] = addr_tmp;
}
int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > output_data_num) {
max_core_num = output_data_num;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
int64_t output_i_addr = 0;
int64_t seq_tmp = i;
for (int32_t j = dims_base_num - 1; j > -1; j--) {
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
seq_tmp = next;
output_i_addr += loc * dims_addr[dims_base[j]];
if (seq_tmp == 0) {
break;
}
}
T1 data_sum = input_data_addr[output_i_addr];
// In the array, the actual address of the element participating in the calculation.
int64_t addr_offset = 0;
for (int64_t j = 1; j < offset_num; j++) {
int32_t stride = axes_data_num_const - 1;
for (int32_t k = stride - 1; k > -1; k--) {
if (j % addr_stride[k] == 0) {
addr_offset -=
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
stride = k;
continue;
}
break;
}
addr_offset += dims_addr[axes_data_norepeat[stride]];
data_sum += input_data_addr[output_i_addr + addr_offset];
}
output_data_addr[i] = ComplexDiv<T1>(data_sum, offset_num);
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
"ReduceMean Compute failed.");
} else {
for (int64_t i = 0; i < output_data_num; i++) {
// In the array, the actual address of the output.
int64_t output_i_addr = 0;
int64_t seq_tmp = i;
for (int32_t j = dims_base_num - 1; j > -1; j--) {
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
seq_tmp = next;
output_i_addr += loc * dims_addr[dims_base[j]];
if (seq_tmp == 0) {
break;
}
}
T1 data_sum = input_data_addr[output_i_addr];
// In the array, the actual address of the element participating in the calculation.
int64_t addr_offset = 0;
for (int64_t j = 1; j < offset_num; j++) {
int32_t stride = axes_data_num_const - 1;
for (int32_t k = stride - 1; k > -1; k--) {
if (j % addr_stride[k] == 0) {
addr_offset -=
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
stride = k;
continue;
}
break;
}
addr_offset += dims_addr[axes_data_norepeat[stride]];
data_sum += input_data_addr[output_i_addr + addr_offset];
}
output_data_addr[i] = ComplexDiv<T1>(data_sum, offset_num);
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kReduceMean, ReduceMeanCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_REDUCEMEAN_H_
#define AICPU_KERNELS_NORMALIZED_REDUCEMEAN_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class ReduceMeanCpuKernel : public CpuKernel {
public:
ReduceMeanCpuKernel() = default;
~ReduceMeanCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2>
static uint32_t ReduceMeanCompute(CpuKernelContext &ctx);
template <typename T1, typename T2>
static uint32_t ReduceMeanCompute_Complex(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,496 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "reduce_prod.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "algorithm"
#include "iostream"
namespace {
const char *kReduceProd = "ReduceProd";
#define REDUCEPROD_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = ReduceProdCompute<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("ReduceProd kernel compute failed."); \
return result; \
} \
break; \
}
#define REDUCEPROD_COMPUTE_CASE_CP(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = ReduceProdCompute_Complex<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("ReduceProd kernel compute failed."); \
return result; \
} \
break; \
}
#define REDUCEPROD_COMPUTE_CASE_ALL(TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE_CP(DT_COMPLEX64, std::complex<float>, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE_CP(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX) \
REDUCEPROD_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)
} // namespace
namespace aicpu {
template <typename T>
T ReduceProdCpuKernel::ComputeMul(T num_1, T num_2) {
T res;
auto a = num_1.real();
auto b = num_1.imag();
auto x = num_2.real();
auto y = num_2.imag();
auto real_res = a * x - b * y;
auto imag_res = b * x + a * y;
res.real(real_res);
res.imag(imag_res);
return res;
}
uint32_t ReduceProdCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
uint32_t input_num = ctx.GetInputsSize();
uint32_t output_num = ctx.GetOutputsSize();
if (input_num != 2 || output_num != 1) {
KERNEL_LOG_ERROR("The number of input or output parameters does not match.");
return KERNEL_STATUS_PARAM_INVALID;
}
Tensor *input_data = ctx.Input(0);
KERNEL_CHECK_NULLPTR(input_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[0] failed.")
Tensor *axes_data = ctx.Input(1);
KERNEL_CHECK_NULLPTR(axes_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[1] failed.")
Tensor *output_data = ctx.Output(0);
KERNEL_CHECK_NULLPTR(output_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output[0] failed.");
DataType data_type = ctx.Input(0)->GetDataType();
DataType axes_type = ctx.Input(1)->GetDataType();
switch (axes_type) {
case DT_INT32:
switch (data_type) {
REDUCEPROD_COMPUTE_CASE_ALL(int32_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
case DT_INT64:
switch (data_type) {
REDUCEPROD_COMPUTE_CASE_ALL(int64_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
default:
KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(axes_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
/*
Calculate the prod of the corresponding dimension data
Rule: except for the specified dimension, a set of data with other
dimensions unchanged participate in the calculation of a prod.
e.g. input_x : float array[2][2][2]={1,2,3,4,5,6,7,8}
axes : [1 , 2]
output:[2.5, 6.5]
2.5 is calculated from array[0][0][0], array[0][0][1],
array[0][1][0] and array[0][1][1]
The same group of data addresses involved in calculating the
prod consists of one same base address and different offset addresses
input_data_address = base_address + offset_address
*/
template <typename T1, typename T2>
uint32_t ReduceProdCpuKernel::ReduceProdCompute(CpuKernelContext &ctx) {
Tensor *input_data = ctx.Input(0);
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
const int64_t input_data_num = input_data->NumElements();
auto input_data_shape = input_data->GetTensorShape();
const int32_t input_data_dims = input_data_shape->GetDims();
std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
std::vector<int64_t> dims_addr(input_data_dims);
dims_addr[input_data_dims - 1] = 1;
int64_t addr_tmp = 1;
for (int32_t i = input_data_dims - 2; i > -1; i--) {
addr_tmp *= input_data_dimsize[i + 1];
dims_addr[i] = addr_tmp;
}
Tensor *output_data = ctx.Output(0);
auto output_data_shape = output_data->GetTensorShape();
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
const int64_t output_data_num = output_data->NumElements();
Tensor *axes_data = ctx.Input(1);
auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
int64_t axes_data_num = axes_data->NumElements();
// Check the effectiveness of the value of axes
for (int64_t i = 0; i < axes_data_num; i++) {
if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
KERNEL_LOG_ERROR("The value of axes is incorrect.");
return KERNEL_STATUS_PARAM_INVALID;
} else if (*(axes_data_addr + i) < 0) {
*(axes_data_addr + i) += input_data_dims;
}
}
std::sort(axes_data_addr, axes_data_addr + axes_data_num);
std::vector<T2> axes_data_norepeat;
for (int64_t i = 0; i < axes_data_num - 1; i++) {
T2 value = axes_data_addr[i];
if (value == axes_data_addr[i + 1]) {
axes_data_num--;
continue;
}
axes_data_norepeat.push_back(value);
}
axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
// deal with attr
auto attr_value = ctx.GetAttr("keep_dims");
bool keep_dims;
if (attr_value == nullptr) {
keep_dims = false;
} else {
keep_dims = static_cast<bool>(attr_value->GetBool());
}
if (axes_data_num == input_data_dims) {
if (keep_dims) {
std::vector<int64_t> dims_new(axes_data_num, 1);
output_data_shape->SetDimSizes(dims_new);
} else {
std::vector<int64_t> dims_new(1, 1);
output_data_shape->SetDimSizes(dims_new);
}
T1 data_prod = static_cast<T1>(1);
for (int64_t i = 0; i < input_data_num; i++) {
data_prod *= input_data_addr[i];
}
output_data_addr[0] = data_prod;
} else {
std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
if (keep_dims) {
for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
dims_new[*iter] = 1;
}
} else {
for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
dims_new.erase(dims_new.begin() + (*iter));
}
}
output_data_shape->SetDimSizes(dims_new);
// Extract unspecified dimensions
std::vector<T2> dims_base;
const int32_t axes_data_num_const = axes_data_num;
const int32_t dims_base_num = input_data_dims - axes_data_num_const;
for (T2 i = 0; i < (T2)input_data_dims; i++) {
bool cflag = true;
for (int64_t j = 0; j < axes_data_num_const; j++) {
if (axes_data_norepeat[j] == i) {
cflag = false;
break;
}
}
if (cflag) {
dims_base.push_back(i);
}
}
int64_t addr_stride[axes_data_num_const];
addr_tmp = 1;
addr_stride[axes_data_num_const - 1] = addr_tmp;
for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
addr_stride[i] = addr_tmp;
}
int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > output_data_num) {
max_core_num = output_data_num;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
int64_t output_i_addr = 0;
int64_t seq_tmp = i;
for (int32_t j = dims_base_num - 1; j > -1; j--) {
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
seq_tmp = next;
output_i_addr += loc * dims_addr[dims_base[j]];
if (seq_tmp == 0) {
break;
}
}
T1 data_prod = input_data_addr[output_i_addr];
// In the array, the actual address of the element participating in the calculation.
int64_t addr_offset = 0;
for (int64_t j = 1; j < offset_num; j++) {
int32_t stride = axes_data_num_const - 1;
for (int32_t k = stride - 1; k > -1; k--) {
if (j % addr_stride[k] == 0) {
addr_offset -=
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
stride = k;
continue;
}
break;
}
addr_offset += dims_addr[axes_data_norepeat[stride]];
data_prod *= input_data_addr[output_i_addr + addr_offset];
}
output_data_addr[i] = data_prod;
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
"ReduceProd Compute failed.");
} else {
for (int64_t i = 0; i < output_data_num; i++) {
// In the array, the actual address of the output.
int64_t output_i_addr = 0;
int64_t seq_tmp = i;
for (int32_t j = dims_base_num - 1; j > -1; j--) {
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
seq_tmp = next;
output_i_addr += loc * dims_addr[dims_base[j]];
if (seq_tmp == 0) {
break;
}
}
T1 data_prod = input_data_addr[output_i_addr];
// In the array, the actual address of the element participating in the calculation.
int64_t addr_offset = 0;
for (int64_t j = 1; j < offset_num; j++) {
int32_t stride = axes_data_num_const - 1;
for (int32_t k = stride - 1; k > -1; k--) {
if (j % addr_stride[k] == 0) {
addr_offset -=
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
stride = k;
continue;
}
break;
}
addr_offset += dims_addr[axes_data_norepeat[stride]];
data_prod *= input_data_addr[output_i_addr + addr_offset];
}
output_data_addr[i] = data_prod;
}
}
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t ReduceProdCpuKernel::ReduceProdCompute_Complex(CpuKernelContext &ctx) {
Tensor *input_data = ctx.Input(0);
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
const int64_t input_data_num = input_data->NumElements();
auto input_data_shape = input_data->GetTensorShape();
const int32_t input_data_dims = input_data_shape->GetDims();
std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
std::vector<int64_t> dims_addr(input_data_dims);
dims_addr[input_data_dims - 1] = 1;
int64_t addr_tmp = 1;
for (int32_t i = input_data_dims - 2; i > -1; i--) {
addr_tmp *= input_data_dimsize[i + 1];
dims_addr[i] = addr_tmp;
}
Tensor *output_data = ctx.Output(0);
auto output_data_shape = output_data->GetTensorShape();
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
const int64_t output_data_num = output_data->NumElements();
Tensor *axes_data = ctx.Input(1);
auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
int64_t axes_data_num = axes_data->NumElements();
// Check the effectiveness of the value of axes
for (int64_t i = 0; i < axes_data_num; i++) {
if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
KERNEL_LOG_ERROR("The value of axes is incorrect.");
return KERNEL_STATUS_PARAM_INVALID;
} else if (*(axes_data_addr + i) < 0) {
*(axes_data_addr + i) += input_data_dims;
}
}
std::sort(axes_data_addr, axes_data_addr + axes_data_num);
std::vector<T2> axes_data_norepeat;
for (int64_t i = 0; i < axes_data_num - 1; i++) {
T2 value = axes_data_addr[i];
if (value == axes_data_addr[i + 1]) {
axes_data_num--;
continue;
}
axes_data_norepeat.push_back(value);
}
axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
// deal with attr
auto attr_value = ctx.GetAttr("keep_dims");
bool keep_dims;
if (attr_value == nullptr) {
keep_dims = false;
} else {
keep_dims = static_cast<bool>(attr_value->GetBool());
}
if (axes_data_num == input_data_dims) {
if (keep_dims) {
std::vector<int64_t> dims_new(axes_data_num, 1);
output_data_shape->SetDimSizes(dims_new);
} else {
std::vector<int64_t> dims_new(1, 1);
output_data_shape->SetDimSizes(dims_new);
}
T1 data_prod;
data_prod.real(1);
data_prod.imag(0);
for (int64_t i = 0; i < input_data_num; i++) {
T1 data_cur = input_data_addr[i];
data_prod = ComputeMul<T1>(data_prod, data_cur);
}
output_data_addr[0] = data_prod;
} else {
std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
if (keep_dims) {
for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
dims_new[*iter] = 1;
}
} else {
for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
dims_new.erase(dims_new.begin() + (*iter));
}
}
output_data_shape->SetDimSizes(dims_new);
// Extract unspecified dimensions
std::vector<T2> dims_base;
const int32_t axes_data_num_const = axes_data_num;
const int32_t dims_base_num = input_data_dims - axes_data_num_const;
for (T2 i = 0; i < (T2)input_data_dims; i++) {
bool cflag = true;
for (int64_t j = 0; j < axes_data_num_const; j++) {
if (axes_data_norepeat[j] == i) {
cflag = false;
break;
}
}
if (cflag) {
dims_base.push_back(i);
}
}
int64_t addr_stride[axes_data_num_const];
addr_tmp = 1;
addr_stride[axes_data_num_const - 1] = addr_tmp;
for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
addr_stride[i] = addr_tmp;
}
int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > output_data_num) {
max_core_num = output_data_num;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
int64_t output_i_addr = 0;
int64_t seq_tmp = i;
for (int32_t j = dims_base_num - 1; j > -1; j--) {
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
seq_tmp = next;
output_i_addr += loc * dims_addr[dims_base[j]];
if (seq_tmp == 0) {
break;
}
}
T1 data_prod = input_data_addr[output_i_addr];
// In the array, the actual address of the element participating in the calculation.
int64_t addr_offset = 0;
for (int64_t j = 1; j < offset_num; j++) {
int32_t stride = axes_data_num_const - 1;
for (int32_t k = stride - 1; k > -1; k--) {
if (j % addr_stride[k] == 0) {
addr_offset -=
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
stride = k;
continue;
}
break;
}
addr_offset += dims_addr[axes_data_norepeat[stride]];
T1 data_cur = input_data_addr[i];
data_prod = ComputeMul<T1>(data_prod, data_cur);
}
output_data_addr[i] = data_prod;
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
"ReduceProd Compute failed.");
} else {
for (int64_t i = 0; i < output_data_num; i++) {
// In the array, the actual address of the output.
int64_t output_i_addr = 0;
int64_t seq_tmp = i;
for (int32_t j = dims_base_num - 1; j > -1; j--) {
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
seq_tmp = next;
output_i_addr += loc * dims_addr[dims_base[j]];
if (seq_tmp == 0) {
break;
}
}
T1 data_prod = input_data_addr[output_i_addr];
// In the array, the actual address of the element participating in the calculation.
int64_t addr_offset = 0;
for (int64_t j = 1; j < offset_num; j++) {
int32_t stride = axes_data_num_const - 1;
for (int32_t k = stride - 1; k > -1; k--) {
if (j % addr_stride[k] == 0) {
addr_offset -=
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
stride = k;
continue;
}
break;
}
addr_offset += dims_addr[axes_data_norepeat[stride]];
T1 data_cur = input_data_addr[output_i_addr + addr_offset];
data_prod = ComputeMul<T1>(data_prod, data_cur);
}
output_data_addr[i] = data_prod;
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kReduceProd, ReduceProdCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,41 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_REDUCEPROD_H_
#define AICPU_KERNELS_NORMALIZED_REDUCEPROD_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class ReduceProdCpuKernel : public CpuKernel {
public:
ReduceProdCpuKernel() = default;
~ReduceProdCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
static T ComputeMul(T num_1, T num_2);
template <typename T1, typename T2>
static uint32_t ReduceProdCompute(CpuKernelContext &ctx);
template <typename T1, typename T2>
static uint32_t ReduceProdCompute_Complex(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,107 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "relu.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *kRelu = "Relu";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
#define RELU_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = ReluCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Relu kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t ReluCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Relu check input and output number failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
RELU_COMPUTE_CASE(DT_INT8, int8_t, ctx)
RELU_COMPUTE_CASE(DT_INT16, int16_t, ctx)
RELU_COMPUTE_CASE(DT_INT32, int32_t, ctx)
RELU_COMPUTE_CASE(DT_INT64, int64_t, ctx)
RELU_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
RELU_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
RELU_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
RELU_COMPUTE_CASE(DT_FLOAT, float, ctx)
RELU_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("Relu kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
void ReluCpuKernel::DoCompute(int64_t start, int64_t end, const T *input1, T *output) {
for (int64_t i = start; i < end; ++i) {
T v = *(input1 + i);
bool p = v > static_cast<T>(0);
*(output + i) = p ? v : static_cast<T>(0);
}
}
template <typename T>
uint32_t ReluCpuKernel::ReluCompute(CpuKernelContext &ctx) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_relu = [&](int64_t start, int64_t end) { DoCompute<T>(start, end, in0, out); };
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_relu),
"Relu Compute failed.");
} else {
DoCompute<T>(0, data_num, in0, out);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kRelu, ReluCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,39 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_RELU_H_
#define AICPU_KERNELS_NORMALIZED_RELU_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class ReluCpuKernel : public CpuKernel {
public:
ReluCpuKernel() = default;
~ReluCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
void DoCompute(int64_t start, int64_t end, const T *input1, T *output);
template <typename T>
uint32_t ReluCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,186 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "reversev2.h"
#include <securec.h>
#include "Eigen/Core"
#include "cpu_kernel_utils.h"
#include "iostream"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
using namespace std;
namespace {
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
const char *kReverseV2 = "ReverseV2";
} // namespace
namespace aicpu {
uint32_t ReverseV2CpuKernel::Compute(CpuKernelContext &ctx) {
int x_max_dim = 8;
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "ReverseV2 check input or output is failed.");
DataType axis_type = ctx.Input(1)->GetDataType();
KERNEL_CHECK_FALSE((axis_type == DT_INT32 || axis_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
"The data type of [axis] need be DT_INT32 or DT_INT64.")
auto x_shape = ctx.Input(0)->GetTensorShape();
auto axis_shape = ctx.Input(1)->GetTensorShape();
DataType data_type = DataType(ctx.Input(0)->GetDataType());
std::vector<int64_t> reverse_shape;
for (int i = 0; i < x_shape->GetDims(); i++) {
reverse_shape.push_back(false);
}
// dims check
if (x_shape->GetDims() == 0 || axis_shape->GetDims() == 0) {
uint32_t ret = ComputeDiffType(data_type, reverse_shape, ctx);
if (ret != KERNEL_STATUS_OK) {
return ret;
}
return KERNEL_STATUS_OK;
}
KERNEL_CHECK_FALSE((x_shape->GetDims() > 0 && x_shape->GetDims() <= x_max_dim), KERNEL_STATUS_PARAM_INVALID,
"Shapes of x is not support.")
KERNEL_CHECK_FALSE((axis_shape->GetDims() == 1), KERNEL_STATUS_PARAM_INVALID, "Shapes of axis is not support.")
auto input0_datasize = ctx.Input(0)->GetDataSize();
auto output_datasize = ctx.Output(0)->GetDataSize();
KERNEL_CHECK_FALSE((input0_datasize == output_datasize), KERNEL_STATUS_PARAM_INVALID,
"The data size of input0 [%d] need be same with "
"output0 [%d].",
input0_datasize, output_datasize)
int64_t dim = x_shape->GetDims();
auto input_axis = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
int64_t axis_element = axis_shape->NumElements();
for (int j = 0; j < axis_element; j++) {
int64_t realdim = *(input_axis + j) < 0 ? dim + *(input_axis + j) : *(input_axis + j);
KERNEL_CHECK_FALSE((realdim >= 0 && realdim < dim), KERNEL_STATUS_PARAM_INVALID, "[%d] is invalid", realdim)
KERNEL_CHECK_FALSE((!reverse_shape[realdim]), KERNEL_STATUS_PARAM_INVALID, "axis [%d], specified more than once.",
realdim)
reverse_shape[realdim] = true;
}
uint32_t ret = ComputeDiffType(data_type, reverse_shape, ctx);
if (ret != KERNEL_STATUS_OK) {
return ret;
}
return KERNEL_STATUS_OK;
}
uint32_t ReverseV2CpuKernel::ComputeDiffType(DataType data_type, std::vector<int64_t> reverse_shape,
CpuKernelContext &ctx) {
switch (data_type) {
case DT_FLOAT16:
return ComputeReverseV2<Eigen::half>(reverse_shape, ctx);
case DT_FLOAT:
return ComputeReverseV2<float>(reverse_shape, ctx);
case DT_DOUBLE:
return ComputeReverseV2<double>(reverse_shape, ctx);
case DT_UINT8:
return ComputeReverseV2<uint8_t>(reverse_shape, ctx);
case DT_INT8:
return ComputeReverseV2<int8_t>(reverse_shape, ctx);
case DT_UINT16:
return ComputeReverseV2<uint16_t>(reverse_shape, ctx);
case DT_INT16:
return ComputeReverseV2<int16_t>(reverse_shape, ctx);
case DT_INT32:
return ComputeReverseV2<int32_t>(reverse_shape, ctx);
case DT_INT64:
return ComputeReverseV2<int64_t>(reverse_shape, ctx);
case DT_BOOL:
return ComputeReverseV2<bool>(reverse_shape, ctx);
case DT_COMPLEX64:
return ComputeReverseV2<std::complex<float>>(reverse_shape, ctx);
case DT_COMPLEX128:
return ComputeReverseV2<std::complex<double>>(reverse_shape, ctx);
case DT_STRING:
return ComputeReverseV2<string>(reverse_shape, ctx);
default:
KERNEL_LOG_ERROR("ReverseV2 invalid input type[%s]", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t ReverseV2CpuKernel::ComputeReverseV2(std::vector<int64_t> reverse_shape, CpuKernelContext &ctx) {
auto x_shape = ctx.Input(0)->GetTensorShape();
auto input_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
if (x_shape->GetDims() == 0) {
*(output_data) = *(input_data);
return KERNEL_STATUS_OK;
}
auto axis_shape = ctx.Input(1)->GetTensorShape();
if (axis_shape->GetDims() == 0) {
for (int i = 0; i < x_shape->NumElements(); i++) {
*(output_data + i) = *(input_data + i);
}
return KERNEL_STATUS_OK;
}
int64_t front = 1;
int64_t shape_element = x_shape->NumElements();
int64_t dim = x_shape->GetDims();
std::vector<int64_t> dims = x_shape->GetDimSizes();
bool redo = false;
for (int j = 0; j < dim; j++) {
front = front * dims[j];
if (j != dim - 1 && reverse_shape[j] == true) {
if (redo == true) {
auto copy_size = shape_element * sizeof(T);
auto ret_mem = memcpy_s(input_data, copy_size, output_data, copy_size);
KERNEL_CHECK_FALSE(ret_mem == EOK, KERNEL_STATUS_INNER_ERROR, "Memcpy failed, size = [%zu].", copy_size);
}
int64_t row_size = shape_element / front;
int64_t input_forward = (dims[j] - 1) * row_size;
int64_t save = input_forward;
int64_t output_forward = 0;
int64_t behind = shape_element / (front / dims[j]);
for (int k = 0; k < front / dims[j]; k++) {
int64_t remain = dims[j];
while (remain > 0) {
auto copy_size = row_size * sizeof(T);
auto cur_output = output_data + output_forward;
auto cur_input = input_data + input_forward;
auto ret_mem = memcpy_s(cur_output, copy_size, cur_input, copy_size);
KERNEL_CHECK_FALSE(ret_mem == EOK, KERNEL_STATUS_INNER_ERROR, "Memcpy size[%zu] from input to output failed.",
copy_size);
input_forward = input_forward - row_size;
output_forward = output_forward + row_size;
remain--;
}
save = save + behind;
input_forward = save;
}
redo = true;
} else if (j == dim - 1 && reverse_shape[j] == true) {
if (redo == true) {
auto copy_size = shape_element * sizeof(T);
auto ret_mem = memcpy_s(input_data, copy_size, output_data, copy_size);
KERNEL_CHECK_FALSE(ret_mem == EOK, KERNEL_STATUS_INNER_ERROR, "Memcpy failed, size = [%zu].", copy_size);
}
int64_t output_forward = 0;
for (int k = 0; k < shape_element / dims[j]; k++) {
for (int i = dims[j] - 1; i >= 0; i--) {
*(output_data + output_forward) = *(input_data + i + k * dims[j]);
output_forward++;
}
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kReverseV2, ReverseV2CpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,39 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_REVERSEV2_H_
#define AICPU_KERNELS_NORMALIZED_REVERSEV2_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class ReverseV2CpuKernel : public CpuKernel {
public:
ReverseV2CpuKernel() = default;
~ReverseV2CpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t ComputeDiffType(DataType data_type, std::vector<int64_t> reverse_shape, CpuKernelContext &ctx);
template <typename T>
uint32_t ComputeReverseV2(std::vector<int64_t> reverse_shape, CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,161 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "rgb_to_hsv.h"
#include <iostream>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
constexpr size_t kInputShapeRank = 3;
constexpr size_t kOutputShapeRank = 3;
constexpr int64_t kImageChannels = 3;
const char *kInputStr = "input";
const char *kOutputStr = "output";
const char *kRGBToHSV = "RGBToHSV";
// when input data size is more than kParallelDataNum, use Parallel func
} // namespace
namespace aicpu {
const std::map<std::string, RGBToHSVCpuKernel::KernelFunction> RGBToHSVCpuKernel::kernels_ = {
{"(DT_FLOAT16,DT_FLOAT16)", &RGBToHSVCpuKernel::DoCompute<Eigen::half, Eigen::half>},
{"(DT_FLOAT,DT_FLOAT)", &RGBToHSVCpuKernel::DoCompute<float, float>},
{"(DT_DOUBLE,DT_DOUBLE)", &RGBToHSVCpuKernel::DoCompute<double, double>}};
const std::vector<std::string> RGBToHSVCpuKernel::kernels_name_ = {"(DT_FLOAT16,DT_FLOAT16)", "(DT_FLOAT,DT_FLOAT)",
"(DT_DOUBLE,DT_DOUBLE)"};
template <typename T1, typename T2>
uint32_t RGBToHSVCpuKernel::DoCompute(CpuKernelContext &ctx) {
Tensor *input_tensor = ctx.Input(0);
Tensor *output_tensor = ctx.Output(0);
auto input_shape = input_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input_tensor->NumElements();
auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
auto input_data = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
auto out = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
for (int64_t i = 0; i < input0_elements_nums; i = i + 3) {
auto t_red = *(input_data + i);
auto t_green = *(input_data + i + 1);
auto t_blue = *(input_data + i + 2);
auto t_value = std::max(std::max(t_red, t_blue), t_green);
auto t_minimum = std::min(std::min(t_red, t_blue), t_green);
auto range = t_value - t_minimum;
auto t_saturation = t_value > static_cast<T1>(0) ? (range / t_value) : static_cast<T1>(0);
auto norm = static_cast<T1>(1.0) / static_cast<T1>(6.0) / range;
auto t_hue = t_green == t_value ? (norm * (t_blue - t_red) + static_cast<T1>(2.0) / static_cast<T1>(6.0))
: (norm * (t_red - t_green) + static_cast<T1>(4.0) / static_cast<T1>(6.0));
t_hue = t_red == t_value ? (norm * (t_green - t_blue)) : t_hue;
t_hue = range > static_cast<T1>(0) ? t_hue : static_cast<T1>(0);
t_hue = t_hue < static_cast<T1>(0) ? (t_hue + static_cast<T1>(1)) : t_hue;
*(out + i) = t_hue;
*(out + i + 1) = t_saturation;
*(out + i + 2) = t_value;
}
return KERNEL_STATUS_OK;
}
uint32_t RGBToHSVCpuKernel::CheckParam(CpuKernelContext &ctx, const std::string &in_or_out, uint32_t index,
size_t rank) {
Tensor *param = nullptr;
if (in_or_out == kInputStr) {
param = ctx.Input(index);
} else if (in_or_out == kOutputStr) {
param = ctx.Output(index);
}
std::string err_header = ConcatString(kRGBToHSV, " op ", in_or_out, "[", index, "]");
KERNEL_CHECK_NULLPTR(param, KERNEL_STATUS_PARAM_INVALID, "%s tensor is nullptr.", err_header.c_str());
auto param_shape = param->GetTensorShape();
KERNEL_CHECK_NULLPTR(param_shape, KERNEL_STATUS_PARAM_INVALID, "%s tensor shape is nullptr.", err_header.c_str());
auto param_dim_sizes = param_shape->GetDimSizes();
if (param_dim_sizes.size() < 1) {
KERNEL_LOG_ERROR("%s shape rank must be at least 1, but got shape[%zu].", err_header.c_str(),
VectorToString(param_dim_sizes).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (param->GetData() == nullptr) {
KERNEL_CHECK_NULLPTR(param, KERNEL_STATUS_PARAM_INVALID, "%s tensor data is nullptr.", err_header.c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t RGBToHSVCpuKernel::CheckShapes(CpuKernelContext &ctx) {
auto input0_shape = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes();
if (input0_shape.back() != kImageChannels) {
KERNEL_LOG_ERROR(
"%s op input[0] shape last dim should be [%d], but got "
"shape[%s].",
kRGBToHSV, kImageChannels, VectorToString(input0_shape).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t RGBToHSVCpuKernel::CheckParams(CpuKernelContext &ctx) {
auto ret = CheckParam(ctx, kInputStr, kFirstInputIndex, kInputShapeRank);
if (ret != KERNEL_STATUS_OK) {
return ret;
}
ret = CheckShapes(ctx);
if (ret != KERNEL_STATUS_OK) {
return ret;
}
return KERNEL_STATUS_OK;
}
uint32_t RGBToHSVCpuKernel::Compute(CpuKernelContext &ctx) {
auto input0 = ctx.Input(kFirstInputIndex);
KERNEL_CHECK_NULLPTR(input0, KERNEL_STATUS_PARAM_INVALID, "%s input[0] tensor is nullptr.", kRGBToHSV);
DataType input0_data_type = input0->GetDataType();
KERNEL_LOG_DEBUG("%s op input[0] data type is [%s].", kRGBToHSV, DTypeStr(input0_data_type).c_str());
auto output = ctx.Output(kFirstOutputIndex);
KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "%s output[0] tensor is nullptr.", kRGBToHSV);
DataType output_data_type = output->GetDataType();
KERNEL_LOG_DEBUG("%s op output[0] data type is [%s].", kRGBToHSV, DTypeStr(output_data_type).c_str());
std::string kernel_name = ConcatString("(", DTypeStr(input0_data_type), ",", DTypeStr(output_data_type), ")");
auto it = kernels_.find(kernel_name);
if (it != kernels_.end()) {
auto ret = CheckParams(ctx);
if (ret != KERNEL_STATUS_OK) {
return ret;
}
auto kernel = it->second;
ret = kernel(ctx);
KERNEL_LOG_DEBUG("%s op end.", kRGBToHSV);
return ret;
}
KERNEL_LOG_ERROR("%s op only support data type [%s], but got [%s].", kRGBToHSV, VectorToString(kernels_name_).c_str(),
kernel_name.c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
REGISTER_CPU_KERNEL(kRGBToHSV, RGBToHSVCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,51 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_RGBToHSV_H_
#define AICPU_KERNELS_NORMALIZED_RGBToHSV_H_
#include <map>
#include <string>
#include <vector>
#include "cpu_ops_kernel.h"
namespace aicpu {
class RGBToHSVCpuKernel : public CpuKernel {
public:
RGBToHSVCpuKernel() = default;
~RGBToHSVCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename TInput, typename TOutput>
static uint32_t DoCompute(CpuKernelContext &ctx);
uint32_t CheckParams(CpuKernelContext &ctx);
uint32_t CheckParam(CpuKernelContext &ctx, const std::string &in_or_out, uint32_t index, size_t rank);
uint32_t CheckShapes(CpuKernelContext &ctx);
private:
using KernelFunction = uint32_t (*)(CpuKernelContext &ctx);
static const std::map<std::string, KernelFunction> kernels_;
static const std::vector<std::string> kernels_name_;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,163 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "rsqrt_grad.h"
#include <algorithm>
#include <complex>
#include <iostream>
#include "utils/eigen_tensor.h"
namespace {
const char *kRsqrtGrad = "RsqrtGrad";
constexpr uint32_t kOutputNum = 1;
constexpr uint32_t kInputNum = 2;
} // namespace
namespace aicpu {
uint32_t RsqrtGradCpuKernel::Compute(CpuKernelContext &ctx) {
if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
return KERNEL_STATUS_PARAM_INVALID;
}
Tensor *input_0 = ctx.Input(kFirstInputIndex);
Tensor *input_1 = ctx.Input(kSecondInputIndex);
if ((input_0->GetDataSize() == 0) || (input_1->GetDataSize() == 0)) {
KERNEL_LOG_INFO("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
return KERNEL_STATUS_OK;
}
// choose compute function depend on dataType
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
case DT_FLOAT16:
return RsqrtGradComputeFP16<Eigen::half>(ctx);
case DT_FLOAT:
return RsqrtGradCompute<float>(ctx);
case DT_DOUBLE:
return RsqrtGradCompute<double>(ctx);
case DT_INT8:
return RsqrtGradCompute<int8_t>(ctx);
case DT_INT32:
return RsqrtGradCompute<int32_t>(ctx);
case DT_COMPLEX128:
return RsqrtGradComputeComplex<std::complex<double>>(ctx);
case DT_COMPLEX64:
return RsqrtGradComputeComplex<std::complex<float>>(ctx);
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
aicpu::DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
template <typename T>
uint32_t RsqrtGradCpuKernel::RsqrtGradComputeFP16(CpuKernelContext &ctx) {
Tensor *y = ctx.Input(0);
Tensor *dy = ctx.Input(1);
Tensor *z = ctx.Output(0);
auto y_ptr = reinterpret_cast<T *>(y->GetData());
auto dy_ptr = reinterpret_cast<T *>(dy->GetData());
auto z_ptr = reinterpret_cast<T *>(z->GetData());
int32_t input_0_num = y->GetTensorShape()->NumElements();
int32_t input_1_num = dy->GetTensorShape()->NumElements();
if (input_0_num >= input_1_num) {
for (int32_t i = 0; i < input_1_num; i++) {
z_ptr[i] =
static_cast<T>((static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i])) *
(static_cast<double>(dy_ptr[i]) / (static_cast<double>(-2))));
}
for (int32_t i = input_1_num; i < input_0_num; i++) {
z_ptr[i] = (T)(0);
}
} else {
for (int32_t i = 0; i < input_0_num; i++) {
z_ptr[i] =
static_cast<T>((static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i])) *
(static_cast<double>(dy_ptr[i]) / (static_cast<double>(-2))));
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t RsqrtGradCpuKernel::RsqrtGradCompute(CpuKernelContext &ctx) {
Tensor *y = ctx.Input(0);
Tensor *dy = ctx.Input(1);
Tensor *z = ctx.Output(0);
KERNEL_CHECK_NULLPTR(z->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get output data failed",
ctx.GetOpType().c_str())
KERNEL_LOG_INFO(
"[%s] Input[0] data size is [%llu], input[1] data size is [%llu], output "
"data size is [%llu].",
ctx.GetOpType().c_str(), y->GetDataSize(), dy->GetDataSize(), z->GetDataSize());
auto y_ptr = reinterpret_cast<T *>(y->GetData());
auto dy_ptr = reinterpret_cast<T *>(dy->GetData());
auto z_ptr = reinterpret_cast<T *>(z->GetData());
int32_t input_0_num = y->GetTensorShape()->NumElements();
int32_t input_1_num = dy->GetTensorShape()->NumElements();
if (input_0_num >= input_1_num) {
for (int32_t i = 0; i < input_1_num; i++) {
z_ptr[i] = (dy_ptr[i] * y_ptr[i] * y_ptr[i] * y_ptr[i]) / (static_cast<T>(-2));
}
for (int32_t i = input_1_num; i < input_0_num; i++) {
z_ptr[i] = (T)(0);
}
} else {
for (int32_t i = 0; i < input_0_num; i++) {
z_ptr[i] = (dy_ptr[i] * y_ptr[i] * y_ptr[i] * y_ptr[i]) / (static_cast<T>(-2));
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t RsqrtGradCpuKernel::RsqrtGradComputeComplex(CpuKernelContext &ctx) {
Tensor *y = ctx.Input(0);
Tensor *dy = ctx.Input(1);
Tensor *z = ctx.Output(0);
KERNEL_CHECK_NULLPTR(z->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get output data failed",
ctx.GetOpType().c_str())
KERNEL_LOG_INFO(
"[%s] Input[0] data size is [%llu], input[1] data size is [%llu], output "
"data size is [%llu].",
ctx.GetOpType().c_str(), y->GetDataSize(), dy->GetDataSize(), z->GetDataSize());
auto y_ptr = reinterpret_cast<T *>(y->GetData());
auto dy_ptr = reinterpret_cast<T *>(dy->GetData());
auto z_ptr = reinterpret_cast<T *>(z->GetData());
int32_t input_0_num = y->GetTensorShape()->NumElements();
int32_t input_1_num = dy->GetTensorShape()->NumElements();
if (input_0_num >= input_1_num) {
for (int32_t i = 0; i < input_1_num; i++) {
z_ptr[i] = (dy_ptr[i] * conj(y_ptr[i]) * conj(y_ptr[i]) * conj(y_ptr[i])) * (static_cast<T>(-0.5));
}
for (int32_t i = input_1_num; i < input_0_num; i++) {
z_ptr[i] = static_cast<T>(0);
}
} else {
for (int32_t i = 0; i < input_0_num; i++) {
z_ptr[i] = (dy_ptr[i] * conj(y_ptr[i]) * conj(y_ptr[i]) * conj(y_ptr[i])) * (static_cast<T>(-0.5));
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kRsqrtGrad, RsqrtGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,48 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_MUL_H_
#define AICPU_KERNELS_NORMALIZED_MUL_H_
#define EIGEN_USE_THREADS
#define EIGEN_USE_SIMPLE_THREAD_POOL
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include <Eigen/Dense>
namespace aicpu {
class RsqrtGradCpuKernel : public CpuKernel {
public:
RsqrtGradCpuKernel() = default;
~RsqrtGradCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t RsqrtGradCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t RsqrtGradComputeComplex(CpuKernelContext &ctx);
template <typename T>
uint32_t RsqrtGradComputeFP16(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_MUL_H_

View File

@ -0,0 +1,421 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sample_distorted_bounding_box_ext2.h"
#include <random>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 3;
const uint32_t kInputNum = 3;
const char *kSDBBExt2 = "SampleDistortedBoundingBoxExt2";
#define SDBBExt2CpuKernel_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = SDBBExt2Compute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SampleDistortedBoundingBoxExt2 kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint64_t SDBBExt2CpuKernel::New64() {
std::random_device device("/dev/urandom");
static std::mt19937_64 rng = std::mt19937_64(device());
return (rng)();
}
void SDBBExt2CpuKernel::InitPhiloxRandom(int64_t seed, int64_t seed2) {
if (seed == 0 && seed2 == 0) {
seed = New64();
seed2 = New64();
}
generator_ = PhiloxRandom(seed, seed2);
}
float SDBBExt2CpuKernel::RandFloat() {
uint32_t x = GenerateSingle();
const uint32_t man = x & 0x7fffffu; // 23 bit mantissa
const uint32_t exp = static_cast<uint32_t>(127);
const uint32_t val = (exp << 23) | man;
float result;
memcpy(&result, &val, sizeof(val));
return result - 1.0f;
}
uint32_t SDBBExt2CpuKernel::Uniform(uint32_t n) {
if (n == 0) {
return GenerateSingle() * n;
} else if (0 == (n & (n - 1))) {
return GenerateSingle() & (n - 1);
} else {
const uint32_t range = ~static_cast<uint32_t>(0);
const uint32_t rem = (range % n) + 1;
uint32_t rnd;
do {
rnd = GenerateSingle();
} while (rnd < rem);
return rnd % n;
}
}
SDBBExt2CpuKernel::ResultElementType SDBBExt2CpuKernel::GenerateSingle() {
if (used_result_index_ == PhiloxRandom::kResultElementCount) {
unused_results_ = generator_();
used_result_index_ = 0;
}
return unused_results_[used_result_index_++];
}
bool SDBBExt2CpuKernel::SatisfiesOverlapConstraints(const Rectangle &crop, float minimum_object_covered,
const std::vector<Rectangle> &bounding_boxes) {
const float kMinArea = 1.0;
if (crop.Area() < kMinArea) {
return false;
}
bool is_object_covered = false;
for (const auto &bbox : bounding_boxes) {
const float object_area = bbox.Area();
if (object_area < kMinArea) {
continue;
}
if (object_area == 0) {
continue;
}
const float object_covered = crop.Intersect(bbox).Area() / object_area;
if (object_covered >= minimum_object_covered) {
is_object_covered = true;
break;
}
}
return is_object_covered;
}
bool SDBBExt2CpuKernel::GenerateRandomCrop(int original_width, int original_height, float min_relative_crop_area,
float max_relative_crop_area, float aspect_ratio, Rectangle *crop_rect) {
if (max_relative_crop_area <= 0.0 || aspect_ratio <= 0.0 || original_width <= 0 || original_height <= 0 ||
min_relative_crop_area > max_relative_crop_area) {
return false;
}
const float min_area = min_relative_crop_area * original_width * original_height;
const float max_area = max_relative_crop_area * original_width * original_height;
if (aspect_ratio == 0) {
return false;
}
int height = static_cast<int>(lrintf(std::sqrt(min_area / aspect_ratio)));
if (aspect_ratio == 0) {
return false;
}
int max_height = static_cast<int>(lrintf(std::sqrt(max_area / aspect_ratio)));
if (lrintf(max_height * aspect_ratio) > original_width) {
const float kEps = 0.0000001;
const float kBias = 0.5;
if (aspect_ratio == 0) {
return false;
}
max_height = static_cast<int>((original_width + kBias - kEps) / aspect_ratio);
if (lrintf(max_height * aspect_ratio) > original_width) {
max_height -= 1;
}
}
if (max_height > original_height) {
max_height = original_height;
}
if (height >= max_height) {
height = max_height;
}
if (height < max_height) {
height += Uniform(max_height - height + 1);
}
int width = static_cast<int>(lrintf(height * aspect_ratio));
float area = static_cast<float>(width * height);
if (area < min_area) {
height += 1;
width = static_cast<int>(lrintf(height * aspect_ratio));
area = width * height;
}
if (area > max_area) {
height -= 1;
width = static_cast<int>(lrintf(height * aspect_ratio));
area = width * height;
}
if (area < min_area || area > max_area || width > original_width || height > original_height || width <= 0 ||
height <= 0) {
return false;
}
int y = 0;
if (height < original_height) {
y = Uniform(original_height - height);
}
int x = 0;
if (width < original_width) {
x = Uniform(original_width - width);
}
crop_rect->min_x_ = x;
crop_rect->min_y_ = y;
crop_rect->max_x_ = x + width;
crop_rect->max_y_ = y + height;
return true;
}
uint32_t SDBBExt2CpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"SampleDistortedBoundingBoxExt2 check input and output number failed.");
KERNEL_HANDLE_ERROR(SDBBExt2Check(ctx), "SampleDistortedBoundingBoxExt2 check params or bcast failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
SDBBExt2CpuKernel_COMPUTE_CASE(DT_UINT8, uint8_t, ctx) SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT8, int8_t, ctx)
SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT16, int16_t, ctx) SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT32, int32_t, ctx)
SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT64, int64_t, ctx) default
: KERNEL_LOG_ERROR("SampleDistortedBoundingBoxExt2 kernel data type [%s] not support.",
DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t SDBBExt2CpuKernel::SDBBExt2Check(CpuKernelContext &ctx) {
auto image_size = ctx.Input(0);
auto bounding_boxes = ctx.Input(1);
auto min_object_covered = ctx.Input(2);
auto begin = ctx.Output(0);
auto size = ctx.Output(1);
auto bboxes = ctx.Output(2);
KERNEL_CHECK_NULLPTR(image_size->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
KERNEL_CHECK_NULLPTR(bounding_boxes->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
KERNEL_CHECK_NULLPTR(min_object_covered->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 2 data failed.")
KERNEL_CHECK_NULLPTR(begin->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed")
KERNEL_CHECK_NULLPTR(size->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 1 data failed")
KERNEL_CHECK_NULLPTR(bboxes->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 2 data failed")
auto attr_seed = ctx.GetAttr("seed");
KERNEL_CHECK_NULLPTR(attr_seed, KERNEL_STATUS_PARAM_INVALID, "Get seed attr failed.")
seed = attr_seed->GetInt();
auto attr_seed2 = ctx.GetAttr("seed2");
KERNEL_CHECK_NULLPTR(attr_seed2, KERNEL_STATUS_PARAM_INVALID, "Get seed2 attr failed.")
seed2 = attr_seed2->GetInt();
auto attr_aspect_ratio_range = ctx.GetAttr("aspect_ratio_range");
KERNEL_CHECK_NULLPTR(attr_aspect_ratio_range, KERNEL_STATUS_PARAM_INVALID, "Get aspect_ratio_range attr failed.")
aspect_ratio_range = attr_aspect_ratio_range->GetListFloat();
auto attr_area_range = ctx.GetAttr("area_range");
KERNEL_CHECK_NULLPTR(attr_area_range, KERNEL_STATUS_PARAM_INVALID, "Get area_range attr failed.")
area_range = attr_area_range->GetListFloat();
auto attr_max_attempts = ctx.GetAttr("max_attempts");
KERNEL_CHECK_NULLPTR(attr_max_attempts, KERNEL_STATUS_PARAM_INVALID, "Get max_attempts attr failed.")
max_attempts = attr_max_attempts->GetInt();
auto attr_use_image_if_no_bounding_boxes = ctx.GetAttr("use_image_if_no_bounding_boxes");
KERNEL_CHECK_NULLPTR(attr_use_image_if_no_bounding_boxes, KERNEL_STATUS_PARAM_INVALID,
"Get use_image_if_no_bounding_boxes attr failed.")
use_image_if_no_bounding_boxes = attr_use_image_if_no_bounding_boxes->GetBool();
KERNEL_CHECK_NULLPTR(image_size->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input image_size shape failed.")
KERNEL_CHECK_NULLPTR(bounding_boxes->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID,
"Get input bounding_boxes shape failed.")
KERNEL_CHECK_NULLPTR(min_object_covered->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID,
"Get input min_object_covered shape failed.")
std::vector<int64_t> shape_image_size = image_size->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_bounding_boxes = bounding_boxes->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE((shape_image_size.size() == 1), KERNEL_STATUS_PARAM_INVALID,
"image_size must be 1-dimensional, got: [%d].", shape_image_size.size())
const int image_size_num = 3;
KERNEL_CHECK_FALSE((shape_image_size.at(0) == image_size_num), KERNEL_STATUS_PARAM_INVALID,
"image_size must contain 3 elements, got: [%d].", shape_image_size.size())
const int shape_bounding_boxes_size = 3;
KERNEL_CHECK_FALSE((shape_bounding_boxes.size() == shape_bounding_boxes_size), KERNEL_STATUS_PARAM_INVALID,
"input boxes must be 3-dimensional [batch, num_boxes, "
"coords], got: [%d].",
shape_bounding_boxes.size())
const int bounding_boxes_size = 4;
KERNEL_CHECK_FALSE((shape_bounding_boxes.at(shape_bounding_boxes.size() - 1) == bounding_boxes_size),
KERNEL_STATUS_PARAM_INVALID, "bounding boxes must have shape [4], got: [%d].",
shape_bounding_boxes.at(shape_bounding_boxes.size() - 1))
const int aspect_ratio_range_size = 2;
KERNEL_CHECK_FALSE((aspect_ratio_range.size() == aspect_ratio_range_size), KERNEL_STATUS_PARAM_INVALID,
"Aspect ratio range field must specify 2 dimensions.")
KERNEL_CHECK_FALSE((aspect_ratio_range[0] > 0 && aspect_ratio_range[1] > 0), KERNEL_STATUS_PARAM_INVALID,
"Aspect ratio range must be positive: [%f], [%f].", aspect_ratio_range[0], aspect_ratio_range[1])
const int area_range_size = 2;
KERNEL_CHECK_FALSE((area_range.size() == area_range_size), KERNEL_STATUS_PARAM_INVALID,
"Area range field must specify 2 dimensions.")
KERNEL_CHECK_FALSE((area_range[0] > 0 && area_range[1] > 0), KERNEL_STATUS_PARAM_INVALID,
"Area range must be positive: [%f], [%f].", area_range[0], area_range[1])
KERNEL_CHECK_FALSE((area_range[0] <= 1 && area_range[1] <= 1), KERNEL_STATUS_PARAM_INVALID,
"Area range must be less then or equal to 1.0: [%f], [%f].", area_range[0], area_range[1])
KERNEL_CHECK_FALSE((max_attempts > 0), KERNEL_STATUS_PARAM_INVALID, "Max attempts must be positive: [%d]",
max_attempts)
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SDBBExt2CpuKernel::SDBBExt2Compute(CpuKernelContext &ctx) {
auto image_size = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto bounding_boxes = reinterpret_cast<float *>(ctx.Input(1)->GetData());
auto min_object_covered = reinterpret_cast<float *>(ctx.Input(2)->GetData());
auto begin = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto size = reinterpret_cast<T *>(ctx.Output(1)->GetData());
auto bboxes = reinterpret_cast<float *>(ctx.Output(2)->GetData());
const int32_t height = static_cast<int32_t>(image_size[0]);
const int32_t width = static_cast<int32_t>(image_size[1]);
if (!(height > 0 && width > 0)) {
KERNEL_LOG_ERROR("Image height and width must be positive, got: [%d] and [%d]", height, width);
return KERNEL_STATUS_INNER_ERROR;
}
float min_object_covered_val = 0.0;
min_object_covered_val = *min_object_covered;
if (min_object_covered_val < 0.0 || min_object_covered_val > 1.0) {
KERNEL_LOG_ERROR("min_object_covered must be in [0.0, 1.0], got: [%f]", min_object_covered_val);
return KERNEL_STATUS_INNER_ERROR;
}
const int index_y_min = 0;
const int index_x_min = 1;
const int index_y_max = 2;
const int index_x_max = 3;
const int kBBoxSize = 4;
std::vector<Rectangle> boxes;
int64_t size_bounding_boxes = ctx.Input(1)->NumElements();
if (size_bounding_boxes > 0) {
for (int b = 0; b < size_bounding_boxes / kBBoxSize; ++b) {
if (!(bounding_boxes[b * kBBoxSize + index_x_min] < bounding_boxes[b * kBBoxSize + index_x_max])) {
KERNEL_LOG_ERROR("x_min must be less than x_max, got: [%f] and [%f]",
bounding_boxes[b * kBBoxSize + index_x_min], bounding_boxes[b * kBBoxSize + index_x_max]);
return KERNEL_STATUS_INNER_ERROR;
}
if (!(bounding_boxes[b * kBBoxSize + index_y_min] < bounding_boxes[b * kBBoxSize + index_y_max])) {
KERNEL_LOG_ERROR("y_min must be less than y_max, got: [%f] and [%f]",
bounding_boxes[b * kBBoxSize + index_y_min], bounding_boxes[b * kBBoxSize + index_y_max]);
return KERNEL_STATUS_INNER_ERROR;
}
for (int i = 0; i < kBBoxSize; ++i) {
if (bounding_boxes[b * kBBoxSize + i] < 0.0 || bounding_boxes[b * kBBoxSize + i] > 1.0) {
KERNEL_LOG_ERROR("All bounding box coordinates must be in [0.0, 1.0], got: [%f]",
bounding_boxes[b * kBBoxSize + i]);
return KERNEL_STATUS_INNER_ERROR;
}
}
const int32_t x_min = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_x_min] * width);
const int32_t y_min = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_y_min] * height);
const int32_t x_max = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_x_max] * width);
const int32_t y_max = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_y_max] * height);
boxes.push_back(Rectangle(x_min, y_min, x_max, y_max));
}
}
const Rectangle image_rect(0, 0, width, height);
if (boxes.empty()) {
if (!use_image_if_no_bounding_boxes) {
KERNEL_LOG_ERROR(
"No bounding boxes provided as input. One must "
"enable use_image_if_no_bounding_boxes if you wish "
"to not provide any bounding boxes.");
return KERNEL_STATUS_INNER_ERROR;
}
boxes.push_back(image_rect);
}
const float min_sample_area = area_range[0];
const float max_sample_area = area_range[1];
const float min_sample_aspect_ratio = aspect_ratio_range[0];
const float max_sample_aspect_ratio = aspect_ratio_range[1];
InitPhiloxRandom(seed, seed2);
Rectangle crop_rect;
bool sample_generated = false;
for (int i = 0; i < max_attempts; ++i) {
const float sample_aspect_ratio =
RandFloat() * (max_sample_aspect_ratio - min_sample_aspect_ratio) + min_sample_aspect_ratio;
if (GenerateRandomCrop(width, height, min_sample_area, max_sample_area, sample_aspect_ratio, &crop_rect)) {
if (SatisfiesOverlapConstraints(crop_rect, min_object_covered_val, boxes)) {
sample_generated = true;
break;
}
}
}
if (!sample_generated) {
crop_rect = image_rect;
}
// Determine the cropping parameters from the bounding box.
const int target_width = crop_rect.max_x_ - crop_rect.min_x_;
const int target_height = crop_rect.max_y_ - crop_rect.min_y_;
const int offset_width = crop_rect.min_x_;
const int offset_height = crop_rect.min_y_;
if (width < target_width + offset_width) {
KERNEL_LOG_ERROR("width must be >= target_width + offset_width: [%d] vs [%d] + [%d]", width, target_width,
offset_width);
return KERNEL_STATUS_INNER_ERROR;
}
if (height < target_height + offset_height) {
KERNEL_LOG_ERROR("height must be >= target_height + offset_height: [%d] vs [%d] + [%d]", height, target_height,
offset_height);
return KERNEL_STATUS_INNER_ERROR;
}
begin[0] = static_cast<T>(offset_height);
size[0] = static_cast<T>(target_height);
begin[1] = static_cast<T>(offset_width);
size[1] = static_cast<T>(target_width);
bboxes[index_y_min] = static_cast<float>(crop_rect.min_y_) / static_cast<float>(height);
bboxes[index_x_min] = static_cast<float>(crop_rect.min_x_) / static_cast<float>(width);
bboxes[index_y_max] = static_cast<float>(crop_rect.max_y_) / static_cast<float>(height);
bboxes[index_x_max] = static_cast<float>(crop_rect.max_x_) / static_cast<float>(width);
// Retain all of the channels.
const int32_t begin_channels = 3;
const int32_t size_channels = 3;
begin[begin_channels - 1] = static_cast<T>(0);
size[size_channels - 1] = static_cast<T>(-1);
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSDBBExt2, SDBBExt2CpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,101 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SAMPLE_DISTORTED_BOUNDING_BOX_EXT2_H_
#define AICPU_KERNELS_NORMALIZED_SAMPLE_DISTORTED_BOUNDING_BOX_EXT2_H_
#include "cpu_ops_kernel.h"
#include "utils/philox_random.h"
class Rectangle {
public:
Rectangle() { Set(0, 0, 0, 0); }
Rectangle(int xmin, int ymin, int xmax, int ymax) { Set(xmin, ymin, xmax, ymax); }
void Set(int xmin, int ymin, int xmax, int ymax) {
min_x_ = xmin;
min_y_ = ymin;
max_x_ = xmax;
max_y_ = ymax;
}
bool IsEmpty() const { return min_x_ > max_x_ || min_y_ > max_y_; }
float Area() const { return static_cast<float>((max_x_ - min_x_) * (max_y_ - min_y_)); }
Rectangle Intersect(const Rectangle &r) const {
const int pmin_x = std::max(min_x_, r.min_x_);
const int pmin_y = std::max(min_y_, r.min_y_);
const int pmax_x = std::min(max_x_, r.max_x_);
const int pmax_y = std::min(max_y_, r.max_y_);
if (pmin_x > pmax_x || pmin_y > pmax_y) {
return Rectangle();
} else {
return Rectangle(pmin_x, pmin_y, pmax_x, pmax_y);
}
}
int min_x_;
int min_y_;
int max_x_;
int max_y_;
};
namespace aicpu {
class SDBBExt2CpuKernel : public CpuKernel {
public:
SDBBExt2CpuKernel() = default;
~SDBBExt2CpuKernel() override = default;
static const int kResultTypeNum = 4;
static const int kKeyNum = 2;
using ResultType = Array<uint32_t, kResultTypeNum>;
using ResultElementType = uint32_t;
using Key = Array<uint32_t, kKeyNum>;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
int seed;
int seed2;
std::vector<float> aspect_ratio_range;
std::vector<float> area_range;
int max_attempts;
bool use_image_if_no_bounding_boxes;
PhiloxRandom generator_;
float RandFloat();
uint32_t Uniform(uint32_t n);
uint64_t New64();
void InitPhiloxRandom(int64_t seed, int64_t seed2);
ResultType unused_results_;
int used_result_index_ = PhiloxRandom::kResultElementCount;
ResultElementType GenerateSingle();
// Image
bool SatisfiesOverlapConstraints(const Rectangle &crop, float minimum_object_covered,
const std::vector<Rectangle> &bounding_boxes);
bool GenerateRandomCrop(int original_width, int original_height, float min_relative_crop_area,
float max_relative_crop_area, float aspect_ratio, Rectangle *crop_rect);
uint32_t SDBBExt2Check(CpuKernelContext &ctx);
template <typename T>
uint32_t SDBBExt2Compute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,196 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "scatter_nd.h"
#include <complex>
#include "eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 3;
const uint32_t kOutputNum = 1;
const char *kScatterNd = "ScatterNd";
} // namespace
namespace aicpu {
uint32_t ScatterNdCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check ScatterNd Input and Output failed.");
Tensor *input_indices = ctx.Input(0);
Tensor *input_x = ctx.Input(1);
Tensor *input_shape = ctx.Input(2);
auto shape_x = input_x->GetTensorShape();
auto shape_indices = input_indices->GetTensorShape();
auto shape_shape = input_shape->GetTensorShape();
int64_t indices_shape_m = shape_indices->GetDimSize(shape_indices->GetDims() - 1);
if (shape_x->GetDims() < 1) {
KERNEL_LOG_ERROR("[%s] Tensor input_x's rank less than 1.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (shape_indices->GetDims() < 1) {
KERNEL_LOG_ERROR("[%s] Tensor input_indices's rank less than 1.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (shape_shape->GetDims() < 1) {
KERNEL_LOG_ERROR("[%s] Tensor input_shape's rank less than 1.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (indices_shape_m > shape_shape->NumElements()) {
KERNEL_LOG_ERROR("[%s] Tensor input_shape&input_indices ranks mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
for (int64_t i = 0; i < shape_indices->GetDims() - 1; i++) {
if (shape_indices->GetDimSize(i) != shape_x->GetDimSize(i)) {
KERNEL_LOG_ERROR("[%s], shape_indices and shape_updates mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
auto data_type_x = input_x->GetDataType();
auto data_type_indices = input_indices->GetDataType();
auto data_type_shape = input_shape->GetDataType();
if (data_type_shape != DT_INT32 && data_type_shape != DT_INT64) {
KERNEL_LOG_ERROR("ScatterNd kernel data type [%s] not support.", DTypeStr(data_type_shape).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (data_type_indices != DT_INT32 && data_type_indices != DT_INT64) {
KERNEL_LOG_ERROR("ScatterNd kernel data type [%s] not support.", DTypeStr(data_type_indices).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (data_type_indices != data_type_shape) {
KERNEL_LOG_ERROR("Indices and shape must have the same type.");
return KERNEL_STATUS_PARAM_INVALID;
}
switch (data_type_x) {
case DT_INT8:
return DTYPE_CHOOSE<int8_t>(ctx);
case DT_INT16:
return DTYPE_CHOOSE<int16_t>(ctx);
case DT_INT32:
return DTYPE_CHOOSE<int32_t>(ctx);
case DT_INT64:
return DTYPE_CHOOSE<int64_t>(ctx);
case DT_UINT8:
return DTYPE_CHOOSE<uint8_t>(ctx);
case DT_UINT16:
return DTYPE_CHOOSE<uint16_t>(ctx);
case DT_UINT32:
return DTYPE_CHOOSE<uint32_t>(ctx);
case DT_UINT64:
return DTYPE_CHOOSE<uint64_t>(ctx);
case DT_FLOAT16:
return DTYPE_CHOOSE<Eigen::half>(ctx);
case DT_FLOAT:
return DTYPE_CHOOSE<float>(ctx);
case DT_DOUBLE:
return DTYPE_CHOOSE<double>(ctx);
case DT_COMPLEX64:
return DTYPE_CHOOSE<std::complex<float>>(ctx);
case DT_COMPLEX128:
return DTYPE_CHOOSE<std::complex<double>>(ctx);
default:
KERNEL_LOG_ERROR("ScatterNd kernel data type [%s] not support.", DTypeStr(data_type_x).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
template <typename data_type_x>
uint32_t ScatterNdCpuKernel::DTYPE_CHOOSE(CpuKernelContext &ctx) {
auto indices_type = static_cast<DataType>(ctx.Input(0)->GetDataType());
switch (indices_type) {
case DT_INT32:
return ScatterNdComputeRealKernel<int32_t, data_type_x>(ctx);
case DT_INT64:
return ScatterNdComputeRealKernel<int64_t, data_type_x>(ctx);
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not supported, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(indices_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename indices_type, typename data_type_x>
uint32_t ScatterNdCpuKernel::ScatterNdComputeRealKernel(CpuKernelContext &ctx) {
int64_t n_slices = 1;
int64_t slice_size = 1;
const int64_t outer_dims = ctx.Input(0)->GetTensorShape()->GetDims() - 1;
const int64_t indices_nd = ctx.Input(0)->GetTensorShape()->GetDimSize(outer_dims);
const int64_t updates_dims = ctx.Input(1)->GetTensorShape()->GetDims();
auto shape_indices = ctx.Input(0)->GetTensorShape();
auto data_shape = reinterpret_cast<indices_type *>(ctx.Input(2)->GetData());
auto dims_shape = ctx.Input(2)->GetTensorShape()->NumElements();
auto updates_shape = ctx.Input(1)->GetTensorShape();
for (int64_t i = 0; i < dims_shape - indices_nd; i++) {
if (updates_shape->GetDimSize(i + shape_indices->GetDims() - 1) != data_shape[i + indices_nd]) {
KERNEL_LOG_ERROR("[%s], shape_indices and shape_updates mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (int64_t i = 0; i < outer_dims; ++i) {
n_slices *= ctx.Input(0)->GetTensorShape()->GetDimSize(i);
}
for (int64_t i = outer_dims; i < updates_dims; ++i) {
slice_size *= ctx.Input(1)->GetTensorShape()->GetDimSize(i);
}
const int kNumberInputTwo = 2;
int64_t output_flat_size = 1;
int64_t num_shape = ctx.Input(kNumberInputTwo)->NumElements();
for (int64_t i = 0; i < num_shape; i++) {
output_flat_size *= data_shape[i];
}
int64_t remain_flat_size = output_flat_size;
std::vector<int64_t> dims_to_count(indices_nd, 0);
for (int64_t i = 0; i < indices_nd; ++i) {
dims_to_count[i] = remain_flat_size / data_shape[i];
remain_flat_size = dims_to_count[i];
}
auto Indices_data = reinterpret_cast<indices_type *>(ctx.Input(0)->GetData());
auto Updates_data = reinterpret_cast<data_type_x *>(ctx.Input(1)->GetData());
auto Output_data = reinterpret_cast<data_type_x *>(ctx.Output(0)->GetData());
memset(Output_data, 0, sizeof(data_type_x) * output_flat_size);
for (int64_t i = 0; i < n_slices; ++i) {
int64_t to_pos = 0;
for (int64_t j = 0; j < indices_nd; ++j) {
int64_t idx = Indices_data[i * indices_nd + j];
if (idx < 0 || idx >= data_shape[j]) {
KERNEL_LOG_ERROR("The indices[%d] is so big or small", idx);
return KERNEL_STATUS_PARAM_INVALID;
}
to_pos += idx * dims_to_count[j];
}
for (int64_t j = 0; j < slice_size; j++) {
Output_data[to_pos + j] += Updates_data[i * slice_size + j];
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kScatterNd, ScatterNdCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,41 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SCATTERND_H_
#define AICPU_KERNELS_NORMALIZED_SCATTERND_H_
#include <string.h>
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
namespace aicpu {
class ScatterNdCpuKernel : public CpuKernel {
public:
ScatterNdCpuKernel() = default;
~ScatterNdCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename data_type0>
uint32_t DTYPE_CHOOSE(CpuKernelContext &ctx);
template <typename indices_type, typename data_type0>
uint32_t ScatterNdComputeRealKernel(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,211 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "scatter_nd_update.h"
#include <string.h>
#include <algorithm>
#include <complex>
#include <iostream>
#include <map>
#include "eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 3;
const uint32_t kOutputNum = 1;
const char *kScatterNdUpdate = "ScatterNdUpdate";
} // namespace
namespace aicpu {
uint32_t ScatterNdUpdateCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check ScatterNdUpdate Input and Output failed.");
Tensor *input_var = ctx.Input(0);
Tensor *input_indices = ctx.Input(1);
Tensor *input_updates = ctx.Input(2);
auto shape_var = input_var->GetTensorShape();
auto shape_indices = input_indices->GetTensorShape();
auto shape_updates = input_updates->GetTensorShape();
if (shape_var->GetDims() < 1) {
KERNEL_LOG_ERROR("[%s] Tensor input_var's rank less than 1.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (shape_indices->GetDims() < 2) {
KERNEL_LOG_ERROR("[%s] Tensor input_indices's rank less than 2.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (shape_updates->GetDims() < 1) {
KERNEL_LOG_ERROR("[%s] Tensor input_updates's rank less than 1.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto index_size = shape_indices->GetDims() - 1;
auto index_depth = shape_indices->GetDimSize(index_size);
if (index_depth > shape_var->GetDims()) {
KERNEL_LOG_ERROR("[%s] Tensor input_var&input_indices ranks mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
std::vector<int64_t> batch_shape;
for (int64_t i = 0; i < index_size; ++i) {
batch_shape.push_back(shape_indices->GetDimSize(i));
}
for (int64_t i = index_depth; i <= shape_var->GetDims() - 1; ++i) {
batch_shape.push_back(shape_var->GetDimSize(i));
}
if (batch_shape != shape_updates->GetDimSizes()) {
KERNEL_LOG_ERROR("[%s] Tensor indices's & updates' and var's shape are dismatch .", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
for (int64_t i = 0; i < index_size; i++) {
if (shape_indices->GetDimSize(i) != shape_updates->GetDimSize(i)) {
KERNEL_LOG_ERROR("[%s], Tensor indices and updates should have the same batch number.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
auto data_type_var = input_var->GetDataType();
auto data_type_indices = input_indices->GetDataType();
if (data_type_indices != DT_INT32 && data_type_indices != DT_INT64) {
KERNEL_LOG_ERROR("ScatterNdUpdate kernel data type [%s] not support.", DTypeStr(data_type_indices).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
switch (data_type_var) {
case DT_INT8:
return DTYPE_CHOOSE<int8_t>(ctx);
case DT_INT16:
return DTYPE_CHOOSE<int16_t>(ctx);
case DT_INT32:
return DTYPE_CHOOSE<int32_t>(ctx);
case DT_INT64:
return DTYPE_CHOOSE<int64_t>(ctx);
case DT_UINT8:
return DTYPE_CHOOSE<uint8_t>(ctx);
case DT_UINT16:
return DTYPE_CHOOSE<uint16_t>(ctx);
case DT_UINT32:
return DTYPE_CHOOSE<uint32_t>(ctx);
case DT_UINT64:
return DTYPE_CHOOSE<uint64_t>(ctx);
case DT_FLOAT16:
return DTYPE_CHOOSE<Eigen::half>(ctx);
case DT_FLOAT:
return DTYPE_CHOOSE<float>(ctx);
case DT_DOUBLE:
return DTYPE_CHOOSE<double>(ctx);
case DT_COMPLEX64:
return DTYPE_CHOOSE<std::complex<float>>(ctx);
case DT_COMPLEX128:
return DTYPE_CHOOSE<std::complex<double>>(ctx);
default:
KERNEL_LOG_ERROR("ScatterNdUpdate kernel data type [%s] not support.", DTypeStr(data_type_var).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename var_type>
uint32_t ScatterNdUpdateCpuKernel::DTYPE_CHOOSE(CpuKernelContext &ctx) {
auto indices_type = static_cast<DataType>(ctx.Input(1)->GetDataType());
switch (indices_type) {
case DT_INT32:
return ScatterNdUpdateComputeRealKernel<var_type, int32_t>(ctx);
case DT_INT64:
return ScatterNdUpdateComputeRealKernel<var_type, int64_t>(ctx);
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not supported, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(indices_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
template <typename var_type, typename indices_type>
uint32_t ScatterNdUpdateCpuKernel::ScatterNdUpdateComputeRealKernel(CpuKernelContext &ctx) {
int64_t n_slices = 1;
int64_t slice_size = 1;
const int64_t indices_dims = ctx.Input(1)->GetTensorShape()->GetDims() - 1;
const int64_t indices_nd = ctx.Input(1)->GetTensorShape()->GetDimSize(indices_dims);
const int64_t updates_dims = ctx.Input(2)->GetTensorShape()->GetDims();
auto shape_var = ctx.Input(0)->GetTensorShape()->GetDimSizes();
auto shape_indices = ctx.Input(1)->GetTensorShape();
auto dims_shape = ctx.Input(0)->GetTensorShape()->GetDims();
for (int64_t i = 0; i < dims_shape - indices_nd; i++) {
if (ctx.Input(2)->GetTensorShape()->GetDimSize(i + shape_indices->GetDims() - 1) != shape_var[i + indices_nd]) {
KERNEL_LOG_ERROR("[%s] shape_indices and shape_updates mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (int64_t i = 0; i < indices_dims; ++i) {
n_slices *= ctx.Input(1)->GetTensorShape()->GetDimSize(i);
}
for (int i = indices_dims; i < updates_dims; ++i) {
slice_size *= ctx.Input(2)->GetTensorShape()->GetDimSize(i);
}
const int64_t var_flat_size = ctx.Input(0)->GetTensorShape()->NumElements();
std::vector<int64_t> output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
int64_t remain_flat_size = var_flat_size;
std::vector<int64_t> dims_to_count(indices_nd, 0);
for (int64_t i = 0; i < indices_nd; ++i) {
dims_to_count[i] = remain_flat_size / output_shape[i];
remain_flat_size = dims_to_count[i];
}
auto Var_data = reinterpret_cast<var_type *>(ctx.Input(0)->GetData());
auto Indices_data = reinterpret_cast<indices_type *>(ctx.Input(1)->GetData());
auto Updates_data = reinterpret_cast<var_type *>(ctx.Input(2)->GetData());
auto Output_data = reinterpret_cast<var_type *>(ctx.Output(0)->GetData());
for (int64_t i = 0; i < var_flat_size; ++i) {
Output_data[i] = Var_data[i];
}
for (int64_t i = 0; i < n_slices; ++i) {
int64_t to_pos = 0;
for (int64_t j = 0; j < indices_nd; ++j) {
int64_t idx = Indices_data[i * indices_nd + j];
if (idx < 0 || idx >= output_shape[j]) {
KERNEL_LOG_ERROR("The indices[%d] is so big or small", idx);
return KERNEL_STATUS_PARAM_INVALID;
}
to_pos += idx * dims_to_count[j];
}
for (int64_t j = 0; j < slice_size; j++) {
Output_data[to_pos + j] = Updates_data[i * slice_size + j];
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kScatterNdUpdate, ScatterNdUpdateCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SCATTERNDUPDATE_H_
#define AICPU_KERNELS_NORMALIZED_SCATTERNDUPDATE_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
#include <string.h>
namespace aicpu {
class ScatterNdUpdateCpuKernel : public CpuKernel {
public:
ScatterNdUpdateCpuKernel() = default;
~ScatterNdUpdateCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename var_type>
uint32_t DTYPE_CHOOSE(CpuKernelContext &ctx);
template <typename var_type, typename indices_type>
uint32_t ScatterNdUpdateComputeRealKernel(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,151 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "select.h"
#include "cpu_kernel_utils.h"
#include "utils/broadcast_iterator.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 3;
const char *kSelect = "Select";
#define SELECT_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = SelectCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Select kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t SelectCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Select check input and output number failed.");
KERNEL_HANDLE_ERROR(SelectParamCheck(ctx), "Select check params failed.");
auto data_type = ctx.Input(1)->GetDataType();
switch (data_type) {
SELECT_COMPUTE_CASE(DT_INT8, int8_t, ctx)
SELECT_COMPUTE_CASE(DT_INT16, int16_t, ctx)
SELECT_COMPUTE_CASE(DT_INT32, int32_t, ctx)
SELECT_COMPUTE_CASE(DT_INT64, int64_t, ctx)
SELECT_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
SELECT_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
SELECT_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
SELECT_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
SELECT_COMPUTE_CASE(DT_BOOL, uint64_t, ctx)
SELECT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
SELECT_COMPUTE_CASE(DT_FLOAT, float, ctx)
SELECT_COMPUTE_CASE(DT_DOUBLE, double, ctx)
SELECT_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx);
SELECT_COMPUTE_CASE(DT_COMPLEX64, std::complex<double>, ctx);
default:
KERNEL_LOG_ERROR("Select kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t SelectCpuKernel::SelectParamCheck(CpuKernelContext &ctx) {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *input_2 = ctx.Input(2);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
DataType input2_type = input_2->GetDataType();
auto input_shape_a = ctx.Input(1)->GetTensorShape()->GetDimSizes();
auto input_shape_b = ctx.Input(2)->GetTensorShape()->GetDimSizes();
if (input0_type != DT_BOOL) {
KERNEL_LOG_ERROR("[%s] Data type of mask requires bool, but got data type [%s].", ctx.GetOpType().c_str(),
DTypeStr(input0_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_CHECK_FALSE((input1_type == input2_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input1 [%s] need be same with "
"input2 [%s].",
DTypeStr(input1_type).c_str(), DTypeStr(input2_type).c_str())
if (input_shape_a != input_shape_b) {
KERNEL_LOG_ERROR("The shape of X1 must equal X2.");
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_LOG_DEBUG(
"SelectCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], input2: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), input_2->GetDataSize(),
output->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SelectCpuKernel::SelectCompute(CpuKernelContext &ctx) {
bool *condition = static_cast<bool *>(ctx.Input(0)->GetData());
T *x1 = static_cast<T *>(ctx.Input(1)->GetData());
T *x2 = static_cast<T *>(ctx.Input(2)->GetData());
T *y = static_cast<T *>(ctx.Output(0)->GetData());
auto input_shape_a = ctx.Input(1)->GetTensorShape()->GetDimSizes();
auto input_shape_b = ctx.Input(2)->GetTensorShape()->GetDimSizes();
auto input_shape_mask = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> output_shape;
int64_t tensor_size = 1;
int64_t position = 0;
if (input_shape_a == input_shape_mask) {
for (const int64_t &d : input_shape_a) {
tensor_size *= d;
}
for (int64_t i = 0; i < tensor_size; ++i) {
if (condition[i]) {
y[position++] = x1[i];
} else {
y[position++] = x2[i];
}
}
} else {
auto ret = GetBroadcastShape(input_shape_a, input_shape_mask, output_shape);
KERNEL_CHECK_FALSE(ret == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID, "Shape of x and mask can't be broadcast.");
for (const int64_t &d : output_shape) {
tensor_size *= d;
}
BroadcastIterator iter(input_shape_a, input_shape_mask, output_shape);
iter.SetPos(0);
for (int64_t i = 0; i < tensor_size; ++i) {
if (condition[iter.GetInputPosB()]) {
y[position++] = x1[i];
} else {
y[position++] = x2[i];
}
iter.GenNextPos();
}
}
ctx.Output(0)->GetTensorShape()->SetDimSizes({position});
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSelect, SelectCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,39 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SELECT_H_
#define AICPU_KERNELS_NORMALIZED_SELECT_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class SelectCpuKernel : public CpuKernel {
public:
SelectCpuKernel() = default;
~SelectCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t SelectParamCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t SelectCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,127 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "self_adjoint_eig.h"
#include "cpu_kernel_utils.h"
#include "kernel_util.h"
#include <complex>
#include "utils/kernel_util.h"
#include "Eigen/Core"
#include <iostream>
#include <Eigen/Dense>
using namespace std;
namespace {
const char *kSelfAdjointEig = "SelfAdjointEig";
const uint32_t kInputNum = 1;
const uint32_t kOutputNum = 2;
} // namespace
namespace aicpu {
uint32_t SelfAdjointEigCpuKernel::Compute(CpuKernelContext &ctx) {
if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
return KERNEL_STATUS_PARAM_INVALID;
}
Tensor *input0 = ctx.Input(0);
if ((input0->GetDataSize() == 0)) {
KERNEL_LOG_INFO("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
return KERNEL_STATUS_OK;
}
uint32_t ret = KERNEL_STATUS_OK;
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
case DT_FLOAT:
ret = SelfAdjointEigCompute<float>(ctx);
break;
case DT_DOUBLE:
ret = SelfAdjointEigCompute<double>(ctx);
break;
case DT_COMPLEX64:
ret = SelfAdjointEigCompute<complex<float>>(ctx);
break;
case DT_COMPLEX128:
ret = SelfAdjointEigCompute<complex<double>>(ctx);
break;
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(data_type).c_str());
ret = KERNEL_STATUS_PARAM_INVALID;
}
return ret;
}
template <typename T>
uint32_t SelfAdjointEigCpuKernel::SelfAdjointEigCompute(CpuKernelContext &ctx) {
auto input_tensor = ctx.Input(0);
auto output_tensor0 = ctx.Output(0);
auto output_tensor1 = ctx.Output(1);
auto input_tensor_shape = input_tensor->GetTensorShape();
auto inputData = reinterpret_cast<T *>(input_tensor->GetData());
int64_t rank = input_tensor_shape->GetDims();
std::vector<int64_t> input_dims = input_tensor_shape->GetDimSizes();
const int32_t m = input_dims[rank - 1];
int64_t num_array = input_tensor_shape->NumElements() / (m * m);
using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
if (rank <= 2) {
MatrixMap input0(inputData, m, m);
MatrixMap output0(reinterpret_cast<T *>(output_tensor0->GetData()), m, 1);
MatrixMap output1(reinterpret_cast<T *>(output_tensor1->GetData()), m, m);
AttrValue *attr = ctx.GetAttr("compute_v");
bool attr_ = (attr == nullptr) ? true : attr->GetBool();
Eigen::SelfAdjointEigenSolver<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> es(
input0, attr_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
output0 = es.eigenvalues().template cast<T>();
if (attr_) {
output1 = es.eigenvectors();
}
} else {
auto outputData0 = reinterpret_cast<T *>(output_tensor0->GetData());
auto outputData1 = reinterpret_cast<T *>(output_tensor1->GetData());
for (int64_t batch = 0; batch < num_array; ++batch) {
AttrValue *attr = ctx.GetAttr("compute_v");
bool attr_ = (attr == nullptr) ? true : attr->GetBool();
T *inputDataMap = reinterpret_cast<T *>(new T[m * m]);
T *outputDataMap0 = reinterpret_cast<T *>(new T[m]);
T *outputDataMap1 = reinterpret_cast<T *>(new T[m * m]);
for (int64_t i = 0; i < m * m; ++i) {
inputDataMap[i] = inputData[batch * m * m + i];
outputDataMap1[i] = outputData1[batch * m * m + i];
}
for (int64_t i = 0; i < m; ++i) {
outputDataMap0[i] = outputData0[batch * m + i];
}
MatrixMap input0(inputDataMap, m, m);
MatrixMap output0(outputDataMap0, m, 1);
MatrixMap output1(outputDataMap1, m, m);
Eigen::SelfAdjointEigenSolver<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> es(
input0, attr_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
output0 = es.eigenvalues().template cast<T>();
for (int64_t i = 0; i < m; i++) {
*(outputData0 + batch * m + i) = output0(i, 0);
}
if (attr_) {
output1 = es.eigenvectors();
for (int64_t i = 0; i < m; i++) {
for (int64_t j = 0; j < m; j++) {
*(outputData1 + batch * m * m + i * m + j) = output1(i, j);
}
}
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSelfAdjointEig, SelfAdjointEigCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SELFADJOINTEIG_H_
#define AICPU_KERNELS_NORMALIZED_SELFADJOINTEIG_H_
#include "cpu_ops_kernel.h"
#include "Eigen/Eigenvalues"
#include <iostream>
namespace aicpu {
class SelfAdjointEigCpuKernel : public CpuKernel {
public:
SelfAdjointEigCpuKernel() = default;
~SelfAdjointEigCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t SelfAdjointEigCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_RANDOM_UNIFORM_H_

View File

@ -0,0 +1,152 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sign.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *const kSign = "Sign";
constexpr int64_t kParallelDataNums = 128 * 1024;
#define SIGN_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = SignCompute<TYPE>(CTX); \
if (result != static_cast<uint32_t>(KERNEL_STATUS_OK)) { \
KERNEL_LOG_ERROR("Sign kernel compute failed."); \
return result; \
} \
break; \
}
#define SIGN_COMPUTE_CASE2(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = SignComputeComplex<TYPE>(CTX); \
if (result != static_cast<uint32_t>(KERNEL_STATUS_OK)) { \
KERNEL_LOG_ERROR("Sign kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t SignCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSign);
KERNEL_HANDLE_ERROR(static_cast<uint32_t>(SignCheck(ctx)), "[%s] check params failed.", kSign);
DataType data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
SIGN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
SIGN_COMPUTE_CASE(DT_FLOAT, float, ctx)
SIGN_COMPUTE_CASE(DT_DOUBLE, double, ctx)
SIGN_COMPUTE_CASE(DT_INT32, int32_t, ctx)
SIGN_COMPUTE_CASE(DT_INT64, int64_t, ctx)
SIGN_COMPUTE_CASE2(DT_COMPLEX64, std::complex<float>, ctx)
SIGN_COMPUTE_CASE2(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("Sign kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
}
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
KernelStatus SignCpuKernel::SignCheck(const CpuKernelContext &ctx) const {
auto input_0 = ctx.Input(0);
auto output_0 = ctx.Output(0);
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
KERNEL_CHECK_NULLPTR(input_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SignCpuKernel::SignCompute(const CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Input(0)->NumElements();
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
if (data_size <= kParallelDataNums) {
for (int64_t i = 0; i < data_num; i++) {
if (*(input_x + i) > static_cast<T>(0)) {
*(output_y + i) = static_cast<T>(1);
} else if (*(input_x + i) == static_cast<T>(0)) {
*(output_y + i) = static_cast<T>(0);
} else {
*(output_y + i) = static_cast<T>(-1);
}
}
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_sign = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
if (*(input_x + i) > static_cast<T>(0)) {
*(output_y + i) = static_cast<T>(1);
} else if (*(input_x + i) == static_cast<T>(0)) {
*(output_y + i) = static_cast<T>(0);
} else {
*(output_y + i) = static_cast<T>(-1);
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_sign),
"Sign Compute failed.");
}
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
template <typename T>
uint32_t SignCpuKernel::SignComputeComplex(const CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Input(0)->NumElements();
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
if (data_size <= kParallelDataNums) {
for (int64_t i = 0; i < data_num; i++) {
if (*(input_x + i) != static_cast<T>(0)) {
*(output_y + i) = (*(input_x + i) / Eigen::numext::abs(*(input_x + i)));
} else {
*(output_y + i) = static_cast<T>(0);
}
}
} else {
uint32_t min_num = 1;
int64_t max_core_num = std::max(min_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_sign = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
if (*(input_x + i) != static_cast<T>(0)) {
*(output_y + i) = (*(input_x + i) / Eigen::numext::abs(*(input_x + i)));
} else {
*(output_y + i) = static_cast<T>(0);
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_sign),
"Sign Compute failed.");
}
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
REGISTER_CPU_KERNEL(kSign, SignCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SIGN_H
#define AICPU_KERNELS_NORMALIZED_SIGN_H
#include "cpu_ops_kernel.h"
#include "cpu_kernel/common/status.h"
namespace aicpu {
class SignCpuKernel : public CpuKernel {
public:
SignCpuKernel() = default;
~SignCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
KernelStatus SignCheck(const CpuKernelContext &ctx) const;
template <typename T>
uint32_t SignCompute(const CpuKernelContext &ctx);
template <typename T>
uint32_t SignComputeComplex(const CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,148 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sin.h"
#include <complex>
#include <unsupported/Eigen/CXX11/Tensor>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/kernel_util.h"
namespace {
const std::uint32_t kSinInputNum{1};
const std::uint32_t kSinOutputNum{1};
const char *kSin{"Sin"};
} // namespace
namespace internal {
template <typename T>
inline T ScalarSin(T x) {
return std::sin(x);
}
template <>
inline Eigen::half ScalarSin(Eigen::half x) {
const Eigen::half val{static_cast<Eigen::half>(Eigen::numext::sin(x))};
return val;
}
} // namespace internal
namespace aicpu {
namespace detail {
template <typename T>
inline std::uint32_t ComputeSinKernel(const CpuKernelContext &ctx) {
using i64 = std::int64_t;
const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
const auto ScalarSin = internal::ScalarSin<T>;
auto input = static_cast<T *>(ctx.Input(0)->GetData());
auto output = static_cast<T *>(ctx.Output(0)->GetData());
i64 total = ctx.Input(0)->NumElements();
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
i64 num = 1024;
if (total > num) {
i64 per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
return ParallelFor(ctx, total, per_unit_size, [&](i64 begin, i64 end) {
std::transform(input + begin, input + end, output + begin, ScalarSin);
});
} else if (cores != 0) {
std::transform(input, input + total, output, ScalarSin);
} else {
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
template <typename T>
inline std::uint32_t ComputeSin(const CpuKernelContext &ctx) {
uint32_t result = ComputeSinKernel<T>(ctx);
if (result != 0) {
KERNEL_LOG_ERROR("Sin compute failed.");
}
return result;
}
inline std::uint32_t SinExtraCheck(const CpuKernelContext &ctx) {
if (ctx.Input(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get input data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Output(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get output data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
KERNEL_LOG_ERROR(
"The data size of the input [%llu] need be the same as the output "
"[%llu].",
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_PARAM_INVALID;
}
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
if (input_dims.size() != output_dims.size()) {
KERNEL_LOG_ERROR(
"The data dim size of the input [%llu] need be the same as the output "
"[%llu].",
input_dims.size(), output_dims.size());
return KERNEL_STATUS_PARAM_INVALID;
}
for (size_t index = 0; index < input_dims.size(); index++) {
if (input_dims[index] != output_dims[index]) {
KERNEL_LOG_ERROR("The data dim of the input need be the same as the output.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
std::uint32_t SinCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
return NormalCheck(ctx, kSinInputNum, kSinOutputNum) ? KERNEL_STATUS_PARAM_INVALID : SinExtraCheck(ctx);
}
std::uint32_t SinCompute(const CpuKernelContext &ctx) {
DataType input_type{ctx.Input(0)->GetDataType()};
switch (input_type) {
case DT_FLOAT16:
return ComputeSin<Eigen::half>(ctx);
case DT_FLOAT:
return ComputeSin<std::float_t>(ctx);
case DT_DOUBLE:
return ComputeSin<std::double_t>(ctx);
case DT_COMPLEX64:
return ComputeSin<std::complex<std::float_t>>(ctx);
case DT_COMPLEX128:
return ComputeSin<std::complex<std::double_t>>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
} // namespace detail
std::uint32_t SinCpuKernel::Compute(CpuKernelContext &ctx) {
return detail::SinCheck(ctx, kSinInputNum, kSinOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::SinCompute(ctx);
}
REGISTER_CPU_KERNEL(kSin, SinCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,26 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_TAN_H_
#define AICPU_KERNELS_NORMALIZED_TAN_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SinCpuKernel final : public CpuKernel {
virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,267 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sinc.h"
#include <complex>
#include <set>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
constexpr double kPI = 3.14159265358979323846L;
constexpr uint32_t kSincInputNum = 1;
constexpr uint32_t kSincOutputNum = 1;
const int64_t paralled_data_size = 64 * 1024;
const char *kSinc = "Sinc";
} // namespace
namespace aicpu {
template <typename T>
uint32_t SincCpuKernel::SincTypeSameCompute(CpuKernelContext &ctx) {
T *x_addr = static_cast<T *>(ctx.Input(0)->GetData());
auto y_addr = static_cast<T *>(ctx.Output(0)->GetData());
size_t x_size = ctx.Input(0)->NumElements();
size_t date_size = x_size * sizeof(T);
if (date_size <= paralled_data_size) {
for (size_t i = 0; i < x_size; i++) {
if (x_addr[i] == T(0.0f)) {
y_addr[i] = T(1.0f);
} else {
T product = T(kPI) * x_addr[i];
y_addr[i] = sin(product) / product;
}
}
} else {
auto shard_sinc = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
if (x_addr[i] == T(0.0f)) {
y_addr[i] = T(1.0f);
} else {
T product = T(kPI) * x_addr[i];
y_addr[i] = sin(product) / product;
}
}
};
uint32_t min_core_num = 1;
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num == 0) {
return KERNEL_STATUS_PARAM_INVALID;
}
if (max_core_num > date_size) {
max_core_num = date_size;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x_size, x_size / max_core_num, shard_sinc),
"Sinc Compute failed.");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SincCpuKernel::SincTypeChangeCompute(CpuKernelContext &ctx) {
T *x_addr = static_cast<T *>(ctx.Input(0)->GetData());
auto y_addr = static_cast<float *>(ctx.Output(0)->GetData());
size_t x_size = ctx.Input(0)->NumElements();
size_t date_size = x_size * sizeof(T);
if (date_size <= paralled_data_size) {
for (size_t i = 0; i < x_size; i++) {
if (x_addr[i] == T(0.0f)) {
y_addr[i] = float(1.0f);
} else {
float product = static_cast<float>(kPI) * x_addr[i];
y_addr[i] = sin(product) / product;
}
}
} else {
auto shard_sinc = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
if (x_addr[i] == T(0.0f)) {
y_addr[i] = float(1.0f);
} else {
float product = static_cast<float>(kPI) * x_addr[i];
y_addr[i] = sin(product) / product;
}
}
};
uint32_t min_core_num = 1;
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num == 0) {
return KERNEL_STATUS_PARAM_INVALID;
}
if (max_core_num > date_size) {
max_core_num = date_size;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x_size, x_size / max_core_num, shard_sinc),
"Sinc Compute failed.");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SincCpuKernel::SincBoolCompute(CpuKernelContext &ctx) {
bool *x_addr = static_cast<bool *>(ctx.Input(0)->GetData());
auto y_addr = static_cast<float *>(ctx.Output(0)->GetData());
size_t x_size = ctx.Input(0)->NumElements();
size_t date_size = x_size * sizeof(T);
if (date_size <= paralled_data_size) {
for (size_t i = 0; i < x_size; i++) {
float tmp;
if (x_addr[i] == true) {
tmp = 1.0f;
} else {
tmp = 0.0f;
}
float product = static_cast<float>(kPI) * tmp;
y_addr[i] = sin(product) / product;
}
} else {
auto shard_sinc = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
float tmp;
if (x_addr[i] == true) {
tmp = 1.0f;
} else {
tmp = 0.0f;
}
float product = static_cast<float>(kPI) * tmp;
y_addr[i] = sin(product) / product;
}
};
uint32_t min_core_num = 1;
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num == 0) {
return KERNEL_STATUS_PARAM_INVALID;
}
if (max_core_num > date_size) {
max_core_num = date_size;
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x_size, x_size / max_core_num, shard_sinc),
"Sinc Compute failed.");
}
return KERNEL_STATUS_OK;
}
inline std::uint32_t SincExtraCheck(const CpuKernelContext &ctx) {
if (ctx.Input(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get input data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Output(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get output data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
DataType in_dtype = ctx.Input(0)->GetDataType();
DataType out_dtype = ctx.Output(0)->GetDataType();
std::set<DataType> dtypes;
dtypes.insert(DT_FLOAT16);
dtypes.insert(DT_FLOAT);
dtypes.insert(DT_DOUBLE);
dtypes.insert(DT_COMPLEX64);
dtypes.insert(DT_COMPLEX128);
if (dtypes.count(in_dtype) == 1) {
if (out_dtype != in_dtype) {
KERNEL_LOG_ERROR("The data type of the output need be the same as the input when input is [%s], but got [%s].",
DTypeStr(in_dtype).c_str(), DTypeStr(out_dtype).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
} else {
if (out_dtype != DT_FLOAT) {
KERNEL_LOG_ERROR("The data type of the output must be float32 when the dtype of input is [%s], but got [%s].",
DTypeStr(in_dtype).c_str(), DTypeStr(out_dtype).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
if (input_dims.size() != output_dims.size()) {
KERNEL_LOG_ERROR(
"The data dim size of the input [%llu] need be the same as the output "
"[%llu].",
input_dims.size(), output_dims.size());
return KERNEL_STATUS_PARAM_INVALID;
}
for (size_t index = 0; index < input_dims.size(); index++) {
if (input_dims[index] != output_dims[index]) {
KERNEL_LOG_ERROR("The data dim of the input need be the same as the output.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
uint32_t SincCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSincInputNum, kSincOutputNum), "[%s] check params failed.", kSinc);
uint32_t res = KERNEL_STATUS_OK;
res = SincExtraCheck(ctx);
if (res != KERNEL_STATUS_OK) {
return res;
}
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
case DT_FLOAT16:
res = SincTypeSameCompute<Eigen::half>(ctx);
break;
case DT_FLOAT:
res = SincTypeSameCompute<float>(ctx);
break;
case DT_DOUBLE:
res = SincTypeSameCompute<double>(ctx);
break;
case DT_INT8:
res = SincTypeChangeCompute<int8_t>(ctx);
break;
case DT_UINT8:
res = SincTypeChangeCompute<uint8_t>(ctx);
break;
case DT_INT16:
res = SincTypeChangeCompute<int16_t>(ctx);
break;
case DT_UINT16:
res = SincTypeChangeCompute<uint16_t>(ctx);
break;
case DT_INT32:
res = SincTypeChangeCompute<int32_t>(ctx);
break;
case DT_UINT32:
res = SincTypeChangeCompute<uint32_t>(ctx);
break;
case DT_INT64:
res = SincTypeChangeCompute<int64_t>(ctx);
break;
case DT_UINT64:
res = SincTypeChangeCompute<uint64_t>(ctx);
break;
case DT_COMPLEX64:
res = SincTypeSameCompute<std::complex<float>>(ctx);
break;
case DT_COMPLEX128:
res = SincTypeSameCompute<std::complex<double>>(ctx);
break;
case DT_BOOL:
res = SincBoolCompute<bool>(ctx);
break;
default:
KERNEL_LOG_ERROR("Sinc invalid input type [%s]", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (res != KERNEL_STATUS_OK) {
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSinc, SincCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,41 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SINC_H_
#define AICPU_KERNELS_NORMALIZED_SINC_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SincCpuKernel : public CpuKernel {
public:
SincCpuKernel() = default;
~SincCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t SincTypeSameCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t SincTypeChangeCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t SincBoolCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,148 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sinh.h"
#include <complex>
#include <unsupported/Eigen/CXX11/Tensor>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/kernel_util.h"
namespace {
const std::uint32_t kSinhInputNum{1};
const std::uint32_t kSinhOutputNum{1};
const std::uint32_t ParallelNum{4096};
const char *kSinh{"Sinh"};
} // namespace
namespace internal {
template <typename T>
inline T ScalarSinh(T x) {
return Eigen::numext::sinh(x);
}
template <>
inline Eigen::half ScalarSinh(Eigen::half x) {
const Eigen::half val{Eigen::numext::sinh(static_cast<float>(x))};
return Eigen::half_impl::isnan(val) ? Eigen::half{0.0f} : val;
}
} // namespace internal
namespace aicpu {
namespace detail {
template <typename T>
inline std::uint32_t ComputeSinhKernel(const CpuKernelContext &ctx) {
const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
const auto ScalarSinh = internal::ScalarSinh<T>;
auto input = static_cast<T *>(ctx.Input(0)->GetData());
auto output = static_cast<T *>(ctx.Output(0)->GetData());
std::int64_t total = ctx.Input(0)->NumElements();
std::uint64_t total_size = ctx.Input(0)->GetDataSize();
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
if (total_size > ParallelNum * sizeof(T)) {
std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
return ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
std::transform(input + begin, input + end, output + begin, ScalarSinh);
});
} else if (cores != 0) {
std::transform(input, input + total, output, ScalarSinh);
} else {
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
template <typename T>
inline std::uint32_t ComputeSinh(const CpuKernelContext &ctx) {
uint32_t result = ComputeSinhKernel<T>(ctx);
if (result != 0) {
KERNEL_LOG_ERROR("Sinh compute failed.");
}
return result;
}
inline std::uint32_t SinhExtraCheck(const CpuKernelContext &ctx) {
if (ctx.Input(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get input data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Output(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get output data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
KERNEL_LOG_ERROR(
"The data size of the input [%llu] need be the same as the output "
"[%llu].",
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_PARAM_INVALID;
}
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
if (input_dims.size() != output_dims.size()) {
KERNEL_LOG_ERROR(
"The data dim size of the input [%llu] need be the same as the output "
"[%llu].",
input_dims.size(), output_dims.size());
return KERNEL_STATUS_PARAM_INVALID;
}
for (size_t index = 0; index < input_dims.size(); index++) {
if (input_dims[index] != output_dims[index]) {
KERNEL_LOG_ERROR("The data dim of the input need be the same as the output.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
std::uint32_t SinhCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
return NormalCheck(ctx, kSinhInputNum, kSinhOutputNum) ? KERNEL_STATUS_PARAM_INVALID : SinhExtraCheck(ctx);
}
std::uint32_t SinhCompute(const CpuKernelContext &ctx) {
DataType input_type{ctx.Input(0)->GetDataType()};
switch (input_type) {
case DT_FLOAT16:
return ComputeSinh<Eigen::half>(ctx);
case DT_FLOAT:
return ComputeSinh<std::float_t>(ctx);
case DT_DOUBLE:
return ComputeSinh<std::double_t>(ctx);
case DT_COMPLEX64:
return ComputeSinh<std::complex<std::float_t> >(ctx);
case DT_COMPLEX128:
return ComputeSinh<std::complex<std::double_t> >(ctx);
default:
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
} // namespace detail
std::uint32_t SinhCpuKernel::Compute(CpuKernelContext &ctx) {
return detail::SinhCheck(ctx, kSinhInputNum, kSinhOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::SinhCompute(ctx);
}
REGISTER_CPU_KERNEL(kSinh, SinhCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,26 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SINH_H_
#define AICPU_KERNELS_NORMALIZED_SINH_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SinhCpuKernel final : public CpuKernel {
virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,387 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "smooth_l1_loss_grad_v2.h"
#include <mutex>
#include "Eigen/Core"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 3;
const uint32_t kOutputNum = 1;
const char *kSmoothL1LossGradV2 = "SmoothL1LossGradV2";
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
float sigma = 1.0;
std::string reduction = "mean";
std::mutex mtx;
#define SmoothL1LossGradV2_COMPUTE_CASE(DTYPE, REDUCTION, TYPE, CTX) \
case (DTYPE): { \
KERNEL_LOG_INFO("Compute [%s]", DTypeStr(data_type).c_str()); \
uint32_t result = KERNEL_STATUS_PARAM_INVALID; \
if ((REDUCTION) == "mean") { \
result = ComputeMean<TYPE>(CTX); \
} else if ((REDUCTION) == "sum") { \
result = ComputeSum<TYPE>(CTX); \
} else if ((REDUCTION) == "none") { \
result = ComputeNone<TYPE>(CTX); \
} \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SmoothL1LossGradV2 kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t SmoothL1LossGradV2CpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"SmoothL1LossGradV2 check input and output number failed.");
KERNEL_HANDLE_ERROR(ParamCheck(ctx), "SmoothL1LossGradV2 check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
SmoothL1LossGradV2_COMPUTE_CASE(DT_FLOAT16, reduction, Eigen::half, ctx)
SmoothL1LossGradV2_COMPUTE_CASE(DT_FLOAT, reduction, float, ctx)
SmoothL1LossGradV2_COMPUTE_CASE(DT_DOUBLE, reduction, double, ctx) default
: KERNEL_LOG_ERROR("SmoothL1LossGradV2 kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t SmoothL1LossGradV2CpuKernel::ParamCheck(CpuKernelContext &ctx) {
Tensor *predict_tensor = ctx.Input(0);
Tensor *label_tensor = ctx.Input(1);
Tensor *dout_tensor = ctx.Input(2);
Tensor *gradient_tensor = ctx.Output(0);
DataType predict_type = predict_tensor->GetDataType();
DataType label_type = label_tensor->GetDataType();
DataType dout_type = dout_tensor->GetDataType();
DataType gradient_type = gradient_tensor->GetDataType();
KERNEL_CHECK_FALSE((predict_type == label_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of predict [%s] need be same with "
"label [%s].",
DTypeStr(predict_type).c_str(), DTypeStr(label_type).c_str());
KERNEL_CHECK_FALSE((predict_type == dout_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of predict [%s] need be same with "
"dout [%s].",
DTypeStr(predict_type).c_str(), DTypeStr(dout_type).c_str());
KERNEL_CHECK_FALSE((predict_type == gradient_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of predict [%s] need be same with "
"gradient [%s].",
DTypeStr(predict_type).c_str(), DTypeStr(gradient_type).c_str());
auto predict_shape = predict_tensor->GetTensorShape();
auto label_shape = label_tensor->GetTensorShape();
auto gradient_shape = gradient_tensor->GetTensorShape();
int32_t predict_dims = predict_shape->GetDims();
int32_t label_dims = label_shape->GetDims();
int32_t gradient_dims = gradient_shape->GetDims();
KERNEL_CHECK_FALSE((predict_dims == label_dims), KERNEL_STATUS_PARAM_INVALID,
"the input shape dim of predict [%d] need be same with "
"label [%d].",
predict_dims, label_dims);
KERNEL_CHECK_FALSE((predict_dims == gradient_dims), KERNEL_STATUS_PARAM_INVALID,
"the input shape dim of predict [%d] need be same with "
"gradient [%d].",
predict_dims, gradient_dims);
for (int32_t i = 0; i < predict_dims; i++) {
KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == label_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
"the every input shape dim of predict [%d] need be same with "
"label [%d] where dim in [%d].",
predict_shape->GetDimSize(i), label_shape->GetDimSize(i), i);
KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == gradient_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
"the every input shape dim of predict [%d] need be same with "
"gradient [%d] where dim in [%d].",
predict_shape->GetDimSize(i), gradient_shape->GetDimSize(i), i);
}
KERNEL_LOG_DEBUG(
"SmoothL1LossGradV2CpuKernel[%s], predict: size[%llu];"
"label: size[%llu], dout: size[%llu], gradient: size[%llu].",
ctx.GetOpType().c_str(), predict_tensor->GetDataSize(), label_tensor->GetDataSize(), dout_tensor->GetDataSize(),
gradient_tensor->GetDataSize());
return AttributesCheck(ctx);
}
uint32_t SmoothL1LossGradV2CpuKernel::AttributesCheck(CpuKernelContext &ctx) {
Tensor *predict_tensor = ctx.Input(0);
Tensor *dout_tensor = ctx.Input(2);
Tensor *gradient_tensor = ctx.Output(0);
auto predict_shape = predict_tensor->GetTensorShape();
auto dout_shape = dout_tensor->GetTensorShape();
auto gradient_shape = gradient_tensor->GetTensorShape();
int32_t predict_dims = predict_shape->GetDims();
int32_t dout_dims = dout_shape->GetDims();
int32_t gradient_dims = gradient_shape->GetDims();
auto sigma_attr = ctx.GetAttr("sigma");
auto reduction_attr = ctx.GetAttr("reduction");
sigma = sigma_attr == nullptr ? 1.0 : sigma_attr->GetFloat();
reduction = reduction_attr == nullptr ? "mean" : reduction_attr->GetString();
KERNEL_CHECK_FALSE(sigma >= 0, KERNEL_STATUS_PARAM_INVALID,
"the sigma value must greater than or equal to 0 "
"when value of input sigma is [%f].",
sigma);
KERNEL_CHECK_FALSE((reduction == "none" || reduction == "mean" || reduction == "sum"), KERNEL_STATUS_PARAM_INVALID,
"the reduction value must be a value in a range of ['none','mean','sum'].", reduction);
if (reduction == "none" || reduction == "mean" || reduction == "sum") {
KERNEL_CHECK_FALSE((predict_dims == gradient_dims), KERNEL_STATUS_PARAM_INVALID,
"the input shape dim of predict [%d] need be same with "
"gradient [%d].",
predict_dims, gradient_dims);
for (int32_t i = 0; i < predict_dims; i++) {
KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == gradient_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
"the input shape dim of predict [%d] must be same with "
"gradient [%d] where dim in [%d].",
predict_shape->GetDimSize(i), gradient_shape->GetDimSize(i), i);
}
}
if (reduction == "none") {
KERNEL_CHECK_FALSE((predict_dims == dout_dims), KERNEL_STATUS_PARAM_INVALID,
"the input shape dim of predict [%d] need be same with "
"dout [%d].",
predict_dims, dout_dims);
for (int32_t i = 0; i < predict_dims; i++) {
KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == dout_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
"the every input shape dim of predict [%d] need be same with "
"dout [%d] where dim in [%d].",
predict_shape->GetDimSize(i), dout_shape->GetDimSize(i), i);
}
} else if (reduction == "sum" || reduction == "mean") {
KERNEL_CHECK_FALSE((dout_dims == 0) || ((dout_dims == 1) && (dout_tensor->NumElements() == 1)),
KERNEL_STATUS_PARAM_INVALID, "the dout shape dim of dout [%d] need be a scalar.", dout_dims);
}
return KERNEL_STATUS_OK;
}
// 1 * dout if x >= sigma
// -1 * dout if x <= -sigma
// x / sigma * dout if |x| < sigma
template <typename T>
uint32_t SmoothL1LossGradV2CpuKernel::ComputeSum(CpuKernelContext &ctx) {
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeSum start");
Tensor *predict_tensor = ctx.Input(0);
Tensor *label_tensor = ctx.Input(1);
Tensor *dout_tensor = ctx.Input(2);
Tensor *gradient_tensor = ctx.Output(0);
T *predict_val = static_cast<T *>(predict_tensor->GetData());
T *label_val = static_cast<T *>(label_tensor->GetData());
T *dout_val = static_cast<T *>(dout_tensor->GetData());
T *gradient_val = static_cast<T *>(gradient_tensor->GetData());
int64_t data_num = predict_tensor->NumElements();
int64_t data_size = data_num * sizeof(T);
T *result = gradient_val;
if (data_size <= kParallelDataNum) {
for (int64_t i = 0; i < data_num; i++) {
T predict = *(predict_val + i);
T label = *(label_val + i);
T dout = *dout_val;
T x = predict - label;
if (x == T(0)) {
*(result + i) = T(0) * dout;
} else if (x <= -T(sigma)) {
*(result + i) = T(-1) * dout;
} else if (x >= T(sigma)) {
*(result + i) = T(1) * dout;
} else if (sigma == 0) {
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
} else {
*(result + i) = x / T(sigma) * dout;
}
}
return KERNEL_STATUS_OK;
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num cannot be 0.");
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shared_smoothl1lossgradv2 = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
T predict = *(predict_val + i);
T label = *(label_val + i);
T dout = *dout_val;
T x = predict - label;
if (x == T(0)) {
*(result + i) = T(0) * dout;
} else if (x <= -T(sigma)) {
*(result + i) = T(-1) * dout;
} else if (x >= T(sigma)) {
*(result + i) = T(1) * dout;
} else if (sigma == 0) {
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
} else {
*(result + i) = x / T(sigma) * dout;
}
}
};
return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossgradv2);
}
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeSum end");
}
// Mean's result is Sum's result divided by the total number of elements per
// element
template <typename T>
uint32_t SmoothL1LossGradV2CpuKernel::ComputeMean(CpuKernelContext &ctx) {
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeMean start");
Tensor *predict_tensor = ctx.Input(0);
Tensor *label_tensor = ctx.Input(1);
Tensor *dout_tensor = ctx.Input(2);
Tensor *gradient_tensor = ctx.Output(0);
T *predict_val = static_cast<T *>(predict_tensor->GetData());
T *label_val = static_cast<T *>(label_tensor->GetData());
T *dout_val = static_cast<T *>(dout_tensor->GetData());
T *gradient_val = static_cast<T *>(gradient_tensor->GetData());
int64_t data_num = predict_tensor->NumElements();
if (data_num == 0) {
KERNEL_LOG_ERROR("data_num cannot be 0.");
}
int64_t data_size = data_num * sizeof(T);
T *result = gradient_val;
if (data_size <= kParallelDataNum) {
for (int64_t i = 0; i < data_num; i++) {
T predict = *(predict_val + i);
T label = *(label_val + i);
T dout = *dout_val;
T x = predict - label;
if (x == T(0)) {
*(result + i) = T(0) * dout;
} else if (x <= -T(sigma)) {
*(result + i) = T(-1) / data_num * dout;
} else if (x >= T(sigma)) {
*(result + i) = T(1) / data_num * dout;
} else if (sigma == 0) {
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
} else {
*(result + i) = x / T(sigma) / data_num * dout;
}
}
return KERNEL_STATUS_OK;
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num cannot be 0.");
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shared_smoothl1lossgradv2 = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
T predict = *(predict_val + i);
T label = *(label_val + i);
T dout = *dout_val;
T x = predict - label;
if (x == T(0)) {
*(result + i) = T(0) * dout;
} else if (x <= -T(sigma)) {
*(result + i) = T(-1) / data_num * dout;
} else if (x >= T(sigma)) {
*(result + i) = T(1) / data_num * dout;
} else if (sigma == 0) {
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
} else {
*(result + i) = x / T(sigma) / data_num * dout;
}
}
};
return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossgradv2);
}
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeMean end");
}
// "None" takes grad_output as a parameter,
// and the end result is that result of "Sum" is multiplied by the grad_output
// one by one, that is, the weight is increased
template <typename T>
uint32_t SmoothL1LossGradV2CpuKernel::ComputeNone(CpuKernelContext &ctx) {
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeNone start");
Tensor *predict_tensor = ctx.Input(0);
Tensor *label_tensor = ctx.Input(1);
Tensor *dout_tensor = ctx.Input(2);
Tensor *gradient_tensor = ctx.Output(0);
T *predict_val = static_cast<T *>(predict_tensor->GetData());
T *label_val = static_cast<T *>(label_tensor->GetData());
T *dout_val = static_cast<T *>(dout_tensor->GetData());
T *gradient_val = static_cast<T *>(gradient_tensor->GetData());
int64_t data_num = predict_tensor->NumElements();
int64_t data_size = data_num * sizeof(T);
T *result = gradient_val;
if (data_size <= kParallelDataNum) {
for (int64_t i = 0; i < data_num; i++) {
T predict = *(predict_val + i);
T label = *(label_val + i);
T x = predict - label;
T dout = *(dout_val + i);
if (x == T(0)) {
*(result + i) = T(0) * dout;
} else if (x <= -T(sigma)) {
*(result + i) = T(-1) * dout;
} else if (x >= T(sigma)) {
*(result + i) = T(1) * dout;
} else if (sigma == 0) {
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
} else {
*(result + i) = dout * x / T(sigma);
}
}
return KERNEL_STATUS_OK;
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num cannot be 0.");
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shared_smoothl1lossgradv2 = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
T predict = *(predict_val + i);
T label = *(label_val + i);
T x = predict - label;
T dout = *(dout_val + i);
if (x == T(0)) {
*(result + i) = T(0) * dout;
} else if (x <= -T(sigma)) {
*(result + i) = T(-1) * dout;
} else if (x >= T(sigma)) {
*(result + i) = T(1) * dout;
} else if (sigma == 0) {
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
} else {
*(result + i) = dout * x / T(sigma);
}
}
};
return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossgradv2);
}
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeNone end");
}
REGISTER_CPU_KERNEL(kSmoothL1LossGradV2, SmoothL1LossGradV2CpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,44 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_GRAD_V2_H_
#define AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_GRAD_V2_H_
#include <string>
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class SmoothL1LossGradV2CpuKernel : public CpuKernel {
public:
SmoothL1LossGradV2CpuKernel() = default;
~SmoothL1LossGradV2CpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t ParamCheck(CpuKernelContext &ctx);
uint32_t AttributesCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t ComputeMean(CpuKernelContext &ctx);
template <typename T>
uint32_t ComputeSum(CpuKernelContext &ctx);
template <typename T>
uint32_t ComputeNone(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_GRAD_V2_H_

View File

@ -0,0 +1,278 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "smooth_l1_loss_v2.h"
#include <mutex>
#include "Eigen/Core"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "utils/kernel_util.h"
namespace {
const char *SmoothL1LossV2 = "SmoothL1LossV2";
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
constexpr int64_t kParallelDataNums = 16 * 1024;
const float opHalf = 0.5;
float sigma = 1.0;
std::string reduction = "mean";
std::mutex mtx;
#define COMPUTE_CASE(DTYPE, REDUCTION, TYPE, CTX) \
case (DTYPE): { \
KERNEL_LOG_DEBUG("Compute [%s]", DTypeStr(data_type).c_str()); \
uint32_t result = KERNEL_STATUS_PARAM_INVALID; \
if ((REDUCTION) == "mean") { \
result = ComputeMean<TYPE>(CTX); \
} else if ((REDUCTION) == "sum") { \
result = ComputeSum<TYPE>(CTX); \
} else if ((REDUCTION) == "none") { \
result = ComputeNone<TYPE>(CTX); \
} \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SmoothL1LossV2 compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t SmoothL1LossV2CpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check SmoothL1LossV2 params failed.");
KERNEL_HANDLE_ERROR(ParamCheck(ctx), "Check SmoothL1LossV2 params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
COMPUTE_CASE(DT_FLOAT16, reduction, Eigen::half, ctx)
COMPUTE_CASE(DT_FLOAT, reduction, float, ctx)
COMPUTE_CASE(DT_DOUBLE, reduction, double, ctx)
default:
KERNEL_LOG_ERROR("SmoothL1LossV2 data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t SmoothL1LossV2CpuKernel::ParamCheck(CpuKernelContext &ctx) {
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output_0 = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
DataType output0_type = output_0->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str());
KERNEL_CHECK_FALSE((input0_type == output0_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"output0 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(output0_type).c_str());
auto input0_shape = input_0->GetTensorShape();
auto input1_shape = input_1->GetTensorShape();
int32_t input0_dims = input0_shape->GetDims();
int32_t input1_dims = input1_shape->GetDims();
KERNEL_CHECK_FALSE((input0_dims == input1_dims), KERNEL_STATUS_PARAM_INVALID,
"the input shape dim of input0 [%d] need be same with "
"input1 [%d].",
input0_dims, input1_dims);
for (int32_t i = 0; i < input0_dims; i++) {
KERNEL_CHECK_FALSE((input0_shape->GetDimSize(i) == input1_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
"the every input shape dim of input0 [%d] need be same with "
"input1 [%d] where dim in [%d].",
input0_shape->GetDimSize(i), input1_shape->GetDimSize(i), i);
}
KERNEL_LOG_DEBUG(
"SmoothL1LossV2CpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output_0->GetDataSize());
return AttributeCheck(ctx);
}
uint32_t SmoothL1LossV2CpuKernel::AttributeCheck(CpuKernelContext &ctx) {
Tensor *input_0 = ctx.Input(0);
Tensor *output_0 = ctx.Output(0);
auto input0_shape = input_0->GetTensorShape();
auto output0_shape = output_0->GetTensorShape();
int32_t input0_dims = input0_shape->GetDims();
int32_t output0_dims = output0_shape->GetDims();
auto sigma_attr = ctx.GetAttr("sigma");
auto reduction_attr = ctx.GetAttr("reduction");
sigma = sigma_attr == nullptr ? 1.0 : sigma_attr->GetFloat();
reduction = reduction_attr == nullptr ? "mean" : reduction_attr->GetString();
KERNEL_CHECK_FALSE(sigma >= 0, KERNEL_STATUS_PARAM_INVALID,
"the sigma value need to greater than or equal to 0 "
"when input sigma value is [%f].",
sigma);
KERNEL_CHECK_FALSE((reduction == "none" || reduction == "mean" || reduction == "sum"), KERNEL_STATUS_PARAM_INVALID,
"the reduction value need be the member of ['none','mean','sum'] "
"when input reduction value is [%s].",
reduction);
if (reduction == "none") {
KERNEL_CHECK_FALSE((input0_dims == output0_dims), KERNEL_STATUS_PARAM_INVALID,
"the input shape dim of input0 [%d] need be same with "
"output0 [%d].",
input0_dims, output0_dims);
for (int32_t i = 0; i < input0_dims; i++) {
KERNEL_CHECK_FALSE((input0_shape->GetDimSize(i) == output0_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
"the every input shape dim of input0 [%d] need be same with "
"output0 [%d] where dim in [%d].",
input0_shape->GetDimSize(i), output0_shape->GetDimSize(i), i);
}
} else if (reduction == "sum" || reduction == "mean") {
KERNEL_CHECK_FALSE((output0_dims == 0) || ((output0_dims == 1) && (output_0->NumElements() == 1)),
KERNEL_STATUS_PARAM_INVALID, "the output shape dim of output0 [%d] need be [1] or a scalar.",
output0_dims);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SmoothL1LossV2CpuKernel::ComputeMean(CpuKernelContext &ctx) {
uint32_t compute_sum_res = ComputeSum<T>(ctx);
if (compute_sum_res != KERNEL_STATUS_OK) {
return compute_sum_res;
}
Tensor *predict_tensor = ctx.Input(0);
int64_t data_num = predict_tensor->NumElements();
Tensor *loss_tensor = ctx.Output(0);
T *loss_val = reinterpret_cast<T *>(loss_tensor->GetData());
T *res = loss_val;
if (data_num == 0) {
*(res) = T(0);
}
*(res) = *(res) / data_num;
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SmoothL1LossV2CpuKernel::ComputeSum(CpuKernelContext &ctx) {
Tensor *predict_tensor = ctx.Input(0);
Tensor *label_tensor = ctx.Input(1);
Tensor *loss_tensor = ctx.Output(0);
T *predict_val = reinterpret_cast<T *>(predict_tensor->GetData());
T *label_val = reinterpret_cast<T *>(label_tensor->GetData());
T *loss_val = reinterpret_cast<T *>(loss_tensor->GetData());
int64_t data_num = predict_tensor->NumElements();
int64_t data_size = data_num * sizeof(T);
double res = 0;
if (data_size <= kParallelDataNums) {
for (int64_t i = 0; i < data_num; i++) {
T predict = *(predict_val + i);
T label = *(label_val + i);
T z = predict - label > T(0) ? predict - label : label - predict;
if (sigma == 0) {
res += static_cast<double>(z);
} else {
res += static_cast<double>(z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma));
}
}
*(loss_val) = static_cast<T>(res);
return KERNEL_STATUS_OK;
} else {
auto shared_smoothl1lossv2 = [&](size_t start, size_t end) -> double {
double sum = 0;
for (size_t i = start; i < end; i++) {
T predict = *(predict_val + i);
T label = *(label_val + i);
T z = predict - label > T(0) ? predict - label : label - predict;
if (sigma == 0) {
res += static_cast<double>(z);
} else {
sum += static_cast<double>(z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma));
}
}
mtx.lock();
res = res + sum;
mtx.unlock();
return KERNEL_STATUS_OK;
};
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > data_num) {
max_core_num = data_num;
}
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
auto result = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossv2);
*(loss_val) = static_cast<T>(res);
return result;
}
}
template <typename T>
uint32_t SmoothL1LossV2CpuKernel::ComputeNone(CpuKernelContext &ctx) {
Tensor *predict_tensor = ctx.Input(0);
Tensor *label_tensor = ctx.Input(1);
Tensor *loss_tensor = ctx.Output(0);
T *predict_val = reinterpret_cast<T *>(predict_tensor->GetData());
T *label_val = reinterpret_cast<T *>(label_tensor->GetData());
T *loss_val = reinterpret_cast<T *>(loss_tensor->GetData());
int64_t data_num = predict_tensor->NumElements();
T *res = loss_val;
int64_t data_size = data_num * sizeof(T);
if (data_size <= kParallelDataNums) {
for (int64_t i = 0; i < data_num; i++) {
T predict = *(predict_val + i);
T label = *(label_val + i);
T z = predict - label > T(0) ? predict - label : label - predict;
if (sigma == 0) {
*(res + i) = z;
} else {
*(res + i) = z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma);
}
}
return KERNEL_STATUS_OK;
} else {
auto shared_smoothl1lossv2 = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
T predict = *(predict_val + i);
T label = *(label_val + i);
T z = predict - label > T(0) ? predict - label : label - predict;
if (sigma == 0) {
*(res + i) = z;
} else {
*(res + i) = z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma);
}
}
};
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > data_num) {
max_core_num = data_num;
}
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossv2);
}
}
REGISTER_CPU_KERNEL(SmoothL1LossV2, SmoothL1LossV2CpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,42 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_V2_H_
#define AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_V2_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SmoothL1LossV2CpuKernel : public CpuKernel {
public:
SmoothL1LossV2CpuKernel() = default;
~SmoothL1LossV2CpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t ParamCheck(CpuKernelContext &ctx);
uint32_t AttributeCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t ComputeMean(CpuKernelContext &ctx);
template <typename T>
uint32_t ComputeSum(CpuKernelContext &ctx);
template <typename T>
uint32_t ComputeNone(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_V2_H_

View File

@ -0,0 +1,185 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_PHILOX_RANDOM_H
#define _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_PHILOX_RANDOM_H
#include <stdint.h>
#include "cpu_kernel/common/status.h"
/**
* A class that represents an inline array.
* Arguments:
* T: the array element type;
* ElementCount: the fixed size of the array;
*/
template <typename T, int ElementCount>
class Array {
public:
static constexpr int kElementCount = ElementCount;
Array() {
for (int i = 0; i < ElementCount; ++i) {
data_[i] = T(0);
}
}
const T &operator[](int index) const { return data_[index]; }
T &operator[](int index) { return data_[index]; }
size_t size() const { return ElementCount; }
private:
T data_[ElementCount];
};
class PhiloxRandom {
public:
using ResultType = Array<uint32_t, 4>;
using ResultElementType = uint32_t;
// The number of elements that will be returned.
static constexpr int kResultElementCount = 4;
// Cost of generation of a single element (in cycles).
static constexpr int kElementCost = 10;
/*
* The type for the 64-bit key stored in the form of two 32-bit uint
* that are used in the diffusion process.
*/
using Key = Array<uint32_t, 2>;
PhiloxRandom() {}
PhiloxRandom(int64_t seed, uint64_t offset) {
const uint32_t seed_low_index = 0;
const uint32_t seed_high_index = 1;
const uint32_t offset_low_index = 2;
const uint32_t offset_high_index = 3;
key_[seed_low_index] = static_cast<uint32_t>(seed);
key_[seed_high_index] = static_cast<uint32_t>(seed >> 32);
counter_[offset_low_index] = static_cast<uint32_t>(offset);
counter_[offset_high_index] = static_cast<uint32_t>(offset >> 32);
}
ResultType const &counter() const { return counter_; }
Key const &key() const { return key_; }
// Skip the specified number of samples of 128-bits in the current stream.
void Skip(uint64_t count) {
const uint32_t count_lo = static_cast<uint32_t>(count);
uint32_t count_hi = static_cast<uint32_t>(count >> 32);
counter_[0] += count_lo;
if (counter_[0] < count_lo) {
++count_hi;
}
counter_[1] += count_hi;
if (counter_[1] < count_hi) {
if (++counter_[2] == 0) {
++counter_[3];
}
}
}
/*
* Returns a group of four random numbers using the underlying Philox
* algorithm.
*/
ResultType operator()() {
ResultType counter = counter_;
Key key = key_;
/*
* Run the single rounds for ten times. Manually unrolling the loop
* for better performance.
*/
counter = ComputeSingleRound(counter, key);
RaiseKey(&key);
counter = ComputeSingleRound(counter, key);
RaiseKey(&key);
counter = ComputeSingleRound(counter, key);
RaiseKey(&key);
counter = ComputeSingleRound(counter, key);
RaiseKey(&key);
counter = ComputeSingleRound(counter, key);
RaiseKey(&key);
counter = ComputeSingleRound(counter, key);
RaiseKey(&key);
counter = ComputeSingleRound(counter, key);
RaiseKey(&key);
counter = ComputeSingleRound(counter, key);
RaiseKey(&key);
counter = ComputeSingleRound(counter, key);
RaiseKey(&key);
counter = ComputeSingleRound(counter, key);
SkipOne();
return counter;
}
private:
// We use the same constants as recommended by the original paper.
static constexpr uint32_t kPhiloxW32A = 0x9E3779B9;
static constexpr uint32_t kPhiloxW32B = 0xBB67AE85;
static constexpr uint32_t kPhiloxM4x32A = 0xD2511F53;
static constexpr uint32_t kPhiloxM4x32B = 0xCD9E8D57;
// Helper function to skip the next sample of 128-bits in the current stream.
void SkipOne() {
if (++counter_[0] == 0) {
if (++counter_[1] == 0) {
if (++counter_[2] == 0) {
++counter_[3];
}
}
}
}
/*
* Helper function to return the lower and higher 32-bits from two 32-bit
* integer multiplications.
*/
static void MultiplyHighLow(uint32_t a, uint32_t b, uint32_t *result_low, uint32_t *result_high) {
const uint64_t product = static_cast<uint64_t>(a) * b;
*result_low = static_cast<uint32_t>(product);
*result_high = static_cast<uint32_t>(product >> 32);
}
// Helper function for a single round of the underlying Philox algorithm.
static ResultType ComputeSingleRound(const ResultType &counter, const Key &key) {
uint32_t lo0;
uint32_t hi0;
MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
uint32_t lo1;
uint32_t hi1;
MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
ResultType result;
result[0] = hi1 ^ counter[1] ^ key[0];
result[1] = lo1;
result[2] = hi0 ^ counter[3] ^ key[1];
result[3] = lo0;
return result;
}
void RaiseKey(Key *key) {
(*key)[0] += kPhiloxW32A;
(*key)[1] += kPhiloxW32B;
}
private:
ResultType counter_;
Key key_;
};
#endif // _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_PHILOX_RANDOM_H_

View File

@ -0,0 +1,35 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sampling_kernels.h"
#include <algorithm>
#include "kernel_log.h"
#include "status.h"
using namespace std;
namespace aicpu {
SamplingKernelType SamplingKernelTypeFromString(std::string str) {
if (str == "lanczos1") return Lanczos1Kernel;
if (str == "lanczos3") return Lanczos3Kernel;
if (str == "lanczos5") return Lanczos5Kernel;
if (str == "gaussian") return GaussianKernel;
if (str == "box") return BoxKernel;
if (str == "triangle") return TriangleKernel;
if (str == "keyscubic") return KeysCubicKernel;
if (str == "mitchellcubic") return MitchellCubicKernel;
return SamplingKernelTypeEnd;
}
} // namespace aicpu

View File

@ -0,0 +1,199 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_UTILS_SAMPLING_KERNELS_H_
#define AICPU_UTILS_SAMPLING_KERNELS_H_
#include <cmath>
#include <stdio.h>
#include "cpu_context.h"
namespace aicpu {
// Defines functions for different types of sampling kernels.
enum SamplingKernelType {
// Lanczos kernel with radius 1. Aliases but does not ring.
Lanczos1Kernel,
/**
* Lanczos kernel with radius 3. High-quality practical filter but may have
* some ringing especially on synthetic images.
*/
Lanczos3Kernel,
/**
* Lanczos kernel with radius 5. Very-high-quality filter but may have
* stronger ringing.
*/
Lanczos5Kernel,
// Gaussian kernel with radius 3, sigma = 1.5 / 3. Less commonly used.
GaussianKernel,
/**
* Rectangle function. Equivalent to "nearest" sampling when upscaling.
* Has value 1 in interval (-0.5, 0.5), value 0.5 on edge, and 0 elsewhere.
*/
BoxKernel,
/**
* Hat/tent function with radius 1. Equivalent to "bilinear" reconstruction
* when upsampling.
* Has value zero at -1.0 and 1.0.
*/
TriangleKernel,
/**
* Cubic interpolant of Keys. Equivalent to Catmull-Rom kernel. Reasonably
* good quality and faster than Lanczos3Kernel.
*/
KeysCubicKernel,
/**
* Cubic non-interpolating scheme. For synthetic images (especially those
* lacking proper prefiltering), less ringing than Keys cubic kernel but less
* sharp.
*/
MitchellCubicKernel,
// Always insert new kernel types before this.
SamplingKernelTypeEnd
};
/**
* // Converts a string into the corresponding kernel type.
* Returns SamplingKernelTypeEnd if the string couldn't be converted.
*/
SamplingKernelType SamplingKernelTypeFromString(std::string str);
// A function object for a Lanczos kernel.
struct LanczosKernelFunc {
// Pass 1 for Lanczos1 kernel, 3 for Lanczos3 etc.
explicit LanczosKernelFunc(float _radius) : radius(_radius) {}
float operator()(float x) const {
constexpr float kPI = 3.14159265359;
x = std::abs(x);
if (x > radius) {
return 0.0;
}
// Need to special case the limit case of sin(x) / x when x is zero.
if (x <= 1e-3) {
return 1.0;
}
return radius * std::sin(kPI * x) * std::sin(kPI * x / radius) / (kPI * kPI * x * x);
}
float Radius() const { return radius; }
const float radius;
};
struct GaussianKernelFunc {
static constexpr float kRadiusMultiplier = 3.0f;
/**
* https://en.wikipedia.org/wiki/Gaussian_function
* We use sigma = 0.5, as suggested on p. 4 of Ken Turkowski's "Filters
* for Common Resampling Tasks" for kernels with a support of 3 pixels:
* www.realitypixels.com/turk/computergraphics/ResamplingFilters.pdf
* This implies a radius of 1.5,
*/
explicit GaussianKernelFunc(float _radius = 1.5f) : radius(_radius), sigma(_radius / kRadiusMultiplier) {}
float operator()(float x) const {
x = std::abs(x);
if (x >= radius) {
return 0.0;
}
return std::exp(-x * x / (2.0 * sigma * sigma));
}
float Radius() const { return radius; }
const float radius;
// Gaussian standard deviation
const float sigma;
};
struct BoxKernelFunc {
float operator()(float x) const {
x = std::abs(x);
return x < 0.5f ? 1. : x == 0.5f ? 0.5f : 0.0f;
}
float Radius() const { return 1.f; }
};
struct TriangleKernelFunc {
// https://en.wikipedia.org/wiki/Triangle_function
float operator()(float x) const {
x = std::abs(x);
return x < 1.0f ? 1.0f - x : 0.0f;
}
float Radius() const { return 1.f; }
};
struct KeysCubicKernelFunc {
/**
* http://ieeexplore.ieee.org/document/1163711/
* R. G. Keys. Cubic convolution interpolation for digital image
* processing. IEEE Transactions on Acoustics, Speech, and Signal
* Processing, 29(6):11531160, 1981.
*/
float operator()(float x) const {
x = std::abs(x);
if (x >= 2.0f) {
return 0.0f;
} else if (x >= 1.0f) {
return ((-0.5f * x + 2.5f) * x - 4.0f) * x + 2.0f;
} else {
return ((1.5f * x - 2.5f) * x) * x + 1.0f;
}
}
float Radius() const { return 2.f; }
};
struct MitchellCubicKernelFunc {
/**
* https://doi.org/10.1145/378456.378514
* D. P. Mitchell and A. N. Netravali. Reconstruction filters in computer
* graphics. Computer Graphics (Proceedings of ACM SIGGRAPH 1988),
* 22(4):221228, 1988.
*/
float operator()(float x) const {
x = std::abs(x);
if (x >= 2.0f) {
return 0.0f;
} else if (x >= 1.0f) {
return (((-7.0f / 18.0f) * x + 2.0f) * x - 10.0f / 3.0f) * x + 16.0f / 9.0f;
} else {
return (((7.0f / 6.0f) * x - 2.0f) * x) * x + 8.0f / 9.0f;
}
}
float Radius() const { return 2.f; }
};
inline LanczosKernelFunc CreateLanczos1Kernel() { return LanczosKernelFunc(1.0); }
inline LanczosKernelFunc CreateLanczos3Kernel() { return LanczosKernelFunc(3.0); }
inline LanczosKernelFunc CreateLanczos5Kernel() { return LanczosKernelFunc(5.0); }
inline GaussianKernelFunc CreateGaussianKernel() { return GaussianKernelFunc(1.5); }
inline BoxKernelFunc CreateBoxKernel() { return BoxKernelFunc(); }
inline TriangleKernelFunc CreateTriangleKernel() { return TriangleKernelFunc(); }
inline KeysCubicKernelFunc CreateKeysCubicKernel() { return KeysCubicKernelFunc(); }
inline MitchellCubicKernelFunc CreateMitchellCubicKernel() { return MitchellCubicKernelFunc(); }
} // namespace aicpu
#endif // _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_SAMPLING_KERNELS_H_

View File

@ -81,8 +81,49 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
mindspore::kQuantileOpName,
mindspore::kSparseSegmentSqrtNOpName,
mindspore::kUnsortedSegmentProdOpName,
mindspore::kMulOpName,
mindspore::kExpOpName};
mindspore::kExpOpName,
mindspore::kMatrixTriangularSolveOpName,
mindspore::kMaximumGradGradOpName,
mindspore::kMaxPoolOpName,
mindspore::kMinimumGradGradOpName,
mindspore::kMulNoNanOpName,
mindspore::kMultilabelMarginLossGradOpName,
mindspore::kNthElementOpName,
mindspore::kNonMaxSuppressionWithOverlapsOpName,
mindspore::kOneHotOpName,
mindspore::kOrgqrOpName,
mindspore::kPackOpName,
mindspore::kParameterizedTruncatedNormalOpName,
mindspore::kPolarOpName,
mindspore::kPdistGradOpName,
mindspore::kRaggedRangeOpName,
mindspore::kRaggedTensorToSparseOpName,
mindspore::kRaggedTensorToTensorOpName,
mindspore::kReciprocalOpName,
mindspore::kReciprocalGradOpName,
mindspore::kReduceMeanOpName,
mindspore::kReduceProdOpName,
mindspore::kReluOpName,
mindspore::kReverseV2OpName,
mindspore::kRGBToHSVOpName,
mindspore::kRsqrtGradOpName,
mindspore::kSampleDistortedBoundingBoxExt2OpName,
mindspore::kScaleAndTranslateGradOpName,
mindspore::kScatterNdOpName,
mindspore::kScatterNdUpdateOpName,
mindspore::kSelectOpName,
mindspore::kSelfAdjointEigOpName,
mindspore::kSinOpName,
mindspore::kSincOpName,
mindspore::kSinhOpName,
mindspore::kSmoothL1LossGradV2OpName,
mindspore::kSmoothL1LossV2OpName,
mindspore::kSignOpName,
mindspore::kCheckNumericsOpName,
mindspore::kFloorDivOpName,
mindspore::kLog1pOpName,
mindspore::kMulOpName};
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";

View File

@ -185,3 +185,38 @@ from .qr import _qr_aicpu
from .col2im import _col2im_aicpu
from .matrix_solve_ls import _matrix_solve_ls_aicpu
from .exp import _exp_aicpu
from .matrix_triangular_solve import _matrix_triangular_solve_aicpu
from .maximum_grad_grad import _maximum_grad_grad_aicpu
from .maxpool_v1 import _maxpool_v1_aicpu
from .minimum_grad_grad import _minimum_grad_grad_aicpu
from .mul_no_nan import _mul_no_nan_aicpu
from .multilabel_margin_loss_grad import _multilabel_margin_loss_grad_aicpu
from .nth_element import _nth_element_aicpu
from .non_max_suppression_with_overlaps import _non_max_suppression_with_overlaps_aicpu
from .one_hot import _one_hot_aicpu
from .orgqr import _orgqr_aicpu
from .parameterized_truncated_normal import _parameterized_truncated_normal_aicpu
from .polar import _polar_aicpu
from .pdist_grad import _pdist_grad_aicpu
from .ragged_range import _raggedrange_aicpu
from .ragged_tensor_to_sparse import _ragged_tensor_to_sparse_aicpu
from .ragged_tensor_to_tensor import _ragged_tensor_to_tensor_aicpu
from .reciprocal import _reciprocal_aicpu
from .reciprocal_grad import _reciprocal_grad_aicpu
from .reduce_mean import _reduce_mean_aicpu
from .reduce_prod import _reduce_prod_aicpu
from .relu_v3 import _relu_v3_aicpu
from .reversev2 import _reversev2_aicpu
from .rgb_to_hsv import _rgb_to_hsv_aicpu
from .rsqrt_grad import _rsqrt_grad_aicpu
from .sample_distorted_bounding_box_v2 import _sample_distorted_bounding_box_v2_aicpu
from .scale_and_translate_grad import _scale_and_translate_grad_aicpu
from .scatter_nd import _scatter_nd_aicpu
from .scatter_nd_update import _scatter_nd_update_aicpu
from .select import _select_aicpu
from .self_adjoint_eig import _self_adjoint_eig_aicpu
from .sin import _sin_aicpu
from .sinc import _sinc_aicpu
from .sinh import _sinh_aicpu
from .smooth_l1_loss_grad import _smooth_l1_loss_grad_aicpu
from .smooth_l1_loss import _smooth_l1_loss_aicpu