0103 aicpu migration first half

This commit is contained in:
lilinjie 2022-12-30 17:20:58 +08:00
parent a023825aae
commit 540665dbbc
40 changed files with 4020 additions and 133 deletions

View File

@ -100,7 +100,8 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "zerodivcond"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noConstructor"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noExplicitConstructor"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "identicalConditionAfterEarlyExit"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitMemberVar"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization"

View File

@ -292,30 +292,15 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd_update.cc:aicpu::ScatterNdUpdateCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_sparse.cc:aicpu::RaggedTensorToSparseCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_tensor.cc:aicpu::RaggedTensorToTensorCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::GenerateRandomCrop
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc:aicpu::ScatterNdCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool.cc:aicpu::SpacialMaxPool
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/slice.cc:aicpu::SliceCpuKernel::SliceCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
@ -323,11 +308,26 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradC
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradComputeFP16
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP16
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_band_part.cc:aicpu::MatrixBandPartCpuKernel::BandCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::SpecialComputeSameShape
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::SpecialComputeXOneElement
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::SpecialComputeYOneElement
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::BcastComputeMultiKernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::BcastComputeOneKernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::SpecialComputeSameShape
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::SpecialComputeXOneElement
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::SpecialComputeYOneElement
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::BcastComputeMultiKernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::BcastComputeOneKernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.cc:aicpu::LuUnpackCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad.cc:aicpu::FractionalMaxPoolGradCpuKernel::DoCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool_grad.cc:aicpu::FractionalAvgPoolGradCpuKernel::DoCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool.cc:aicpu::FractionalMaxPoolCpuKernel::DoCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool.cc:aicpu::FractionalAvgPoolCpuKernel::DoCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/densetosparsesetoperation.cc:aicpu::DenseToSparseSetOperationCpuKernel::ComputeDenseToSparse

View File

@ -226,6 +226,7 @@ constexpr auto kCumulativeLogsumexpOpName = "CumulativeLogsumexp";
constexpr auto kCumulativeLogsumexpDOpName = "CumulativeLogsumexpD";
constexpr auto kDataFormatVecPermuteOpName = "DataFormatVecPermute";
constexpr auto kDeadNodeName = "DeadNode";
constexpr auto kDenseToCSRSparseMatrixOpName = "DenseToCSRSparseMatrix";
constexpr auto kDenseToDenseSetOperation = "DenseToDenseSetOperation";
constexpr auto kDenseToSparseSetOperation = "DenseToSparseSetOperation";
constexpr auto kDepthwiseConv2dNativeBackpropFilterOpName = "DepthwiseConv2dNativeBackpropFilter";
@ -242,7 +243,9 @@ constexpr auto kDiagPartOpName = "DiagPart";
constexpr auto kDiagPartDOpName = "DiagPartD";
constexpr auto kDiagOpName = "Diag";
constexpr auto kDiagDOpName = "DiagD";
constexpr auto kDiagonalOpName = "Diagonal";
constexpr auto kDivOpName = "Div";
constexpr auto kDivNoNanOpName = "DivNoNan";
constexpr auto kDropoutDoMaskOpName = "DropoutDoMask";
constexpr auto kDropOutDoMaskOpName = "DropOutDoMask";
constexpr auto kDropoutDoMaskV3OpName = "DropoutDoMaskV3";
@ -257,6 +260,7 @@ constexpr auto kDynamicAtomicAddrCleanOpName = "DynamicAtomicAddrClean";
constexpr auto kDynamicGRUV2OpName = "DynamicGRUV2";
constexpr auto kDynamicRNNOpName = "DynamicRNN";
constexpr auto kDynamicStitchOpName = "DynamicStitch";
constexpr auto kEigOpName = "Eig";
constexpr auto kEmbeddingLookupCommGradOpName = "EmbeddingLookupCommGrad";
constexpr auto kEmbeddingLookupOpName = "EmbeddingLookup";
constexpr auto kEmbeddingLookupProxyOpName = "EmbeddingLookupProxy";
@ -293,7 +297,12 @@ constexpr auto kFive2FourOpName = "Five2Four";
constexpr auto kFlattenGradOpName = "FlattenGrad";
constexpr auto kFloorDivOpName = "FloorDiv";
constexpr auto kFour2FiveOpName = "Four2Five";
constexpr auto kFractionalAvgPoolOpName = "FractionalAvgPool";
constexpr auto kFractionalAvgPoolGradOpName = "FractionalAvgPoolGrad";
constexpr auto kFractionalMaxPoolOpName = "FractionalMaxPool";
constexpr auto kFractionalMaxPoolGradOpName = "FractionalMaxPoolGrad";
constexpr auto kFractionalMaxPoolGradWithFixedKsizeOpName = "FractionalMaxPoolGradWithFixedKsize";
constexpr auto kFractionalMaxPoolWithFixedKsizeOpName = "FractionalMaxPoolWithFixedKsize";
constexpr auto kFusedAdaFactorName = "FusedAdaFactor";
constexpr auto kFusedAdaFactorWithGlobalNormName = "FusedAdaFactorWithGlobalNorm";
constexpr auto kFusedAdamName = "FusedAdam";
@ -327,10 +336,12 @@ constexpr auto kGatherOpName = "Gather";
constexpr auto kGatherNdOpName = "GatherNd";
constexpr auto kGatherV2OpName = "GatherV2";
constexpr auto kGatherV2DOpName = "GatherV2D";
constexpr auto kGcdOpName = "Gcd";
constexpr auto kGeLUOpName = "GeLU";
constexpr auto kGeluOpName = "Gelu";
constexpr auto kGeLUGradOpName = "GeLUGrad";
constexpr auto kGeluGradOpName = "GeluGrad";
constexpr auto kGeqrfOpName = "Geqrf";
constexpr auto kGetNextOpName = "GetNext";
constexpr auto kGreaterEqualOpName = "GreaterEqual";
constexpr auto kGreaterOpName = "Greater";
@ -346,13 +357,21 @@ constexpr auto kHSigmoidOpName = "HSigmoid";
constexpr auto kHardSigmoidOpName = "HardSigmoid";
constexpr auto kHSigmoidGradOpName = "HSigmoidGrad";
constexpr auto kHardSigmoidGradOpName = "HardSigmoidGrad";
constexpr auto kHSVToRGBOpName = "HSVToRGB";
constexpr auto kHSwishOpName = "HSwish";
constexpr auto kHardSwishOpName = "HardSwish";
constexpr auto kHistogramDOpName = "HistogramD";
constexpr auto kHSwishGradOpName = "HSwishGrad";
constexpr auto kHardSwishGradOpName = "HardSwishGrad";
constexpr auto kHeavisideOpName = "Heaviside";
constexpr auto kHostAllGatherOpName = "HostAllGather";
constexpr auto kHostReduceScatterOpName = "HostReduceScatter";
constexpr auto kHypotOpName = "Hypot";
constexpr auto kIdentityNOpName = "IdentityN";
constexpr auto kIgammaOpName = "Igamma";
constexpr auto kIgammacOpName = "Igammac";
constexpr auto kIgammaGradAOpName = "IgammaGradA";
constexpr auto kIndexFillOpName = "IndexFill";
constexpr auto kInitDatasetQueueOpName = "InitDataSetQueue";
constexpr auto kIOUOpName = "IOU";
constexpr auto kIouOpName = "Iou";
@ -369,7 +388,6 @@ constexpr auto kInstanceNormV2OpName = "InstanceNormV2";
constexpr auto kInstanceNormV2GradOpName = "InstanceNormV2Grad";
constexpr auto kInTopKOpName = "InTopK";
constexpr auto kInTopKDOpName = "InTopKD";
constexpr auto kIsInfOpName = "IsInf";
constexpr auto kIsNanOpName = "IsNan";
constexpr auto kKLDivLossOpName = "KLDivLoss";
constexpr auto kKLDivOpName = "KLDiv";
@ -395,6 +413,7 @@ constexpr auto kLayerNormBetaGammaBackpropV2OpName = "LayerNormBetaGammaBackprop
constexpr auto kLayerNormGradOpName = "LayerNormGrad";
constexpr auto kLayerNormXBackpropOpName = "LayerNormXBackprop";
constexpr auto kLayerNormXBackpropV2OpName = "LayerNormXBackpropV2";
constexpr auto kLcmOpName = "Lcm";
constexpr auto kLessEqualOpName = "LessEqual";
constexpr auto kLessOpName = "Less";
constexpr auto kLinSpaceOpName = "LinSpace";
@ -403,14 +422,21 @@ constexpr auto kListDiffOpName = "ListDiff";
constexpr auto kLogMatrixDeterminantOpName = "LogMatrixDeterminant";
constexpr auto kLogOpName = "Log";
constexpr auto kLog1pOpName = "Log1p";
constexpr auto kLogicalXorOpName = "LogicalXor";
constexpr auto kLogitOpName = "Logit";
constexpr auto kLogitGradOpName = "LogitGrad";
constexpr auto kLogNormalReverseOpName = "LogNormalReverse";
constexpr auto kLogSoftmaxOpName = "LogSoftmax";
constexpr auto kLogSoftmaxV2OpName = "LogSoftmaxV2";
constexpr auto kLogSoftmaxGradOpName = "LogSoftmaxGrad";
constexpr auto kLowerBoundOpName = "LowerBound";
constexpr auto kLpNormOpName = "LpNorm";
constexpr auto kLSTMGradOpName = "LSTMGrad";
constexpr auto kLSTMInputGradOpName = "LSTMInputGrad";
constexpr auto kLSTMOpName = "LSTM";
constexpr auto kLstsqOpName = "Lstsq";
constexpr auto kLuUnpackOpName = "LuUnpack";
constexpr auto kLuUnpackGradOpName = "LuUnpackGrad";
constexpr auto kMaskedFillOpName = "MaskedFill";
constexpr auto kMaskedSelectOpName = "MaskedSelect";
constexpr auto kMaskedSelectGradOpName = "MaskedSelectGrad";
@ -423,6 +449,7 @@ constexpr auto kMatrixDiagDOpName = "MatrixDiagD";
constexpr auto kMatrixDiagPartOpName = "MatrixDiagPart";
constexpr auto kMatrixDiagPartDOpName = "MatrixDiagPartD";
constexpr auto kMatrixDiagPartV3OpName = "MatrixDiagPartV3";
constexpr auto kMatrixExpOpName = "MatrixExp";
constexpr auto kMatrixLogarithmOpName = "MatrixLogarithm";
constexpr auto kMatrixSetDiagOpName = "MatrixSetDiag";
constexpr auto kMatrixSetDiagDOpName = "MatrixSetDiagD";

View File

@ -0,0 +1,150 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dense_to_csr_sparse_matrix.h"
#include <complex>
#include <numeric>
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 5;
const char *DenseToCSRSparseMatrix = "DenseToCSRSparseMatrix";
} // namespace
namespace aicpu {
uint32_t DenseToCSRSparseMatrixCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "DenseToCSRSparseMatrix normal check failed.");
DataType value_type = ctx.Input(0)->GetDataType();
DataType indice_type = ctx.Input(1)->GetDataType();
uint32_t status;
switch (indice_type) {
case DT_INT32:
switch (value_type) {
case DT_FLOAT:
status = ComputeKernel<int32_t, float>(ctx);
break;
case DT_DOUBLE:
status = ComputeKernel<int32_t, double>(ctx);
break;
case DT_COMPLEX64:
status = ComputeKernel<int32_t, std::complex<float>>(ctx);
break;
case DT_COMPLEX128:
status = ComputeKernel<int32_t, std::complex<double>>(ctx);
break;
default:
KERNEL_LOG_ERROR("DenseToCSRSparseMatrix value type [%s] not support.", DTypeStr(value_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
case DT_INT64:
switch (value_type) {
case DT_FLOAT:
status = ComputeKernel<int64_t, float>(ctx);
break;
case DT_DOUBLE:
status = ComputeKernel<int64_t, double>(ctx);
break;
case DT_COMPLEX64:
status = ComputeKernel<int64_t, std::complex<float>>(ctx);
break;
case DT_COMPLEX128:
status = ComputeKernel<int64_t, std::complex<double>>(ctx);
break;
default:
KERNEL_LOG_ERROR("DenseToCSRSparseMatrix value type [%s] not support.", DTypeStr(value_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
default:
KERNEL_LOG_ERROR("DenseToCSRSparseMatrix indices type [%s] not support.", DTypeStr(indice_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_HANDLE_ERROR(status, "DenseToCSRSparseMatrix kernel compute failed.");
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(DenseToCSRSparseMatrix, DenseToCSRSparseMatrixCpuKernel);
template <typename indiceT, typename valueT>
uint32_t DenseToCSRSparseMatrixCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
auto dense_input_ptr = reinterpret_cast<valueT *>(ctx.Input(0)->GetData());
auto indices_ptr = reinterpret_cast<indiceT *>(ctx.Input(1)->GetData());
auto y_dense_shape_ptr = reinterpret_cast<indiceT *>(ctx.Output(0)->GetData());
auto y_batch_pointers_ptr = reinterpret_cast<indiceT *>(ctx.Output(1)->GetData());
auto y_row_pointers_ptr = reinterpret_cast<indiceT *>(ctx.Output(2)->GetData());
auto y_col_indices_ptr = reinterpret_cast<indiceT *>(ctx.Output(3)->GetData());
auto y_values_ptr = reinterpret_cast<valueT *>(ctx.Output(4)->GetData());
// Copy the CSRSparseMatrix's dense_shape and values from the Dense.
const int64_t rank = ctx.Input(1)->GetTensorShape()->GetDimSize(1);
const int64_t total_nnz = ctx.Input(1)->GetTensorShape()->GetDimSize(0);
const int64_t batch_size = (rank == 2) ? 1 : ctx.Input(0)->GetTensorShape()->GetDimSize(0);
const int64_t num_rows = ctx.Input(0)->GetTensorShape()->GetDimSize((rank == 2) ? 0 : 1);
const int64_t num_cols = ctx.Input(0)->GetTensorShape()->GetDimSize((rank == 2) ? 1 : 2);
for (int64_t i = 0; i < rank; i++) {
y_dense_shape_ptr[i] = ctx.Input(0)->GetTensorShape()->GetDimSize(i);
}
for (int64_t i = 0; i < total_nnz; i++) {
if (rank == 2) {
int64_t cur_idx = indices_ptr[i * rank] * num_cols + indices_ptr[i * rank + 1];
y_values_ptr[i] = dense_input_ptr[cur_idx];
} else {
int64_t cur_idx = indices_ptr[i * rank] * num_rows * num_cols;
cur_idx = cur_idx + indices_ptr[i * rank + 1] * num_cols + indices_ptr[i * rank + 2];
y_values_ptr[i] = dense_input_ptr[cur_idx];
}
}
for (int64_t i = 0; i < batch_size * (num_rows + 1); i++) {
y_row_pointers_ptr[i] = 0;
}
int prev_batch = -1;
if (rank == 2) {
// For a single batch, the batch_ptrs are {0, total_nnz}.
y_batch_pointers_ptr[0] = 0;
++prev_batch;
for (int64_t i = 0; i < total_nnz; ++i) {
// For now, the rows pointers store the corresponding row counts.
y_row_pointers_ptr[indices_ptr[i * rank] + 1] += 1;
y_col_indices_ptr[i] = indices_ptr[i * rank + 1];
}
} else { // rank == 3
for (int64_t i = 0; i < total_nnz; ++i) {
const int cur_batch = indices_ptr[i * rank];
// For now, the rows pointers store the corresponding row counts.
y_row_pointers_ptr[cur_batch * (num_rows + 1) + indices_ptr[i * rank + 1] + 1] += 1;
y_col_indices_ptr[i] = indices_ptr[i * rank + 2];
// We're at a new batch and might have skipped over empty batches.
while (prev_batch < cur_batch) {
// The previous batch ends at position i.
y_batch_pointers_ptr[prev_batch + 1] = i;
++prev_batch;
}
}
}
// Set the last element of batch_ptr and account for trailing empty batches.
while (prev_batch < batch_size) {
y_batch_pointers_ptr[prev_batch + 1] = total_nnz;
++prev_batch;
}
// Compute the cumulative row counts for each batch.
for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
auto *row_ptr_batch = y_row_pointers_ptr + batch_idx * (num_rows + 1);
std::partial_sum(row_ptr_batch, row_ptr_batch + num_rows + 1, row_ptr_batch);
}
return KERNEL_STATUS_OK;
}
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_DENSE_TO_CSR_SPARSE_MATRIX_H_
#define AICPU_KERNELS_NORMALIZED_DENSE_TO_CSR_SPARSE_MATRIX_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class DenseToCSRSparseMatrixCpuKernel : public CpuKernel {
public:
~DenseToCSRSparseMatrixCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename indiceT, typename valueT>
uint32_t ComputeKernel(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,429 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "densetosparsesetoperation.h"
#include <algorithm>
#include <atomic>
#include <mutex>
#include <numeric>
#include <set>
#include <string>
#include <vector>
#include "cpu_kernel_utils.h"
#include "utils/allocator_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "kernel_log.h"
#include "status.h"
namespace {
const char *kDenseToSparseSetOperation = "DenseToSparseSetOperation";
const uint32_t kOutputNum = 3;
const uint32_t kInputNum = 4;
constexpr int64_t kIndex0 = 0;
constexpr int64_t kIndex1 = 1;
constexpr int64_t kIndex2 = 2;
constexpr int64_t kIndex3 = 3;
const int64_t kParallelNum{64};
} // namespace
// 定义命名空间aicpu
namespace aicpu {
const std::vector<int64_t> Strides(const std::vector<int64_t> &shape) {
std::vector<int64_t> result(shape.size());
int64_t product = 1;
for (int64_t i = static_cast<int64_t>(shape.size()) - 1; i >= 0; --i) {
result[i] = product;
product *= shape[i];
}
return result;
}
uint32_t GroupShape(const std::vector<int64_t> input_shape, std::vector<int64_t> &grouped_shape) {
if (input_shape.size() < 2) {
return KERNEL_STATUS_PARAM_INVALID;
}
// grouped_shape is input_shape[:-1]
grouped_shape.assign(input_shape.begin(), input_shape.end() - 1);
return KERNEL_STATUS_OK;
}
uint32_t CheckShapesMatch(const std::vector<int64_t> &shape1, const std::vector<int64_t> &shape2) {
if (shape1.size() != shape2.size()) {
return KERNEL_STATUS_PARAM_INVALID;
}
for (size_t i = 0; i < shape1.size(); i++) {
if (shape1[i] != shape2[i]) return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t GroupShapeFromInputs(const std::vector<int64_t> &shape1, const std::vector<int64_t> &shape2,
std::vector<int64_t> &group_shape) {
std::vector<int64_t> group_shape_1;
KERNEL_HANDLE_ERROR(GroupShape(shape1, group_shape_1), "X1_Shape rank is less than 2.");
std::vector<int64_t> group_shape_2;
KERNEL_HANDLE_ERROR(GroupShape(shape2, group_shape_2), "X2_Shape rank is less than 2.");
KERNEL_HANDLE_ERROR(CheckShapesMatch(group_shape_1, group_shape_2), "Two shapes mismatch with each other.");
group_shape.assign(group_shape_1.begin(), group_shape_1.end());
return KERNEL_STATUS_OK;
}
uint32_t GetNumElements(const std::vector<int64_t> input_shape, int64_t &res) {
int64_t result = 1;
for (uint32_t i = 0; i < input_shape.size(); i++) {
KERNEL_CHECK_FALSE(MulWithoutOverflow(input_shape[i], result, result), KERNEL_STATUS_PARAM_INVALID,
"Overflow when calculate shape size.");
}
res = result;
return KERNEL_STATUS_OK;
}
void DenseToSparseSetOperationCpuKernel::PopulateGroupIndices(const int64_t flat_group_index,
const std::vector<int64_t> &group_shape,
std::vector<int64_t> &group_indices) {
group_indices.clear();
int64_t running_flat_group_index = flat_group_index;
for (int64_t group_dim_index = static_cast<int64_t>(group_shape.size()) - 1; group_dim_index >= 0;
--group_dim_index) {
const auto group_dim = group_shape[group_dim_index];
group_indices.insert(group_indices.begin(), running_flat_group_index % group_dim);
running_flat_group_index /= group_dim;
}
}
template <typename T>
uint32_t DenseToSparseSetOperationCpuKernel::PopulateFromDenseGroup(Tensor *input_tensor,
const std::vector<int64_t> &input_strides,
const std::vector<int64_t> &group_indices,
std::set<T> &result) {
result.clear();
EigenTensor input_tensor_eigen(input_tensor, input_tensor->GetData());
auto input_flat = input_tensor_eigen.flat<T>();
const auto start = std::inner_product(group_indices.begin(), group_indices.end(), input_strides.begin(), 0LL);
auto input_shape = input_tensor->GetTensorShape();
const auto end = start + input_shape->GetDimSize(input_shape->GetDims() - 1);
for (int64_t i = start; i < end; ++i) {
result.insert(input_flat(i));
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t DenseToSparseSetOperationCpuKernel::PopulateFromSparseGroup(const Group &group,
const std::vector<int64_t> &sparse_tensor_shape,
std::set<T> &result) {
KERNEL_HANDLE_ERROR(CheckGroup<T>(group, sparse_tensor_shape), "PopulateFromSparseGroup check error.");
result.clear();
const auto &group_values = group.values<T>();
for (int64_t i = 0; i < group_values.size(); ++i) {
result.insert(group_values(i));
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t DenseToSparseSetOperationCpuKernel::CheckGroup(const Group &group,
const std::vector<int64_t> &sparse_tensor_shape) {
const auto &indices = group.indices();
const auto &values = group.values<T>();
const auto num_values = values.dimension(0);
// Sanity check: valid indices.
const uint32_t expected_rank = sparse_tensor_shape.size();
for (uint32_t j = 0; j < expected_rank; ++j) {
const auto dim_size = sparse_tensor_shape[j];
KERNEL_CHECK_FALSE(dim_size > 0, KERNEL_STATUS_PARAM_INVALID, "Invalid dim_size [%d] for index [%d]", dim_size, j);
for (int64_t i = 0; i < num_values; ++i) {
const auto index = indices(i, j);
KERNEL_CHECK_FALSE(dim_size > index, KERNEL_STATUS_PARAM_INVALID,
"indices index ([%d],[%d]) expected < [%d], got [%d].", i, j, dim_size, index);
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
void DenseToSparseSetOperationCpuKernel::ApplySetOperation(const std::set<T> &set1, const std::set<T> &set2,
std::set<T> &result, SetOperation set_operation_) {
switch (set_operation_) {
case A_MINUS_B:
std::set_difference(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
break;
case B_MINUS_A:
std::set_difference(set2.begin(), set2.end(), set1.begin(), set1.end(), std::inserter(result, result.begin()));
break;
case INTERSECTION:
std::set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
break;
case UNION:
std::set_union(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
break;
}
}
template <typename T>
uint32_t DenseToSparseSetOperationCpuKernel::OutputSparseTensor(
DataBank &databank, const std::vector<int64_t> &output_shape, const int64_t num_values,
const std::map<std::vector<int64_t>, std::set<T>> &sets) {
Tensor *out_indices, *out_values, *out_shape;
out_indices = databank.result_indices;
out_values = databank.result_values;
out_shape = databank.result_shape;
EigenTensor out_indices_t(out_indices, out_indices->GetData());
auto out_indices_mat = out_indices_t.matrix<int64_t>();
EigenTensor out_values_t(out_values, out_values->GetData());
auto out_values_flat = out_values_t.vec<T>();
EigenTensor out_shape_t(out_shape, out_shape->GetData());
auto out_shape_flat = out_shape_t.vec<int64_t>();
int64_t value_index = 0;
for (auto it = sets.begin(); it != sets.end(); ++it) {
const auto &group_indices = it->first;
KERNEL_CHECK_FALSE(group_indices.size() == output_shape.size() - 1, KERNEL_STATUS_PARAM_INVALID,
"Invalid number of indices [%d] expected [%].", group_indices.size(), output_shape.size() - 1)
const auto &set = it->second;
// For each set item, write its indices and value to output tensors.
int64_t group_value_index = 0;
for (auto value = set.begin(); value != set.end(); ++value, ++value_index, ++group_value_index) {
// First n-1 dimensions are the group, last dimension is the position in
// the set.
for (uint32_t i = 0; i < group_indices.size(); ++i) {
out_indices_mat(value_index, i) = group_indices[i];
}
out_indices_mat(value_index, group_indices.size()) = group_value_index;
out_values_flat(value_index) = *value;
}
}
for (uint32_t i = 0; i < output_shape.size(); ++i) {
out_shape_flat(i) = output_shape[i];
}
out_indices->GetTensorShape()->SetDimSizes({num_values, static_cast<int64_t>(output_shape.size())});
out_values->GetTensorShape()->SetDimSizes({num_values});
out_shape->GetTensorShape()->SetDimSizes({static_cast<int64_t>(output_shape.size())});
return KERNEL_STATUS_OK;
}
uint32_t DenseToSparseSetOperationCpuKernel::NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &databank) {
databank.set1 = ctx.Input(kIndex0);
databank.set2_indices = ctx.Input(kIndex1);
databank.set2_values = ctx.Input(kIndex2);
databank.set2_shape = ctx.Input(kIndex3);
databank.result_indices = ctx.Output(kIndex0);
databank.result_values = ctx.Output(kIndex1);
databank.result_shape = ctx.Output(kIndex2);
databank.ctx = &ctx;
AttrValue *validate_indices = ctx.GetAttr("validate_indices");
if (validate_indices == nullptr) {
databank.validate_indices_ = true;
} else {
databank.validate_indices_ = validate_indices->GetBool();
}
AttrValue *set_operation = ctx.GetAttr("set_operation");
KERNEL_CHECK_NULLPTR(set_operation, KERNEL_STATUS_PARAM_INVALID, "Missing set_operation.")
std::string set_operation_str = set_operation->GetString();
std::transform(set_operation_str.begin(), set_operation_str.end(), set_operation_str.begin(), ::tolower);
if ("a-b" == set_operation_str) {
databank.set_operation_ = A_MINUS_B;
} else if ("b-a" == set_operation_str) {
databank.set_operation_ = B_MINUS_A;
} else if ("intersection" == set_operation_str) {
databank.set_operation_ = INTERSECTION;
} else if ("union" == set_operation_str) {
databank.set_operation_ = UNION;
} else {
KERNEL_LOG_ERROR("Invalid set_operation.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t DenseToSparseSetOperationCpuKernel::ComputeDenseToSparse(DataBank &databank) {
EigenTensor set2_shape_e(databank.set2_shape, databank.set2_shape->GetData());
auto set2_shape = set2_shape_e.vec<int64_t>();
std::vector<int64_t> shape2(set2_shape.size());
for (int64_t i = 0; i < set2_shape.size(); ++i) {
shape2[i] = set2_shape(i);
}
const auto rank = shape2.size();
std::vector<int64_t> order(rank);
std::iota(order.begin(), order.end(), 0);
SparseTensor set2;
Tensor *set1_t = databank.set1;
SparseTensor *set2_st = &set2;
KERNEL_HANDLE_ERROR(set2_st->CreateSparseTensor(databank.set2_indices, databank.set2_values, shape2, order),
"create sparse tenser fail.");
if (databank.validate_indices_) {
KERNEL_HANDLE_ERROR(set2_st->IndicesValid(*databank.ctx), "IndicesValid fail!!");
}
std::vector<int64_t> group_shape;
const auto shape1 = set1_t->GetTensorShape()->GetDimSizes();
KERNEL_HANDLE_ERROR(GroupShapeFromInputs(shape1, shape2, group_shape), "GroupShapeFromInputs error.");
const std::vector<int64_t> set1_strides = Strides(shape1);
std::map<std::vector<int64_t>, std::set<T>> group_sets;
int64_t num_result_values = 0;
int64_t max_set_size = 0;
int64_t num_elements;
KERNEL_HANDLE_ERROR(GetNumElements(group_shape, num_elements), "NumElements error.");
if (num_elements <= kParallelNum) {
std::set<T> set1_group_set;
std::set<T> set2_group_set;
const std::vector<int64_t> subspan(order.begin(), order.end() - 1);
auto set2_grouper = set2_st->group(subspan);
auto set2_group_it = set2_grouper.begin();
std::vector<int64_t> group_indices;
for (int64_t flat_group_index = 0; flat_group_index < num_elements; ++flat_group_index) {
PopulateGroupIndices(flat_group_index, group_shape, group_indices);
// Get values from set1.
PopulateFromDenseGroup<T>(set1_t, set1_strides, group_indices, set1_group_set);
// Get values from set2, if applicable.
set2_group_set.clear();
if (set2_group_it != set2_grouper.end()) {
const auto &group = *set2_group_it;
const auto set2_group_indices = group.group();
bool group_match = true;
for (uint32_t i = 0; group_match && (i < set2_group_indices.size()); ++i) {
if (set2_group_indices[i] != group_indices[i]) {
group_match = false;
}
}
if (group_match) {
KERNEL_HANDLE_ERROR(PopulateFromSparseGroup<T>(group, shape2, set2_group_set),
"PopulateFromSparseGroup error.");
++set2_group_it;
}
}
std::set<T> group_set;
ApplySetOperation(set1_group_set, set2_group_set, group_set, databank.set_operation_);
if (!group_set.empty()) {
group_sets[group_indices] = group_set;
int64_t set_size = group_set.size();
if (set_size > max_set_size) {
max_set_size = set_size;
}
num_result_values += set_size;
}
}
} else {
std::mutex mt;
int64_t total = num_elements;
uint32_t cores = CpuKernelUtils::GetCPUNum(*databank.ctx);
int64_t per_unit_size = (total / std::min(std::max(1L, cores - 2L), total));
uint32_t ret =
CpuKernelUtils::ParallelFor(*databank.ctx, total, per_unit_size, [&](int64_t begin, int64_t end) -> uint32_t {
std::set<T> set1_group_set;
std::set<T> set2_group_set;
const std::vector<int64_t> subspan(order.begin(), order.end() - 1);
auto set2_grouper = set2_st->group(subspan);
auto set2_group_it = set2_grouper.begin();
std::vector<int64_t> group_indices;
for (int64_t flat_group_index = begin; flat_group_index < end; ++flat_group_index) {
PopulateGroupIndices(flat_group_index, group_shape, group_indices);
// Get values from set1.
PopulateFromDenseGroup<T>(set1_t, set1_strides, group_indices, set1_group_set);
// Get values from set2, if applicable.
set2_group_set.clear();
if (set2_group_it != set2_grouper.end()) {
const auto &group = *set2_group_it;
const auto set2_group_indices = group.group();
bool group_match = true;
for (uint32_t i = 0; group_match && (i < set2_group_indices.size()); ++i) {
if (set2_group_indices[i] != group_indices[i]) {
group_match = false;
}
}
if (group_match) {
KERNEL_HANDLE_ERROR(PopulateFromSparseGroup<T>(group, shape2, set2_group_set),
"PopulateFromSparseGroup error.");
++set2_group_it;
}
}
std::set<T> group_set;
ApplySetOperation(set1_group_set, set2_group_set, group_set, databank.set_operation_);
if (!group_set.empty()) {
std::lock_guard<std::mutex> lck(mt);
group_sets[group_indices] = group_set;
int64_t set_size = group_set.size();
if (set_size > max_set_size) {
max_set_size = set_size;
}
num_result_values += set_size;
}
}
return KERNEL_STATUS_OK;
});
KERNEL_CHECK_FALSE((ret == KERNEL_STATUS_OK), KERNEL_STATUS_INNER_ERROR,
"DenseToSparseSetOperation compute failed.");
}
group_shape.push_back(max_set_size);
return OutputSparseTensor<T>(databank, group_shape, num_result_values, group_sets);
}
uint32_t DenseToSparseSetOperationCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"DenseToSparseSetOperation check input and output number failed.");
DataBank databank;
KERNEL_HANDLE_ERROR(NullptrAndMatVecCheck(ctx, databank), "DenseToSparseSetOperation check params failed.");
DataType dt = reinterpret_cast<DataType>(databank.set2_values->GetDataType());
uint32_t KERNEL_STATUS;
switch (dt) {
case DT_INT8:
KERNEL_STATUS = ComputeDenseToSparse<int8_t>(databank);
break;
case DT_UINT8:
KERNEL_STATUS = ComputeDenseToSparse<uint8_t>(databank);
break;
case DT_INT16:
KERNEL_STATUS = ComputeDenseToSparse<int16_t>(databank);
break;
case DT_UINT16:
KERNEL_STATUS = ComputeDenseToSparse<uint16_t>(databank);
break;
case DT_INT32:
KERNEL_STATUS = ComputeDenseToSparse<int32_t>(databank);
break;
case DT_INT64:
KERNEL_STATUS = ComputeDenseToSparse<int64_t>(databank);
break;
case DT_STRING:
KERNEL_STATUS = ComputeDenseToSparse<std::string>(databank);
break;
default:
KERNEL_LOG_ERROR("DenseToSparseSetOperation can't support this data type [%s].", DTypeStr(dt).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (KERNEL_STATUS != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("DenseToSparseSetOperation failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kDenseToSparseSetOperation, DenseToSparseSetOperationCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,80 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <set>
#include "cpu_ops_kernel.h"
#include "utils/sparse_group.h"
#include "utils/sparse_tensor.h"
// 定义命名空间aicpu
namespace aicpu {
enum SetOperation { A_MINUS_B = 0, B_MINUS_A = 1, INTERSECTION = 2, UNION = 3 };
struct DataBank {
DataBank()
: set1(nullptr),
set2_indices(nullptr),
set2_values(nullptr),
set2_shape(nullptr),
result_indices(nullptr),
result_values(nullptr),
result_shape(nullptr) {}
Tensor *set1;
Tensor *set2_indices;
Tensor *set2_values;
Tensor *set2_shape;
Tensor *result_indices;
Tensor *result_values;
Tensor *result_shape;
SetOperation set_operation_;
bool validate_indices_;
CpuKernelContext *ctx;
};
// 算子类继承CpuKernel基类
class DenseToSparseSetOperationCpuKernel : public CpuKernel {
public:
~DenseToSparseSetOperationCpuKernel() = default;
DenseToSparseSetOperationCpuKernel() = default;
// 声明函数Compute且Compute函数需要重写
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &calc_info);
template <typename T>
uint32_t ComputeDenseToSparse(DataBank &databank);
template <typename T>
uint32_t CheckGroup(const Group &group, const std::vector<int64_t> &sparse_tensor_shape);
template <typename T>
uint32_t PopulateFromSparseGroup(const Group &group, const std::vector<int64_t> &sparse_tensor_shape,
std::set<T> &result);
template <typename T>
uint32_t PopulateFromDenseGroup(Tensor *input_tensor, const std::vector<int64_t> &input_strides,
const std::vector<int64_t> &group_indices, std::set<T> &result);
void PopulateGroupIndices(const int64_t flat_group_index, const std::vector<int64_t> &group_shape,
std::vector<int64_t> &group_indices);
template <typename T>
void ApplySetOperation(const std::set<T> &set1, const std::set<T> &set2, std::set<T> &result,
SetOperation set_operation_);
template <typename T>
uint32_t OutputSparseTensor(DataBank &databank, const std::vector<int64_t> &output_shape, const int64_t num_values,
const std::map<std::vector<int64_t>, std::set<T>> &sets);
};
} // namespace aicpu

View File

@ -0,0 +1,121 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "diag.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *kDiag = "Diag";
constexpr int64_t kParallelDataNums = 80 * 32;
constexpr int64_t kParallelDataNumsMid = 8 * 1024;
#define DIAG_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = DiagCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Diag kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t DiagCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kDiag);
KERNEL_HANDLE_ERROR(DiagCheck(ctx), "[%s] check params failed.", kDiag);
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
DIAG_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
DIAG_COMPUTE_CASE(DT_FLOAT, float, ctx)
DIAG_COMPUTE_CASE(DT_DOUBLE, double, ctx)
DIAG_COMPUTE_CASE(DT_INT32, int32_t, ctx)
DIAG_COMPUTE_CASE(DT_INT64, int64_t, ctx)
DIAG_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
DIAG_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("Diag kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t DiagCpuKernel::DiagCheck(CpuKernelContext &ctx) {
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get output tensor shape failed.")
std::vector<int64_t> shape_input = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_output = ctx.Output(0)->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE((shape_input.size() != 0), KERNEL_STATUS_PARAM_INVALID,
"Input must be at least rank 1, got [%zu].", shape_input.size())
KERNEL_CHECK_FALSE((shape_input.size() != shape_output.size() * 2), KERNEL_STATUS_PARAM_INVALID,
"The output shape size should be twice the output shape size, "
"but the input shape size is [%zu] and the output shape size is [%zu].",
shape_input.size(), shape_output.size())
for (size_t i = 0; i < shape_output.size(); ++i) {
KERNEL_CHECK_FALSE((shape_input[i % shape_input.size()] == shape_output[i]), KERNEL_STATUS_PARAM_INVALID,
"Invalid shape: the input dimension [%zu] size [%zu] does not match "
"the output dimension [%zu] size [%zu].",
i % shape_input.size(), shape_input[i % shape_input.size()], i, shape_output[i])
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t DiagCpuKernel::DiagCompute(CpuKernelContext &ctx) {
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t size = ctx.Input(0)->NumElements();
int64_t data_size = size * sizeof(T);
if (data_size <= kParallelDataNums) {
std::fill(output, output + size * size, T());
for (int64_t index = 0; index < size; index++) {
*(output + (1 + size) * index) = *(input + index);
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_size <= kParallelDataNumsMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > size) {
max_core_num = size;
}
auto shard_diag = [&](int64_t start, int64_t end) {
std::fill(output + size * start, output + size * end, T());
for (int64_t index = start; index < end; index++) {
*(output + (1 + size) * index) = *(input + index);
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, size, size / max_core_num, shard_diag),
"Diag Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kDiag, DiagCpuKernel);
} // namespace aicpu

View File

@ -1,5 +1,5 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -13,23 +13,25 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_IS_INF_H_
#define AICPU_KERNELS_NORMALIZED_IS_INF_H_
#ifndef AICPU_KERNELS_NORMALIZED_DIAG_H_
#define AICPU_KERNELS_NORMALIZED_DIAG_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class IsInfCpuKernel : public CpuKernel {
class DiagCpuKernel : public CpuKernel {
public:
IsInfCpuKernel() = default;
~IsInfCpuKernel() override = default;
DiagCpuKernel() = default;
~DiagCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t IsInfCheck(const CpuKernelContext &ctx) const;
uint32_t DiagCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t IsInfCompute(const CpuKernelContext &ctx);
uint32_t DiagCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,87 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "diag_part.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *kDiagPart = "DiagPart";
#define DIAGPART_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = DiagPartCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("DiagPart kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t DiagPartCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kDiagPart);
KERNEL_HANDLE_ERROR(DiagPartCheck(ctx), "[%s] check params failed.", kDiagPart);
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
DIAGPART_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
DIAGPART_COMPUTE_CASE(DT_FLOAT, float, ctx)
DIAGPART_COMPUTE_CASE(DT_DOUBLE, double, ctx)
DIAGPART_COMPUTE_CASE(DT_INT32, int32_t, ctx)
DIAGPART_COMPUTE_CASE(DT_INT64, int64_t, ctx)
DIAGPART_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
DIAGPART_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("DiagPart kernel data type [%s] not supports.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t DiagPartCpuKernel::DiagPartCheck(CpuKernelContext &ctx) {
std::vector<int64_t> shape_input = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_output = ctx.Output(0)->GetTensorShape()->GetDimSizes();
KERNEL_CHECK_FALSE((shape_input.size() % 2 == 0), KERNEL_STATUS_PARAM_INVALID,
"The rank of the tensor should be even and positive.");
for (size_t i = 0; i < shape_output.size(); i++) {
KERNEL_CHECK_FALSE((shape_input[i] == shape_input[i + shape_output.size()]), KERNEL_STATUS_PARAM_INVALID,
"Invalid shape: the input dimension [%zu] size [%zu] does not match "
"the input dimension [%zu] size [%zu].",
i, shape_input[i], i + shape_output.size(), shape_input[i + shape_output.size()]);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t DiagPartCpuKernel::DiagPartCompute(CpuKernelContext &ctx) {
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
uint64_t size = ctx.Output(0)->NumElements();
for (size_t index = 0; index < size; index++) {
*(output + index) = *(input + (1 + size) * index);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kDiagPart, DiagPartCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_DIAG_PART_H_
#define AICPU_KERNELS_NORMALIZED_DIAG_PART_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class DiagPartCpuKernel : public CpuKernel {
public:
DiagPartCpuKernel() = default;
~DiagPartCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t DiagPartCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t DiagPartCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,227 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "diagonal.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#define N2 2
#define N3 3
#define N4 4
using namespace std;
namespace {
const size_t kOutputNum = 1;
const size_t kInputNum = 1;
const char *kDiagonal = "Diagonal";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 400;
const int64_t kParallelDataNumMid = 2 * 1024;
const uint32_t min_core_num = 1;
template <typename T>
T mul_sum(std::vector<T> v1, std::vector<T> v2) {
T output = 0;
if (v1.size() != v2.size()) {
return false;
} else {
for (unsigned int i = 0; i < v1.size(); i++) {
output += v1[i] * v2[i];
}
return output;
}
}
template <typename T>
std::vector<T> construct_stride(std::vector<T> t_shape) {
std::vector<T> t_stride(t_shape.size(), 1);
int initial = 1;
for (unsigned int i = t_shape.size(); i > 0; i--) {
t_stride[i - 1] = initial;
initial = initial * t_shape[i - 1];
}
return t_stride;
}
int64_t diag_size(const int64_t &offset, const int64_t &dim1, const int64_t &dim2, std::vector<int64_t> x_shape) {
int64_t dsize = 0;
if (offset >= 0) {
dsize = std::max<int64_t>(std::min(x_shape.at(dim1), x_shape.at(dim2) - offset), 0);
} else {
dsize = std::max<int64_t>(std::min(x_shape.at(dim1) + offset, x_shape.at(dim2)), 0);
}
return dsize;
}
int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr) {
if (dim < 0) {
dim += dim_post_expr;
}
return dim;
}
template <typename T>
T get_data(int64_t basepos, int64_t offset, int64_t *ar, T *dptr) {
if (offset >= 0) {
return dptr[basepos + offset * ar[1]];
} else {
return dptr[basepos - offset * ar[0]];
}
}
template <typename T>
std::vector<T> construct_index(int num, std::vector<T> &stride) {
std::vector<T> idx;
int tmp_num = num;
for (uint32_t i = 0; i < stride.size(); i++) {
idx.push_back(tmp_num / stride[i]);
tmp_num = tmp_num % stride[i];
}
return idx;
}
} // namespace
namespace aicpu {
template <typename T>
void DiagonalCpuKernel::set_output(int64_t *ar, T *dptr, T *y_dptr) {
for (int i = 0; i < dsize; i++) {
y_dptr[ar[N3] + i] = get_data(ar[N2] + i * (ar[0] + ar[1]), offset_, ar, dptr);
}
}
template <typename T>
uint32_t DiagonalCpuKernel::DoComputeType(CpuKernelContext &ctx) {
// Get the inuput and output
Tensor *input_x = ctx.Input(0);
// Get some information of input
int32_t x_NumE = input_x->NumElements();
auto x_shape = input_x->GetTensorShape();
std::vector<int64_t> x_shape_ = x_shape->GetDimSizes();
const int64_t x_dim = x_shape->GetDims();
auto dataptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto y_dataptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
// Compute
dsize = diag_size(offset_, dim1_, dim2_, x_shape_);
std::vector<int64_t> x_stride = construct_stride<int64_t>(x_shape_);
if (x_dim != N2 && x_NumE > 0) {
// set the vx_shape and vx_stride, which is x_shape_ and x_stride of
// position dim1_ and dim2_ removed.
std::vector<int64_t> vx_shape, vx_stride;
for (unsigned int tmp_dim = 0; tmp_dim < x_shape_.size(); tmp_dim++) {
if (tmp_dim != dim1_ && tmp_dim != dim2_) {
vx_shape.push_back(x_shape_[tmp_dim]);
vx_stride.push_back(x_stride[tmp_dim]);
}
}
// set the y_shape (the output shape), y_stride(the output stride),
// vy_stride(the y_stride without the last dim)
std::vector<int64_t> y_shape = vx_shape;
y_shape.push_back(dsize);
std::vector<int64_t> y_stride = construct_stride<int64_t>(y_shape);
std::vector<int64_t> vy_stride = y_stride;
vy_stride.pop_back();
// diagonal
int32_t task_num = x_NumE / x_shape_[dim1_] / x_shape_[dim2_];
if (task_num >= kParallelDataNum) {
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (task_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, static_cast<int64_t>(N4));
}
max_core_num = max_core_num > task_num ? task_num : max_core_num;
auto sharder_diagonal = [&](int64_t start, int64_t end) {
for (int64_t j = start; j < end; j++) {
std::vector<int64_t> v_s_stride = construct_stride<int64_t>(vx_shape);
auto p = construct_index<int64_t>(j, v_s_stride);
int64_t arr[N4] = {x_stride[dim1_], x_stride[dim2_], mul_sum<int64_t>(p, vx_stride),
mul_sum<int64_t>(p, vy_stride)};
set_output(arr, dataptr, y_dataptr);
}
};
if (max_core_num != 0) {
int64_t per_unit = task_num / max_core_num;
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, task_num, per_unit, sharder_diagonal), "Diagonal failed.");
}
} else {
for (int64_t j = 0; j < task_num; j++) {
std::vector<int64_t> v_s_stride = construct_stride<int64_t>(vx_shape);
auto p = construct_index<int64_t>(j, v_s_stride);
int64_t arr[N4] = {x_stride[dim1_], x_stride[dim2_], mul_sum<int64_t>(p, vx_stride),
mul_sum<int64_t>(p, vy_stride)};
set_output(arr, dataptr, y_dataptr);
}
}
} else if (x_dim == N2) {
int64_t arr[N4] = {x_stride[dim1_], x_stride[dim2_], 0, 0};
set_output(arr, dataptr, y_dataptr);
} else {
y_dataptr = dataptr;
}
return KERNEL_STATUS_OK;
}
uint32_t DiagonalCpuKernel::Compute(CpuKernelContext &ctx) {
// Check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Diagonal check input and output number failed.");
// Get the inuput
Tensor *input_x = ctx.Input(0);
auto input_size = input_x->GetTensorShape()->GetDims();
// Check the input dims
if (input_size < N2) {
KERNEL_LOG_ERROR("[Diagonal]: the input tensor must is at least 2-dimensional.");
return KERNEL_STATUS_PARAM_INVALID;
}
// Get the attr
AttrValue *offset = ctx.GetAttr("offset");
offset_ = (offset == nullptr) ? 0 : (offset->GetInt());
AttrValue *dim1 = ctx.GetAttr("dim1");
dim1_ = (dim1 == nullptr) ? 0 : (dim1->GetInt());
AttrValue *dim2 = ctx.GetAttr("dim2");
dim2_ = (dim2 == nullptr) ? 1 : (dim2->GetInt());
int64_t min_d = -input_size;
int64_t max_d = input_size - 1;
// Check the attr
if (dim1_ < min_d || dim1_ > max_d || dim2_ < min_d || dim2_ > max_d) {
KERNEL_LOG_ERROR(
"[Diagonal]: Dimension out of range (expected to be in range of [%d, "
"%d]).",
min_d, max_d);
return KERNEL_STATUS_PARAM_INVALID;
}
// Represent the dim in uniform standard form and Check the dim
dim1_ = maybe_wrap_dim(dim1_, input_size);
dim2_ = maybe_wrap_dim(dim2_, input_size);
if (dim1_ == dim2_) {
KERNEL_LOG_ERROR("[Diagonal]:Diagonal dimensions cannot be identical.");
return KERNEL_STATUS_PARAM_INVALID;
}
auto data_type = input_x->GetDataType();
switch (data_type) {
case DT_FLOAT:
return DoComputeType<float>(ctx);
case DT_DOUBLE:
return DoComputeType<double>(ctx);
default:
KERNEL_LOG_ERROR("[Diagonal]: Diagonal kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
REGISTER_CPU_KERNEL(kDiagonal, DiagonalCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,43 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <Eigen/Dense>
#include <array>
#include <iostream>
#include "cpu_ops_kernel.h"
namespace aicpu {
class DiagonalCpuKernel final : public CpuKernel {
public:
DiagonalCpuKernel() = default;
~DiagonalCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
template <typename T>
uint32_t DoComputeType(CpuKernelContext &ctx);
template <typename T>
void set_output(int64_t *ar, T *dptr, T *y_dptr);
private:
int64_t offset_ = 0;
int64_t dim1_ = 0;
int64_t dim2_ = 1;
int64_t dsize = 0;
};
} // namespace aicpu

View File

@ -0,0 +1,95 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "eig.h"
#include <Eigen/Dense>
#include <Eigen/Eigenvalues>
#include <algorithm>
#include <complex>
#include <iostream>
#include <map>
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 1;
const uint32_t kOutputNum = 2;
const char *Eig = "Eig";
} // namespace
namespace aicpu {
uint32_t EigCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Eig check input and output failed.");
Tensor *input = ctx.Input(0);
auto input_dtype = static_cast<DataType>(input->GetDataType());
switch (input_dtype) {
case DT_FLOAT:
return ComputeKernel<float, std::complex<float>>(ctx);
case DT_DOUBLE:
return ComputeKernel<double, std::complex<double>>(ctx);
case DT_COMPLEX64:
return ComputeKernel<std::complex<float>, std::complex<float>>(ctx);
case DT_COMPLEX128:
return ComputeKernel<std::complex<double>, std::complex<double>>(ctx);
default:
KERNEL_LOG_ERROR("Eig kernel data type [%s] not support.", DTypeStr(input_dtype).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
REGISTER_CPU_KERNEL(Eig, EigCpuKernel);
template <typename T, typename C>
uint32_t EigCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
auto xptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto valptr = reinterpret_cast<C *>(ctx.Output(0)->GetData());
auto vecptr = reinterpret_cast<C *>(ctx.Output(1)->GetData());
std::vector<int64_t> dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
int64_t rank = ctx.Input(0)->GetTensorShape()->GetDims();
int64_t x_dim = ctx.Input(0)->GetTensorShape()->GetDimSize(rank - 1);
int64_t batch_size = 1;
if (rank > 2) {
for (int64_t i = 0; i < rank - 2; i++) {
batch_size *= dims[i];
}
}
AttrValue *compute_v = ctx.GetAttr("compute_v");
bool compute_v_ = (compute_v == nullptr) ? false : compute_v->GetBool();
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(x_dim, x_dim);
for (int64_t k = 0; k < batch_size; k++) {
for (int64_t i = 0; i < x_dim * x_dim; i++) {
A.data()[i] = xptr[k * x_dim * x_dim + i];
}
if (!compute_v_) {
Eigen::ComplexEigenSolver<Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> es(A, false);
Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> D = es.eigenvalues();
for (int64_t i = 0; i < x_dim; i++) {
valptr[k * x_dim + i] = D.data()[i];
}
} else {
Eigen::ComplexEigenSolver<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> es(A);
Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> D = es.eigenvalues();
Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> V = es.eigenvectors();
for (int64_t i = 0; i < x_dim; i++) {
valptr[k * x_dim + i] = D.data()[i];
}
for (int64_t i = 0; i < x_dim * x_dim; i++) {
vecptr[k * x_dim * x_dim + i] = V.data()[i];
}
}
}
return KERNEL_STATUS_OK;
}
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_EIG_H_
#define AICPU_KERNELS_NORMALIZED_EIG_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
namespace aicpu {
class EigCpuKernel : public CpuKernel {
public:
EigCpuKernel() = default;
~EigCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T, typename C>
uint32_t ComputeKernel(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,114 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "eye.h"
#include <string.h>
#include "Eigen/Dense"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kEye = "Eye";
#define EYE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = EyePartCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Eye kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t EyeCpuKernel::Compute(CpuKernelContext &ctx) {
Tensor *output = ctx.Output(0);
KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed")
auto data_type = ctx.Output(0)->GetDataType();
switch (data_type) {
EYE_COMPUTE_CASE(DT_FLOAT, float, ctx)
EYE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
EYE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
EYE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
EYE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
EYE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
EYE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
EYE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
EYE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
EYE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
EYE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
EYE_COMPUTE_CASE(DT_COMPLEX64, std::complex<std::float_t>, ctx)
EYE_COMPUTE_CASE(DT_COMPLEX128, std::complex<std::double_t>, ctx)
default:
KERNEL_LOG_ERROR("Eye kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t EyeCpuKernel::EyePartCompute(CpuKernelContext &ctx) {
int64_t num_rows_value1 = 0;
int64_t num_columns_value = -1;
int64_t dim_value = 1;
int32_t out_size_size = 0;
AttrValue *num_rows = ctx.GetAttr("num_rows");
KERNEL_CHECK_NULLPTR(num_rows, KERNEL_STATUS_PARAM_INVALID, "get num_rows failed.");
num_rows_value1 = num_rows->GetInt();
int64_t min_value = num_rows_value1;
int64_t max_value = -1;
int64_t num_col = num_rows_value1;
AttrValue *num_columns = ctx.GetAttr("num_columns");
if (num_columns) {
num_columns_value = num_columns->GetInt();
min_value = num_columns_value < num_rows_value1 ? num_columns_value : num_rows_value1;
max_value = num_columns_value > num_rows_value1 ? num_columns_value : num_rows_value1;
num_col = num_columns_value;
}
if (max_value == -1) {
max_value = num_rows_value1;
}
AttrValue *batch_shape = ctx.GetAttr("batch_shape");
if (batch_shape) {
std::vector<int64_t> output_size = ctx.GetAttr("batch_shape")->GetListInt();
out_size_size = output_size.size();
int64_t batch_shape_value = 1;
for (int32_t t = 0; t < out_size_size; t++) {
batch_shape_value = output_size[t];
dim_value = dim_value * batch_shape_value;
}
}
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
int64_t data_size = data_num * sizeof(T);
Tensor *y = ctx.Output(0);
auto y_addr = y->GetData();
memset(y_addr, 0.0, data_size);
T num = static_cast<T>(1);
int32_t block_size = min_value * max_value;
for (int32_t dim = 0; dim < dim_value; dim++) {
for (int32_t i = 0; i < min_value; i++) {
*(output_y + (dim * block_size) + (num_col + 1) * i) = num;
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kEye, EyeCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_ZEROSLIKE_H_
#define AICPU_KERNELS_NORMALIZED_ZEROSLIKE_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class EyeCpuKernel : public CpuKernel {
public:
EyeCpuKernel() = default;
~EyeCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t EyePartCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,294 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fractional_avg_pool.h"
#include "Eigen/Dense"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kFractionalAvgPool = "FractionalAvgPool";
const uint32_t k_InputNum = 1;
const uint32_t k_OutputNum = 3;
const int64_t kParallelDataNum = 1024 * 1024;
constexpr uint32_t tensor_in_and_out_dims = 4;
} // namespace
namespace aicpu {
uint32_t FractionalAvgPoolCpuKernel::FractionalAvgPoolParamCheck(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
"FractionalAvgPool Check input and output number failed.");
Tensor *input = ctx.Input(0);
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the output [%s] need be the same as the input [%s]",
DTypeStr(ctx.Output(0)->GetDataType()).c_str(), DTypeStr(ctx.Input(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto input_shape = input->GetTensorShape();
int32_t input_dims = input_shape->GetDims();
for (int32_t i = 0; i < input_dims; i++) {
KERNEL_CHECK_FALSE((input_shape->GetDimSize(i) > 0), KERNEL_STATUS_PARAM_INVALID,
"FractionalAvgPool: expected input to have non-empty spatial "
"dimensions, "
"but input has sizes [%d] with dimension [%d] being empty.",
input_dims, i);
}
KERNEL_CHECK_FALSE((input_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
"tensor_in must be 4-dimensional.");
AttrValue *pooling_ratio = ctx.GetAttr("pooling_ratio");
KERNEL_CHECK_NULLPTR(pooling_ratio, KERNEL_STATUS_PARAM_INVALID, "[%s] get attr:pooling_ratio failed.",
kFractionalAvgPool);
int32_t pooling_ratio_size = pooling_ratio->ListFloatSize();
KERNEL_CHECK_FALSE((pooling_ratio_size == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
"pooling_ratio field must specify 4 dimensions.");
std::vector<float> pooling_ratio_data = ctx.GetAttr("pooling_ratio")->GetListFloat();
KERNEL_CHECK_FALSE((pooling_ratio_data[0] == 1.0 && pooling_ratio_data[3] == 1.0), KERNEL_STATUS_PARAM_INVALID,
"FractionalAvgPool is not yet supported on the batch nor channel "
"dimension.The first and last elements of pooling ratio must be 1.0.");
return KERNEL_STATUS_OK;
}
static std::vector<int64_t> GeneratePoolingSequencePseudoRandom(int input_length, int output_length, int seed) {
// generate a random number u which is in (0,1)
std::vector<int64_t> cum_seq(output_length + 1, 0);
std::vector<int64_t> diff(output_length, 0);
double alpha = static_cast<double>(input_length) / output_length;
int k = input_length / output_length;
double u_max1 = (k + 2) / alpha - 1;
double u_max2 = (input_length + 1 - k) / alpha - (output_length - 1);
double max_u = std::min(u_max1, u_max2);
std::default_random_engine random(seed);
std::uniform_real_distribution<double> dis2(0.0, 1.0);
const double u = dis2(random) * max_u;
cum_seq[0] = 1;
cum_seq[output_length] = input_length + 1;
for (int i = 1; i < output_length; ++i) {
cum_seq[i] = static_cast<int>(ceil(alpha * (i + u)));
}
for (int i = 0; i < output_length; ++i) {
diff[i] = cum_seq[i + 1] - cum_seq[i];
}
return diff;
}
static std::vector<int64_t> GeneratePoolingSequenceRandom(int input_length, int output_length, int seed) {
int k = input_length / output_length;
int num_random_spot = input_length % output_length;
std::vector<int64_t> diff(output_length, k);
for (int i = 0; i < num_random_spot; ++i) {
diff[i] += 1;
}
std::srand(seed);
random_shuffle(diff.begin(), diff.end());
return diff;
}
std::vector<int64_t> GeneratePoolingSequence(int input_length, int output_length, bool pseudo_random, int seed) {
std::vector<int64_t> diff;
if (input_length % output_length == 0) {
diff = std::vector<int64_t>(output_length, input_length / output_length);
}
if (pseudo_random) {
diff = GeneratePoolingSequencePseudoRandom(input_length, output_length, seed);
} else {
diff = GeneratePoolingSequenceRandom(input_length, output_length, seed);
}
int k = input_length / output_length;
for (int i = 0; i < output_length; i++) {
if (diff[i] < k || diff[i] > k + 1) {
KERNEL_LOG_ERROR("FractionalAvgPool kernel GeneratePoolingSequence diff[%d] is error");
}
}
std::vector<int64_t> cum_seq(output_length + 1, 0);
for (size_t i = 1; i < cum_seq.size(); ++i) {
cum_seq[i] = cum_seq[i - 1] + diff[i - 1];
}
return cum_seq;
}
template <typename T>
uint32_t FractionalAvgPoolCpuKernel::DoCompute(CpuKernelContext &ctx) {
Tensor *input = ctx.Input(0);
Tensor *output = ctx.Output(0);
Tensor *row_pooling_sequence = ctx.Output(1);
Tensor *col_pooling_sequence = ctx.Output(2);
std::vector<float> pooling_ratio = ctx.GetAttr("pooling_ratio")->GetListFloat();
AttrValue *pseudo_random_ = ctx.GetAttr("pseudo_random");
bool pseudo_random = (pseudo_random_ == nullptr) ? false : (pseudo_random_->GetBool());
AttrValue *overlapping_ = ctx.GetAttr("overlapping");
bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
AttrValue *deterministic_ = ctx.GetAttr("deterministic");
bool deterministic = (deterministic_ == nullptr) ? false : (deterministic_->GetBool());
AttrValue *seed_ = ctx.GetAttr("seed");
int seed = (seed_ == nullptr) ? 0 : (seed_->GetInt());
AttrValue *seed2_ = ctx.GetAttr("seed2");
int seed2 = (seed2_ == nullptr) ? 0 : (seed2_->GetInt());
auto input_shape = input->GetTensorShape();
std::vector<int> input_size(tensor_in_and_out_dims);
std::vector<int> output_size(tensor_in_and_out_dims);
for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
input_size[i] = input_shape->GetDimSize(i);
}
for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
output_size[i] = static_cast<int>(std::floor(input_size[i] / pooling_ratio[i]));
KERNEL_CHECK_FALSE((output_size[i] > 0), KERNEL_STATUS_PARAM_INVALID,
"FractionalAvgPool kernel outputsize[%d] cannot be 0");
}
auto input_data = static_cast<T *>(ctx.Input(0)->GetData());
auto output_data = static_cast<T *>(output->GetData());
auto output_height_seq_tensor = static_cast<int64_t *>(row_pooling_sequence->GetData());
auto output_width_seq_tensor = static_cast<int64_t *>(col_pooling_sequence->GetData());
std::random_device rd;
std::mt19937 generator(rd());
if (deterministic) {
// If both seeds are not set when deterministic is true, force set seeds.
if ((seed == 0) && (seed2 == 0)) {
seed = generator();
seed2 = generator();
}
} else {
KERNEL_CHECK_FALSE(((seed == 0) && (seed2 == 0)), KERNEL_STATUS_PARAM_INVALID,
"Both seed and seed2 should be 0 if deterministic is false.");
}
if (seed == 0 && seed2 != 0) {
seed = seed2;
}
// Generate pooling sequence.
std::vector<int64_t> height_cum_seq;
std::vector<int64_t> width_cum_seq;
height_cum_seq = GeneratePoolingSequence(input_size[1], output_size[1], pseudo_random, seed);
width_cum_seq = GeneratePoolingSequence(input_size[2], output_size[2], pseudo_random, seed);
for (uint32_t i = 0; i < height_cum_seq.size(); ++i) {
*(output_height_seq_tensor + i) = height_cum_seq[i];
}
for (uint32_t i = 0; i < width_cum_seq.size(); ++i) {
*(output_width_seq_tensor + i) = width_cum_seq[i];
}
const int64_t height_max = input_size[1] - 1;
const int64_t width_max = input_size[2] - 1;
const int64_t depth_max = input_size[3] - 1;
uint64_t data_num = input->NumElements();
/**
* For both input and output,
* 0: batch
* 1: height / row
* 2: width / col
* 3: depth / channel
*/
if (data_num < kParallelDataNum) {
for (int64_t b = 0; b < input_size[0]; ++b) {
// height sequence.
for (uint32_t hs = 0; hs < height_cum_seq.size() - 1; ++hs) {
// height start and end.
const int64_t height_start = height_cum_seq[hs];
int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
height_end = std::min(height_end, height_max);
// width sequence.
for (uint32_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
for (int64_t c = 0; c <= depth_max; ++c) {
const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
// Initializes the output tensor with 0.
T sum = static_cast<T>(0);
T avg = static_cast<T>(0);
int count = 0;
// width start and end.
const int64_t width_start = width_cum_seq[ws];
int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
width_end = std::min(width_end, width_max);
for (int64_t h = height_start; h <= height_end; ++h) {
for (int64_t w = width_start; w <= width_end; ++w) {
const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
sum += input_data[in_offset];
count++;
}
}
avg = sum / static_cast<T>(count);
*(output_data + out_offset) = avg;
}
}
}
}
} else {
uint64_t height_cum_len = height_cum_seq.size() - 1;
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > height_cum_len) {
max_core_num = height_cum_len;
}
for (int64_t b = 0; b < input_size[0]; ++b) {
// height sequence.
auto sharder_fractionalavgpool_index = [&](size_t start, size_t end) {
for (uint32_t hs = start; hs < end; ++hs) {
// height start and end.
const int64_t height_start = height_cum_seq[hs];
int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
height_end = std::min(height_end, height_max);
// width sequence.
for (uint32_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
for (int64_t c = 0; c <= depth_max; ++c) {
const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
// Initializes the output tensor with 0.
T sum = static_cast<T>(0);
T avg = static_cast<T>(0);
int count = 0;
// width start and end.
const int64_t width_start = width_cum_seq[ws];
int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
width_end = std::min(width_end, width_max);
for (int64_t h = height_start; h <= height_end; ++h) {
for (int64_t w = width_start; w <= width_end; ++w) {
const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
sum += input_data[in_offset];
count++;
}
}
avg = sum / static_cast<T>(count);
*(output_data + out_offset) = avg;
}
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, height_cum_len, height_cum_len / max_core_num,
sharder_fractionalavgpool_index),
"FractionalAvgPool Index Compute failed");
}
}
return KERNEL_STATUS_OK;
}
uint32_t FractionalAvgPoolCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(FractionalAvgPoolParamCheck(ctx), "Check FractionalAvgPool params failed.");
Tensor *input = ctx.Input(0);
auto data_type = input->GetDataType();
switch (data_type) {
case DT_FLOAT:
return DoCompute<float>(ctx);
case DT_DOUBLE:
return DoCompute<double>(ctx);
case DT_INT32:
return DoCompute<int32_t>(ctx);
case DT_INT64:
return DoCompute<int64_t>(ctx);
default:
KERNEL_LOG_ERROR("FractionalAvgPool kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kFractionalAvgPool, FractionalAvgPoolCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,20 @@
#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_POOL_H_
#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_POOL_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class FractionalAvgPoolCpuKernel : public CpuKernel {
public:
FractionalAvgPoolCpuKernel() = default;
~FractionalAvgPoolCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
uint32_t FractionalAvgPoolParamCheck(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_POOL_H_

View File

@ -0,0 +1,208 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fractional_avg_pool_grad.h"
#include <iostream>
#include "Eigen/Dense"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kFractionalAvgPoolGrad = "FractionalAvgPoolGrad";
const uint32_t k_InputNum = 4;
const uint32_t k_OutputNum = 1;
const int64_t kParallelDataNum = 32 * 1024;
} // namespace
namespace aicpu {
uint32_t FractionalAvgPoolGradCpuKernel::FractionalAvgPoolGradParamCheck(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
"FractionalAvgPoolGrad check input and output number failed.");
Tensor *orig_input_tensor_shape = ctx.Input(0);
Tensor *out_backprop = ctx.Input(1);
Tensor *output = ctx.Output(0);
auto orig_input_shape = orig_input_tensor_shape->GetTensorShape();
int32_t orig_input_dims = orig_input_shape->GetDims();
int32_t orig_input_shape_nums = orig_input_tensor_shape->NumElements();
if (out_backprop->GetDataType() != output->GetDataType()) {
KERNEL_LOG_ERROR(
"The data type of the output [%s] need be the same as the out_backprop "
"[%s]",
DTypeStr(output->GetDataType()).c_str(), DTypeStr(out_backprop->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_CHECK_FALSE((orig_input_dims == 1 && orig_input_shape_nums == 4), KERNEL_STATUS_PARAM_INVALID,
"original input tensor shape must be 1-dimensional and 4 elements.");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t FractionalAvgPoolGradCpuKernel::DoCompute(CpuKernelContext &ctx) {
typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenMatrixMap;
typedef Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>> EigenDoubleMatrixMap;
const Tensor *orig_input_tensor_shape = ctx.Input(0);
const Tensor *out_backprop = ctx.Input(1);
const Tensor *row_pooling_sequence = ctx.Input(2);
const Tensor *col_pooling_sequence = ctx.Input(3);
Tensor *output = ctx.Output(0);
auto output_data = static_cast<T *>(output->GetData());
AttrValue *overlapping_ = ctx.GetAttr("overlapping");
bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
int32_t row_seq_nums = row_pooling_sequence->NumElements();
int32_t col_seq_nums = col_pooling_sequence->NumElements();
auto out_backprop_shape = out_backprop->GetTensorShape();
const int64_t out_batch = out_backprop_shape->GetDimSize(0);
const int64_t out_rows = out_backprop_shape->GetDimSize(1);
const int64_t out_cols = out_backprop_shape->GetDimSize(2);
const int64_t out_depth = out_backprop_shape->GetDimSize(3);
KERNEL_CHECK_FALSE((row_seq_nums > out_rows), KERNEL_STATUS_PARAM_INVALID,
"Given out_backprop shape [%ld,%ld,%ld,%ld], row_seq_tensor must"
" have at least [%ld] elements, but got[%ld].",
out_batch, out_rows, out_cols, out_depth, out_rows + 1, row_seq_nums);
KERNEL_CHECK_FALSE((col_seq_nums > out_cols), KERNEL_STATUS_PARAM_INVALID,
"Given out_backprop shape [%ld,%ld,%ld,%ld], col_seq_tensor must"
" have at least [%ld] elements, but got[%ld].",
out_batch, out_rows, out_cols, out_depth, out_cols + 1, col_seq_nums);
auto row_seq_data = static_cast<int64_t *>(row_pooling_sequence->GetData());
auto col_seq_data = static_cast<int64_t *>(col_pooling_sequence->GetData());
auto orig_input_tensor_shape_data = static_cast<int64_t *>(orig_input_tensor_shape->GetData());
const int64_t in_batch = *(orig_input_tensor_shape_data);
const int64_t in_rows = *(orig_input_tensor_shape_data + 1);
const int64_t in_cols = *(orig_input_tensor_shape_data + 2);
const int64_t in_depth = *(orig_input_tensor_shape_data + 3);
int32_t input_nums = orig_input_tensor_shape->NumElements();
std::vector<int64_t> out_put_dims;
for (int i = 0; i < input_nums; i++) {
KERNEL_CHECK_FALSE((*(orig_input_tensor_shape_data + i) > 0), KERNEL_STATUS_PARAM_INVALID,
"Each dimension of input must be > 0.");
out_put_dims.push_back(orig_input_tensor_shape_data[i]);
}
int64_t output_nums = in_batch * in_rows * in_cols * in_depth;
// Create intermediate in_backprop.
std::vector<double> in_backprop_tensor_temp(output_nums);
for (int i = 0; i < output_nums; i++) {
in_backprop_tensor_temp[i] = 0;
*(output_data + i) = 0;
}
EigenDoubleMatrixMap in_backprop_tensor_temp_mat(in_backprop_tensor_temp.data(), in_depth,
in_cols * in_rows * in_batch);
ConstEigenMatrixMap out_backprop_mat(reinterpret_cast<T *>(out_backprop->GetData()), out_depth,
out_cols * out_rows * out_batch);
// Loop through each element of out_backprop and evenly distribute the
// element to the corresponding pooling cell.
const int64_t in_max_row_index = in_rows - 1;
const int64_t in_max_col_index = in_cols - 1;
if (output_nums < kParallelDataNum) {
for (int64_t b = 0; b < out_batch; ++b) {
for (int64_t r = 0; r < out_rows; ++r) {
const int64_t in_row_start = *(row_seq_data + r);
int64_t in_row_end = overlapping ? *(row_seq_data + r + 1) : *(row_seq_data + r + 1) - 1;
in_row_end = std::min(in_row_end, in_max_row_index);
for (int64_t c = 0; c < out_cols; ++c) {
const int64_t in_col_start = *(col_seq_data + c);
int64_t in_col_end = overlapping ? *(col_seq_data + c + 1) : *(col_seq_data + c + 1) - 1;
in_col_end = std::min(in_col_end, in_max_col_index);
const int64_t num_elements_in_pooling_cell =
(in_row_end - in_row_start + 1) * (in_col_end - in_col_start + 1);
const int64_t out_index = (b * out_rows + r) * out_cols + c;
// Now we can evenly distribute out_backprop(b, h, w, *) to
// in_backprop(b, hs:he, ws:we, *).
for (int64_t in_r = in_row_start; in_r <= in_row_end; ++in_r) {
for (int64_t in_c = in_col_start; in_c <= in_col_end; ++in_c) {
const int64_t in_index = (b * in_rows + in_r) * in_cols + in_c;
// Walk through each channel (depth).
for (int64_t d = 0; d < out_depth; ++d) {
const double out_backprop_element = static_cast<double>(out_backprop_mat.coeffRef(d, out_index));
double &in_backprop_ref = in_backprop_tensor_temp_mat.coeffRef(d, in_index);
in_backprop_ref += out_backprop_element / num_elements_in_pooling_cell;
}
}
}
}
}
}
} else {
uint64_t row_len = out_rows;
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > row_len) {
max_core_num = row_len;
}
for (int64_t b = 0; b < out_batch; ++b) {
auto sharder_fractionalavgpoolgrad_index = [&](size_t start, size_t end) {
for (size_t r = start; r < end; ++r) {
const int64_t in_row_start = *(row_seq_data + r);
int64_t in_row_end = overlapping ? *(row_seq_data + r + 1) : *(row_seq_data + r + 1) - 1;
in_row_end = std::min(in_row_end, in_max_row_index);
for (int64_t c = 0; c < out_cols; ++c) {
const int64_t in_col_start = *(col_seq_data + c);
int64_t in_col_end = overlapping ? *(col_seq_data + c + 1) : *(col_seq_data + c + 1) - 1;
in_col_end = std::min(in_col_end, in_max_col_index);
const int64_t num_elements_in_pooling_cell =
(in_row_end - in_row_start + 1) * (in_col_end - in_col_start + 1);
const int64_t out_index = (b * out_rows + r) * out_cols + c;
// Now we can evenly distribute out_backprop(b, h, w, *) to
// in_backprop(b, hs:he, ws:we, *).
for (int64_t in_r = in_row_start; in_r <= in_row_end; ++in_r) {
for (int64_t in_c = in_col_start; in_c <= in_col_end; ++in_c) {
const int64_t in_index = (b * in_rows + in_r) * in_cols + in_c;
// Walk through each channel (depth).
for (int64_t d = 0; d < out_depth; ++d) {
const double out_backprop_element = static_cast<double>(out_backprop_mat.coeffRef(d, out_index));
double &in_backprop_ref = in_backprop_tensor_temp_mat.coeffRef(d, in_index);
in_backprop_ref += out_backprop_element / num_elements_in_pooling_cell;
}
}
}
}
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, row_len, row_len / max_core_num, sharder_fractionalavgpoolgrad_index),
"FractionalAvgPoolGrad Index Compute failed.");
}
}
// Depending on the type, cast double to type T.
for (int64_t i = 0; i < output_nums; ++i) {
*(output_data + i) = static_cast<T>(in_backprop_tensor_temp[i]);
}
output->GetTensorShape()->SetDimSizes(out_put_dims);
return KERNEL_STATUS_OK;
}
uint32_t FractionalAvgPoolGradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(FractionalAvgPoolGradParamCheck(ctx), "Check FractionalAvgPoolGrad params failed.");
Tensor *out_backprop = ctx.Input(1);
auto data_type = out_backprop->GetDataType();
switch (data_type) {
case DT_FLOAT:
return DoCompute<float>(ctx);
case DT_DOUBLE:
return DoCompute<double>(ctx);
case DT_INT32:
return DoCompute<int32_t>(ctx);
case DT_INT64:
return DoCompute<int64_t>(ctx);
default:
KERNEL_LOG_ERROR("FractionalAvgPoolGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kFractionalAvgPoolGrad, FractionalAvgPoolGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,20 @@
#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_GRAD_POOL_H_
#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_GRAD_POOL_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class FractionalAvgPoolGradCpuKernel : public CpuKernel {
public:
FractionalAvgPoolGradCpuKernel() = default;
~FractionalAvgPoolGradCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
uint32_t FractionalAvgPoolGradParamCheck(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_GRAD_POOL_H_

View File

@ -0,0 +1,285 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fractional_max_pool.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kFractionalMaxPool = "FractionalMaxPool";
const uint32_t k_InputNum = 1;
const uint32_t k_OutputNum = 3;
const int64_t kParallelDataNum = 1024 * 1024;
const uint32_t tensor_in_and_out_dims = 4;
} // namespace
namespace aicpu {
uint32_t FractionalMaxPoolCpuKernel::FractionalMaxPoolParamCheck(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
"FractionalMaxPool Check input and output number failed.");
Tensor *input = ctx.Input(0);
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the output [%s] need be the same as the input [%s]",
DTypeStr(ctx.Output(0)->GetDataType()).c_str(), DTypeStr(ctx.Input(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto input_shape = input->GetTensorShape();
int32_t input_dims = input_shape->GetDims();
for (int32_t i = 0; i < input_dims; i++) {
KERNEL_CHECK_FALSE((input_shape->GetDimSize(i) > 0), KERNEL_STATUS_PARAM_INVALID,
"FractionalMaxPool: expected input to have non-empty spatial "
"dimensions, "
"but input has sizes [%d] with dimension [%d] being empty.",
input_dims, i);
}
KERNEL_CHECK_FALSE((input_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
"tensor_in must be 4-dimensional.");
AttrValue *pooling_ratio = ctx.GetAttr("pooling_ratio");
KERNEL_CHECK_NULLPTR(pooling_ratio, KERNEL_STATUS_PARAM_INVALID, "[%s] get attr:pooling_ratio failed.",
kFractionalMaxPool);
int32_t pooling_ratio_size = pooling_ratio->ListFloatSize();
KERNEL_CHECK_FALSE((pooling_ratio_size == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
"The size of pooling_ratio must be 4, but got [%d].", pooling_ratio_size);
std::vector<float> pooling_ratio_data = ctx.GetAttr("pooling_ratio")->GetListFloat();
KERNEL_CHECK_FALSE((pooling_ratio_data[0] == 1.0 && pooling_ratio_data[3] == 1.0), KERNEL_STATUS_PARAM_INVALID,
"FractionalMaxPool is not yet supported on the batch nor channel "
"dimension.The first and last elements of pooling ratio must be 1.0.");
return KERNEL_STATUS_OK;
}
static std::vector<int64_t> GeneratePoolingSequencePseudoRandom(int input_length, int output_length, int64_t seed) {
// generate a random number which is in (0,1)
std::vector<int64_t> cum_seq(output_length + 1, 0);
std::vector<int64_t> diff(output_length, 0);
double alpha = static_cast<double>(input_length) / output_length;
int k = input_length / output_length;
double u_max1 = (k + 2) / alpha - 1;
double u_max2 = (input_length + 1 - k) / alpha - (output_length - 1);
double max_u = std::min(u_max1, u_max2);
std::default_random_engine random(seed);
std::uniform_real_distribution<double> dis2(0.0, 1.0);
const double u = dis2(random) * max_u;
cum_seq[0] = 1;
cum_seq[output_length] = input_length + 1;
for (int i = 1; i < output_length; ++i) {
cum_seq[i] = static_cast<int>(ceil(alpha * (i + u)));
}
for (int i = 0; i < output_length; ++i) {
diff[i] = cum_seq[i + 1] - cum_seq[i];
}
return diff;
}
static std::vector<int64_t> GeneratePoolingSequenceRandom(int input_length, int output_length, int64_t seed) {
int k = input_length / output_length;
int num_random_spot = input_length % output_length;
std::vector<int64_t> diff(output_length, k);
for (int i = 0; i < num_random_spot; ++i) {
diff[i] += 1;
}
std::srand(seed);
random_shuffle(diff.begin(), diff.end());
return diff;
}
std::vector<int64_t> GeneratePoolingSequence(int input_length, int output_length, bool pseudo_random, int64_t seed) {
std::vector<int64_t> diff;
if (input_length % output_length == 0) {
diff = std::vector<int64_t>(output_length, input_length / output_length);
}
if (pseudo_random) {
diff = GeneratePoolingSequencePseudoRandom(input_length, output_length, seed);
} else {
diff = GeneratePoolingSequenceRandom(input_length, output_length, seed);
}
int k = input_length / output_length;
for (int i = 0; i < output_length; i++) {
if (diff[i] < k || diff[i] > k + 1) {
KERNEL_LOG_ERROR("FractionalMaxPool kernel GeneratePoolingSequence diff[%d] is error");
}
}
std::vector<int64_t> cum_seq(output_length + 1, 0);
for (size_t i = 1; i < cum_seq.size(); ++i) {
cum_seq[i] = cum_seq[i - 1] + diff[i - 1];
}
return cum_seq;
}
template <typename T>
uint32_t FractionalMaxPoolCpuKernel::DoCompute(CpuKernelContext &ctx) {
Tensor *input = ctx.Input(0);
Tensor *output = ctx.Output(0);
Tensor *row_pooling_sequence = ctx.Output(1);
Tensor *col_pooling_sequence = ctx.Output(2);
std::vector<float> pooling_ratio = ctx.GetAttr("pooling_ratio")->GetListFloat();
AttrValue *pseudo_random_ = ctx.GetAttr("pseudo_random");
bool pseudo_random = (pseudo_random_ == nullptr) ? false : (pseudo_random_->GetBool());
AttrValue *overlapping_ = ctx.GetAttr("overlapping");
bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
AttrValue *deterministic_ = ctx.GetAttr("deterministic");
bool deterministic = (deterministic_ == nullptr) ? false : (deterministic_->GetBool());
AttrValue *seed_ = ctx.GetAttr("seed");
int seed = (seed_ == nullptr) ? 0 : (seed_->GetInt());
AttrValue *seed2_ = ctx.GetAttr("seed2");
int seed2 = (seed2_ == nullptr) ? 0 : (seed2_->GetInt());
auto input_shape = input->GetTensorShape();
std::vector<int> input_size(tensor_in_and_out_dims);
std::vector<int> output_size(tensor_in_and_out_dims);
for (size_t i = 0; i < tensor_in_and_out_dims; ++i) {
input_size[i] = input_shape->GetDimSize(i);
}
for (size_t i = 0; i < tensor_in_and_out_dims; ++i) {
output_size[i] = static_cast<int>(std::floor(input_size[i] / pooling_ratio[i]));
KERNEL_CHECK_FALSE((output_size[i] > 0), KERNEL_STATUS_PARAM_INVALID,
"FractionalMaxPool kernel output size[%d] cannot be 0.");
}
auto input_data = static_cast<T *>(ctx.Input(0)->GetData());
auto output_data = static_cast<T *>(output->GetData());
auto output_height_seq_tensor = static_cast<int64_t *>(row_pooling_sequence->GetData());
auto output_width_seq_tensor = static_cast<int64_t *>(col_pooling_sequence->GetData());
std::random_device rd;
std::mt19937 generator(rd());
if (deterministic) {
// If both seeds are not set when deterministic is true, force set seeds.
if ((seed == 0) && (seed2 == 0)) {
seed = generator();
seed2 = generator();
}
} else {
KERNEL_CHECK_FALSE(((seed == 0) && (seed2 == 0)), KERNEL_STATUS_PARAM_INVALID,
"Both seed and seed2 should be 0 if deterministic is false.");
}
if (seed == 0 && seed2 != 0) {
seed = seed2;
}
// Generate pooling sequence.
std::vector<int64_t> height_cum_seq;
std::vector<int64_t> width_cum_seq;
height_cum_seq = GeneratePoolingSequence(input_size[1], output_size[1], pseudo_random, seed);
width_cum_seq = GeneratePoolingSequence(input_size[2], output_size[2], pseudo_random, seed);
for (size_t i = 0; i < height_cum_seq.size(); ++i) {
*(output_height_seq_tensor + i) = height_cum_seq[i];
}
for (size_t i = 0; i < width_cum_seq.size(); ++i) {
*(output_width_seq_tensor + i) = width_cum_seq[i];
}
const int64_t height_max = input_size[1] - 1;
const int64_t width_max = input_size[2] - 1;
const int64_t depth_max = input_size[3] - 1;
uint64_t data_num = input->NumElements();
/**
* For both input and output,
* 0: batch
* 1: height / row
* 2: width / col
* 3: depth / channel
*/
if (data_num < kParallelDataNum) {
for (int64_t b = 0; b < input_size[0]; ++b) {
// height sequence.
for (size_t hs = 0; hs < height_cum_seq.size() - 1; ++hs) {
// height start and end.
const int64_t height_start = height_cum_seq[hs];
int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
height_end = std::min(height_end, height_max);
// width sequence.
for (size_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
for (int64_t c = 0; c <= depth_max; ++c) {
const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
// Initializes the output tensor with MIN<T>.
T max = std::numeric_limits<T>::lowest();
// width start and end.
const int64_t width_start = width_cum_seq[ws];
int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
width_end = std::min(width_end, width_max);
for (int64_t h = height_start; h <= height_end; ++h) {
for (int64_t w = width_start; w <= width_end; ++w) {
const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
max = max > input_data[in_offset] ? max : input_data[in_offset];
}
}
*(output_data + out_offset) = max;
}
}
}
}
} else {
uint64_t height_cum_len = height_cum_seq.size() - 1;
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > height_cum_len) {
max_core_num = height_cum_len;
}
for (int64_t b = 0; b < input_size[0]; ++b) {
// height sequence.
auto sharder_fractionalmaxpool_index = [&](size_t start, size_t end) {
for (size_t hs = start; hs < end; ++hs) {
// height start and end.
const int64_t height_start = height_cum_seq[hs];
int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
height_end = std::min(height_end, height_max);
// width sequence.
for (size_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
for (int64_t c = 0; c <= depth_max; ++c) {
const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
// Initializes the output tensor with MIN<T>.
T max = std::numeric_limits<T>::lowest();
// width start and end.
const int64_t width_start = width_cum_seq[ws];
int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
width_end = std::min(width_end, width_max);
for (int64_t h = height_start; h <= height_end; ++h) {
for (int64_t w = width_start; w <= width_end; ++w) {
const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
max = max > input_data[in_offset] ? max : input_data[in_offset];
}
}
*(output_data + out_offset) = max;
}
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, height_cum_len, height_cum_len / max_core_num,
sharder_fractionalmaxpool_index),
"FractionalMaxPool Index Compute failed");
}
}
return KERNEL_STATUS_OK;
}
uint32_t FractionalMaxPoolCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(FractionalMaxPoolParamCheck(ctx), "FractionalMaxPool check params failed.");
Tensor *input = ctx.Input(0);
auto data_type = input->GetDataType();
switch (data_type) {
case DT_FLOAT:
return DoCompute<float>(ctx);
case DT_DOUBLE:
return DoCompute<double>(ctx);
case DT_INT32:
return DoCompute<int32_t>(ctx);
case DT_INT64:
return DoCompute<int64_t>(ctx);
default:
KERNEL_LOG_ERROR("FractionalMaxPool kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kFractionalMaxPool, FractionalMaxPoolCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,20 @@
#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_H_
#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class FractionalMaxPoolCpuKernel : public CpuKernel {
public:
FractionalMaxPoolCpuKernel() = default;
~FractionalMaxPoolCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
uint32_t FractionalMaxPoolParamCheck(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_H_

View File

@ -0,0 +1,242 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fractional_max_pool_grad.h"
#include "Eigen/Dense"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kFractionalMaxPoolGrad = "FractionalMaxPoolGrad";
const uint32_t k_InputNum = 5;
const uint32_t k_OutputNum = 1;
static const int kInvalidMaxPoolingIndex = -1;
const int64_t kParallelDataNum = 32 * 1024;
const uint32_t tensor_in_and_out_dims = 4;
} // namespace
namespace aicpu {
uint32_t FractionalMaxPoolGradCpuKernel::FractionalMaxPoolGradParamCheck(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
"FractionalMaxPoolGrad check input and output number failed.");
Tensor *orig_input = ctx.Input(0);
Tensor *orig_output = ctx.Input(1);
Tensor *out_backprop = ctx.Input(2);
auto orig_input_shape = orig_input->GetTensorShape();
int32_t orig_input_dims = orig_input_shape->GetDims();
auto orig_output_shape = orig_output->GetTensorShape();
int32_t orig_output_dims = orig_output_shape->GetDims();
auto out_backprop_shape = out_backprop->GetTensorShape();
int32_t out_backprop_dims = out_backprop_shape->GetDims();
if (orig_input->GetDataType() != orig_output->GetDataType()) {
KERNEL_LOG_ERROR(
"The data type of the orig_output [%s] need be the same as the "
"orig_input [%s].",
DTypeStr(orig_output->GetDataType()).c_str(), DTypeStr(orig_input->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (orig_input->GetDataType() != out_backprop->GetDataType()) {
KERNEL_LOG_ERROR(
"The data type of the out_backprop [%s] need be the same as the "
"orig_input [%s].",
DTypeStr(out_backprop->GetDataType()).c_str(), DTypeStr(orig_input->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_CHECK_FALSE((orig_input_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
"orig_input should be a tensor of rank 4.");
KERNEL_CHECK_FALSE((orig_output_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
"orig_output should be a tensor of rank 4.");
KERNEL_CHECK_FALSE((out_backprop_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
"out_backprop should be a tensor of rank 4.");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t FractionalMaxPoolGradCpuKernel::DoCompute(CpuKernelContext &ctx) {
typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenMatrixMap;
typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> EigenMatrixMap;
typedef Eigen::Map<Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>> EigenIndexMatrixMap;
const Tensor *tensor_in = ctx.Input(0);
const Tensor *tensor_out = ctx.Input(1);
const Tensor *out_backprop = ctx.Input(2);
const Tensor *height_seq_tensor = ctx.Input(3);
const Tensor *width_seq_tensor = ctx.Input(4);
Tensor *output = ctx.Output(0);
auto output_data = static_cast<T *>(output->GetData());
AttrValue *overlapping_ = ctx.GetAttr("overlapping");
bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
auto tensor_in_shape = tensor_in->GetTensorShape();
auto tensor_out_shape = tensor_out->GetTensorShape();
std::vector<int64_t> input_size(tensor_in_and_out_dims);
std::vector<int64_t> output_size(tensor_in_and_out_dims);
for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
input_size[i] = tensor_in_shape->GetDimSize(i);
}
for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
output_size[i] = tensor_out_shape->GetDimSize(i);
}
int64_t tensor_in_num = tensor_in->NumElements();
int64_t tensor_out_num = tensor_out->NumElements();
std::vector<T> tensor_out_dup(tensor_out_num);
std::vector<int64_t> tensor_out_arg_max(tensor_out_num);
for (int i = 0; i < tensor_out_num; i++) {
tensor_out_dup[i] = std::numeric_limits<T>::lowest();
tensor_out_arg_max[i] = -1;
}
// Find arg_max for each tensor_out
ConstEigenMatrixMap tensor_in_mat(reinterpret_cast<T *>(tensor_in->GetData()), input_size[3],
input_size[2] * input_size[1] * input_size[0]);
EigenMatrixMap tensor_out_dup_mat(tensor_out_dup.data(), output_size[3],
output_size[2] * output_size[1] * output_size[0]);
EigenIndexMatrixMap tensor_out_arg_max_mat(tensor_out_arg_max.data(), output_size[3],
output_size[2] * output_size[1] * output_size[0]);
auto height_seq_tensor_shape = height_seq_tensor->GetTensorShape();
auto width_seq_tensor_shape = width_seq_tensor->GetTensorShape();
auto height_seq_tensor_data = static_cast<int64_t *>(height_seq_tensor->GetData());
auto width_seq_tensor_data = static_cast<int64_t *>(width_seq_tensor->GetData());
/**
* Now walk through the process of fractional max pooling again.
* For both input and output,
* 0: batch
* 1: height / row
* 2: width / col
* 3: depth / channel
*/
if (tensor_in_num < kParallelDataNum) {
const int64_t height_max = input_size[1] - 1;
const int64_t width_max = input_size[2] - 1;
for (int64_t b = 0; b < input_size[0]; ++b) {
// height sequence.
for (int64_t hs = 0; hs < height_seq_tensor_shape->GetDimSize(0) - 1; ++hs) {
// height start and end.
const int64_t height_start = *(height_seq_tensor_data + hs);
int64_t height_end = overlapping ? *(height_seq_tensor_data + hs + 1) : *(height_seq_tensor_data + hs + 1) - 1;
height_end = std::min(height_end, height_max);
// width sequence.
for (int64_t ws = 0; ws < width_seq_tensor_shape->GetDimSize(0) - 1; ++ws) {
const int64_t out_index = (b * output_size[1] + hs) * output_size[2] + ws;
// width start and end.
const int64_t width_start = *(width_seq_tensor_data + ws);
int64_t width_end = overlapping ? *(width_seq_tensor_data + ws + 1) : *(width_seq_tensor_data + ws + 1) - 1;
width_end = std::min(width_end, width_max);
for (int64_t h = height_start; h <= height_end; ++h) {
for (int64_t w = width_start; w <= width_end; ++w) {
const int64_t in_index = (b * input_size[1] + h) * input_size[2] + w;
// Walk through each channel (depth).
for (int64_t d = 0; d < input_size[3]; ++d) {
const T &input_ref = tensor_in_mat.coeffRef(d, in_index);
T &output_ref = tensor_out_dup_mat.coeffRef(d, out_index);
int64_t &out_arg_max_ref = tensor_out_arg_max_mat.coeffRef(d, out_index);
if (output_ref < input_ref || out_arg_max_ref == kInvalidMaxPoolingIndex) {
output_ref = input_ref;
int input_offset = in_index * input_size[3] + d;
out_arg_max_ref = input_offset;
}
}
}
}
}
}
}
} else {
uint64_t height_seq_len = height_seq_tensor_shape->GetDimSize(0) - 1;
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > height_seq_len) {
max_core_num = height_seq_len;
}
const int64_t height_max = input_size[1] - 1;
const int64_t width_max = input_size[2] - 1;
for (int64_t b = 0; b < input_size[0]; ++b) {
// height sequence.
auto sharder_fractionalmaxpoolgrad_index = [&](size_t start, size_t end) {
for (size_t hs = start; hs < end; ++hs) {
// height start and end.
const int64_t height_start = *(height_seq_tensor_data + hs);
int64_t height_end =
overlapping ? *(height_seq_tensor_data + hs + 1) : *(height_seq_tensor_data + hs + 1) - 1;
height_end = std::min(height_end, height_max);
// width sequence.
for (int64_t ws = 0; ws < width_seq_tensor_shape->GetDimSize(0) - 1; ++ws) {
const int64_t out_index = (b * output_size[1] + hs) * output_size[2] + ws;
// width start and end.
const int64_t width_start = *(width_seq_tensor_data + ws);
int64_t width_end = overlapping ? *(width_seq_tensor_data + ws + 1) : *(width_seq_tensor_data + ws + 1) - 1;
width_end = std::min(width_end, width_max);
for (int64_t h = height_start; h <= height_end; ++h) {
for (int64_t w = width_start; w <= width_end; ++w) {
const int64_t in_index = (b * input_size[1] + h) * input_size[2] + w;
// Walk through each channel (depth).
for (int64_t d = 0; d < input_size[3]; ++d) {
const T &input_ref = tensor_in_mat.coeffRef(d, in_index);
T &output_ref = tensor_out_dup_mat.coeffRef(d, out_index);
int64_t &out_arg_max_ref = tensor_out_arg_max_mat.coeffRef(d, out_index);
if (output_ref < input_ref || out_arg_max_ref == kInvalidMaxPoolingIndex) {
output_ref = input_ref;
int input_offset = in_index * input_size[3] + d;
out_arg_max_ref = input_offset;
}
}
}
}
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, height_seq_len, height_seq_len / max_core_num,
sharder_fractionalmaxpoolgrad_index),
"FractionalMaxPoolGrad Index Compute failed.");
}
}
for (int i = 0; i < tensor_in_num; i++) {
*(output_data + i) = 0;
}
auto out_backprop_data = static_cast<T *>(out_backprop->GetData());
int num_total_outputs = out_backprop->NumElements();
int num_total_inputs = output->NumElements();
for (int index = 0; index < num_total_outputs; ++index) {
int input_backprop_index = tensor_out_arg_max[index];
KERNEL_CHECK_FALSE((input_backprop_index >= 0 && input_backprop_index < num_total_inputs),
KERNEL_STATUS_PARAM_INVALID,
"Invalid input backprop index:[%d], The maximum number of output is: "
"[%d].",
input_backprop_index, num_total_inputs);
*(output_data + input_backprop_index) += *(out_backprop_data + index);
}
return KERNEL_STATUS_OK;
}
uint32_t FractionalMaxPoolGradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(FractionalMaxPoolGradParamCheck(ctx), "Check FractionalMaxPoolGrad params failed.");
Tensor *input = ctx.Input(0);
auto data_type = input->GetDataType();
switch (data_type) {
case DT_FLOAT:
return DoCompute<float>(ctx);
case DT_DOUBLE:
return DoCompute<double>(ctx);
case DT_INT32:
return DoCompute<int32_t>(ctx);
case DT_INT64:
return DoCompute<int64_t>(ctx);
default:
KERNEL_LOG_ERROR("FractionalMaxPoolGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kFractionalMaxPoolGrad, FractionalMaxPoolGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,20 @@
#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class FractionalMaxPoolGradCpuKernel : public CpuKernel {
public:
FractionalMaxPoolGradCpuKernel() = default;
~FractionalMaxPoolGradCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
uint32_t FractionalMaxPoolGradParamCheck(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_H_

View File

@ -0,0 +1,198 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fractional_max_pool_grad_with_fixed_ksize.h"
#include <cmath>
#include <limits>
#include <vector>
#include "Eigen/Dense"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 3;
const uint32_t kOutputNum = 1;
const char *kFractionalMaxPoolGradWithFixedKsize = "FractionalMaxPoolGradWithFixedKsize";
constexpr int64_t kParallelDataNums = 128 * 1024;
#define FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DTYPE, TYPE, OUT_BACKPROP, ARGMAX, DATA_NUMS, N_SIZE, C_SIZE, \
INPUT_H, INPUT_W, OUTPUT_H, OUTPUT_W, CTX) \
case (DTYPE): { \
uint32_t result = FractionalMaxPoolGradWithFixedKsizeCompute<TYPE>( \
OUT_BACKPROP, ARGMAX, DATA_NUMS, N_SIZE, C_SIZE, INPUT_H, INPUT_W, OUTPUT_H, OUTPUT_W, CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("FractionalMaxPoolGradWithFixedKsize kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t FractionalMaxPoolGradWithFixedKsize::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"FractionalMaxPoolGradWithFixedKsize check input and "
"output number failed.");
Tensor *origin_input = ctx.Input(0);
int64_t data_nums = origin_input->NumElements();
auto origin_input_shape = origin_input->GetTensorShape();
int32_t origin_input_dim = origin_input_shape->GetDims();
KERNEL_CHECK_FALSE(origin_input_dim == 4, KERNEL_STATUS_PARAM_INVALID,
"The dim of input[origin_input] must be 4, but got [%d].", origin_input_dim);
Tensor *out_backprop = ctx.Input(1);
auto out_backprop_shape = out_backprop->GetTensorShape();
int32_t out_backprop_dim = out_backprop_shape->GetDims();
KERNEL_CHECK_FALSE(out_backprop_dim == 4, KERNEL_STATUS_PARAM_INVALID,
"The dim of input[out_backprop] must be 4, but got [%d].", out_backprop_dim);
Tensor *argmax = ctx.Input(2);
auto argmax_shape = argmax->GetTensorShape();
int32_t argmax_dim = argmax_shape->GetDims();
KERNEL_CHECK_FALSE(argmax_dim == 4, KERNEL_STATUS_PARAM_INVALID, "The dim of input[argmax] must be 4, but got [%d].",
argmax_dim);
std::vector<int64_t> out_backprop_dim_sizes = out_backprop_shape->GetDimSizes();
std::vector<int64_t> argmax_dim_sizes = argmax_shape->GetDimSizes();
KERNEL_CHECK_FALSE(out_backprop_dim_sizes == argmax_dim_sizes, KERNEL_STATUS_PARAM_INVALID,
"The shape of input[out_backprop] and input[argmax] must be equal.");
int64_t n_size = out_backprop_dim_sizes[0];
int64_t c_size = out_backprop_dim_sizes[1];
int64_t input_h = out_backprop_dim_sizes[2];
int64_t input_w = out_backprop_dim_sizes[3];
std::vector<int64_t> origin_input_dim_sizes = origin_input_shape->GetDimSizes();
KERNEL_CHECK_FALSE(origin_input_dim_sizes[0] == n_size, KERNEL_STATUS_PARAM_INVALID,
"The first dim of input[origin_input] and "
"input[out_backprop] must be equal,"
"but got origin_input=[%d] and out_backprop=[%d].",
origin_input_dim_sizes[0], n_size);
KERNEL_CHECK_FALSE(origin_input_dim_sizes[1] == c_size, KERNEL_STATUS_PARAM_INVALID,
"The second dim of input[origin_input] and "
"input[out_backprop] must be equal,"
"but got origin_input=[%d] and out_backprop=[%d].",
origin_input_dim_sizes[1], c_size);
int64_t output_h = origin_input_dim_sizes[2];
int64_t output_w = origin_input_dim_sizes[3];
auto data_type = out_backprop->GetDataType();
switch (data_type) {
FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, out_backprop, argmax, data_nums, n_size,
c_size, input_h, input_w, output_h, output_w, ctx)
FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_FLOAT, float, out_backprop, argmax, data_nums, n_size, c_size,
input_h, input_w, output_h, output_w, ctx)
FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_DOUBLE, double, out_backprop, argmax, data_nums, n_size, c_size,
input_h, input_w, output_h, output_w, ctx)
FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_INT32, int32_t, out_backprop, argmax, data_nums, n_size, c_size,
input_h, input_w, output_h, output_w, ctx)
FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_INT64, int64_t, out_backprop, argmax, data_nums, n_size, c_size,
input_h, input_w, output_h, output_w, ctx)
default:
KERNEL_LOG_ERROR(
"FractionalMaxPoolGradWithFixedKsize kernel input[out_backprop] type "
"[%s] not support.",
DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t FractionalMaxPoolGradWithFixedKsize::FractionalMaxPoolGradWithFixedKsizeCompute(
Tensor *out_backprop, Tensor *argmax, const int64_t data_nums, const int n_size, const int c_size, const int input_h,
const int input_w, const int output_h, const int output_w, CpuKernelContext &ctx) {
T *out_backprop_addr = reinterpret_cast<T *>(out_backprop->GetData());
int64_t *argmax_addr = reinterpret_cast<int64_t *>(argmax->GetData());
Tensor *y = ctx.Output(0);
T *y_addr = reinterpret_cast<T *>(y->GetData());
if (data_nums < kParallelDataNums || n_size == 1) {
for (int n = 0; n < n_size; n++) {
T *out_backprop_single_batch_addr = out_backprop_addr + n * c_size * input_h * input_w;
int64_t *argmax_single_batch_addr = argmax_addr + n * c_size * input_h * input_w;
T *y_single_batch_addr = y_addr + n * c_size * output_h * output_w;
ComputeSingleBatch<T>(out_backprop_single_batch_addr, argmax_single_batch_addr, y_single_batch_addr, c_size,
input_h, input_w, output_h, output_w);
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > (uint32_t)n_size) {
max_core_num = n_size;
}
auto shared_computeN = [&](size_t start, size_t end) {
for (size_t n = start; n < end; n++) {
T *out_backprop_single_batch_addr = out_backprop_addr + n * c_size * input_h * input_w;
int64_t *argmax_single_batch_addr = argmax_addr + n * c_size * input_h * input_w;
T *y_single_batch_addr = y_addr + n * c_size * output_h * output_w;
ComputeSingleBatch<T>(out_backprop_single_batch_addr, argmax_single_batch_addr, y_single_batch_addr, c_size,
input_h, input_w, output_h, output_w);
}
};
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, n_size, n_size / max_core_num, shared_computeN);
if (ret != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor shared_computeN failed.");
return KERNEL_STATUS_INNER_ERROR;
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t FractionalMaxPoolGradWithFixedKsize::ComputeSingleBatch(T *out_backprop_single_batch_addr,
int64_t *argmax_single_batch_addr,
T *y_single_batch_addr, const int c_size,
const int input_h, const int input_w,
const int output_h, const int output_w) {
for (int plane = 0; plane < c_size; plane++) {
T *out_backprop_plane_addr = out_backprop_single_batch_addr + plane * input_h * input_w;
int64_t *argmax_plane_addr = argmax_single_batch_addr + plane * input_h * input_w;
T *y_plane_addr = y_single_batch_addr + plane * output_h * output_w;
for (int i = 0; i < output_h; i++) {
for (int j = 0; j < output_w; j++) {
y_plane_addr[i * output_w + j] = static_cast<T>(0);
}
}
for (int h = 0; h < input_h; h++) {
for (int w = 0; w < input_w; w++) {
int input_index = h * input_w + w;
KERNEL_CHECK_FALSE((input_index >= 0 && input_index < input_h * input_w), KERNEL_STATUS_PARAM_INVALID,
"The input_index[%d] out of the length of argmax.", input_index);
int output_index = argmax_plane_addr[input_index];
KERNEL_CHECK_FALSE((output_index >= 0 && output_index < output_h * output_w), KERNEL_STATUS_PARAM_INVALID,
"The output_index[%d] out of the length of y.", output_index);
y_plane_addr[output_index] += out_backprop_plane_addr[input_index];
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kFractionalMaxPoolGradWithFixedKsize, FractionalMaxPoolGradWithFixedKsize);
} // namespace aicpu

View File

@ -0,0 +1,42 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_WITH_FIXED_KSIZE_H_
#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_WITH_FIXED_KSIZE_H_
#include <vector>
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
namespace aicpu {
class FractionalMaxPoolGradWithFixedKsize : public CpuKernel {
public:
FractionalMaxPoolGradWithFixedKsize() = default;
~FractionalMaxPoolGradWithFixedKsize() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t FractionalMaxPoolGradWithFixedKsizeCompute(Tensor *out_backprop, Tensor *argmax, const int64_t data_nums,
const int n_size, const int c_size, const int input_h,
const int input_w, const int output_h, const int output_w,
CpuKernelContext &ctx);
template <typename T>
uint32_t ComputeSingleBatch(T *out_backprop_single_batch_addr, int64_t *argmax_single_batch_addr,
T *y_single_batch_addr, const int c_size, const int input_h, const int input_w,
const int output_h, const int output_w);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,160 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "gcd.h"
#include <set>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kGcdOutputNum = 1;
const uint32_t kGcdInputNum = 2;
const char *kGcd = "Gcd";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int32_t kInput_32_32 = 3;
const int32_t kInput_32_64 = 2;
const int32_t kInput_64_32 = 1;
const int32_t kInput_64_64 = 0;
} // namespace
namespace aicpu {
// Simple recursive Gcd.
template <class T>
T elewise_gcd(T a, T b) {
if (b == 0) {
return a;
}
return elewise_gcd(b, a % b);
}
uint32_t GcdIOTypeCheck(CpuKernelContext &ctx, int32_t &dual_types) {
Tensor *x1 = ctx.Input(kFirstInputIndex);
Tensor *x2 = ctx.Input(kSecondInputIndex);
Tensor *y = ctx.Output(kFirstOutputIndex);
const std::set<DataType> supported_types{DT_INT32, DT_INT64};
auto x1_type = x1->GetDataType();
auto x2_type = x2->GetDataType();
auto y_type = y->GetDataType();
KERNEL_CHECK_FALSE(supported_types.count(x1_type) != 0, KERNEL_STATUS_PARAM_INVALID,
"[Gcd] input x1 data type [%s] is not supported.", DTypeStr(x1_type).c_str());
KERNEL_CHECK_FALSE(supported_types.count(x2_type) != 0, KERNEL_STATUS_PARAM_INVALID,
"[Gcd] input x2 data type [%s] is not supported.", DTypeStr(x2_type).c_str());
int32_t x1_is_i32 = static_cast<int32_t>(x1_type == DT_INT32) << 1;
int32_t x2_is_i32 = static_cast<int32_t>(x2_type == DT_INT32);
int32_t _dual_types = x1_is_i32 | x2_is_i32;
switch (_dual_types) {
case kInput_64_64:
case kInput_64_32:
case kInput_32_64:
KERNEL_CHECK_FALSE(y_type == DT_INT64, KERNEL_STATUS_PARAM_INVALID,
"[Gcd] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
dual_types = _dual_types;
break;
case kInput_32_32:
KERNEL_CHECK_FALSE(y_type == DT_INT32, KERNEL_STATUS_PARAM_INVALID,
"[Gcd] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
dual_types = _dual_types;
break;
default:
KERNEL_LOG_ERROR("[Gcd] input data type tuple is not supported.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <class T1, class T2, class T3>
uint32_t GcdElewiseCompute(CpuKernelContext &ctx, const T1 *x1_ptr, const T2 *x2_ptr, T3 *y_ptr, Bcast &bcast) {
int64_t data_num = ctx.Output(kFirstOutputIndex)->NumElements();
auto gcd_shard = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
T3 x1_ele_abs = std::abs(static_cast<T3>(x1_ptr[bcast.GetBroadcastXIndex(i)]));
T3 x2_ele_abs = std::abs(static_cast<T3>(x2_ptr[bcast.GetBroadcastYIndex(i)]));
y_ptr[i] = elewise_gcd(x1_ele_abs, x2_ele_abs);
}
};
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
if (max_core_num == 0) {
KERNEL_LOG_ERROR("[Gcd] max_core_num is 0, please check the cpu num.");
return KERNEL_STATUS_PARAM_INVALID;
}
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, gcd_shard);
if (ret != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("[Gcd] Gcd Compute failed.");
return ret;
}
} else {
gcd_shard(0, data_num);
}
return KERNEL_STATUS_OK;
}
template <class T1, class T2, class T3>
uint32_t GcdCompute(CpuKernelContext &ctx) {
Tensor *x1 = ctx.Input(kFirstInputIndex);
Tensor *x2 = ctx.Input(kSecondInputIndex);
Tensor *y = ctx.Output(kFirstOutputIndex);
const T1 *x1_ptr = reinterpret_cast<const T1 *>(x1->GetData());
const T2 *x2_ptr = reinterpret_cast<const T2 *>(x2->GetData());
T3 *y_ptr = reinterpret_cast<T3 *>(y->GetData());
auto x1_shape = x1->GetTensorShape()->GetDimSizes();
auto x2_shape = x2->GetTensorShape()->GetDimSizes();
Bcast bcast(x1_shape, x2_shape);
if (bcast.IsValid()) {
return GcdElewiseCompute<T1, T2, T3>(ctx, x1_ptr, x2_ptr, y_ptr, bcast);
} else {
KERNEL_LOG_ERROR("[Gcd] broadcast failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
uint32_t GcdCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kGcdInputNum, kGcdOutputNum), "[Gcd] check input and output number failed.");
int32_t dual_types = static_cast<int32_t>(-1);
KERNEL_HANDLE_ERROR(GcdIOTypeCheck(ctx, dual_types), "[Gcd] check data type failed.");
switch (dual_types) {
case kInput_64_64:
return GcdCompute<int64_t, int64_t, int64_t>(ctx);
break;
case kInput_64_32:
return GcdCompute<int64_t, int32_t, int64_t>(ctx);
break;
case kInput_32_64:
return GcdCompute<int32_t, int64_t, int64_t>(ctx);
break;
case kInput_32_32:
return GcdCompute<int32_t, int32_t, int32_t>(ctx);
break;
default:
KERNEL_LOG_ERROR("[Gcd] input data type tuple is not supported.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kGcd, GcdCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,32 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_GCD_H_
#define AICPU_KERNELS_NORMALIZED_GCD_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class GcdCpuKernel : public CpuKernel {
public:
~GcdCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,268 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "geqrf.h"
#include <cmath>
#include <complex>
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
using namespace std;
namespace {
const char *kGeqrf = "Geqrf";
const uint32_t kInputNum = 1;
const uint32_t kOutputNum = 2;
} // namespace
namespace aicpu {
uint32_t GeqrfCpuKernel::Compute(CpuKernelContext &ctx) {
if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
return KERNEL_STATUS_PARAM_INVALID;
}
DataType input0_data_type = ctx.Input(0)->GetDataType();
bool ret = KERNEL_STATUS_PARAM_INVALID;
switch (input0_data_type) {
case DT_FLOAT16:
ret = DoCompute<Eigen::half>(ctx);
break;
case DT_FLOAT:
ret = DoCompute<float>(ctx);
break;
case DT_DOUBLE:
ret = DoCompute<double>(ctx);
break;
case DT_COMPLEX64:
ret = DoComputeC<float>(ctx);
break;
case DT_COMPLEX128:
ret = DoComputeC<double>(ctx);
break;
default:
KERNEL_LOG_ERROR("Unsupported input data type[%s]", DTypeStr(input0_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return ret;
}
template <typename T>
void GeqrfCpuKernel::Larfg(int n, int vm, int vn, T **A, T *tau) {
T zero = static_cast<T>(0);
if (n <= 1) {
*tau = zero;
return;
}
T xnorm = zero;
for (int i = vm + 1; i < vm + n; i++) {
xnorm = xnorm + A[i][vn] * A[i][vn];
}
xnorm = sqrt(xnorm);
if (xnorm == zero) {
*tau = zero;
return;
} else {
T beta = sqrt(A[vm][vn] * A[vm][vn] + xnorm * xnorm);
if (A[vm][vn] > zero) {
beta = -beta;
}
*tau = (beta - (A[vm][vn])) / beta;
auto scal = (A[vm][vn]) - beta;
for (int i = vm + 1; i < vm + n; i++) {
A[i][vn] /= scal;
}
A[vm][vn] = beta;
}
}
template <typename T>
void GeqrfCpuKernel::Larf(int m, int n, T **A, T *tau, int cm, int cn) {
if (m <= 0 || n <= 0) {
return;
}
T *work = new T[n]();
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
work[j] += A[cm + i][cn - 1] * A[cm + i][cn + j];
}
}
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
A[i + cm][j + cn] -= (*tau) * A[cm + i][cn - 1] * work[j];
}
}
delete[] work;
}
template <typename T>
void GeqrfCpuKernel::Geqrf(int m, int n, T **A, T *tau) {
if (m < 0 || n < 0) {
return;
}
int k = std::min(m, n);
T one = static_cast<T>(1);
for (int i = 0; i < k; i++) {
Larfg<T>(m - i, i, i, A, tau + i);
T aii = A[i][i];
A[i][i] = one;
Larf<T>(m - i, n - i - 1, A, tau + i, i, i + 1);
A[i][i] = aii;
}
}
template <typename T>
void GeqrfCpuKernel::CLarfg(int n, int vm, int vn, complex<T> **A, complex<T> *tau) {
complex<T> one = complex<T>(1, 0);
complex<T> zero = complex<T>(0, 0);
if (n <= 0) {
*tau = zero;
return;
}
T xnorm = 0;
for (int i = vm + 1; i < vm + n; i++) {
xnorm = xnorm + norm(A[i][vn]);
}
xnorm = sqrt(xnorm);
T alphr = A[vm][vn].real();
T alphi = A[vm][vn].imag();
if (xnorm == 0 && alphi == 0) {
*tau = zero;
} else {
T beta;
beta = sqrt(alphr * alphr + alphi * alphi + xnorm * xnorm);
if (A[vm][vn].real() > 0) {
beta = -beta;
}
*tau = complex<T>((beta - alphr) / beta, -alphi / beta);
A[vm][vn] = one / (A[vm][vn] - beta);
for (int i = vm + 1; i < vm + n; i++) {
A[i][vn] *= A[vm][vn];
}
A[vm][vn] = beta;
}
}
template <typename T>
void GeqrfCpuKernel::CLarf(int m, int n, complex<T> **A, complex<T> *tau, int cm, int cn) {
if (m <= 0 || n <= 0) {
return;
}
complex<T> zero = complex<T>(0, 0);
complex<T> *work = new complex<T>[n];
complex<T> temp = zero;
for (int j = 0; j < n; j++) {
for (int i = 0; i < m; i++) {
temp = temp + conj(A[i + cm][j + cn]) * A[cm + i][cn - 1];
}
work[j] = temp;
temp = zero;
}
for (int j = 0; j < n; j++) {
for (int i = 0; i < m; i++) {
A[i + cm][j + cn] = A[i + cm][j + cn] - conj(*tau) * A[cm + i][cn - 1] * conj(work[j]);
}
}
delete[] work;
}
template <typename T>
void GeqrfCpuKernel::CGeqrf(int m, int n, complex<T> **A, complex<T> *tau) {
if (m < 0 || n < 0) {
return;
}
int k = std::min(m, n);
complex<T> one = complex<T>(1, 0);
complex<T> aii;
for (int i = 0; i < k; i++) {
CLarfg<T>(m - i, i, i, A, (tau + i));
aii = A[i][i];
A[i][i] = one;
CLarf<T>(m - i, n - i - 1, A, tau + i, i, i + 1);
A[i][i] = aii;
}
}
template <typename T>
uint32_t GeqrfCpuKernel::DoCompute(CpuKernelContext &ctx) {
auto input0_tensor = ctx.Input(0);
auto input0_tensor_shape = input0_tensor->GetTensorShape();
int32_t dim = input0_tensor_shape->GetDims();
if (dim != kOutputNum) {
KERNEL_LOG_ERROR("The input matrix must have dimension = 2");
return KERNEL_STATUS_PARAM_INVALID;
}
std::vector<int64_t> input0_dims = input0_tensor_shape->GetDimSizes();
const int32_t m = input0_dims[0];
const int32_t n = input0_dims[1];
auto input_m = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_r = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto output_tau = reinterpret_cast<T *>(ctx.Output(1)->GetData());
T **A = new T *[m];
for (int i = 0; i < m; i++) {
A[i] = new T[n];
}
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
A[i][j] = *(input_m + i * n + j);
}
}
Geqrf<T>(m, n, A, output_tau);
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
*(output_r + i * n + j) = A[i][j];
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t GeqrfCpuKernel::DoComputeC(CpuKernelContext &ctx) {
auto input0_tensor = ctx.Input(0);
auto input0_tensor_shape = input0_tensor->GetTensorShape();
int32_t dim = input0_tensor_shape->GetDims();
if (dim != kOutputNum) {
KERNEL_LOG_ERROR("The input matrix must have dimension = 2");
return KERNEL_STATUS_PARAM_INVALID;
}
std::vector<int64_t> input0_dims = input0_tensor_shape->GetDimSizes();
const int32_t m = input0_dims[0];
const int32_t n = input0_dims[1];
auto input_m = reinterpret_cast<complex<T> *>(ctx.Input(0)->GetData());
auto output_r = reinterpret_cast<complex<T> *>(ctx.Output(0)->GetData());
auto output_tau = reinterpret_cast<complex<T> *>(ctx.Output(1)->GetData());
complex<T> **A = new complex<T> *[m];
for (int i = 0; i < m; i++) {
A[i] = new complex<T>[n];
}
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
A[i][j] = *(input_m + i * n + j);
}
}
CGeqrf<T>(m, n, A, output_tau);
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
*(output_r + i * n + j) = A[i][j];
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kGeqrf, GeqrfCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,55 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_GEQRF_H_
#define AICPU_KERNELS_NORMALIZED_GEQRF_H_
#include <complex>
#include "cpu_ops_kernel.h"
namespace aicpu {
class GeqrfCpuKernel : public CpuKernel {
public:
GeqrfCpuKernel() = default;
~GeqrfCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
void Larfg(int n, int vm, int vn, T **A, T *tau);
template <typename T>
void Larf(int m, int n, T **A, T *tau, int cm, int cn);
template <typename T>
void Geqrf(int m, int n, T **A, T *tau);
template <typename T>
void CLarfg(int n, int vm, int vn, std::complex<T> **A, std::complex<T> *tau);
template <typename T>
void CLarf(int m, int n, std::complex<T> **A, std::complex<T> *tau, int cm, int cn);
template <typename T>
void CGeqrf(int m, int n, std::complex<T> **A, std::complex<T> *tau);
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t DoComputeC(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_GEQRF_H_

View File

@ -0,0 +1,89 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "hard_sigmoid.h"
#include <algorithm>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *const kHardSigmoid = "HardSigmoid";
const int64_t kParallelDataNums = 16 * 1024;
const float alpha = 0.16666666;
const float beta = 0.5;
#define HARD_SIGMOID_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = HardSigmoidCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("HardSigmoid kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t HardSigmoidCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kHardSigmoid);
DataType data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
HARD_SIGMOID_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
HARD_SIGMOID_COMPUTE_CASE(DT_FLOAT, float, ctx)
HARD_SIGMOID_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("HardSigmoid kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t HardSigmoidCpuKernel::HardSigmoidCompute(const CpuKernelContext &ctx) {
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Input(0)->NumElements();
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
const T zero = static_cast<T>(0);
const T three = static_cast<T>(3);
const T six = static_cast<T>(6);
if (data_size <= kParallelDataNums) {
for (int64_t i = 0; i < data_num; i++) {
*(output_y + i) = std::min(std::max(*(input_x + i) + three, zero), six) / six;
}
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num > data_num) {
max_core_num = data_num;
}
int64_t perUnitSize = max_core_num > 0 ? data_num / max_core_num : data_num;
auto shard_hard_sigmoid = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
*(output_y + i) = std::min(std::max(*(input_x + i) + three, zero), six) / six;
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, perUnitSize, shard_hard_sigmoid),
"HardSigmoid Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kHardSigmoid, HardSigmoidCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_H
#define AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_H
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class HardSigmoidCpuKernel : public CpuKernel {
public:
HardSigmoidCpuKernel() = default;
~HardSigmoidCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t HardSigmoidCompute(const CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,95 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "hard_sigmoid_grad.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *const kHardSigmoidGrad = "HardSigmoidGrad";
const int64_t kParallelDataNums = 16 * 1024;
#define HARD_SIGMOID_GRAD_COMPUTE_CASE(DTYPE1, TYPE1, TYPE2, CTX) \
case (DTYPE1): { \
uint32_t result = HardSigmoidGradCompute<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("HardSigmoidGrad kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t HardSigmoidGradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kHardSigmoidGrad);
DataType grads_type = ctx.Input(0)->GetDataType();
DataType x_type = ctx.Input(1)->GetDataType();
if (grads_type != x_type) {
KERNEL_LOG_ERROR("HardSigmoidGrad kernel input[0] data type [%s] must be the same as input[1] data type [%s].",
DTypeStr(grads_type).c_str(), DTypeStr(x_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
switch (grads_type) {
HARD_SIGMOID_GRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, Eigen::half, ctx)
HARD_SIGMOID_GRAD_COMPUTE_CASE(DT_FLOAT, float, float, ctx)
HARD_SIGMOID_GRAD_COMPUTE_CASE(DT_DOUBLE, double, double, ctx)
default:
KERNEL_LOG_ERROR("HardSigmoidGrad kernel inputs data type [%s] not support.", DTypeStr(grads_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t HardSigmoidGradCpuKernel::HardSigmoidGradCompute(const CpuKernelContext &ctx) {
auto grads = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
auto input_x = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
auto y = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Input(1)->NumElements();
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T2));
const T2 zero = static_cast<T2>(0);
const T2 three = static_cast<T2>(3);
const T2 neg_three = static_cast<T2>(-3);
const T2 one_sixth = static_cast<T2>(1.0f / 6.0f);
if (data_size <= kParallelDataNums) {
for (int64_t i = 0; i < data_num; i++) {
*(y + i) =
(*(input_x + i) > neg_three && *(input_x + i) < three) ? static_cast<T2>(*(grads + i)) * one_sixth : zero;
}
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num > data_num) {
max_core_num = data_num;
}
int64_t perUnitSize = max_core_num > 0 ? data_num / max_core_num : data_num;
auto shard_hard_sigmoid_grad = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
*(y + i) =
(*(input_x + i) > neg_three && *(input_x + i) < three) ? static_cast<T2>(*(grads + i)) * one_sixth : zero;
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, perUnitSize, shard_hard_sigmoid_grad),
"HardSigmoidGrad Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kHardSigmoidGrad, HardSigmoidGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_GRAD_H
#define AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_GRAD_H
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class HardSigmoidGradCpuKernel : public CpuKernel {
public:
HardSigmoidGradCpuKernel() = default;
~HardSigmoidGradCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2>
uint32_t HardSigmoidGradCompute(const CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,237 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022.All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "heaviside.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kHeaviside = "Heaviside";
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
#define HEAVISIDE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = HeavisideCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Heaviside kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
template <typename T>
T heaviside(T a, T b) {
return a == static_cast<T>(0) ? b : static_cast<T>(a > static_cast<T>(0));
}
uint32_t HeavisideCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Heaviside check input and output number failed.");
KERNEL_HANDLE_ERROR(HeavisideParamCheck(ctx), "Heaviside check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
HEAVISIDE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
HEAVISIDE_COMPUTE_CASE(DT_FLOAT, float, ctx)
HEAVISIDE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
HEAVISIDE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
HEAVISIDE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
HEAVISIDE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
HEAVISIDE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
HEAVISIDE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
HEAVISIDE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
HEAVISIDE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
HEAVISIDE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
default:
KERNEL_LOG_ERROR("Heaviside kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t HeavisideCpuKernel::HeavisideParamCheck(CpuKernelContext &ctx) {
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"HeavisideCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t HeavisideCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type;
if (in0_elements_nums == in1_elements_nums) {
type = BcastShapeType::SAME_SHAPE;
} else {
type = (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
}
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_heaviside = [&](int64_t start, int64_t end) {
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = start; i < end; ++i) {
*(out + i) = heaviside<T>(*(in0 + i), *(in1 + i));
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(out + i) = heaviside<T>(*in0, *(in1 + i));
}
break;
case BcastShapeType::Y_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(out + i) = heaviside<T>(*(in0 + i), *in1);
}
break;
default:
KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_heaviside),
"Heaviside Compute failed.");
} else {
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
*(out + i) = heaviside<T>(*(in0 + i), *(in1 + i));
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
*(out + i) = heaviside<T>(*in0, *(in1 + i));
}
break;
case BcastShapeType::Y_ONE_ELEMENT:
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
*(out + i) = heaviside<T>(*(in0 + i), *in1);
}
break;
default:
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t HeavisideCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
T *in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
T *in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
T *out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_heaviside = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
*(out + i) = heaviside<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_heaviside),
"Heaviside Compute failed.");
} else {
for (int64_t i = 0; i < data_num; ++i) {
*(out + i) = heaviside<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t HeavisideCpuKernel::HeavisideCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (isNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kHeaviside, HeavisideCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,43 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022.All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_HEAVISIDE_H_
#define AICPU_KERNELS_NORMALIZED_HEAVISIDE_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class HeavisideCpuKernel : public CpuKernel {
public:
HeavisideCpuKernel() = default;
~HeavisideCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t HeavisideParamCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
uint32_t HeavisideCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -1,103 +0,0 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "is_inf.h"
#include "Eigen/Dense"
#include "unsupported/Eigen/CXX11/Tensor"
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
namespace {
const char *const kIsInf = "IsInf";
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
constexpr int64_t kParallelDataNumsFloat16 = 128 * 1024;
constexpr int64_t kParallelDataNumsFloat = 128 * 1024;
constexpr int64_t kParallelDataNumsDouble = 300 * 1024;
#define ISINF_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = IsInfCompute<TYPE>(CTX); \
if (result != static_cast<uint32_t>(KERNEL_STATUS_OK)) { \
KERNEL_LOG_ERROR("IsInf kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t IsInfCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kIsInf);
KERNEL_HANDLE_ERROR(IsInfCheck(ctx), "[%s] check params failed.", kIsInf);
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
ISINF_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
ISINF_COMPUTE_CASE(DT_FLOAT, float, ctx)
ISINF_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("IsInf kernel data type [%s] not supports.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
uint32_t IsInfCpuKernel::IsInfCheck(const CpuKernelContext &ctx) const {
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t IsInfCpuKernel::IsInfCompute(const CpuKernelContext &ctx) {
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output = reinterpret_cast<bool *>(ctx.Output(0)->GetData());
auto data_type = ctx.Input(0)->GetDataType();
int64_t data_num = ctx.Output(0)->NumElements();
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
if ((data_type == DT_FLOAT16 && data_size <= kParallelDataNumsFloat16) ||
(data_type == DT_FLOAT && data_size <= kParallelDataNumsFloat) ||
(data_type == DT_DOUBLE && data_size <= kParallelDataNumsDouble)) {
for (int64_t index = 0; index < data_num; index++) {
*(output + index) = Eigen::numext::isinf(*(input + index));
}
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto shard_isinf = [&](size_t start, size_t end) {
for (size_t index = start; index < end; index++) {
*(output + index) = Eigen::numext::isinf(*(input + index));
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_isinf),
"IsInf Compute failed.");
}
return static_cast<uint32_t>(KERNEL_STATUS_OK);
}
REGISTER_CPU_KERNEL(kIsInf, IsInfCpuKernel);
} // namespace aicpu

View File

@ -88,7 +88,6 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
mindspore::kScatterNdOpName,
mindspore::kScatterNdUpdateOpName,
mindspore::kTensorScatterUpdateOpName,
mindspore::kIsInfOpName,
mindspore::kIsNanOpName,
mindspore::kMatrixDeterminantOpName,
mindspore::kMatrixLogarithmOpName,
@ -145,7 +144,44 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
mindspore::kMulOpName,
mindspore::kConjOpName,
mindspore::kZerosLikeOpName,
mindspore::kMatrixBandPartOpName};
mindspore::kMatrixBandPartOpName,
mindspore::kDenseToCSRSparseMatrixOpName,
mindspore::kDenseToSparseSetOperation,
mindspore::kDiagOpName,
mindspore::kDiagonalOpName,
mindspore::kDiagPartOpName,
mindspore::kEigOpName,
mindspore::kEyeOpName,
mindspore::kMaximumOpName,
mindspore::kMinimumOpName,
mindspore::kFractionalAvgPoolOpName,
mindspore::kFractionalAvgPoolGradOpName,
mindspore::kFractionalMaxPoolOpName,
mindspore::kFractionalMaxPoolGradOpName,
mindspore::kFractionalMaxPoolGradWithFixedKsizeOpName,
mindspore::kGatherNdOpName,
mindspore::kGcdOpName,
mindspore::kGeqrfOpName,
mindspore::kHardSigmoidOpName,
mindspore::kHardSigmoidGradOpName,
mindspore::kHeavisideOpName,
mindspore::kHypotOpName,
mindspore::kIdentityNOpName,
mindspore::kIndexFillOpName,
mindspore::kKLDivOpName,
mindspore::kKlDivLossGradOpName,
mindspore::kLcmOpName,
mindspore::kLessEqualOpName,
mindspore::kLogicalXorOpName,
mindspore::kLogitOpName,
mindspore::kLogitGradOpName,
mindspore::kLogNormalReverseOpName,
mindspore::kLowerBoundOpName,
mindspore::kLstsqOpName,
mindspore::kLuUnpackOpName,
mindspore::kLuUnpackGradOpName,
mindspore::kMatMulOpName,
mindspore::kMatrixExpOpName};
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";

View File

@ -238,7 +238,42 @@ from .smooth_l1_loss import _smooth_l1_loss_aicpu
from .cumulative_logsumexp import _cumulative_logsumexp_aicpu
from .nuclear_norm import _nuclear_norm_aicpu
from .sparse_segment_sqrt_n import _sparse_segment_sqrt_n_aicpu
from .unsorted_segment_prod import _unsorted_segment_prod_aicpu
from .scale_and_translate import _scale_and_translate_aicpu
from .quant_dtype_cast import _quant_dtype_cast_aicpu
from .fse_decode import _fse_decode_aicpu
from .unsorted_segment_prod import _unsorted_segment_prod_aicpu
from .dense_to_csr_sparse_matrix import _dense_to_csr_sparse_matrix_aicpu
from .dense_to_sparse_set_operation import _dense_to_sparse_set_operation_aicpu
from .diag import _diag_aicpu
from .diagonal import _diagonal_aicpu
from .diag_part import _diag_part_aicpu
from .eig import _eig_aicpu
from .eye import _eye_aicpu
from .fmax import _fmax_aicpu
from .fmin import _fmin_aicpu
from .fractional_avg_pool import _fractional_avg_pool_aicpu
from .fractional_avg_pool_grad import _fractional_avg_pool_grad_aicpu
from .fractional_max_pool import _fractional_max_pool_aicpu
from .fractional_max_pool_grad import _fractional_max_pool_grad_aicpu
from .fractional_max_pool_grad_with_fixed_ksize import _fractional_max_pool_grad_with_fixed_ksize_aicpu
from .gcd import _gcd_aicpu
from .geqrf import _geqrf_aicpu
from .hard_sigmoid import _hard_sigmoid_aicpu
from .hard_sigmoid_grad import _hard_sigmoid_grad_aicpu
from .heaviside import _heaviside_aicpu
from .hypot import _hypot_aicpu
from .identity_n import _identity_n_aicpu
from .index_fill import _index_fill_aicpu
from .kldivloss import _kldiv_loss_aicpu
from .kldivlossgrad import _kldiv_loss_grad_aicpu
from .lcm import _lcm_aicpu
from .less_equal import _less_equal_aicpu
from .logical_xor import _logical_xor_aicpu
from .logit import _logit_aicpu
from .logit_grad import _logit_grad_aicpu
from .log_normal_reverse import _log_normal_reverse_aicpu
from .lower_bound import _lower_bound_aicpu
from .lstsq import _lstsq_aicpu
from .lu_unpack import _lu_unpack_aicpu
from .lu_unpack_grad import _lu_unpack_grad_aicpu
from .matrix_exp import _matrix_exp_aicpu