forked from mindspore-Ecosystem/mindspore
0103 aicpu migration first half
This commit is contained in:
parent
a023825aae
commit
540665dbbc
|
@ -100,7 +100,8 @@
|
|||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "zerodivcond"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noConstructor"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noExplicitConstructor"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "identicalConditionAfterEarlyExit"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitMemberVar"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization"
|
||||
|
|
|
@ -292,30 +292,15 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
|
|||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd_update.cc:aicpu::ScatterNdUpdateCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_sparse.cc:aicpu::RaggedTensorToSparseCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_tensor.cc:aicpu::RaggedTensorToTensorCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::GenerateRandomCrop
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc:aicpu::ScatterNdCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool.cc:aicpu::SpacialMaxPool
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/slice.cc:aicpu::SliceCpuKernel::SliceCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
|
||||
|
@ -323,11 +308,26 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
|
|||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradC
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradComputeFP16
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP16
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_band_part.cc:aicpu::MatrixBandPartCpuKernel::BandCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::SpecialComputeSameShape
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::SpecialComputeXOneElement
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::SpecialComputeYOneElement
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::BcastComputeMultiKernel
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/minimum.cc:aicpu::MinimumCpuKernel::BcastComputeOneKernel
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::SpecialComputeSameShape
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::SpecialComputeXOneElement
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::SpecialComputeYOneElement
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::BcastComputeMultiKernel
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maximum.cc:aicpu::MaximumCpuKernel::BcastComputeOneKernel
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/lu_unpack.cc:aicpu::LuUnpackCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool_grad.cc:aicpu::FractionalMaxPoolGradCpuKernel::DoCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool_grad.cc:aicpu::FractionalAvgPoolGradCpuKernel::DoCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_max_pool.cc:aicpu::FractionalMaxPoolCpuKernel::DoCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/fractional_avg_pool.cc:aicpu::FractionalAvgPoolCpuKernel::DoCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/densetosparsesetoperation.cc:aicpu::DenseToSparseSetOperationCpuKernel::ComputeDenseToSparse
|
||||
|
|
|
@ -226,6 +226,7 @@ constexpr auto kCumulativeLogsumexpOpName = "CumulativeLogsumexp";
|
|||
constexpr auto kCumulativeLogsumexpDOpName = "CumulativeLogsumexpD";
|
||||
constexpr auto kDataFormatVecPermuteOpName = "DataFormatVecPermute";
|
||||
constexpr auto kDeadNodeName = "DeadNode";
|
||||
constexpr auto kDenseToCSRSparseMatrixOpName = "DenseToCSRSparseMatrix";
|
||||
constexpr auto kDenseToDenseSetOperation = "DenseToDenseSetOperation";
|
||||
constexpr auto kDenseToSparseSetOperation = "DenseToSparseSetOperation";
|
||||
constexpr auto kDepthwiseConv2dNativeBackpropFilterOpName = "DepthwiseConv2dNativeBackpropFilter";
|
||||
|
@ -242,7 +243,9 @@ constexpr auto kDiagPartOpName = "DiagPart";
|
|||
constexpr auto kDiagPartDOpName = "DiagPartD";
|
||||
constexpr auto kDiagOpName = "Diag";
|
||||
constexpr auto kDiagDOpName = "DiagD";
|
||||
constexpr auto kDiagonalOpName = "Diagonal";
|
||||
constexpr auto kDivOpName = "Div";
|
||||
constexpr auto kDivNoNanOpName = "DivNoNan";
|
||||
constexpr auto kDropoutDoMaskOpName = "DropoutDoMask";
|
||||
constexpr auto kDropOutDoMaskOpName = "DropOutDoMask";
|
||||
constexpr auto kDropoutDoMaskV3OpName = "DropoutDoMaskV3";
|
||||
|
@ -257,6 +260,7 @@ constexpr auto kDynamicAtomicAddrCleanOpName = "DynamicAtomicAddrClean";
|
|||
constexpr auto kDynamicGRUV2OpName = "DynamicGRUV2";
|
||||
constexpr auto kDynamicRNNOpName = "DynamicRNN";
|
||||
constexpr auto kDynamicStitchOpName = "DynamicStitch";
|
||||
constexpr auto kEigOpName = "Eig";
|
||||
constexpr auto kEmbeddingLookupCommGradOpName = "EmbeddingLookupCommGrad";
|
||||
constexpr auto kEmbeddingLookupOpName = "EmbeddingLookup";
|
||||
constexpr auto kEmbeddingLookupProxyOpName = "EmbeddingLookupProxy";
|
||||
|
@ -293,7 +297,12 @@ constexpr auto kFive2FourOpName = "Five2Four";
|
|||
constexpr auto kFlattenGradOpName = "FlattenGrad";
|
||||
constexpr auto kFloorDivOpName = "FloorDiv";
|
||||
constexpr auto kFour2FiveOpName = "Four2Five";
|
||||
constexpr auto kFractionalAvgPoolOpName = "FractionalAvgPool";
|
||||
constexpr auto kFractionalAvgPoolGradOpName = "FractionalAvgPoolGrad";
|
||||
constexpr auto kFractionalMaxPoolOpName = "FractionalMaxPool";
|
||||
constexpr auto kFractionalMaxPoolGradOpName = "FractionalMaxPoolGrad";
|
||||
constexpr auto kFractionalMaxPoolGradWithFixedKsizeOpName = "FractionalMaxPoolGradWithFixedKsize";
|
||||
constexpr auto kFractionalMaxPoolWithFixedKsizeOpName = "FractionalMaxPoolWithFixedKsize";
|
||||
constexpr auto kFusedAdaFactorName = "FusedAdaFactor";
|
||||
constexpr auto kFusedAdaFactorWithGlobalNormName = "FusedAdaFactorWithGlobalNorm";
|
||||
constexpr auto kFusedAdamName = "FusedAdam";
|
||||
|
@ -327,10 +336,12 @@ constexpr auto kGatherOpName = "Gather";
|
|||
constexpr auto kGatherNdOpName = "GatherNd";
|
||||
constexpr auto kGatherV2OpName = "GatherV2";
|
||||
constexpr auto kGatherV2DOpName = "GatherV2D";
|
||||
constexpr auto kGcdOpName = "Gcd";
|
||||
constexpr auto kGeLUOpName = "GeLU";
|
||||
constexpr auto kGeluOpName = "Gelu";
|
||||
constexpr auto kGeLUGradOpName = "GeLUGrad";
|
||||
constexpr auto kGeluGradOpName = "GeluGrad";
|
||||
constexpr auto kGeqrfOpName = "Geqrf";
|
||||
constexpr auto kGetNextOpName = "GetNext";
|
||||
constexpr auto kGreaterEqualOpName = "GreaterEqual";
|
||||
constexpr auto kGreaterOpName = "Greater";
|
||||
|
@ -346,13 +357,21 @@ constexpr auto kHSigmoidOpName = "HSigmoid";
|
|||
constexpr auto kHardSigmoidOpName = "HardSigmoid";
|
||||
constexpr auto kHSigmoidGradOpName = "HSigmoidGrad";
|
||||
constexpr auto kHardSigmoidGradOpName = "HardSigmoidGrad";
|
||||
constexpr auto kHSVToRGBOpName = "HSVToRGB";
|
||||
constexpr auto kHSwishOpName = "HSwish";
|
||||
constexpr auto kHardSwishOpName = "HardSwish";
|
||||
constexpr auto kHistogramDOpName = "HistogramD";
|
||||
constexpr auto kHSwishGradOpName = "HSwishGrad";
|
||||
constexpr auto kHardSwishGradOpName = "HardSwishGrad";
|
||||
constexpr auto kHeavisideOpName = "Heaviside";
|
||||
constexpr auto kHostAllGatherOpName = "HostAllGather";
|
||||
constexpr auto kHostReduceScatterOpName = "HostReduceScatter";
|
||||
constexpr auto kHypotOpName = "Hypot";
|
||||
constexpr auto kIdentityNOpName = "IdentityN";
|
||||
constexpr auto kIgammaOpName = "Igamma";
|
||||
constexpr auto kIgammacOpName = "Igammac";
|
||||
constexpr auto kIgammaGradAOpName = "IgammaGradA";
|
||||
constexpr auto kIndexFillOpName = "IndexFill";
|
||||
constexpr auto kInitDatasetQueueOpName = "InitDataSetQueue";
|
||||
constexpr auto kIOUOpName = "IOU";
|
||||
constexpr auto kIouOpName = "Iou";
|
||||
|
@ -369,7 +388,6 @@ constexpr auto kInstanceNormV2OpName = "InstanceNormV2";
|
|||
constexpr auto kInstanceNormV2GradOpName = "InstanceNormV2Grad";
|
||||
constexpr auto kInTopKOpName = "InTopK";
|
||||
constexpr auto kInTopKDOpName = "InTopKD";
|
||||
constexpr auto kIsInfOpName = "IsInf";
|
||||
constexpr auto kIsNanOpName = "IsNan";
|
||||
constexpr auto kKLDivLossOpName = "KLDivLoss";
|
||||
constexpr auto kKLDivOpName = "KLDiv";
|
||||
|
@ -395,6 +413,7 @@ constexpr auto kLayerNormBetaGammaBackpropV2OpName = "LayerNormBetaGammaBackprop
|
|||
constexpr auto kLayerNormGradOpName = "LayerNormGrad";
|
||||
constexpr auto kLayerNormXBackpropOpName = "LayerNormXBackprop";
|
||||
constexpr auto kLayerNormXBackpropV2OpName = "LayerNormXBackpropV2";
|
||||
constexpr auto kLcmOpName = "Lcm";
|
||||
constexpr auto kLessEqualOpName = "LessEqual";
|
||||
constexpr auto kLessOpName = "Less";
|
||||
constexpr auto kLinSpaceOpName = "LinSpace";
|
||||
|
@ -403,14 +422,21 @@ constexpr auto kListDiffOpName = "ListDiff";
|
|||
constexpr auto kLogMatrixDeterminantOpName = "LogMatrixDeterminant";
|
||||
constexpr auto kLogOpName = "Log";
|
||||
constexpr auto kLog1pOpName = "Log1p";
|
||||
constexpr auto kLogicalXorOpName = "LogicalXor";
|
||||
constexpr auto kLogitOpName = "Logit";
|
||||
constexpr auto kLogitGradOpName = "LogitGrad";
|
||||
constexpr auto kLogNormalReverseOpName = "LogNormalReverse";
|
||||
constexpr auto kLogSoftmaxOpName = "LogSoftmax";
|
||||
constexpr auto kLogSoftmaxV2OpName = "LogSoftmaxV2";
|
||||
constexpr auto kLogSoftmaxGradOpName = "LogSoftmaxGrad";
|
||||
constexpr auto kLowerBoundOpName = "LowerBound";
|
||||
constexpr auto kLpNormOpName = "LpNorm";
|
||||
constexpr auto kLSTMGradOpName = "LSTMGrad";
|
||||
constexpr auto kLSTMInputGradOpName = "LSTMInputGrad";
|
||||
constexpr auto kLSTMOpName = "LSTM";
|
||||
constexpr auto kLstsqOpName = "Lstsq";
|
||||
constexpr auto kLuUnpackOpName = "LuUnpack";
|
||||
constexpr auto kLuUnpackGradOpName = "LuUnpackGrad";
|
||||
constexpr auto kMaskedFillOpName = "MaskedFill";
|
||||
constexpr auto kMaskedSelectOpName = "MaskedSelect";
|
||||
constexpr auto kMaskedSelectGradOpName = "MaskedSelectGrad";
|
||||
|
@ -423,6 +449,7 @@ constexpr auto kMatrixDiagDOpName = "MatrixDiagD";
|
|||
constexpr auto kMatrixDiagPartOpName = "MatrixDiagPart";
|
||||
constexpr auto kMatrixDiagPartDOpName = "MatrixDiagPartD";
|
||||
constexpr auto kMatrixDiagPartV3OpName = "MatrixDiagPartV3";
|
||||
constexpr auto kMatrixExpOpName = "MatrixExp";
|
||||
constexpr auto kMatrixLogarithmOpName = "MatrixLogarithm";
|
||||
constexpr auto kMatrixSetDiagOpName = "MatrixSetDiag";
|
||||
constexpr auto kMatrixSetDiagDOpName = "MatrixSetDiagD";
|
||||
|
|
|
@ -0,0 +1,150 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "dense_to_csr_sparse_matrix.h"
|
||||
#include <complex>
|
||||
#include <numeric>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 2;
|
||||
const uint32_t kOutputNum = 5;
|
||||
const char *DenseToCSRSparseMatrix = "DenseToCSRSparseMatrix";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t DenseToCSRSparseMatrixCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "DenseToCSRSparseMatrix normal check failed.");
|
||||
DataType value_type = ctx.Input(0)->GetDataType();
|
||||
DataType indice_type = ctx.Input(1)->GetDataType();
|
||||
uint32_t status;
|
||||
switch (indice_type) {
|
||||
case DT_INT32:
|
||||
switch (value_type) {
|
||||
case DT_FLOAT:
|
||||
status = ComputeKernel<int32_t, float>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
status = ComputeKernel<int32_t, double>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
status = ComputeKernel<int32_t, std::complex<float>>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
status = ComputeKernel<int32_t, std::complex<double>>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("DenseToCSRSparseMatrix value type [%s] not support.", DTypeStr(value_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
case DT_INT64:
|
||||
switch (value_type) {
|
||||
case DT_FLOAT:
|
||||
status = ComputeKernel<int64_t, float>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
status = ComputeKernel<int64_t, double>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
status = ComputeKernel<int64_t, std::complex<float>>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
status = ComputeKernel<int64_t, std::complex<double>>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("DenseToCSRSparseMatrix value type [%s] not support.", DTypeStr(value_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("DenseToCSRSparseMatrix indices type [%s] not support.", DTypeStr(indice_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(status, "DenseToCSRSparseMatrix kernel compute failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(DenseToCSRSparseMatrix, DenseToCSRSparseMatrixCpuKernel);
|
||||
|
||||
template <typename indiceT, typename valueT>
|
||||
uint32_t DenseToCSRSparseMatrixCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
|
||||
auto dense_input_ptr = reinterpret_cast<valueT *>(ctx.Input(0)->GetData());
|
||||
auto indices_ptr = reinterpret_cast<indiceT *>(ctx.Input(1)->GetData());
|
||||
auto y_dense_shape_ptr = reinterpret_cast<indiceT *>(ctx.Output(0)->GetData());
|
||||
auto y_batch_pointers_ptr = reinterpret_cast<indiceT *>(ctx.Output(1)->GetData());
|
||||
auto y_row_pointers_ptr = reinterpret_cast<indiceT *>(ctx.Output(2)->GetData());
|
||||
auto y_col_indices_ptr = reinterpret_cast<indiceT *>(ctx.Output(3)->GetData());
|
||||
auto y_values_ptr = reinterpret_cast<valueT *>(ctx.Output(4)->GetData());
|
||||
// Copy the CSRSparseMatrix's dense_shape and values from the Dense.
|
||||
const int64_t rank = ctx.Input(1)->GetTensorShape()->GetDimSize(1);
|
||||
const int64_t total_nnz = ctx.Input(1)->GetTensorShape()->GetDimSize(0);
|
||||
const int64_t batch_size = (rank == 2) ? 1 : ctx.Input(0)->GetTensorShape()->GetDimSize(0);
|
||||
const int64_t num_rows = ctx.Input(0)->GetTensorShape()->GetDimSize((rank == 2) ? 0 : 1);
|
||||
const int64_t num_cols = ctx.Input(0)->GetTensorShape()->GetDimSize((rank == 2) ? 1 : 2);
|
||||
for (int64_t i = 0; i < rank; i++) {
|
||||
y_dense_shape_ptr[i] = ctx.Input(0)->GetTensorShape()->GetDimSize(i);
|
||||
}
|
||||
for (int64_t i = 0; i < total_nnz; i++) {
|
||||
if (rank == 2) {
|
||||
int64_t cur_idx = indices_ptr[i * rank] * num_cols + indices_ptr[i * rank + 1];
|
||||
y_values_ptr[i] = dense_input_ptr[cur_idx];
|
||||
} else {
|
||||
int64_t cur_idx = indices_ptr[i * rank] * num_rows * num_cols;
|
||||
cur_idx = cur_idx + indices_ptr[i * rank + 1] * num_cols + indices_ptr[i * rank + 2];
|
||||
y_values_ptr[i] = dense_input_ptr[cur_idx];
|
||||
}
|
||||
}
|
||||
for (int64_t i = 0; i < batch_size * (num_rows + 1); i++) {
|
||||
y_row_pointers_ptr[i] = 0;
|
||||
}
|
||||
int prev_batch = -1;
|
||||
if (rank == 2) {
|
||||
// For a single batch, the batch_ptrs are {0, total_nnz}.
|
||||
y_batch_pointers_ptr[0] = 0;
|
||||
++prev_batch;
|
||||
for (int64_t i = 0; i < total_nnz; ++i) {
|
||||
// For now, the rows pointers store the corresponding row counts.
|
||||
y_row_pointers_ptr[indices_ptr[i * rank] + 1] += 1;
|
||||
y_col_indices_ptr[i] = indices_ptr[i * rank + 1];
|
||||
}
|
||||
} else { // rank == 3
|
||||
for (int64_t i = 0; i < total_nnz; ++i) {
|
||||
const int cur_batch = indices_ptr[i * rank];
|
||||
// For now, the rows pointers store the corresponding row counts.
|
||||
y_row_pointers_ptr[cur_batch * (num_rows + 1) + indices_ptr[i * rank + 1] + 1] += 1;
|
||||
y_col_indices_ptr[i] = indices_ptr[i * rank + 2];
|
||||
// We're at a new batch and might have skipped over empty batches.
|
||||
while (prev_batch < cur_batch) {
|
||||
// The previous batch ends at position i.
|
||||
y_batch_pointers_ptr[prev_batch + 1] = i;
|
||||
++prev_batch;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Set the last element of batch_ptr and account for trailing empty batches.
|
||||
while (prev_batch < batch_size) {
|
||||
y_batch_pointers_ptr[prev_batch + 1] = total_nnz;
|
||||
++prev_batch;
|
||||
}
|
||||
// Compute the cumulative row counts for each batch.
|
||||
for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
|
||||
auto *row_ptr_batch = y_row_pointers_ptr + batch_idx * (num_rows + 1);
|
||||
std::partial_sum(row_ptr_batch, row_ptr_batch + num_rows + 1, row_ptr_batch);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_DENSE_TO_CSR_SPARSE_MATRIX_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_DENSE_TO_CSR_SPARSE_MATRIX_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class DenseToCSRSparseMatrixCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~DenseToCSRSparseMatrixCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename indiceT, typename valueT>
|
||||
uint32_t ComputeKernel(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,429 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "densetosparsesetoperation.h"
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
#include <numeric>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/allocator_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
|
||||
namespace {
|
||||
const char *kDenseToSparseSetOperation = "DenseToSparseSetOperation";
|
||||
const uint32_t kOutputNum = 3;
|
||||
const uint32_t kInputNum = 4;
|
||||
constexpr int64_t kIndex0 = 0;
|
||||
constexpr int64_t kIndex1 = 1;
|
||||
constexpr int64_t kIndex2 = 2;
|
||||
constexpr int64_t kIndex3 = 3;
|
||||
const int64_t kParallelNum{64};
|
||||
} // namespace
|
||||
// 定义命名空间aicpu
|
||||
namespace aicpu {
|
||||
const std::vector<int64_t> Strides(const std::vector<int64_t> &shape) {
|
||||
std::vector<int64_t> result(shape.size());
|
||||
int64_t product = 1;
|
||||
for (int64_t i = static_cast<int64_t>(shape.size()) - 1; i >= 0; --i) {
|
||||
result[i] = product;
|
||||
product *= shape[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
uint32_t GroupShape(const std::vector<int64_t> input_shape, std::vector<int64_t> &grouped_shape) {
|
||||
if (input_shape.size() < 2) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
// grouped_shape is input_shape[:-1]
|
||||
grouped_shape.assign(input_shape.begin(), input_shape.end() - 1);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t CheckShapesMatch(const std::vector<int64_t> &shape1, const std::vector<int64_t> &shape2) {
|
||||
if (shape1.size() != shape2.size()) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
for (size_t i = 0; i < shape1.size(); i++) {
|
||||
if (shape1[i] != shape2[i]) return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t GroupShapeFromInputs(const std::vector<int64_t> &shape1, const std::vector<int64_t> &shape2,
|
||||
std::vector<int64_t> &group_shape) {
|
||||
std::vector<int64_t> group_shape_1;
|
||||
KERNEL_HANDLE_ERROR(GroupShape(shape1, group_shape_1), "X1_Shape rank is less than 2.");
|
||||
std::vector<int64_t> group_shape_2;
|
||||
KERNEL_HANDLE_ERROR(GroupShape(shape2, group_shape_2), "X2_Shape rank is less than 2.");
|
||||
KERNEL_HANDLE_ERROR(CheckShapesMatch(group_shape_1, group_shape_2), "Two shapes mismatch with each other.");
|
||||
group_shape.assign(group_shape_1.begin(), group_shape_1.end());
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t GetNumElements(const std::vector<int64_t> input_shape, int64_t &res) {
|
||||
int64_t result = 1;
|
||||
for (uint32_t i = 0; i < input_shape.size(); i++) {
|
||||
KERNEL_CHECK_FALSE(MulWithoutOverflow(input_shape[i], result, result), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Overflow when calculate shape size.");
|
||||
}
|
||||
res = result;
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
void DenseToSparseSetOperationCpuKernel::PopulateGroupIndices(const int64_t flat_group_index,
|
||||
const std::vector<int64_t> &group_shape,
|
||||
std::vector<int64_t> &group_indices) {
|
||||
group_indices.clear();
|
||||
int64_t running_flat_group_index = flat_group_index;
|
||||
for (int64_t group_dim_index = static_cast<int64_t>(group_shape.size()) - 1; group_dim_index >= 0;
|
||||
--group_dim_index) {
|
||||
const auto group_dim = group_shape[group_dim_index];
|
||||
group_indices.insert(group_indices.begin(), running_flat_group_index % group_dim);
|
||||
running_flat_group_index /= group_dim;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t DenseToSparseSetOperationCpuKernel::PopulateFromDenseGroup(Tensor *input_tensor,
|
||||
const std::vector<int64_t> &input_strides,
|
||||
const std::vector<int64_t> &group_indices,
|
||||
std::set<T> &result) {
|
||||
result.clear();
|
||||
EigenTensor input_tensor_eigen(input_tensor, input_tensor->GetData());
|
||||
auto input_flat = input_tensor_eigen.flat<T>();
|
||||
const auto start = std::inner_product(group_indices.begin(), group_indices.end(), input_strides.begin(), 0LL);
|
||||
auto input_shape = input_tensor->GetTensorShape();
|
||||
const auto end = start + input_shape->GetDimSize(input_shape->GetDims() - 1);
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
result.insert(input_flat(i));
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t DenseToSparseSetOperationCpuKernel::PopulateFromSparseGroup(const Group &group,
|
||||
const std::vector<int64_t> &sparse_tensor_shape,
|
||||
std::set<T> &result) {
|
||||
KERNEL_HANDLE_ERROR(CheckGroup<T>(group, sparse_tensor_shape), "PopulateFromSparseGroup check error.");
|
||||
result.clear();
|
||||
const auto &group_values = group.values<T>();
|
||||
for (int64_t i = 0; i < group_values.size(); ++i) {
|
||||
result.insert(group_values(i));
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t DenseToSparseSetOperationCpuKernel::CheckGroup(const Group &group,
|
||||
const std::vector<int64_t> &sparse_tensor_shape) {
|
||||
const auto &indices = group.indices();
|
||||
const auto &values = group.values<T>();
|
||||
const auto num_values = values.dimension(0);
|
||||
|
||||
// Sanity check: valid indices.
|
||||
const uint32_t expected_rank = sparse_tensor_shape.size();
|
||||
for (uint32_t j = 0; j < expected_rank; ++j) {
|
||||
const auto dim_size = sparse_tensor_shape[j];
|
||||
KERNEL_CHECK_FALSE(dim_size > 0, KERNEL_STATUS_PARAM_INVALID, "Invalid dim_size [%d] for index [%d]", dim_size, j);
|
||||
for (int64_t i = 0; i < num_values; ++i) {
|
||||
const auto index = indices(i, j);
|
||||
KERNEL_CHECK_FALSE(dim_size > index, KERNEL_STATUS_PARAM_INVALID,
|
||||
"indices index ([%d],[%d]) expected < [%d], got [%d].", i, j, dim_size, index);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void DenseToSparseSetOperationCpuKernel::ApplySetOperation(const std::set<T> &set1, const std::set<T> &set2,
|
||||
std::set<T> &result, SetOperation set_operation_) {
|
||||
switch (set_operation_) {
|
||||
case A_MINUS_B:
|
||||
std::set_difference(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
|
||||
break;
|
||||
case B_MINUS_A:
|
||||
std::set_difference(set2.begin(), set2.end(), set1.begin(), set1.end(), std::inserter(result, result.begin()));
|
||||
break;
|
||||
case INTERSECTION:
|
||||
std::set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
|
||||
break;
|
||||
case UNION:
|
||||
std::set_union(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t DenseToSparseSetOperationCpuKernel::OutputSparseTensor(
|
||||
DataBank &databank, const std::vector<int64_t> &output_shape, const int64_t num_values,
|
||||
const std::map<std::vector<int64_t>, std::set<T>> &sets) {
|
||||
Tensor *out_indices, *out_values, *out_shape;
|
||||
out_indices = databank.result_indices;
|
||||
out_values = databank.result_values;
|
||||
out_shape = databank.result_shape;
|
||||
|
||||
EigenTensor out_indices_t(out_indices, out_indices->GetData());
|
||||
auto out_indices_mat = out_indices_t.matrix<int64_t>();
|
||||
EigenTensor out_values_t(out_values, out_values->GetData());
|
||||
auto out_values_flat = out_values_t.vec<T>();
|
||||
EigenTensor out_shape_t(out_shape, out_shape->GetData());
|
||||
auto out_shape_flat = out_shape_t.vec<int64_t>();
|
||||
|
||||
int64_t value_index = 0;
|
||||
for (auto it = sets.begin(); it != sets.end(); ++it) {
|
||||
const auto &group_indices = it->first;
|
||||
KERNEL_CHECK_FALSE(group_indices.size() == output_shape.size() - 1, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Invalid number of indices [%d] expected [%].", group_indices.size(), output_shape.size() - 1)
|
||||
const auto &set = it->second;
|
||||
|
||||
// For each set item, write its indices and value to output tensors.
|
||||
int64_t group_value_index = 0;
|
||||
for (auto value = set.begin(); value != set.end(); ++value, ++value_index, ++group_value_index) {
|
||||
// First n-1 dimensions are the group, last dimension is the position in
|
||||
// the set.
|
||||
for (uint32_t i = 0; i < group_indices.size(); ++i) {
|
||||
out_indices_mat(value_index, i) = group_indices[i];
|
||||
}
|
||||
out_indices_mat(value_index, group_indices.size()) = group_value_index;
|
||||
|
||||
out_values_flat(value_index) = *value;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < output_shape.size(); ++i) {
|
||||
out_shape_flat(i) = output_shape[i];
|
||||
}
|
||||
|
||||
out_indices->GetTensorShape()->SetDimSizes({num_values, static_cast<int64_t>(output_shape.size())});
|
||||
out_values->GetTensorShape()->SetDimSizes({num_values});
|
||||
out_shape->GetTensorShape()->SetDimSizes({static_cast<int64_t>(output_shape.size())});
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t DenseToSparseSetOperationCpuKernel::NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &databank) {
|
||||
databank.set1 = ctx.Input(kIndex0);
|
||||
databank.set2_indices = ctx.Input(kIndex1);
|
||||
databank.set2_values = ctx.Input(kIndex2);
|
||||
databank.set2_shape = ctx.Input(kIndex3);
|
||||
databank.result_indices = ctx.Output(kIndex0);
|
||||
databank.result_values = ctx.Output(kIndex1);
|
||||
databank.result_shape = ctx.Output(kIndex2);
|
||||
databank.ctx = &ctx;
|
||||
AttrValue *validate_indices = ctx.GetAttr("validate_indices");
|
||||
if (validate_indices == nullptr) {
|
||||
databank.validate_indices_ = true;
|
||||
} else {
|
||||
databank.validate_indices_ = validate_indices->GetBool();
|
||||
}
|
||||
AttrValue *set_operation = ctx.GetAttr("set_operation");
|
||||
KERNEL_CHECK_NULLPTR(set_operation, KERNEL_STATUS_PARAM_INVALID, "Missing set_operation.")
|
||||
std::string set_operation_str = set_operation->GetString();
|
||||
std::transform(set_operation_str.begin(), set_operation_str.end(), set_operation_str.begin(), ::tolower);
|
||||
if ("a-b" == set_operation_str) {
|
||||
databank.set_operation_ = A_MINUS_B;
|
||||
} else if ("b-a" == set_operation_str) {
|
||||
databank.set_operation_ = B_MINUS_A;
|
||||
} else if ("intersection" == set_operation_str) {
|
||||
databank.set_operation_ = INTERSECTION;
|
||||
} else if ("union" == set_operation_str) {
|
||||
databank.set_operation_ = UNION;
|
||||
} else {
|
||||
KERNEL_LOG_ERROR("Invalid set_operation.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t DenseToSparseSetOperationCpuKernel::ComputeDenseToSparse(DataBank &databank) {
|
||||
EigenTensor set2_shape_e(databank.set2_shape, databank.set2_shape->GetData());
|
||||
auto set2_shape = set2_shape_e.vec<int64_t>();
|
||||
std::vector<int64_t> shape2(set2_shape.size());
|
||||
for (int64_t i = 0; i < set2_shape.size(); ++i) {
|
||||
shape2[i] = set2_shape(i);
|
||||
}
|
||||
const auto rank = shape2.size();
|
||||
std::vector<int64_t> order(rank);
|
||||
std::iota(order.begin(), order.end(), 0);
|
||||
SparseTensor set2;
|
||||
|
||||
Tensor *set1_t = databank.set1;
|
||||
SparseTensor *set2_st = &set2;
|
||||
KERNEL_HANDLE_ERROR(set2_st->CreateSparseTensor(databank.set2_indices, databank.set2_values, shape2, order),
|
||||
"create sparse tenser fail.");
|
||||
if (databank.validate_indices_) {
|
||||
KERNEL_HANDLE_ERROR(set2_st->IndicesValid(*databank.ctx), "IndicesValid fail!!");
|
||||
}
|
||||
std::vector<int64_t> group_shape;
|
||||
const auto shape1 = set1_t->GetTensorShape()->GetDimSizes();
|
||||
|
||||
KERNEL_HANDLE_ERROR(GroupShapeFromInputs(shape1, shape2, group_shape), "GroupShapeFromInputs error.");
|
||||
const std::vector<int64_t> set1_strides = Strides(shape1);
|
||||
std::map<std::vector<int64_t>, std::set<T>> group_sets;
|
||||
int64_t num_result_values = 0;
|
||||
int64_t max_set_size = 0;
|
||||
int64_t num_elements;
|
||||
KERNEL_HANDLE_ERROR(GetNumElements(group_shape, num_elements), "NumElements error.");
|
||||
if (num_elements <= kParallelNum) {
|
||||
std::set<T> set1_group_set;
|
||||
std::set<T> set2_group_set;
|
||||
const std::vector<int64_t> subspan(order.begin(), order.end() - 1);
|
||||
auto set2_grouper = set2_st->group(subspan);
|
||||
auto set2_group_it = set2_grouper.begin();
|
||||
std::vector<int64_t> group_indices;
|
||||
for (int64_t flat_group_index = 0; flat_group_index < num_elements; ++flat_group_index) {
|
||||
PopulateGroupIndices(flat_group_index, group_shape, group_indices);
|
||||
|
||||
// Get values from set1.
|
||||
PopulateFromDenseGroup<T>(set1_t, set1_strides, group_indices, set1_group_set);
|
||||
// Get values from set2, if applicable.
|
||||
set2_group_set.clear();
|
||||
if (set2_group_it != set2_grouper.end()) {
|
||||
const auto &group = *set2_group_it;
|
||||
const auto set2_group_indices = group.group();
|
||||
bool group_match = true;
|
||||
for (uint32_t i = 0; group_match && (i < set2_group_indices.size()); ++i) {
|
||||
if (set2_group_indices[i] != group_indices[i]) {
|
||||
group_match = false;
|
||||
}
|
||||
}
|
||||
if (group_match) {
|
||||
KERNEL_HANDLE_ERROR(PopulateFromSparseGroup<T>(group, shape2, set2_group_set),
|
||||
"PopulateFromSparseGroup error.");
|
||||
++set2_group_it;
|
||||
}
|
||||
}
|
||||
|
||||
std::set<T> group_set;
|
||||
ApplySetOperation(set1_group_set, set2_group_set, group_set, databank.set_operation_);
|
||||
if (!group_set.empty()) {
|
||||
group_sets[group_indices] = group_set;
|
||||
int64_t set_size = group_set.size();
|
||||
if (set_size > max_set_size) {
|
||||
max_set_size = set_size;
|
||||
}
|
||||
num_result_values += set_size;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::mutex mt;
|
||||
int64_t total = num_elements;
|
||||
uint32_t cores = CpuKernelUtils::GetCPUNum(*databank.ctx);
|
||||
int64_t per_unit_size = (total / std::min(std::max(1L, cores - 2L), total));
|
||||
uint32_t ret =
|
||||
CpuKernelUtils::ParallelFor(*databank.ctx, total, per_unit_size, [&](int64_t begin, int64_t end) -> uint32_t {
|
||||
std::set<T> set1_group_set;
|
||||
std::set<T> set2_group_set;
|
||||
const std::vector<int64_t> subspan(order.begin(), order.end() - 1);
|
||||
auto set2_grouper = set2_st->group(subspan);
|
||||
auto set2_group_it = set2_grouper.begin();
|
||||
std::vector<int64_t> group_indices;
|
||||
for (int64_t flat_group_index = begin; flat_group_index < end; ++flat_group_index) {
|
||||
PopulateGroupIndices(flat_group_index, group_shape, group_indices);
|
||||
|
||||
// Get values from set1.
|
||||
PopulateFromDenseGroup<T>(set1_t, set1_strides, group_indices, set1_group_set);
|
||||
// Get values from set2, if applicable.
|
||||
set2_group_set.clear();
|
||||
if (set2_group_it != set2_grouper.end()) {
|
||||
const auto &group = *set2_group_it;
|
||||
const auto set2_group_indices = group.group();
|
||||
bool group_match = true;
|
||||
for (uint32_t i = 0; group_match && (i < set2_group_indices.size()); ++i) {
|
||||
if (set2_group_indices[i] != group_indices[i]) {
|
||||
group_match = false;
|
||||
}
|
||||
}
|
||||
if (group_match) {
|
||||
KERNEL_HANDLE_ERROR(PopulateFromSparseGroup<T>(group, shape2, set2_group_set),
|
||||
"PopulateFromSparseGroup error.");
|
||||
++set2_group_it;
|
||||
}
|
||||
}
|
||||
|
||||
std::set<T> group_set;
|
||||
ApplySetOperation(set1_group_set, set2_group_set, group_set, databank.set_operation_);
|
||||
if (!group_set.empty()) {
|
||||
std::lock_guard<std::mutex> lck(mt);
|
||||
group_sets[group_indices] = group_set;
|
||||
int64_t set_size = group_set.size();
|
||||
if (set_size > max_set_size) {
|
||||
max_set_size = set_size;
|
||||
}
|
||||
num_result_values += set_size;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
});
|
||||
KERNEL_CHECK_FALSE((ret == KERNEL_STATUS_OK), KERNEL_STATUS_INNER_ERROR,
|
||||
"DenseToSparseSetOperation compute failed.");
|
||||
}
|
||||
|
||||
group_shape.push_back(max_set_size);
|
||||
return OutputSparseTensor<T>(databank, group_shape, num_result_values, group_sets);
|
||||
}
|
||||
|
||||
uint32_t DenseToSparseSetOperationCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"DenseToSparseSetOperation check input and output number failed.");
|
||||
DataBank databank;
|
||||
KERNEL_HANDLE_ERROR(NullptrAndMatVecCheck(ctx, databank), "DenseToSparseSetOperation check params failed.");
|
||||
DataType dt = reinterpret_cast<DataType>(databank.set2_values->GetDataType());
|
||||
|
||||
uint32_t KERNEL_STATUS;
|
||||
switch (dt) {
|
||||
case DT_INT8:
|
||||
KERNEL_STATUS = ComputeDenseToSparse<int8_t>(databank);
|
||||
break;
|
||||
case DT_UINT8:
|
||||
KERNEL_STATUS = ComputeDenseToSparse<uint8_t>(databank);
|
||||
break;
|
||||
case DT_INT16:
|
||||
KERNEL_STATUS = ComputeDenseToSparse<int16_t>(databank);
|
||||
break;
|
||||
case DT_UINT16:
|
||||
KERNEL_STATUS = ComputeDenseToSparse<uint16_t>(databank);
|
||||
break;
|
||||
case DT_INT32:
|
||||
KERNEL_STATUS = ComputeDenseToSparse<int32_t>(databank);
|
||||
break;
|
||||
case DT_INT64:
|
||||
KERNEL_STATUS = ComputeDenseToSparse<int64_t>(databank);
|
||||
break;
|
||||
case DT_STRING:
|
||||
KERNEL_STATUS = ComputeDenseToSparse<std::string>(databank);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("DenseToSparseSetOperation can't support this data type [%s].", DTypeStr(dt).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (KERNEL_STATUS != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("DenseToSparseSetOperation failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kDenseToSparseSetOperation, DenseToSparseSetOperationCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,80 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <set>
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/sparse_group.h"
|
||||
#include "utils/sparse_tensor.h"
|
||||
// 定义命名空间aicpu
|
||||
|
||||
namespace aicpu {
|
||||
enum SetOperation { A_MINUS_B = 0, B_MINUS_A = 1, INTERSECTION = 2, UNION = 3 };
|
||||
struct DataBank {
|
||||
DataBank()
|
||||
: set1(nullptr),
|
||||
set2_indices(nullptr),
|
||||
set2_values(nullptr),
|
||||
set2_shape(nullptr),
|
||||
result_indices(nullptr),
|
||||
result_values(nullptr),
|
||||
result_shape(nullptr) {}
|
||||
Tensor *set1;
|
||||
Tensor *set2_indices;
|
||||
Tensor *set2_values;
|
||||
Tensor *set2_shape;
|
||||
Tensor *result_indices;
|
||||
Tensor *result_values;
|
||||
Tensor *result_shape;
|
||||
SetOperation set_operation_;
|
||||
bool validate_indices_;
|
||||
CpuKernelContext *ctx;
|
||||
};
|
||||
|
||||
// 算子类继承CpuKernel基类
|
||||
class DenseToSparseSetOperationCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~DenseToSparseSetOperationCpuKernel() = default;
|
||||
DenseToSparseSetOperationCpuKernel() = default;
|
||||
// 声明函数Compute,且Compute函数需要重写
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &calc_info);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ComputeDenseToSparse(DataBank &databank);
|
||||
|
||||
template <typename T>
|
||||
uint32_t CheckGroup(const Group &group, const std::vector<int64_t> &sparse_tensor_shape);
|
||||
|
||||
template <typename T>
|
||||
uint32_t PopulateFromSparseGroup(const Group &group, const std::vector<int64_t> &sparse_tensor_shape,
|
||||
std::set<T> &result);
|
||||
template <typename T>
|
||||
uint32_t PopulateFromDenseGroup(Tensor *input_tensor, const std::vector<int64_t> &input_strides,
|
||||
const std::vector<int64_t> &group_indices, std::set<T> &result);
|
||||
|
||||
void PopulateGroupIndices(const int64_t flat_group_index, const std::vector<int64_t> &group_shape,
|
||||
std::vector<int64_t> &group_indices);
|
||||
|
||||
template <typename T>
|
||||
void ApplySetOperation(const std::set<T> &set1, const std::set<T> &set2, std::set<T> &result,
|
||||
SetOperation set_operation_);
|
||||
|
||||
template <typename T>
|
||||
uint32_t OutputSparseTensor(DataBank &databank, const std::vector<int64_t> &output_shape, const int64_t num_values,
|
||||
const std::map<std::vector<int64_t>, std::set<T>> &sets);
|
||||
};
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,121 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "diag.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *kDiag = "Diag";
|
||||
constexpr int64_t kParallelDataNums = 80 * 32;
|
||||
constexpr int64_t kParallelDataNumsMid = 8 * 1024;
|
||||
|
||||
#define DIAG_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = DiagCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Diag kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t DiagCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kDiag);
|
||||
KERNEL_HANDLE_ERROR(DiagCheck(ctx), "[%s] check params failed.", kDiag);
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
DIAG_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
DIAG_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
DIAG_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
DIAG_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
DIAG_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
DIAG_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
DIAG_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Diag kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t DiagCpuKernel::DiagCheck(CpuKernelContext &ctx) {
|
||||
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
|
||||
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
|
||||
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
|
||||
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get output tensor shape failed.")
|
||||
|
||||
std::vector<int64_t> shape_input = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_output = ctx.Output(0)->GetTensorShape()->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE((shape_input.size() != 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input must be at least rank 1, got [%zu].", shape_input.size())
|
||||
KERNEL_CHECK_FALSE((shape_input.size() != shape_output.size() * 2), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The output shape size should be twice the output shape size, "
|
||||
"but the input shape size is [%zu] and the output shape size is [%zu].",
|
||||
shape_input.size(), shape_output.size())
|
||||
for (size_t i = 0; i < shape_output.size(); ++i) {
|
||||
KERNEL_CHECK_FALSE((shape_input[i % shape_input.size()] == shape_output[i]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Invalid shape: the input dimension [%zu] size [%zu] does not match "
|
||||
"the output dimension [%zu] size [%zu].",
|
||||
i % shape_input.size(), shape_input[i % shape_input.size()], i, shape_output[i])
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t DiagCpuKernel::DiagCompute(CpuKernelContext &ctx) {
|
||||
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
|
||||
int64_t size = ctx.Input(0)->NumElements();
|
||||
int64_t data_size = size * sizeof(T);
|
||||
|
||||
if (data_size <= kParallelDataNums) {
|
||||
std::fill(output, output + size * size, T());
|
||||
for (int64_t index = 0; index < size; index++) {
|
||||
*(output + (1 + size) * index) = *(input + index);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (data_size <= kParallelDataNumsMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
if (max_core_num > size) {
|
||||
max_core_num = size;
|
||||
}
|
||||
auto shard_diag = [&](int64_t start, int64_t end) {
|
||||
std::fill(output + size * start, output + size * end, T());
|
||||
for (int64_t index = start; index < end; index++) {
|
||||
*(output + (1 + size) * index) = *(input + index);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, size, size / max_core_num, shard_diag),
|
||||
"Diag Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kDiag, DiagCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -13,23 +13,25 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_IS_INF_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_IS_INF_H_
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_DIAG_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_DIAG_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class IsInfCpuKernel : public CpuKernel {
|
||||
class DiagCpuKernel : public CpuKernel {
|
||||
public:
|
||||
IsInfCpuKernel() = default;
|
||||
~IsInfCpuKernel() override = default;
|
||||
DiagCpuKernel() = default;
|
||||
~DiagCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t IsInfCheck(const CpuKernelContext &ctx) const;
|
||||
uint32_t DiagCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t IsInfCompute(const CpuKernelContext &ctx);
|
||||
uint32_t DiagCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,87 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "diag_part.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *kDiagPart = "DiagPart";
|
||||
|
||||
#define DIAGPART_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = DiagPartCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("DiagPart kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t DiagPartCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kDiagPart);
|
||||
KERNEL_HANDLE_ERROR(DiagPartCheck(ctx), "[%s] check params failed.", kDiagPart);
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
DIAGPART_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
DIAGPART_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
DIAGPART_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
DIAGPART_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
DIAGPART_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
DIAGPART_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
DIAGPART_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("DiagPart kernel data type [%s] not supports.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t DiagPartCpuKernel::DiagPartCheck(CpuKernelContext &ctx) {
|
||||
std::vector<int64_t> shape_input = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_output = ctx.Output(0)->GetTensorShape()->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE((shape_input.size() % 2 == 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The rank of the tensor should be even and positive.");
|
||||
for (size_t i = 0; i < shape_output.size(); i++) {
|
||||
KERNEL_CHECK_FALSE((shape_input[i] == shape_input[i + shape_output.size()]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Invalid shape: the input dimension [%zu] size [%zu] does not match "
|
||||
"the input dimension [%zu] size [%zu].",
|
||||
i, shape_input[i], i + shape_output.size(), shape_input[i + shape_output.size()]);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t DiagPartCpuKernel::DiagPartCompute(CpuKernelContext &ctx) {
|
||||
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
|
||||
uint64_t size = ctx.Output(0)->NumElements();
|
||||
for (size_t index = 0; index < size; index++) {
|
||||
*(output + index) = *(input + (1 + size) * index);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kDiagPart, DiagPartCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_DIAG_PART_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_DIAG_PART_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class DiagPartCpuKernel : public CpuKernel {
|
||||
public:
|
||||
DiagPartCpuKernel() = default;
|
||||
~DiagPartCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t DiagPartCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t DiagPartCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,227 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "diagonal.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#define N2 2
|
||||
#define N3 3
|
||||
#define N4 4
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const size_t kOutputNum = 1;
|
||||
const size_t kInputNum = 1;
|
||||
const char *kDiagonal = "Diagonal";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 400;
|
||||
const int64_t kParallelDataNumMid = 2 * 1024;
|
||||
const uint32_t min_core_num = 1;
|
||||
|
||||
template <typename T>
|
||||
T mul_sum(std::vector<T> v1, std::vector<T> v2) {
|
||||
T output = 0;
|
||||
if (v1.size() != v2.size()) {
|
||||
return false;
|
||||
} else {
|
||||
for (unsigned int i = 0; i < v1.size(); i++) {
|
||||
output += v1[i] * v2[i];
|
||||
}
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> construct_stride(std::vector<T> t_shape) {
|
||||
std::vector<T> t_stride(t_shape.size(), 1);
|
||||
int initial = 1;
|
||||
for (unsigned int i = t_shape.size(); i > 0; i--) {
|
||||
t_stride[i - 1] = initial;
|
||||
initial = initial * t_shape[i - 1];
|
||||
}
|
||||
return t_stride;
|
||||
}
|
||||
|
||||
int64_t diag_size(const int64_t &offset, const int64_t &dim1, const int64_t &dim2, std::vector<int64_t> x_shape) {
|
||||
int64_t dsize = 0;
|
||||
if (offset >= 0) {
|
||||
dsize = std::max<int64_t>(std::min(x_shape.at(dim1), x_shape.at(dim2) - offset), 0);
|
||||
} else {
|
||||
dsize = std::max<int64_t>(std::min(x_shape.at(dim1) + offset, x_shape.at(dim2)), 0);
|
||||
}
|
||||
return dsize;
|
||||
}
|
||||
|
||||
int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr) {
|
||||
if (dim < 0) {
|
||||
dim += dim_post_expr;
|
||||
}
|
||||
return dim;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T get_data(int64_t basepos, int64_t offset, int64_t *ar, T *dptr) {
|
||||
if (offset >= 0) {
|
||||
return dptr[basepos + offset * ar[1]];
|
||||
} else {
|
||||
return dptr[basepos - offset * ar[0]];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> construct_index(int num, std::vector<T> &stride) {
|
||||
std::vector<T> idx;
|
||||
int tmp_num = num;
|
||||
for (uint32_t i = 0; i < stride.size(); i++) {
|
||||
idx.push_back(tmp_num / stride[i]);
|
||||
tmp_num = tmp_num % stride[i];
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
void DiagonalCpuKernel::set_output(int64_t *ar, T *dptr, T *y_dptr) {
|
||||
for (int i = 0; i < dsize; i++) {
|
||||
y_dptr[ar[N3] + i] = get_data(ar[N2] + i * (ar[0] + ar[1]), offset_, ar, dptr);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t DiagonalCpuKernel::DoComputeType(CpuKernelContext &ctx) {
|
||||
// Get the inuput and output
|
||||
Tensor *input_x = ctx.Input(0);
|
||||
// Get some information of input
|
||||
int32_t x_NumE = input_x->NumElements();
|
||||
auto x_shape = input_x->GetTensorShape();
|
||||
std::vector<int64_t> x_shape_ = x_shape->GetDimSizes();
|
||||
const int64_t x_dim = x_shape->GetDims();
|
||||
auto dataptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto y_dataptr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
// Compute
|
||||
dsize = diag_size(offset_, dim1_, dim2_, x_shape_);
|
||||
std::vector<int64_t> x_stride = construct_stride<int64_t>(x_shape_);
|
||||
if (x_dim != N2 && x_NumE > 0) {
|
||||
// set the vx_shape and vx_stride, which is x_shape_ and x_stride of
|
||||
// position dim1_ and dim2_ removed.
|
||||
std::vector<int64_t> vx_shape, vx_stride;
|
||||
for (unsigned int tmp_dim = 0; tmp_dim < x_shape_.size(); tmp_dim++) {
|
||||
if (tmp_dim != dim1_ && tmp_dim != dim2_) {
|
||||
vx_shape.push_back(x_shape_[tmp_dim]);
|
||||
vx_stride.push_back(x_stride[tmp_dim]);
|
||||
}
|
||||
}
|
||||
// set the y_shape (the output shape), y_stride(the output stride),
|
||||
// vy_stride(the y_stride without the last dim)
|
||||
std::vector<int64_t> y_shape = vx_shape;
|
||||
y_shape.push_back(dsize);
|
||||
std::vector<int64_t> y_stride = construct_stride<int64_t>(y_shape);
|
||||
std::vector<int64_t> vy_stride = y_stride;
|
||||
vy_stride.pop_back();
|
||||
// diagonal
|
||||
int32_t task_num = x_NumE / x_shape_[dim1_] / x_shape_[dim2_];
|
||||
if (task_num >= kParallelDataNum) {
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (task_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, static_cast<int64_t>(N4));
|
||||
}
|
||||
max_core_num = max_core_num > task_num ? task_num : max_core_num;
|
||||
auto sharder_diagonal = [&](int64_t start, int64_t end) {
|
||||
for (int64_t j = start; j < end; j++) {
|
||||
std::vector<int64_t> v_s_stride = construct_stride<int64_t>(vx_shape);
|
||||
auto p = construct_index<int64_t>(j, v_s_stride);
|
||||
int64_t arr[N4] = {x_stride[dim1_], x_stride[dim2_], mul_sum<int64_t>(p, vx_stride),
|
||||
mul_sum<int64_t>(p, vy_stride)};
|
||||
set_output(arr, dataptr, y_dataptr);
|
||||
}
|
||||
};
|
||||
if (max_core_num != 0) {
|
||||
int64_t per_unit = task_num / max_core_num;
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, task_num, per_unit, sharder_diagonal), "Diagonal failed.");
|
||||
}
|
||||
} else {
|
||||
for (int64_t j = 0; j < task_num; j++) {
|
||||
std::vector<int64_t> v_s_stride = construct_stride<int64_t>(vx_shape);
|
||||
auto p = construct_index<int64_t>(j, v_s_stride);
|
||||
int64_t arr[N4] = {x_stride[dim1_], x_stride[dim2_], mul_sum<int64_t>(p, vx_stride),
|
||||
mul_sum<int64_t>(p, vy_stride)};
|
||||
set_output(arr, dataptr, y_dataptr);
|
||||
}
|
||||
}
|
||||
} else if (x_dim == N2) {
|
||||
int64_t arr[N4] = {x_stride[dim1_], x_stride[dim2_], 0, 0};
|
||||
set_output(arr, dataptr, y_dataptr);
|
||||
} else {
|
||||
y_dataptr = dataptr;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t DiagonalCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// Check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Diagonal check input and output number failed.");
|
||||
// Get the inuput
|
||||
Tensor *input_x = ctx.Input(0);
|
||||
auto input_size = input_x->GetTensorShape()->GetDims();
|
||||
// Check the input dims
|
||||
if (input_size < N2) {
|
||||
KERNEL_LOG_ERROR("[Diagonal]: the input tensor must is at least 2-dimensional.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
// Get the attr
|
||||
AttrValue *offset = ctx.GetAttr("offset");
|
||||
offset_ = (offset == nullptr) ? 0 : (offset->GetInt());
|
||||
AttrValue *dim1 = ctx.GetAttr("dim1");
|
||||
dim1_ = (dim1 == nullptr) ? 0 : (dim1->GetInt());
|
||||
AttrValue *dim2 = ctx.GetAttr("dim2");
|
||||
dim2_ = (dim2 == nullptr) ? 1 : (dim2->GetInt());
|
||||
int64_t min_d = -input_size;
|
||||
int64_t max_d = input_size - 1;
|
||||
// Check the attr
|
||||
if (dim1_ < min_d || dim1_ > max_d || dim2_ < min_d || dim2_ > max_d) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"[Diagonal]: Dimension out of range (expected to be in range of [%d, "
|
||||
"%d]).",
|
||||
min_d, max_d);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
// Represent the dim in uniform standard form and Check the dim
|
||||
dim1_ = maybe_wrap_dim(dim1_, input_size);
|
||||
dim2_ = maybe_wrap_dim(dim2_, input_size);
|
||||
if (dim1_ == dim2_) {
|
||||
KERNEL_LOG_ERROR("[Diagonal]:Diagonal dimensions cannot be identical.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
auto data_type = input_x->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
return DoComputeType<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return DoComputeType<double>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[Diagonal]: Diagonal kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kDiagonal, DiagonalCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <Eigen/Dense>
|
||||
#include <array>
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class DiagonalCpuKernel final : public CpuKernel {
|
||||
public:
|
||||
DiagonalCpuKernel() = default;
|
||||
~DiagonalCpuKernel() override = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
template <typename T>
|
||||
uint32_t DoComputeType(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void set_output(int64_t *ar, T *dptr, T *y_dptr);
|
||||
|
||||
private:
|
||||
int64_t offset_ = 0;
|
||||
int64_t dim1_ = 0;
|
||||
int64_t dim2_ = 1;
|
||||
int64_t dsize = 0;
|
||||
};
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,95 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "eig.h"
|
||||
#include <Eigen/Dense>
|
||||
#include <Eigen/Eigenvalues>
|
||||
#include <algorithm>
|
||||
#include <complex>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 1;
|
||||
const uint32_t kOutputNum = 2;
|
||||
const char *Eig = "Eig";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t EigCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Eig check input and output failed.");
|
||||
Tensor *input = ctx.Input(0);
|
||||
auto input_dtype = static_cast<DataType>(input->GetDataType());
|
||||
switch (input_dtype) {
|
||||
case DT_FLOAT:
|
||||
return ComputeKernel<float, std::complex<float>>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return ComputeKernel<double, std::complex<double>>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return ComputeKernel<std::complex<float>, std::complex<float>>(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return ComputeKernel<std::complex<double>, std::complex<double>>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Eig kernel data type [%s] not support.", DTypeStr(input_dtype).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(Eig, EigCpuKernel);
|
||||
|
||||
template <typename T, typename C>
|
||||
uint32_t EigCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
|
||||
auto xptr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto valptr = reinterpret_cast<C *>(ctx.Output(0)->GetData());
|
||||
auto vecptr = reinterpret_cast<C *>(ctx.Output(1)->GetData());
|
||||
std::vector<int64_t> dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
int64_t rank = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
int64_t x_dim = ctx.Input(0)->GetTensorShape()->GetDimSize(rank - 1);
|
||||
int64_t batch_size = 1;
|
||||
if (rank > 2) {
|
||||
for (int64_t i = 0; i < rank - 2; i++) {
|
||||
batch_size *= dims[i];
|
||||
}
|
||||
}
|
||||
AttrValue *compute_v = ctx.GetAttr("compute_v");
|
||||
bool compute_v_ = (compute_v == nullptr) ? false : compute_v->GetBool();
|
||||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A(x_dim, x_dim);
|
||||
for (int64_t k = 0; k < batch_size; k++) {
|
||||
for (int64_t i = 0; i < x_dim * x_dim; i++) {
|
||||
A.data()[i] = xptr[k * x_dim * x_dim + i];
|
||||
}
|
||||
if (!compute_v_) {
|
||||
Eigen::ComplexEigenSolver<Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> es(A, false);
|
||||
Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> D = es.eigenvalues();
|
||||
for (int64_t i = 0; i < x_dim; i++) {
|
||||
valptr[k * x_dim + i] = D.data()[i];
|
||||
}
|
||||
} else {
|
||||
Eigen::ComplexEigenSolver<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> es(A);
|
||||
Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> D = es.eigenvalues();
|
||||
Eigen::Matrix<C, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> V = es.eigenvectors();
|
||||
for (int64_t i = 0; i < x_dim; i++) {
|
||||
valptr[k * x_dim + i] = D.data()[i];
|
||||
}
|
||||
for (int64_t i = 0; i < x_dim * x_dim; i++) {
|
||||
vecptr[k * x_dim * x_dim + i] = V.data()[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_EIG_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_EIG_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class EigCpuKernel : public CpuKernel {
|
||||
public:
|
||||
EigCpuKernel() = default;
|
||||
~EigCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T, typename C>
|
||||
uint32_t ComputeKernel(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,114 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "eye.h"
|
||||
|
||||
#include <string.h>
|
||||
#include "Eigen/Dense"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kEye = "Eye";
|
||||
|
||||
#define EYE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = EyePartCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Eye kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t EyeCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
Tensor *output = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed")
|
||||
auto data_type = ctx.Output(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
EYE_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
EYE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
EYE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
EYE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
EYE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
EYE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
EYE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
EYE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
EYE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
EYE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
|
||||
EYE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
|
||||
EYE_COMPUTE_CASE(DT_COMPLEX64, std::complex<std::float_t>, ctx)
|
||||
EYE_COMPUTE_CASE(DT_COMPLEX128, std::complex<std::double_t>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Eye kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t EyeCpuKernel::EyePartCompute(CpuKernelContext &ctx) {
|
||||
int64_t num_rows_value1 = 0;
|
||||
int64_t num_columns_value = -1;
|
||||
int64_t dim_value = 1;
|
||||
int32_t out_size_size = 0;
|
||||
AttrValue *num_rows = ctx.GetAttr("num_rows");
|
||||
KERNEL_CHECK_NULLPTR(num_rows, KERNEL_STATUS_PARAM_INVALID, "get num_rows failed.");
|
||||
num_rows_value1 = num_rows->GetInt();
|
||||
int64_t min_value = num_rows_value1;
|
||||
int64_t max_value = -1;
|
||||
int64_t num_col = num_rows_value1;
|
||||
AttrValue *num_columns = ctx.GetAttr("num_columns");
|
||||
if (num_columns) {
|
||||
num_columns_value = num_columns->GetInt();
|
||||
min_value = num_columns_value < num_rows_value1 ? num_columns_value : num_rows_value1;
|
||||
max_value = num_columns_value > num_rows_value1 ? num_columns_value : num_rows_value1;
|
||||
num_col = num_columns_value;
|
||||
}
|
||||
if (max_value == -1) {
|
||||
max_value = num_rows_value1;
|
||||
}
|
||||
AttrValue *batch_shape = ctx.GetAttr("batch_shape");
|
||||
if (batch_shape) {
|
||||
std::vector<int64_t> output_size = ctx.GetAttr("batch_shape")->GetListInt();
|
||||
out_size_size = output_size.size();
|
||||
int64_t batch_shape_value = 1;
|
||||
for (int32_t t = 0; t < out_size_size; t++) {
|
||||
batch_shape_value = output_size[t];
|
||||
dim_value = dim_value * batch_shape_value;
|
||||
}
|
||||
}
|
||||
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
int64_t data_size = data_num * sizeof(T);
|
||||
Tensor *y = ctx.Output(0);
|
||||
auto y_addr = y->GetData();
|
||||
memset(y_addr, 0.0, data_size);
|
||||
T num = static_cast<T>(1);
|
||||
int32_t block_size = min_value * max_value;
|
||||
for (int32_t dim = 0; dim < dim_value; dim++) {
|
||||
for (int32_t i = 0; i < min_value; i++) {
|
||||
*(output_y + (dim * block_size) + (num_col + 1) * i) = num;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kEye, EyeCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_ZEROSLIKE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_ZEROSLIKE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class EyeCpuKernel : public CpuKernel {
|
||||
public:
|
||||
EyeCpuKernel() = default;
|
||||
~EyeCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t EyePartCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,294 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "fractional_avg_pool.h"
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kFractionalAvgPool = "FractionalAvgPool";
|
||||
const uint32_t k_InputNum = 1;
|
||||
const uint32_t k_OutputNum = 3;
|
||||
const int64_t kParallelDataNum = 1024 * 1024;
|
||||
constexpr uint32_t tensor_in_and_out_dims = 4;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t FractionalAvgPoolCpuKernel::FractionalAvgPoolParamCheck(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
|
||||
"FractionalAvgPool Check input and output number failed.");
|
||||
Tensor *input = ctx.Input(0);
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the output [%s] need be the same as the input [%s]",
|
||||
DTypeStr(ctx.Output(0)->GetDataType()).c_str(), DTypeStr(ctx.Input(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
auto input_shape = input->GetTensorShape();
|
||||
int32_t input_dims = input_shape->GetDims();
|
||||
for (int32_t i = 0; i < input_dims; i++) {
|
||||
KERNEL_CHECK_FALSE((input_shape->GetDimSize(i) > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"FractionalAvgPool: expected input to have non-empty spatial "
|
||||
"dimensions, "
|
||||
"but input has sizes [%d] with dimension [%d] being empty.",
|
||||
input_dims, i);
|
||||
}
|
||||
KERNEL_CHECK_FALSE((input_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"tensor_in must be 4-dimensional.");
|
||||
AttrValue *pooling_ratio = ctx.GetAttr("pooling_ratio");
|
||||
KERNEL_CHECK_NULLPTR(pooling_ratio, KERNEL_STATUS_PARAM_INVALID, "[%s] get attr:pooling_ratio failed.",
|
||||
kFractionalAvgPool);
|
||||
int32_t pooling_ratio_size = pooling_ratio->ListFloatSize();
|
||||
KERNEL_CHECK_FALSE((pooling_ratio_size == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"pooling_ratio field must specify 4 dimensions.");
|
||||
std::vector<float> pooling_ratio_data = ctx.GetAttr("pooling_ratio")->GetListFloat();
|
||||
KERNEL_CHECK_FALSE((pooling_ratio_data[0] == 1.0 && pooling_ratio_data[3] == 1.0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"FractionalAvgPool is not yet supported on the batch nor channel "
|
||||
"dimension.The first and last elements of pooling ratio must be 1.0.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
static std::vector<int64_t> GeneratePoolingSequencePseudoRandom(int input_length, int output_length, int seed) {
|
||||
// generate a random number u which is in (0,1)
|
||||
std::vector<int64_t> cum_seq(output_length + 1, 0);
|
||||
std::vector<int64_t> diff(output_length, 0);
|
||||
double alpha = static_cast<double>(input_length) / output_length;
|
||||
int k = input_length / output_length;
|
||||
double u_max1 = (k + 2) / alpha - 1;
|
||||
double u_max2 = (input_length + 1 - k) / alpha - (output_length - 1);
|
||||
double max_u = std::min(u_max1, u_max2);
|
||||
std::default_random_engine random(seed);
|
||||
std::uniform_real_distribution<double> dis2(0.0, 1.0);
|
||||
const double u = dis2(random) * max_u;
|
||||
cum_seq[0] = 1;
|
||||
cum_seq[output_length] = input_length + 1;
|
||||
for (int i = 1; i < output_length; ++i) {
|
||||
cum_seq[i] = static_cast<int>(ceil(alpha * (i + u)));
|
||||
}
|
||||
for (int i = 0; i < output_length; ++i) {
|
||||
diff[i] = cum_seq[i + 1] - cum_seq[i];
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
static std::vector<int64_t> GeneratePoolingSequenceRandom(int input_length, int output_length, int seed) {
|
||||
int k = input_length / output_length;
|
||||
int num_random_spot = input_length % output_length;
|
||||
std::vector<int64_t> diff(output_length, k);
|
||||
for (int i = 0; i < num_random_spot; ++i) {
|
||||
diff[i] += 1;
|
||||
}
|
||||
std::srand(seed);
|
||||
random_shuffle(diff.begin(), diff.end());
|
||||
return diff;
|
||||
}
|
||||
|
||||
std::vector<int64_t> GeneratePoolingSequence(int input_length, int output_length, bool pseudo_random, int seed) {
|
||||
std::vector<int64_t> diff;
|
||||
if (input_length % output_length == 0) {
|
||||
diff = std::vector<int64_t>(output_length, input_length / output_length);
|
||||
}
|
||||
if (pseudo_random) {
|
||||
diff = GeneratePoolingSequencePseudoRandom(input_length, output_length, seed);
|
||||
} else {
|
||||
diff = GeneratePoolingSequenceRandom(input_length, output_length, seed);
|
||||
}
|
||||
int k = input_length / output_length;
|
||||
for (int i = 0; i < output_length; i++) {
|
||||
if (diff[i] < k || diff[i] > k + 1) {
|
||||
KERNEL_LOG_ERROR("FractionalAvgPool kernel GeneratePoolingSequence diff[%d] is error");
|
||||
}
|
||||
}
|
||||
std::vector<int64_t> cum_seq(output_length + 1, 0);
|
||||
for (size_t i = 1; i < cum_seq.size(); ++i) {
|
||||
cum_seq[i] = cum_seq[i - 1] + diff[i - 1];
|
||||
}
|
||||
return cum_seq;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t FractionalAvgPoolCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input = ctx.Input(0);
|
||||
Tensor *output = ctx.Output(0);
|
||||
Tensor *row_pooling_sequence = ctx.Output(1);
|
||||
Tensor *col_pooling_sequence = ctx.Output(2);
|
||||
std::vector<float> pooling_ratio = ctx.GetAttr("pooling_ratio")->GetListFloat();
|
||||
AttrValue *pseudo_random_ = ctx.GetAttr("pseudo_random");
|
||||
bool pseudo_random = (pseudo_random_ == nullptr) ? false : (pseudo_random_->GetBool());
|
||||
AttrValue *overlapping_ = ctx.GetAttr("overlapping");
|
||||
bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
|
||||
AttrValue *deterministic_ = ctx.GetAttr("deterministic");
|
||||
bool deterministic = (deterministic_ == nullptr) ? false : (deterministic_->GetBool());
|
||||
AttrValue *seed_ = ctx.GetAttr("seed");
|
||||
int seed = (seed_ == nullptr) ? 0 : (seed_->GetInt());
|
||||
AttrValue *seed2_ = ctx.GetAttr("seed2");
|
||||
int seed2 = (seed2_ == nullptr) ? 0 : (seed2_->GetInt());
|
||||
auto input_shape = input->GetTensorShape();
|
||||
std::vector<int> input_size(tensor_in_and_out_dims);
|
||||
std::vector<int> output_size(tensor_in_and_out_dims);
|
||||
for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
|
||||
input_size[i] = input_shape->GetDimSize(i);
|
||||
}
|
||||
for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
|
||||
output_size[i] = static_cast<int>(std::floor(input_size[i] / pooling_ratio[i]));
|
||||
KERNEL_CHECK_FALSE((output_size[i] > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"FractionalAvgPool kernel outputsize[%d] cannot be 0");
|
||||
}
|
||||
auto input_data = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_data = static_cast<T *>(output->GetData());
|
||||
auto output_height_seq_tensor = static_cast<int64_t *>(row_pooling_sequence->GetData());
|
||||
auto output_width_seq_tensor = static_cast<int64_t *>(col_pooling_sequence->GetData());
|
||||
std::random_device rd;
|
||||
std::mt19937 generator(rd());
|
||||
if (deterministic) {
|
||||
// If both seeds are not set when deterministic is true, force set seeds.
|
||||
if ((seed == 0) && (seed2 == 0)) {
|
||||
seed = generator();
|
||||
seed2 = generator();
|
||||
}
|
||||
} else {
|
||||
KERNEL_CHECK_FALSE(((seed == 0) && (seed2 == 0)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Both seed and seed2 should be 0 if deterministic is false.");
|
||||
}
|
||||
if (seed == 0 && seed2 != 0) {
|
||||
seed = seed2;
|
||||
}
|
||||
// Generate pooling sequence.
|
||||
std::vector<int64_t> height_cum_seq;
|
||||
std::vector<int64_t> width_cum_seq;
|
||||
height_cum_seq = GeneratePoolingSequence(input_size[1], output_size[1], pseudo_random, seed);
|
||||
width_cum_seq = GeneratePoolingSequence(input_size[2], output_size[2], pseudo_random, seed);
|
||||
for (uint32_t i = 0; i < height_cum_seq.size(); ++i) {
|
||||
*(output_height_seq_tensor + i) = height_cum_seq[i];
|
||||
}
|
||||
for (uint32_t i = 0; i < width_cum_seq.size(); ++i) {
|
||||
*(output_width_seq_tensor + i) = width_cum_seq[i];
|
||||
}
|
||||
const int64_t height_max = input_size[1] - 1;
|
||||
const int64_t width_max = input_size[2] - 1;
|
||||
const int64_t depth_max = input_size[3] - 1;
|
||||
uint64_t data_num = input->NumElements();
|
||||
/**
|
||||
* For both input and output,
|
||||
* 0: batch
|
||||
* 1: height / row
|
||||
* 2: width / col
|
||||
* 3: depth / channel
|
||||
*/
|
||||
if (data_num < kParallelDataNum) {
|
||||
for (int64_t b = 0; b < input_size[0]; ++b) {
|
||||
// height sequence.
|
||||
for (uint32_t hs = 0; hs < height_cum_seq.size() - 1; ++hs) {
|
||||
// height start and end.
|
||||
const int64_t height_start = height_cum_seq[hs];
|
||||
int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
|
||||
height_end = std::min(height_end, height_max);
|
||||
// width sequence.
|
||||
for (uint32_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
|
||||
for (int64_t c = 0; c <= depth_max; ++c) {
|
||||
const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
|
||||
// Initializes the output tensor with 0.
|
||||
T sum = static_cast<T>(0);
|
||||
T avg = static_cast<T>(0);
|
||||
int count = 0;
|
||||
// width start and end.
|
||||
const int64_t width_start = width_cum_seq[ws];
|
||||
int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
|
||||
width_end = std::min(width_end, width_max);
|
||||
for (int64_t h = height_start; h <= height_end; ++h) {
|
||||
for (int64_t w = width_start; w <= width_end; ++w) {
|
||||
const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
|
||||
sum += input_data[in_offset];
|
||||
count++;
|
||||
}
|
||||
}
|
||||
avg = sum / static_cast<T>(count);
|
||||
*(output_data + out_offset) = avg;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint64_t height_cum_len = height_cum_seq.size() - 1;
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > height_cum_len) {
|
||||
max_core_num = height_cum_len;
|
||||
}
|
||||
for (int64_t b = 0; b < input_size[0]; ++b) {
|
||||
// height sequence.
|
||||
auto sharder_fractionalavgpool_index = [&](size_t start, size_t end) {
|
||||
for (uint32_t hs = start; hs < end; ++hs) {
|
||||
// height start and end.
|
||||
const int64_t height_start = height_cum_seq[hs];
|
||||
int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
|
||||
height_end = std::min(height_end, height_max);
|
||||
// width sequence.
|
||||
for (uint32_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
|
||||
for (int64_t c = 0; c <= depth_max; ++c) {
|
||||
const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
|
||||
// Initializes the output tensor with 0.
|
||||
T sum = static_cast<T>(0);
|
||||
T avg = static_cast<T>(0);
|
||||
int count = 0;
|
||||
// width start and end.
|
||||
const int64_t width_start = width_cum_seq[ws];
|
||||
int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
|
||||
width_end = std::min(width_end, width_max);
|
||||
for (int64_t h = height_start; h <= height_end; ++h) {
|
||||
for (int64_t w = width_start; w <= width_end; ++w) {
|
||||
const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
|
||||
sum += input_data[in_offset];
|
||||
count++;
|
||||
}
|
||||
}
|
||||
avg = sum / static_cast<T>(count);
|
||||
*(output_data + out_offset) = avg;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, height_cum_len, height_cum_len / max_core_num,
|
||||
sharder_fractionalavgpool_index),
|
||||
"FractionalAvgPool Index Compute failed");
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t FractionalAvgPoolCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(FractionalAvgPoolParamCheck(ctx), "Check FractionalAvgPool params failed.");
|
||||
Tensor *input = ctx.Input(0);
|
||||
auto data_type = input->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
return DoCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<double>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int64_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("FractionalAvgPool kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kFractionalAvgPool, FractionalAvgPoolCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,20 @@
|
|||
#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_POOL_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_POOL_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class FractionalAvgPoolCpuKernel : public CpuKernel {
|
||||
public:
|
||||
FractionalAvgPoolCpuKernel() = default;
|
||||
~FractionalAvgPoolCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
uint32_t FractionalAvgPoolParamCheck(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_POOL_H_
|
|
@ -0,0 +1,208 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "fractional_avg_pool_grad.h"
|
||||
|
||||
#include <iostream>
|
||||
#include "Eigen/Dense"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kFractionalAvgPoolGrad = "FractionalAvgPoolGrad";
|
||||
const uint32_t k_InputNum = 4;
|
||||
const uint32_t k_OutputNum = 1;
|
||||
const int64_t kParallelDataNum = 32 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t FractionalAvgPoolGradCpuKernel::FractionalAvgPoolGradParamCheck(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
|
||||
"FractionalAvgPoolGrad check input and output number failed.");
|
||||
Tensor *orig_input_tensor_shape = ctx.Input(0);
|
||||
Tensor *out_backprop = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
auto orig_input_shape = orig_input_tensor_shape->GetTensorShape();
|
||||
int32_t orig_input_dims = orig_input_shape->GetDims();
|
||||
int32_t orig_input_shape_nums = orig_input_tensor_shape->NumElements();
|
||||
if (out_backprop->GetDataType() != output->GetDataType()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data type of the output [%s] need be the same as the out_backprop "
|
||||
"[%s]",
|
||||
DTypeStr(output->GetDataType()).c_str(), DTypeStr(out_backprop->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
KERNEL_CHECK_FALSE((orig_input_dims == 1 && orig_input_shape_nums == 4), KERNEL_STATUS_PARAM_INVALID,
|
||||
"original input tensor shape must be 1-dimensional and 4 elements.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t FractionalAvgPoolGradCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenMatrixMap;
|
||||
typedef Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>> EigenDoubleMatrixMap;
|
||||
const Tensor *orig_input_tensor_shape = ctx.Input(0);
|
||||
const Tensor *out_backprop = ctx.Input(1);
|
||||
const Tensor *row_pooling_sequence = ctx.Input(2);
|
||||
const Tensor *col_pooling_sequence = ctx.Input(3);
|
||||
Tensor *output = ctx.Output(0);
|
||||
auto output_data = static_cast<T *>(output->GetData());
|
||||
AttrValue *overlapping_ = ctx.GetAttr("overlapping");
|
||||
bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
|
||||
int32_t row_seq_nums = row_pooling_sequence->NumElements();
|
||||
int32_t col_seq_nums = col_pooling_sequence->NumElements();
|
||||
auto out_backprop_shape = out_backprop->GetTensorShape();
|
||||
const int64_t out_batch = out_backprop_shape->GetDimSize(0);
|
||||
const int64_t out_rows = out_backprop_shape->GetDimSize(1);
|
||||
const int64_t out_cols = out_backprop_shape->GetDimSize(2);
|
||||
const int64_t out_depth = out_backprop_shape->GetDimSize(3);
|
||||
KERNEL_CHECK_FALSE((row_seq_nums > out_rows), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Given out_backprop shape [%ld,%ld,%ld,%ld], row_seq_tensor must"
|
||||
" have at least [%ld] elements, but got[%ld].",
|
||||
out_batch, out_rows, out_cols, out_depth, out_rows + 1, row_seq_nums);
|
||||
KERNEL_CHECK_FALSE((col_seq_nums > out_cols), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Given out_backprop shape [%ld,%ld,%ld,%ld], col_seq_tensor must"
|
||||
" have at least [%ld] elements, but got[%ld].",
|
||||
out_batch, out_rows, out_cols, out_depth, out_cols + 1, col_seq_nums);
|
||||
auto row_seq_data = static_cast<int64_t *>(row_pooling_sequence->GetData());
|
||||
auto col_seq_data = static_cast<int64_t *>(col_pooling_sequence->GetData());
|
||||
auto orig_input_tensor_shape_data = static_cast<int64_t *>(orig_input_tensor_shape->GetData());
|
||||
const int64_t in_batch = *(orig_input_tensor_shape_data);
|
||||
const int64_t in_rows = *(orig_input_tensor_shape_data + 1);
|
||||
const int64_t in_cols = *(orig_input_tensor_shape_data + 2);
|
||||
const int64_t in_depth = *(orig_input_tensor_shape_data + 3);
|
||||
int32_t input_nums = orig_input_tensor_shape->NumElements();
|
||||
std::vector<int64_t> out_put_dims;
|
||||
for (int i = 0; i < input_nums; i++) {
|
||||
KERNEL_CHECK_FALSE((*(orig_input_tensor_shape_data + i) > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Each dimension of input must be > 0.");
|
||||
out_put_dims.push_back(orig_input_tensor_shape_data[i]);
|
||||
}
|
||||
int64_t output_nums = in_batch * in_rows * in_cols * in_depth;
|
||||
// Create intermediate in_backprop.
|
||||
std::vector<double> in_backprop_tensor_temp(output_nums);
|
||||
for (int i = 0; i < output_nums; i++) {
|
||||
in_backprop_tensor_temp[i] = 0;
|
||||
*(output_data + i) = 0;
|
||||
}
|
||||
EigenDoubleMatrixMap in_backprop_tensor_temp_mat(in_backprop_tensor_temp.data(), in_depth,
|
||||
in_cols * in_rows * in_batch);
|
||||
ConstEigenMatrixMap out_backprop_mat(reinterpret_cast<T *>(out_backprop->GetData()), out_depth,
|
||||
out_cols * out_rows * out_batch);
|
||||
// Loop through each element of out_backprop and evenly distribute the
|
||||
// element to the corresponding pooling cell.
|
||||
const int64_t in_max_row_index = in_rows - 1;
|
||||
const int64_t in_max_col_index = in_cols - 1;
|
||||
if (output_nums < kParallelDataNum) {
|
||||
for (int64_t b = 0; b < out_batch; ++b) {
|
||||
for (int64_t r = 0; r < out_rows; ++r) {
|
||||
const int64_t in_row_start = *(row_seq_data + r);
|
||||
int64_t in_row_end = overlapping ? *(row_seq_data + r + 1) : *(row_seq_data + r + 1) - 1;
|
||||
in_row_end = std::min(in_row_end, in_max_row_index);
|
||||
for (int64_t c = 0; c < out_cols; ++c) {
|
||||
const int64_t in_col_start = *(col_seq_data + c);
|
||||
int64_t in_col_end = overlapping ? *(col_seq_data + c + 1) : *(col_seq_data + c + 1) - 1;
|
||||
in_col_end = std::min(in_col_end, in_max_col_index);
|
||||
const int64_t num_elements_in_pooling_cell =
|
||||
(in_row_end - in_row_start + 1) * (in_col_end - in_col_start + 1);
|
||||
const int64_t out_index = (b * out_rows + r) * out_cols + c;
|
||||
// Now we can evenly distribute out_backprop(b, h, w, *) to
|
||||
// in_backprop(b, hs:he, ws:we, *).
|
||||
for (int64_t in_r = in_row_start; in_r <= in_row_end; ++in_r) {
|
||||
for (int64_t in_c = in_col_start; in_c <= in_col_end; ++in_c) {
|
||||
const int64_t in_index = (b * in_rows + in_r) * in_cols + in_c;
|
||||
// Walk through each channel (depth).
|
||||
for (int64_t d = 0; d < out_depth; ++d) {
|
||||
const double out_backprop_element = static_cast<double>(out_backprop_mat.coeffRef(d, out_index));
|
||||
double &in_backprop_ref = in_backprop_tensor_temp_mat.coeffRef(d, in_index);
|
||||
in_backprop_ref += out_backprop_element / num_elements_in_pooling_cell;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint64_t row_len = out_rows;
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > row_len) {
|
||||
max_core_num = row_len;
|
||||
}
|
||||
for (int64_t b = 0; b < out_batch; ++b) {
|
||||
auto sharder_fractionalavgpoolgrad_index = [&](size_t start, size_t end) {
|
||||
for (size_t r = start; r < end; ++r) {
|
||||
const int64_t in_row_start = *(row_seq_data + r);
|
||||
int64_t in_row_end = overlapping ? *(row_seq_data + r + 1) : *(row_seq_data + r + 1) - 1;
|
||||
in_row_end = std::min(in_row_end, in_max_row_index);
|
||||
for (int64_t c = 0; c < out_cols; ++c) {
|
||||
const int64_t in_col_start = *(col_seq_data + c);
|
||||
int64_t in_col_end = overlapping ? *(col_seq_data + c + 1) : *(col_seq_data + c + 1) - 1;
|
||||
in_col_end = std::min(in_col_end, in_max_col_index);
|
||||
const int64_t num_elements_in_pooling_cell =
|
||||
(in_row_end - in_row_start + 1) * (in_col_end - in_col_start + 1);
|
||||
const int64_t out_index = (b * out_rows + r) * out_cols + c;
|
||||
// Now we can evenly distribute out_backprop(b, h, w, *) to
|
||||
// in_backprop(b, hs:he, ws:we, *).
|
||||
for (int64_t in_r = in_row_start; in_r <= in_row_end; ++in_r) {
|
||||
for (int64_t in_c = in_col_start; in_c <= in_col_end; ++in_c) {
|
||||
const int64_t in_index = (b * in_rows + in_r) * in_cols + in_c;
|
||||
// Walk through each channel (depth).
|
||||
for (int64_t d = 0; d < out_depth; ++d) {
|
||||
const double out_backprop_element = static_cast<double>(out_backprop_mat.coeffRef(d, out_index));
|
||||
double &in_backprop_ref = in_backprop_tensor_temp_mat.coeffRef(d, in_index);
|
||||
in_backprop_ref += out_backprop_element / num_elements_in_pooling_cell;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, row_len, row_len / max_core_num, sharder_fractionalavgpoolgrad_index),
|
||||
"FractionalAvgPoolGrad Index Compute failed.");
|
||||
}
|
||||
}
|
||||
// Depending on the type, cast double to type T.
|
||||
for (int64_t i = 0; i < output_nums; ++i) {
|
||||
*(output_data + i) = static_cast<T>(in_backprop_tensor_temp[i]);
|
||||
}
|
||||
output->GetTensorShape()->SetDimSizes(out_put_dims);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t FractionalAvgPoolGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(FractionalAvgPoolGradParamCheck(ctx), "Check FractionalAvgPoolGrad params failed.");
|
||||
Tensor *out_backprop = ctx.Input(1);
|
||||
auto data_type = out_backprop->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
return DoCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<double>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int64_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("FractionalAvgPoolGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kFractionalAvgPoolGrad, FractionalAvgPoolGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,20 @@
|
|||
#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_GRAD_POOL_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_GRAD_POOL_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class FractionalAvgPoolGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
FractionalAvgPoolGradCpuKernel() = default;
|
||||
~FractionalAvgPoolGradCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
uint32_t FractionalAvgPoolGradParamCheck(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_FRACTIONAL_AVG_GRAD_POOL_H_
|
|
@ -0,0 +1,285 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "fractional_max_pool.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kFractionalMaxPool = "FractionalMaxPool";
|
||||
const uint32_t k_InputNum = 1;
|
||||
const uint32_t k_OutputNum = 3;
|
||||
const int64_t kParallelDataNum = 1024 * 1024;
|
||||
const uint32_t tensor_in_and_out_dims = 4;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t FractionalMaxPoolCpuKernel::FractionalMaxPoolParamCheck(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
|
||||
"FractionalMaxPool Check input and output number failed.");
|
||||
Tensor *input = ctx.Input(0);
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the output [%s] need be the same as the input [%s]",
|
||||
DTypeStr(ctx.Output(0)->GetDataType()).c_str(), DTypeStr(ctx.Input(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
auto input_shape = input->GetTensorShape();
|
||||
int32_t input_dims = input_shape->GetDims();
|
||||
for (int32_t i = 0; i < input_dims; i++) {
|
||||
KERNEL_CHECK_FALSE((input_shape->GetDimSize(i) > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"FractionalMaxPool: expected input to have non-empty spatial "
|
||||
"dimensions, "
|
||||
"but input has sizes [%d] with dimension [%d] being empty.",
|
||||
input_dims, i);
|
||||
}
|
||||
KERNEL_CHECK_FALSE((input_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"tensor_in must be 4-dimensional.");
|
||||
AttrValue *pooling_ratio = ctx.GetAttr("pooling_ratio");
|
||||
KERNEL_CHECK_NULLPTR(pooling_ratio, KERNEL_STATUS_PARAM_INVALID, "[%s] get attr:pooling_ratio failed.",
|
||||
kFractionalMaxPool);
|
||||
int32_t pooling_ratio_size = pooling_ratio->ListFloatSize();
|
||||
KERNEL_CHECK_FALSE((pooling_ratio_size == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The size of pooling_ratio must be 4, but got [%d].", pooling_ratio_size);
|
||||
std::vector<float> pooling_ratio_data = ctx.GetAttr("pooling_ratio")->GetListFloat();
|
||||
KERNEL_CHECK_FALSE((pooling_ratio_data[0] == 1.0 && pooling_ratio_data[3] == 1.0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"FractionalMaxPool is not yet supported on the batch nor channel "
|
||||
"dimension.The first and last elements of pooling ratio must be 1.0.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
static std::vector<int64_t> GeneratePoolingSequencePseudoRandom(int input_length, int output_length, int64_t seed) {
|
||||
// generate a random number which is in (0,1)
|
||||
std::vector<int64_t> cum_seq(output_length + 1, 0);
|
||||
std::vector<int64_t> diff(output_length, 0);
|
||||
double alpha = static_cast<double>(input_length) / output_length;
|
||||
int k = input_length / output_length;
|
||||
double u_max1 = (k + 2) / alpha - 1;
|
||||
double u_max2 = (input_length + 1 - k) / alpha - (output_length - 1);
|
||||
double max_u = std::min(u_max1, u_max2);
|
||||
std::default_random_engine random(seed);
|
||||
std::uniform_real_distribution<double> dis2(0.0, 1.0);
|
||||
const double u = dis2(random) * max_u;
|
||||
cum_seq[0] = 1;
|
||||
cum_seq[output_length] = input_length + 1;
|
||||
for (int i = 1; i < output_length; ++i) {
|
||||
cum_seq[i] = static_cast<int>(ceil(alpha * (i + u)));
|
||||
}
|
||||
for (int i = 0; i < output_length; ++i) {
|
||||
diff[i] = cum_seq[i + 1] - cum_seq[i];
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
static std::vector<int64_t> GeneratePoolingSequenceRandom(int input_length, int output_length, int64_t seed) {
|
||||
int k = input_length / output_length;
|
||||
int num_random_spot = input_length % output_length;
|
||||
std::vector<int64_t> diff(output_length, k);
|
||||
for (int i = 0; i < num_random_spot; ++i) {
|
||||
diff[i] += 1;
|
||||
}
|
||||
std::srand(seed);
|
||||
random_shuffle(diff.begin(), diff.end());
|
||||
return diff;
|
||||
}
|
||||
|
||||
std::vector<int64_t> GeneratePoolingSequence(int input_length, int output_length, bool pseudo_random, int64_t seed) {
|
||||
std::vector<int64_t> diff;
|
||||
if (input_length % output_length == 0) {
|
||||
diff = std::vector<int64_t>(output_length, input_length / output_length);
|
||||
}
|
||||
if (pseudo_random) {
|
||||
diff = GeneratePoolingSequencePseudoRandom(input_length, output_length, seed);
|
||||
} else {
|
||||
diff = GeneratePoolingSequenceRandom(input_length, output_length, seed);
|
||||
}
|
||||
int k = input_length / output_length;
|
||||
for (int i = 0; i < output_length; i++) {
|
||||
if (diff[i] < k || diff[i] > k + 1) {
|
||||
KERNEL_LOG_ERROR("FractionalMaxPool kernel GeneratePoolingSequence diff[%d] is error");
|
||||
}
|
||||
}
|
||||
std::vector<int64_t> cum_seq(output_length + 1, 0);
|
||||
for (size_t i = 1; i < cum_seq.size(); ++i) {
|
||||
cum_seq[i] = cum_seq[i - 1] + diff[i - 1];
|
||||
}
|
||||
return cum_seq;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t FractionalMaxPoolCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input = ctx.Input(0);
|
||||
Tensor *output = ctx.Output(0);
|
||||
Tensor *row_pooling_sequence = ctx.Output(1);
|
||||
Tensor *col_pooling_sequence = ctx.Output(2);
|
||||
std::vector<float> pooling_ratio = ctx.GetAttr("pooling_ratio")->GetListFloat();
|
||||
AttrValue *pseudo_random_ = ctx.GetAttr("pseudo_random");
|
||||
bool pseudo_random = (pseudo_random_ == nullptr) ? false : (pseudo_random_->GetBool());
|
||||
AttrValue *overlapping_ = ctx.GetAttr("overlapping");
|
||||
bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
|
||||
AttrValue *deterministic_ = ctx.GetAttr("deterministic");
|
||||
bool deterministic = (deterministic_ == nullptr) ? false : (deterministic_->GetBool());
|
||||
AttrValue *seed_ = ctx.GetAttr("seed");
|
||||
int seed = (seed_ == nullptr) ? 0 : (seed_->GetInt());
|
||||
AttrValue *seed2_ = ctx.GetAttr("seed2");
|
||||
int seed2 = (seed2_ == nullptr) ? 0 : (seed2_->GetInt());
|
||||
auto input_shape = input->GetTensorShape();
|
||||
std::vector<int> input_size(tensor_in_and_out_dims);
|
||||
std::vector<int> output_size(tensor_in_and_out_dims);
|
||||
for (size_t i = 0; i < tensor_in_and_out_dims; ++i) {
|
||||
input_size[i] = input_shape->GetDimSize(i);
|
||||
}
|
||||
for (size_t i = 0; i < tensor_in_and_out_dims; ++i) {
|
||||
output_size[i] = static_cast<int>(std::floor(input_size[i] / pooling_ratio[i]));
|
||||
KERNEL_CHECK_FALSE((output_size[i] > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"FractionalMaxPool kernel output size[%d] cannot be 0.");
|
||||
}
|
||||
auto input_data = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_data = static_cast<T *>(output->GetData());
|
||||
auto output_height_seq_tensor = static_cast<int64_t *>(row_pooling_sequence->GetData());
|
||||
auto output_width_seq_tensor = static_cast<int64_t *>(col_pooling_sequence->GetData());
|
||||
std::random_device rd;
|
||||
std::mt19937 generator(rd());
|
||||
if (deterministic) {
|
||||
// If both seeds are not set when deterministic is true, force set seeds.
|
||||
if ((seed == 0) && (seed2 == 0)) {
|
||||
seed = generator();
|
||||
seed2 = generator();
|
||||
}
|
||||
} else {
|
||||
KERNEL_CHECK_FALSE(((seed == 0) && (seed2 == 0)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Both seed and seed2 should be 0 if deterministic is false.");
|
||||
}
|
||||
if (seed == 0 && seed2 != 0) {
|
||||
seed = seed2;
|
||||
}
|
||||
// Generate pooling sequence.
|
||||
std::vector<int64_t> height_cum_seq;
|
||||
std::vector<int64_t> width_cum_seq;
|
||||
height_cum_seq = GeneratePoolingSequence(input_size[1], output_size[1], pseudo_random, seed);
|
||||
width_cum_seq = GeneratePoolingSequence(input_size[2], output_size[2], pseudo_random, seed);
|
||||
for (size_t i = 0; i < height_cum_seq.size(); ++i) {
|
||||
*(output_height_seq_tensor + i) = height_cum_seq[i];
|
||||
}
|
||||
for (size_t i = 0; i < width_cum_seq.size(); ++i) {
|
||||
*(output_width_seq_tensor + i) = width_cum_seq[i];
|
||||
}
|
||||
const int64_t height_max = input_size[1] - 1;
|
||||
const int64_t width_max = input_size[2] - 1;
|
||||
const int64_t depth_max = input_size[3] - 1;
|
||||
uint64_t data_num = input->NumElements();
|
||||
/**
|
||||
* For both input and output,
|
||||
* 0: batch
|
||||
* 1: height / row
|
||||
* 2: width / col
|
||||
* 3: depth / channel
|
||||
*/
|
||||
if (data_num < kParallelDataNum) {
|
||||
for (int64_t b = 0; b < input_size[0]; ++b) {
|
||||
// height sequence.
|
||||
for (size_t hs = 0; hs < height_cum_seq.size() - 1; ++hs) {
|
||||
// height start and end.
|
||||
const int64_t height_start = height_cum_seq[hs];
|
||||
int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
|
||||
height_end = std::min(height_end, height_max);
|
||||
// width sequence.
|
||||
for (size_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
|
||||
for (int64_t c = 0; c <= depth_max; ++c) {
|
||||
const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
|
||||
// Initializes the output tensor with MIN<T>.
|
||||
T max = std::numeric_limits<T>::lowest();
|
||||
// width start and end.
|
||||
const int64_t width_start = width_cum_seq[ws];
|
||||
int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
|
||||
width_end = std::min(width_end, width_max);
|
||||
for (int64_t h = height_start; h <= height_end; ++h) {
|
||||
for (int64_t w = width_start; w <= width_end; ++w) {
|
||||
const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
|
||||
max = max > input_data[in_offset] ? max : input_data[in_offset];
|
||||
}
|
||||
}
|
||||
*(output_data + out_offset) = max;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint64_t height_cum_len = height_cum_seq.size() - 1;
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > height_cum_len) {
|
||||
max_core_num = height_cum_len;
|
||||
}
|
||||
for (int64_t b = 0; b < input_size[0]; ++b) {
|
||||
// height sequence.
|
||||
auto sharder_fractionalmaxpool_index = [&](size_t start, size_t end) {
|
||||
for (size_t hs = start; hs < end; ++hs) {
|
||||
// height start and end.
|
||||
const int64_t height_start = height_cum_seq[hs];
|
||||
int64_t height_end = overlapping ? height_cum_seq[hs + 1] : height_cum_seq[hs + 1] - 1;
|
||||
height_end = std::min(height_end, height_max);
|
||||
// width sequence.
|
||||
for (size_t ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
|
||||
for (int64_t c = 0; c <= depth_max; ++c) {
|
||||
const int64_t out_offset = ((b * output_size[1] + hs) * output_size[2] + ws) * output_size[3] + c;
|
||||
// Initializes the output tensor with MIN<T>.
|
||||
T max = std::numeric_limits<T>::lowest();
|
||||
// width start and end.
|
||||
const int64_t width_start = width_cum_seq[ws];
|
||||
int64_t width_end = overlapping ? width_cum_seq[ws + 1] : width_cum_seq[ws + 1] - 1;
|
||||
width_end = std::min(width_end, width_max);
|
||||
for (int64_t h = height_start; h <= height_end; ++h) {
|
||||
for (int64_t w = width_start; w <= width_end; ++w) {
|
||||
const int64_t in_offset = ((b * input_size[1] + h) * input_size[2] + w) * output_size[3] + c;
|
||||
max = max > input_data[in_offset] ? max : input_data[in_offset];
|
||||
}
|
||||
}
|
||||
*(output_data + out_offset) = max;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, height_cum_len, height_cum_len / max_core_num,
|
||||
sharder_fractionalmaxpool_index),
|
||||
"FractionalMaxPool Index Compute failed");
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t FractionalMaxPoolCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(FractionalMaxPoolParamCheck(ctx), "FractionalMaxPool check params failed.");
|
||||
Tensor *input = ctx.Input(0);
|
||||
auto data_type = input->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
return DoCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<double>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int64_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("FractionalMaxPool kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kFractionalMaxPool, FractionalMaxPoolCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,20 @@
|
|||
#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class FractionalMaxPoolCpuKernel : public CpuKernel {
|
||||
public:
|
||||
FractionalMaxPoolCpuKernel() = default;
|
||||
~FractionalMaxPoolCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
uint32_t FractionalMaxPoolParamCheck(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_H_
|
|
@ -0,0 +1,242 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "fractional_max_pool_grad.h"
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kFractionalMaxPoolGrad = "FractionalMaxPoolGrad";
|
||||
const uint32_t k_InputNum = 5;
|
||||
const uint32_t k_OutputNum = 1;
|
||||
static const int kInvalidMaxPoolingIndex = -1;
|
||||
const int64_t kParallelDataNum = 32 * 1024;
|
||||
const uint32_t tensor_in_and_out_dims = 4;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t FractionalMaxPoolGradCpuKernel::FractionalMaxPoolGradParamCheck(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, k_InputNum, k_OutputNum),
|
||||
"FractionalMaxPoolGrad check input and output number failed.");
|
||||
Tensor *orig_input = ctx.Input(0);
|
||||
Tensor *orig_output = ctx.Input(1);
|
||||
Tensor *out_backprop = ctx.Input(2);
|
||||
auto orig_input_shape = orig_input->GetTensorShape();
|
||||
int32_t orig_input_dims = orig_input_shape->GetDims();
|
||||
auto orig_output_shape = orig_output->GetTensorShape();
|
||||
int32_t orig_output_dims = orig_output_shape->GetDims();
|
||||
auto out_backprop_shape = out_backprop->GetTensorShape();
|
||||
int32_t out_backprop_dims = out_backprop_shape->GetDims();
|
||||
if (orig_input->GetDataType() != orig_output->GetDataType()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data type of the orig_output [%s] need be the same as the "
|
||||
"orig_input [%s].",
|
||||
DTypeStr(orig_output->GetDataType()).c_str(), DTypeStr(orig_input->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (orig_input->GetDataType() != out_backprop->GetDataType()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data type of the out_backprop [%s] need be the same as the "
|
||||
"orig_input [%s].",
|
||||
DTypeStr(out_backprop->GetDataType()).c_str(), DTypeStr(orig_input->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
KERNEL_CHECK_FALSE((orig_input_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"orig_input should be a tensor of rank 4.");
|
||||
KERNEL_CHECK_FALSE((orig_output_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"orig_output should be a tensor of rank 4.");
|
||||
KERNEL_CHECK_FALSE((out_backprop_dims == tensor_in_and_out_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"out_backprop should be a tensor of rank 4.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t FractionalMaxPoolGradCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenMatrixMap;
|
||||
typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> EigenMatrixMap;
|
||||
typedef Eigen::Map<Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>> EigenIndexMatrixMap;
|
||||
const Tensor *tensor_in = ctx.Input(0);
|
||||
const Tensor *tensor_out = ctx.Input(1);
|
||||
const Tensor *out_backprop = ctx.Input(2);
|
||||
const Tensor *height_seq_tensor = ctx.Input(3);
|
||||
const Tensor *width_seq_tensor = ctx.Input(4);
|
||||
Tensor *output = ctx.Output(0);
|
||||
auto output_data = static_cast<T *>(output->GetData());
|
||||
AttrValue *overlapping_ = ctx.GetAttr("overlapping");
|
||||
bool overlapping = (overlapping_ == nullptr) ? false : (overlapping_->GetBool());
|
||||
auto tensor_in_shape = tensor_in->GetTensorShape();
|
||||
auto tensor_out_shape = tensor_out->GetTensorShape();
|
||||
std::vector<int64_t> input_size(tensor_in_and_out_dims);
|
||||
std::vector<int64_t> output_size(tensor_in_and_out_dims);
|
||||
for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
|
||||
input_size[i] = tensor_in_shape->GetDimSize(i);
|
||||
}
|
||||
for (uint32_t i = 0; i < tensor_in_and_out_dims; ++i) {
|
||||
output_size[i] = tensor_out_shape->GetDimSize(i);
|
||||
}
|
||||
int64_t tensor_in_num = tensor_in->NumElements();
|
||||
int64_t tensor_out_num = tensor_out->NumElements();
|
||||
std::vector<T> tensor_out_dup(tensor_out_num);
|
||||
std::vector<int64_t> tensor_out_arg_max(tensor_out_num);
|
||||
for (int i = 0; i < tensor_out_num; i++) {
|
||||
tensor_out_dup[i] = std::numeric_limits<T>::lowest();
|
||||
tensor_out_arg_max[i] = -1;
|
||||
}
|
||||
// Find arg_max for each tensor_out
|
||||
ConstEigenMatrixMap tensor_in_mat(reinterpret_cast<T *>(tensor_in->GetData()), input_size[3],
|
||||
input_size[2] * input_size[1] * input_size[0]);
|
||||
EigenMatrixMap tensor_out_dup_mat(tensor_out_dup.data(), output_size[3],
|
||||
output_size[2] * output_size[1] * output_size[0]);
|
||||
EigenIndexMatrixMap tensor_out_arg_max_mat(tensor_out_arg_max.data(), output_size[3],
|
||||
output_size[2] * output_size[1] * output_size[0]);
|
||||
auto height_seq_tensor_shape = height_seq_tensor->GetTensorShape();
|
||||
auto width_seq_tensor_shape = width_seq_tensor->GetTensorShape();
|
||||
auto height_seq_tensor_data = static_cast<int64_t *>(height_seq_tensor->GetData());
|
||||
auto width_seq_tensor_data = static_cast<int64_t *>(width_seq_tensor->GetData());
|
||||
/**
|
||||
* Now walk through the process of fractional max pooling again.
|
||||
* For both input and output,
|
||||
* 0: batch
|
||||
* 1: height / row
|
||||
* 2: width / col
|
||||
* 3: depth / channel
|
||||
*/
|
||||
if (tensor_in_num < kParallelDataNum) {
|
||||
const int64_t height_max = input_size[1] - 1;
|
||||
const int64_t width_max = input_size[2] - 1;
|
||||
for (int64_t b = 0; b < input_size[0]; ++b) {
|
||||
// height sequence.
|
||||
for (int64_t hs = 0; hs < height_seq_tensor_shape->GetDimSize(0) - 1; ++hs) {
|
||||
// height start and end.
|
||||
const int64_t height_start = *(height_seq_tensor_data + hs);
|
||||
int64_t height_end = overlapping ? *(height_seq_tensor_data + hs + 1) : *(height_seq_tensor_data + hs + 1) - 1;
|
||||
height_end = std::min(height_end, height_max);
|
||||
// width sequence.
|
||||
for (int64_t ws = 0; ws < width_seq_tensor_shape->GetDimSize(0) - 1; ++ws) {
|
||||
const int64_t out_index = (b * output_size[1] + hs) * output_size[2] + ws;
|
||||
// width start and end.
|
||||
const int64_t width_start = *(width_seq_tensor_data + ws);
|
||||
int64_t width_end = overlapping ? *(width_seq_tensor_data + ws + 1) : *(width_seq_tensor_data + ws + 1) - 1;
|
||||
width_end = std::min(width_end, width_max);
|
||||
for (int64_t h = height_start; h <= height_end; ++h) {
|
||||
for (int64_t w = width_start; w <= width_end; ++w) {
|
||||
const int64_t in_index = (b * input_size[1] + h) * input_size[2] + w;
|
||||
// Walk through each channel (depth).
|
||||
for (int64_t d = 0; d < input_size[3]; ++d) {
|
||||
const T &input_ref = tensor_in_mat.coeffRef(d, in_index);
|
||||
T &output_ref = tensor_out_dup_mat.coeffRef(d, out_index);
|
||||
int64_t &out_arg_max_ref = tensor_out_arg_max_mat.coeffRef(d, out_index);
|
||||
if (output_ref < input_ref || out_arg_max_ref == kInvalidMaxPoolingIndex) {
|
||||
output_ref = input_ref;
|
||||
int input_offset = in_index * input_size[3] + d;
|
||||
out_arg_max_ref = input_offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint64_t height_seq_len = height_seq_tensor_shape->GetDimSize(0) - 1;
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > height_seq_len) {
|
||||
max_core_num = height_seq_len;
|
||||
}
|
||||
const int64_t height_max = input_size[1] - 1;
|
||||
const int64_t width_max = input_size[2] - 1;
|
||||
for (int64_t b = 0; b < input_size[0]; ++b) {
|
||||
// height sequence.
|
||||
auto sharder_fractionalmaxpoolgrad_index = [&](size_t start, size_t end) {
|
||||
for (size_t hs = start; hs < end; ++hs) {
|
||||
// height start and end.
|
||||
const int64_t height_start = *(height_seq_tensor_data + hs);
|
||||
int64_t height_end =
|
||||
overlapping ? *(height_seq_tensor_data + hs + 1) : *(height_seq_tensor_data + hs + 1) - 1;
|
||||
height_end = std::min(height_end, height_max);
|
||||
// width sequence.
|
||||
for (int64_t ws = 0; ws < width_seq_tensor_shape->GetDimSize(0) - 1; ++ws) {
|
||||
const int64_t out_index = (b * output_size[1] + hs) * output_size[2] + ws;
|
||||
// width start and end.
|
||||
const int64_t width_start = *(width_seq_tensor_data + ws);
|
||||
int64_t width_end = overlapping ? *(width_seq_tensor_data + ws + 1) : *(width_seq_tensor_data + ws + 1) - 1;
|
||||
width_end = std::min(width_end, width_max);
|
||||
for (int64_t h = height_start; h <= height_end; ++h) {
|
||||
for (int64_t w = width_start; w <= width_end; ++w) {
|
||||
const int64_t in_index = (b * input_size[1] + h) * input_size[2] + w;
|
||||
// Walk through each channel (depth).
|
||||
for (int64_t d = 0; d < input_size[3]; ++d) {
|
||||
const T &input_ref = tensor_in_mat.coeffRef(d, in_index);
|
||||
T &output_ref = tensor_out_dup_mat.coeffRef(d, out_index);
|
||||
int64_t &out_arg_max_ref = tensor_out_arg_max_mat.coeffRef(d, out_index);
|
||||
if (output_ref < input_ref || out_arg_max_ref == kInvalidMaxPoolingIndex) {
|
||||
output_ref = input_ref;
|
||||
int input_offset = in_index * input_size[3] + d;
|
||||
out_arg_max_ref = input_offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, height_seq_len, height_seq_len / max_core_num,
|
||||
sharder_fractionalmaxpoolgrad_index),
|
||||
"FractionalMaxPoolGrad Index Compute failed.");
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < tensor_in_num; i++) {
|
||||
*(output_data + i) = 0;
|
||||
}
|
||||
auto out_backprop_data = static_cast<T *>(out_backprop->GetData());
|
||||
int num_total_outputs = out_backprop->NumElements();
|
||||
int num_total_inputs = output->NumElements();
|
||||
for (int index = 0; index < num_total_outputs; ++index) {
|
||||
int input_backprop_index = tensor_out_arg_max[index];
|
||||
KERNEL_CHECK_FALSE((input_backprop_index >= 0 && input_backprop_index < num_total_inputs),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Invalid input backprop index:[%d], The maximum number of output is: "
|
||||
"[%d].",
|
||||
input_backprop_index, num_total_inputs);
|
||||
*(output_data + input_backprop_index) += *(out_backprop_data + index);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t FractionalMaxPoolGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(FractionalMaxPoolGradParamCheck(ctx), "Check FractionalMaxPoolGrad params failed.");
|
||||
Tensor *input = ctx.Input(0);
|
||||
auto data_type = input->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
return DoCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<double>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int64_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("FractionalMaxPoolGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kFractionalMaxPoolGrad, FractionalMaxPoolGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,20 @@
|
|||
#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class FractionalMaxPoolGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
FractionalMaxPoolGradCpuKernel() = default;
|
||||
~FractionalMaxPoolGradCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
uint32_t FractionalMaxPoolGradParamCheck(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_H_
|
|
@ -0,0 +1,198 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "fractional_max_pool_grad_with_fixed_ksize.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include "Eigen/Dense"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 3;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kFractionalMaxPoolGradWithFixedKsize = "FractionalMaxPoolGradWithFixedKsize";
|
||||
constexpr int64_t kParallelDataNums = 128 * 1024;
|
||||
|
||||
#define FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DTYPE, TYPE, OUT_BACKPROP, ARGMAX, DATA_NUMS, N_SIZE, C_SIZE, \
|
||||
INPUT_H, INPUT_W, OUTPUT_H, OUTPUT_W, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = FractionalMaxPoolGradWithFixedKsizeCompute<TYPE>( \
|
||||
OUT_BACKPROP, ARGMAX, DATA_NUMS, N_SIZE, C_SIZE, INPUT_H, INPUT_W, OUTPUT_H, OUTPUT_W, CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("FractionalMaxPoolGradWithFixedKsize kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t FractionalMaxPoolGradWithFixedKsize::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"FractionalMaxPoolGradWithFixedKsize check input and "
|
||||
"output number failed.");
|
||||
|
||||
Tensor *origin_input = ctx.Input(0);
|
||||
int64_t data_nums = origin_input->NumElements();
|
||||
auto origin_input_shape = origin_input->GetTensorShape();
|
||||
int32_t origin_input_dim = origin_input_shape->GetDims();
|
||||
KERNEL_CHECK_FALSE(origin_input_dim == 4, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The dim of input[origin_input] must be 4, but got [%d].", origin_input_dim);
|
||||
|
||||
Tensor *out_backprop = ctx.Input(1);
|
||||
auto out_backprop_shape = out_backprop->GetTensorShape();
|
||||
int32_t out_backprop_dim = out_backprop_shape->GetDims();
|
||||
KERNEL_CHECK_FALSE(out_backprop_dim == 4, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The dim of input[out_backprop] must be 4, but got [%d].", out_backprop_dim);
|
||||
Tensor *argmax = ctx.Input(2);
|
||||
auto argmax_shape = argmax->GetTensorShape();
|
||||
int32_t argmax_dim = argmax_shape->GetDims();
|
||||
KERNEL_CHECK_FALSE(argmax_dim == 4, KERNEL_STATUS_PARAM_INVALID, "The dim of input[argmax] must be 4, but got [%d].",
|
||||
argmax_dim);
|
||||
std::vector<int64_t> out_backprop_dim_sizes = out_backprop_shape->GetDimSizes();
|
||||
std::vector<int64_t> argmax_dim_sizes = argmax_shape->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE(out_backprop_dim_sizes == argmax_dim_sizes, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The shape of input[out_backprop] and input[argmax] must be equal.");
|
||||
int64_t n_size = out_backprop_dim_sizes[0];
|
||||
int64_t c_size = out_backprop_dim_sizes[1];
|
||||
int64_t input_h = out_backprop_dim_sizes[2];
|
||||
int64_t input_w = out_backprop_dim_sizes[3];
|
||||
|
||||
std::vector<int64_t> origin_input_dim_sizes = origin_input_shape->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE(origin_input_dim_sizes[0] == n_size, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The first dim of input[origin_input] and "
|
||||
"input[out_backprop] must be equal,"
|
||||
"but got origin_input=[%d] and out_backprop=[%d].",
|
||||
origin_input_dim_sizes[0], n_size);
|
||||
KERNEL_CHECK_FALSE(origin_input_dim_sizes[1] == c_size, KERNEL_STATUS_PARAM_INVALID,
|
||||
"The second dim of input[origin_input] and "
|
||||
"input[out_backprop] must be equal,"
|
||||
"but got origin_input=[%d] and out_backprop=[%d].",
|
||||
origin_input_dim_sizes[1], c_size);
|
||||
int64_t output_h = origin_input_dim_sizes[2];
|
||||
int64_t output_w = origin_input_dim_sizes[3];
|
||||
|
||||
auto data_type = out_backprop->GetDataType();
|
||||
switch (data_type) {
|
||||
FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, out_backprop, argmax, data_nums, n_size,
|
||||
c_size, input_h, input_w, output_h, output_w, ctx)
|
||||
FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_FLOAT, float, out_backprop, argmax, data_nums, n_size, c_size,
|
||||
input_h, input_w, output_h, output_w, ctx)
|
||||
FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_DOUBLE, double, out_backprop, argmax, data_nums, n_size, c_size,
|
||||
input_h, input_w, output_h, output_w, ctx)
|
||||
FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_INT32, int32_t, out_backprop, argmax, data_nums, n_size, c_size,
|
||||
input_h, input_w, output_h, output_w, ctx)
|
||||
FRACTIONALMAXPOOLGRADWITHFIXEDKSIZE_COMPUTE_CASE(DT_INT64, int64_t, out_backprop, argmax, data_nums, n_size, c_size,
|
||||
input_h, input_w, output_h, output_w, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR(
|
||||
"FractionalMaxPoolGradWithFixedKsize kernel input[out_backprop] type "
|
||||
"[%s] not support.",
|
||||
DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t FractionalMaxPoolGradWithFixedKsize::FractionalMaxPoolGradWithFixedKsizeCompute(
|
||||
Tensor *out_backprop, Tensor *argmax, const int64_t data_nums, const int n_size, const int c_size, const int input_h,
|
||||
const int input_w, const int output_h, const int output_w, CpuKernelContext &ctx) {
|
||||
T *out_backprop_addr = reinterpret_cast<T *>(out_backprop->GetData());
|
||||
int64_t *argmax_addr = reinterpret_cast<int64_t *>(argmax->GetData());
|
||||
|
||||
Tensor *y = ctx.Output(0);
|
||||
T *y_addr = reinterpret_cast<T *>(y->GetData());
|
||||
|
||||
if (data_nums < kParallelDataNums || n_size == 1) {
|
||||
for (int n = 0; n < n_size; n++) {
|
||||
T *out_backprop_single_batch_addr = out_backprop_addr + n * c_size * input_h * input_w;
|
||||
int64_t *argmax_single_batch_addr = argmax_addr + n * c_size * input_h * input_w;
|
||||
T *y_single_batch_addr = y_addr + n * c_size * output_h * output_w;
|
||||
|
||||
ComputeSingleBatch<T>(out_backprop_single_batch_addr, argmax_single_batch_addr, y_single_batch_addr, c_size,
|
||||
input_h, input_w, output_h, output_w);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > (uint32_t)n_size) {
|
||||
max_core_num = n_size;
|
||||
}
|
||||
auto shared_computeN = [&](size_t start, size_t end) {
|
||||
for (size_t n = start; n < end; n++) {
|
||||
T *out_backprop_single_batch_addr = out_backprop_addr + n * c_size * input_h * input_w;
|
||||
int64_t *argmax_single_batch_addr = argmax_addr + n * c_size * input_h * input_w;
|
||||
T *y_single_batch_addr = y_addr + n * c_size * output_h * output_w;
|
||||
|
||||
ComputeSingleBatch<T>(out_backprop_single_batch_addr, argmax_single_batch_addr, y_single_batch_addr, c_size,
|
||||
input_h, input_w, output_h, output_w);
|
||||
}
|
||||
};
|
||||
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, n_size, n_size / max_core_num, shared_computeN);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor shared_computeN failed.");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t FractionalMaxPoolGradWithFixedKsize::ComputeSingleBatch(T *out_backprop_single_batch_addr,
|
||||
int64_t *argmax_single_batch_addr,
|
||||
T *y_single_batch_addr, const int c_size,
|
||||
const int input_h, const int input_w,
|
||||
const int output_h, const int output_w) {
|
||||
for (int plane = 0; plane < c_size; plane++) {
|
||||
T *out_backprop_plane_addr = out_backprop_single_batch_addr + plane * input_h * input_w;
|
||||
int64_t *argmax_plane_addr = argmax_single_batch_addr + plane * input_h * input_w;
|
||||
T *y_plane_addr = y_single_batch_addr + plane * output_h * output_w;
|
||||
|
||||
for (int i = 0; i < output_h; i++) {
|
||||
for (int j = 0; j < output_w; j++) {
|
||||
y_plane_addr[i * output_w + j] = static_cast<T>(0);
|
||||
}
|
||||
}
|
||||
|
||||
for (int h = 0; h < input_h; h++) {
|
||||
for (int w = 0; w < input_w; w++) {
|
||||
int input_index = h * input_w + w;
|
||||
KERNEL_CHECK_FALSE((input_index >= 0 && input_index < input_h * input_w), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The input_index[%d] out of the length of argmax.", input_index);
|
||||
int output_index = argmax_plane_addr[input_index];
|
||||
KERNEL_CHECK_FALSE((output_index >= 0 && output_index < output_h * output_w), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The output_index[%d] out of the length of y.", output_index);
|
||||
|
||||
y_plane_addr[output_index] += out_backprop_plane_addr[input_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kFractionalMaxPoolGradWithFixedKsize, FractionalMaxPoolGradWithFixedKsize);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_WITH_FIXED_KSIZE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_FRACTIONAL_MAX_POOL_GRAD_WITH_FIXED_KSIZE_H_
|
||||
|
||||
#include <vector>
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class FractionalMaxPoolGradWithFixedKsize : public CpuKernel {
|
||||
public:
|
||||
FractionalMaxPoolGradWithFixedKsize() = default;
|
||||
~FractionalMaxPoolGradWithFixedKsize() override = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t FractionalMaxPoolGradWithFixedKsizeCompute(Tensor *out_backprop, Tensor *argmax, const int64_t data_nums,
|
||||
const int n_size, const int c_size, const int input_h,
|
||||
const int input_w, const int output_h, const int output_w,
|
||||
CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t ComputeSingleBatch(T *out_backprop_single_batch_addr, int64_t *argmax_single_batch_addr,
|
||||
T *y_single_batch_addr, const int c_size, const int input_h, const int input_w,
|
||||
const int output_h, const int output_w);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,160 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "gcd.h"
|
||||
|
||||
#include <set>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kGcdOutputNum = 1;
|
||||
const uint32_t kGcdInputNum = 2;
|
||||
const char *kGcd = "Gcd";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int32_t kInput_32_32 = 3;
|
||||
const int32_t kInput_32_64 = 2;
|
||||
const int32_t kInput_64_32 = 1;
|
||||
const int32_t kInput_64_64 = 0;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
// Simple recursive Gcd.
|
||||
template <class T>
|
||||
T elewise_gcd(T a, T b) {
|
||||
if (b == 0) {
|
||||
return a;
|
||||
}
|
||||
return elewise_gcd(b, a % b);
|
||||
}
|
||||
|
||||
uint32_t GcdIOTypeCheck(CpuKernelContext &ctx, int32_t &dual_types) {
|
||||
Tensor *x1 = ctx.Input(kFirstInputIndex);
|
||||
Tensor *x2 = ctx.Input(kSecondInputIndex);
|
||||
Tensor *y = ctx.Output(kFirstOutputIndex);
|
||||
const std::set<DataType> supported_types{DT_INT32, DT_INT64};
|
||||
auto x1_type = x1->GetDataType();
|
||||
auto x2_type = x2->GetDataType();
|
||||
auto y_type = y->GetDataType();
|
||||
KERNEL_CHECK_FALSE(supported_types.count(x1_type) != 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[Gcd] input x1 data type [%s] is not supported.", DTypeStr(x1_type).c_str());
|
||||
KERNEL_CHECK_FALSE(supported_types.count(x2_type) != 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[Gcd] input x2 data type [%s] is not supported.", DTypeStr(x2_type).c_str());
|
||||
int32_t x1_is_i32 = static_cast<int32_t>(x1_type == DT_INT32) << 1;
|
||||
int32_t x2_is_i32 = static_cast<int32_t>(x2_type == DT_INT32);
|
||||
int32_t _dual_types = x1_is_i32 | x2_is_i32;
|
||||
switch (_dual_types) {
|
||||
case kInput_64_64:
|
||||
case kInput_64_32:
|
||||
case kInput_32_64:
|
||||
KERNEL_CHECK_FALSE(y_type == DT_INT64, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[Gcd] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
|
||||
dual_types = _dual_types;
|
||||
break;
|
||||
case kInput_32_32:
|
||||
KERNEL_CHECK_FALSE(y_type == DT_INT32, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[Gcd] output y data type [%s] is not supported.", DTypeStr(y_type).c_str());
|
||||
dual_types = _dual_types;
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[Gcd] input data type tuple is not supported.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
template <class T1, class T2, class T3>
|
||||
uint32_t GcdElewiseCompute(CpuKernelContext &ctx, const T1 *x1_ptr, const T2 *x2_ptr, T3 *y_ptr, Bcast &bcast) {
|
||||
int64_t data_num = ctx.Output(kFirstOutputIndex)->NumElements();
|
||||
auto gcd_shard = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
T3 x1_ele_abs = std::abs(static_cast<T3>(x1_ptr[bcast.GetBroadcastXIndex(i)]));
|
||||
T3 x2_ele_abs = std::abs(static_cast<T3>(x2_ptr[bcast.GetBroadcastYIndex(i)]));
|
||||
y_ptr[i] = elewise_gcd(x1_ele_abs, x2_ele_abs);
|
||||
}
|
||||
};
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("[Gcd] max_core_num is 0, please check the cpu num.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, gcd_shard);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("[Gcd] Gcd Compute failed.");
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
gcd_shard(0, data_num);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
template <class T1, class T2, class T3>
|
||||
uint32_t GcdCompute(CpuKernelContext &ctx) {
|
||||
Tensor *x1 = ctx.Input(kFirstInputIndex);
|
||||
Tensor *x2 = ctx.Input(kSecondInputIndex);
|
||||
Tensor *y = ctx.Output(kFirstOutputIndex);
|
||||
const T1 *x1_ptr = reinterpret_cast<const T1 *>(x1->GetData());
|
||||
const T2 *x2_ptr = reinterpret_cast<const T2 *>(x2->GetData());
|
||||
T3 *y_ptr = reinterpret_cast<T3 *>(y->GetData());
|
||||
auto x1_shape = x1->GetTensorShape()->GetDimSizes();
|
||||
auto x2_shape = x2->GetTensorShape()->GetDimSizes();
|
||||
Bcast bcast(x1_shape, x2_shape);
|
||||
if (bcast.IsValid()) {
|
||||
return GcdElewiseCompute<T1, T2, T3>(ctx, x1_ptr, x2_ptr, y_ptr, bcast);
|
||||
} else {
|
||||
KERNEL_LOG_ERROR("[Gcd] broadcast failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
uint32_t GcdCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kGcdInputNum, kGcdOutputNum), "[Gcd] check input and output number failed.");
|
||||
int32_t dual_types = static_cast<int32_t>(-1);
|
||||
KERNEL_HANDLE_ERROR(GcdIOTypeCheck(ctx, dual_types), "[Gcd] check data type failed.");
|
||||
switch (dual_types) {
|
||||
case kInput_64_64:
|
||||
return GcdCompute<int64_t, int64_t, int64_t>(ctx);
|
||||
break;
|
||||
case kInput_64_32:
|
||||
return GcdCompute<int64_t, int32_t, int64_t>(ctx);
|
||||
break;
|
||||
case kInput_32_64:
|
||||
return GcdCompute<int32_t, int64_t, int64_t>(ctx);
|
||||
break;
|
||||
case kInput_32_32:
|
||||
return GcdCompute<int32_t, int32_t, int32_t>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[Gcd] input data type tuple is not supported.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kGcd, GcdCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_GCD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_GCD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class GcdCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~GcdCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,268 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "geqrf.h"
|
||||
#include <cmath>
|
||||
#include <complex>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const char *kGeqrf = "Geqrf";
|
||||
const uint32_t kInputNum = 1;
|
||||
const uint32_t kOutputNum = 2;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t GeqrfCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
DataType input0_data_type = ctx.Input(0)->GetDataType();
|
||||
bool ret = KERNEL_STATUS_PARAM_INVALID;
|
||||
switch (input0_data_type) {
|
||||
case DT_FLOAT16:
|
||||
ret = DoCompute<Eigen::half>(ctx);
|
||||
break;
|
||||
case DT_FLOAT:
|
||||
ret = DoCompute<float>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
ret = DoCompute<double>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
ret = DoComputeC<float>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
ret = DoComputeC<double>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported input data type[%s]", DTypeStr(input0_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GeqrfCpuKernel::Larfg(int n, int vm, int vn, T **A, T *tau) {
|
||||
T zero = static_cast<T>(0);
|
||||
if (n <= 1) {
|
||||
*tau = zero;
|
||||
return;
|
||||
}
|
||||
T xnorm = zero;
|
||||
for (int i = vm + 1; i < vm + n; i++) {
|
||||
xnorm = xnorm + A[i][vn] * A[i][vn];
|
||||
}
|
||||
xnorm = sqrt(xnorm);
|
||||
if (xnorm == zero) {
|
||||
*tau = zero;
|
||||
return;
|
||||
} else {
|
||||
T beta = sqrt(A[vm][vn] * A[vm][vn] + xnorm * xnorm);
|
||||
if (A[vm][vn] > zero) {
|
||||
beta = -beta;
|
||||
}
|
||||
*tau = (beta - (A[vm][vn])) / beta;
|
||||
auto scal = (A[vm][vn]) - beta;
|
||||
for (int i = vm + 1; i < vm + n; i++) {
|
||||
A[i][vn] /= scal;
|
||||
}
|
||||
A[vm][vn] = beta;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GeqrfCpuKernel::Larf(int m, int n, T **A, T *tau, int cm, int cn) {
|
||||
if (m <= 0 || n <= 0) {
|
||||
return;
|
||||
}
|
||||
T *work = new T[n]();
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
work[j] += A[cm + i][cn - 1] * A[cm + i][cn + j];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
A[i + cm][j + cn] -= (*tau) * A[cm + i][cn - 1] * work[j];
|
||||
}
|
||||
}
|
||||
delete[] work;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GeqrfCpuKernel::Geqrf(int m, int n, T **A, T *tau) {
|
||||
if (m < 0 || n < 0) {
|
||||
return;
|
||||
}
|
||||
int k = std::min(m, n);
|
||||
T one = static_cast<T>(1);
|
||||
for (int i = 0; i < k; i++) {
|
||||
Larfg<T>(m - i, i, i, A, tau + i);
|
||||
T aii = A[i][i];
|
||||
A[i][i] = one;
|
||||
Larf<T>(m - i, n - i - 1, A, tau + i, i, i + 1);
|
||||
A[i][i] = aii;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GeqrfCpuKernel::CLarfg(int n, int vm, int vn, complex<T> **A, complex<T> *tau) {
|
||||
complex<T> one = complex<T>(1, 0);
|
||||
complex<T> zero = complex<T>(0, 0);
|
||||
if (n <= 0) {
|
||||
*tau = zero;
|
||||
return;
|
||||
}
|
||||
T xnorm = 0;
|
||||
for (int i = vm + 1; i < vm + n; i++) {
|
||||
xnorm = xnorm + norm(A[i][vn]);
|
||||
}
|
||||
xnorm = sqrt(xnorm);
|
||||
T alphr = A[vm][vn].real();
|
||||
T alphi = A[vm][vn].imag();
|
||||
if (xnorm == 0 && alphi == 0) {
|
||||
*tau = zero;
|
||||
} else {
|
||||
T beta;
|
||||
beta = sqrt(alphr * alphr + alphi * alphi + xnorm * xnorm);
|
||||
if (A[vm][vn].real() > 0) {
|
||||
beta = -beta;
|
||||
}
|
||||
*tau = complex<T>((beta - alphr) / beta, -alphi / beta);
|
||||
A[vm][vn] = one / (A[vm][vn] - beta);
|
||||
for (int i = vm + 1; i < vm + n; i++) {
|
||||
A[i][vn] *= A[vm][vn];
|
||||
}
|
||||
A[vm][vn] = beta;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GeqrfCpuKernel::CLarf(int m, int n, complex<T> **A, complex<T> *tau, int cm, int cn) {
|
||||
if (m <= 0 || n <= 0) {
|
||||
return;
|
||||
}
|
||||
complex<T> zero = complex<T>(0, 0);
|
||||
complex<T> *work = new complex<T>[n];
|
||||
complex<T> temp = zero;
|
||||
for (int j = 0; j < n; j++) {
|
||||
for (int i = 0; i < m; i++) {
|
||||
temp = temp + conj(A[i + cm][j + cn]) * A[cm + i][cn - 1];
|
||||
}
|
||||
work[j] = temp;
|
||||
temp = zero;
|
||||
}
|
||||
for (int j = 0; j < n; j++) {
|
||||
for (int i = 0; i < m; i++) {
|
||||
A[i + cm][j + cn] = A[i + cm][j + cn] - conj(*tau) * A[cm + i][cn - 1] * conj(work[j]);
|
||||
}
|
||||
}
|
||||
delete[] work;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GeqrfCpuKernel::CGeqrf(int m, int n, complex<T> **A, complex<T> *tau) {
|
||||
if (m < 0 || n < 0) {
|
||||
return;
|
||||
}
|
||||
int k = std::min(m, n);
|
||||
complex<T> one = complex<T>(1, 0);
|
||||
complex<T> aii;
|
||||
for (int i = 0; i < k; i++) {
|
||||
CLarfg<T>(m - i, i, i, A, (tau + i));
|
||||
aii = A[i][i];
|
||||
A[i][i] = one;
|
||||
CLarf<T>(m - i, n - i - 1, A, tau + i, i, i + 1);
|
||||
A[i][i] = aii;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t GeqrfCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
auto input0_tensor = ctx.Input(0);
|
||||
auto input0_tensor_shape = input0_tensor->GetTensorShape();
|
||||
int32_t dim = input0_tensor_shape->GetDims();
|
||||
if (dim != kOutputNum) {
|
||||
KERNEL_LOG_ERROR("The input matrix must have dimension = 2");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
std::vector<int64_t> input0_dims = input0_tensor_shape->GetDimSizes();
|
||||
const int32_t m = input0_dims[0];
|
||||
const int32_t n = input0_dims[1];
|
||||
auto input_m = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_r = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto output_tau = reinterpret_cast<T *>(ctx.Output(1)->GetData());
|
||||
|
||||
T **A = new T *[m];
|
||||
for (int i = 0; i < m; i++) {
|
||||
A[i] = new T[n];
|
||||
}
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
A[i][j] = *(input_m + i * n + j);
|
||||
}
|
||||
}
|
||||
Geqrf<T>(m, n, A, output_tau);
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
*(output_r + i * n + j) = A[i][j];
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t GeqrfCpuKernel::DoComputeC(CpuKernelContext &ctx) {
|
||||
auto input0_tensor = ctx.Input(0);
|
||||
auto input0_tensor_shape = input0_tensor->GetTensorShape();
|
||||
int32_t dim = input0_tensor_shape->GetDims();
|
||||
if (dim != kOutputNum) {
|
||||
KERNEL_LOG_ERROR("The input matrix must have dimension = 2");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
std::vector<int64_t> input0_dims = input0_tensor_shape->GetDimSizes();
|
||||
const int32_t m = input0_dims[0];
|
||||
const int32_t n = input0_dims[1];
|
||||
auto input_m = reinterpret_cast<complex<T> *>(ctx.Input(0)->GetData());
|
||||
auto output_r = reinterpret_cast<complex<T> *>(ctx.Output(0)->GetData());
|
||||
auto output_tau = reinterpret_cast<complex<T> *>(ctx.Output(1)->GetData());
|
||||
|
||||
complex<T> **A = new complex<T> *[m];
|
||||
for (int i = 0; i < m; i++) {
|
||||
A[i] = new complex<T>[n];
|
||||
}
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
A[i][j] = *(input_m + i * n + j);
|
||||
}
|
||||
}
|
||||
CGeqrf<T>(m, n, A, output_tau);
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
*(output_r + i * n + j) = A[i][j];
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kGeqrf, GeqrfCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,55 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_GEQRF_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_GEQRF_H_
|
||||
|
||||
#include <complex>
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class GeqrfCpuKernel : public CpuKernel {
|
||||
public:
|
||||
GeqrfCpuKernel() = default;
|
||||
~GeqrfCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void Larfg(int n, int vm, int vn, T **A, T *tau);
|
||||
|
||||
template <typename T>
|
||||
void Larf(int m, int n, T **A, T *tau, int cm, int cn);
|
||||
|
||||
template <typename T>
|
||||
void Geqrf(int m, int n, T **A, T *tau);
|
||||
|
||||
template <typename T>
|
||||
void CLarfg(int n, int vm, int vn, std::complex<T> **A, std::complex<T> *tau);
|
||||
|
||||
template <typename T>
|
||||
void CLarf(int m, int n, std::complex<T> **A, std::complex<T> *tau, int cm, int cn);
|
||||
|
||||
template <typename T>
|
||||
void CGeqrf(int m, int n, std::complex<T> **A, std::complex<T> *tau);
|
||||
|
||||
template <typename T>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t DoComputeC(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_GEQRF_H_
|
|
@ -0,0 +1,89 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "hard_sigmoid.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *const kHardSigmoid = "HardSigmoid";
|
||||
const int64_t kParallelDataNums = 16 * 1024;
|
||||
const float alpha = 0.16666666;
|
||||
const float beta = 0.5;
|
||||
|
||||
#define HARD_SIGMOID_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = HardSigmoidCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("HardSigmoid kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t HardSigmoidCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kHardSigmoid);
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
HARD_SIGMOID_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
HARD_SIGMOID_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
HARD_SIGMOID_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("HardSigmoid kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t HardSigmoidCpuKernel::HardSigmoidCompute(const CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
|
||||
const T zero = static_cast<T>(0);
|
||||
const T three = static_cast<T>(3);
|
||||
const T six = static_cast<T>(6);
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
*(output_y + i) = std::min(std::max(*(input_x + i) + three, zero), six) / six;
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
int64_t perUnitSize = max_core_num > 0 ? data_num / max_core_num : data_num;
|
||||
auto shard_hard_sigmoid = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
*(output_y + i) = std::min(std::max(*(input_x + i) + three, zero), six) / six;
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, perUnitSize, shard_hard_sigmoid),
|
||||
"HardSigmoid Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kHardSigmoid, HardSigmoidCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_H
|
||||
#define AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_H
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class HardSigmoidCpuKernel : public CpuKernel {
|
||||
public:
|
||||
HardSigmoidCpuKernel() = default;
|
||||
~HardSigmoidCpuKernel() override = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t HardSigmoidCompute(const CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,95 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "hard_sigmoid_grad.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *const kHardSigmoidGrad = "HardSigmoidGrad";
|
||||
const int64_t kParallelDataNums = 16 * 1024;
|
||||
|
||||
#define HARD_SIGMOID_GRAD_COMPUTE_CASE(DTYPE1, TYPE1, TYPE2, CTX) \
|
||||
case (DTYPE1): { \
|
||||
uint32_t result = HardSigmoidGradCompute<TYPE1, TYPE2>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("HardSigmoidGrad kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t HardSigmoidGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kHardSigmoidGrad);
|
||||
DataType grads_type = ctx.Input(0)->GetDataType();
|
||||
DataType x_type = ctx.Input(1)->GetDataType();
|
||||
if (grads_type != x_type) {
|
||||
KERNEL_LOG_ERROR("HardSigmoidGrad kernel input[0] data type [%s] must be the same as input[1] data type [%s].",
|
||||
DTypeStr(grads_type).c_str(), DTypeStr(x_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
switch (grads_type) {
|
||||
HARD_SIGMOID_GRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, Eigen::half, ctx)
|
||||
HARD_SIGMOID_GRAD_COMPUTE_CASE(DT_FLOAT, float, float, ctx)
|
||||
HARD_SIGMOID_GRAD_COMPUTE_CASE(DT_DOUBLE, double, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("HardSigmoidGrad kernel inputs data type [%s] not support.", DTypeStr(grads_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t HardSigmoidGradCpuKernel::HardSigmoidGradCompute(const CpuKernelContext &ctx) {
|
||||
auto grads = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
|
||||
auto input_x = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
|
||||
auto y = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Input(1)->NumElements();
|
||||
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T2));
|
||||
const T2 zero = static_cast<T2>(0);
|
||||
const T2 three = static_cast<T2>(3);
|
||||
const T2 neg_three = static_cast<T2>(-3);
|
||||
const T2 one_sixth = static_cast<T2>(1.0f / 6.0f);
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
*(y + i) =
|
||||
(*(input_x + i) > neg_three && *(input_x + i) < three) ? static_cast<T2>(*(grads + i)) * one_sixth : zero;
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
int64_t perUnitSize = max_core_num > 0 ? data_num / max_core_num : data_num;
|
||||
auto shard_hard_sigmoid_grad = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
*(y + i) =
|
||||
(*(input_x + i) > neg_three && *(input_x + i) < three) ? static_cast<T2>(*(grads + i)) * one_sixth : zero;
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, perUnitSize, shard_hard_sigmoid_grad),
|
||||
"HardSigmoidGrad Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kHardSigmoidGrad, HardSigmoidGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_GRAD_H
|
||||
#define AICPU_KERNELS_NORMALIZED_HARD_SIGMOID_GRAD_H
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class HardSigmoidGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
HardSigmoidGradCpuKernel() = default;
|
||||
~HardSigmoidGradCpuKernel() override = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T1, typename T2>
|
||||
uint32_t HardSigmoidGradCompute(const CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,237 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022.All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "heaviside.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kHeaviside = "Heaviside";
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
|
||||
#define HEAVISIDE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = HeavisideCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Heaviside kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
T heaviside(T a, T b) {
|
||||
return a == static_cast<T>(0) ? b : static_cast<T>(a > static_cast<T>(0));
|
||||
}
|
||||
|
||||
uint32_t HeavisideCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Heaviside check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(HeavisideParamCheck(ctx), "Heaviside check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
HEAVISIDE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
HEAVISIDE_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
HEAVISIDE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
HEAVISIDE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
HEAVISIDE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
HEAVISIDE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
HEAVISIDE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
HEAVISIDE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
HEAVISIDE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
|
||||
HEAVISIDE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
|
||||
HEAVISIDE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Heaviside kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t HeavisideCpuKernel::HeavisideParamCheck(CpuKernelContext &ctx) {
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"HeavisideCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t HeavisideCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
|
||||
BcastShapeType type;
|
||||
if (in0_elements_nums == in1_elements_nums) {
|
||||
type = BcastShapeType::SAME_SHAPE;
|
||||
} else {
|
||||
type = (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
|
||||
}
|
||||
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_heaviside = [&](int64_t start, int64_t end) {
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) = heaviside<T>(*(in0 + i), *(in1 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) = heaviside<T>(*in0, *(in1 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) = heaviside<T>(*(in0 + i), *in1);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_heaviside),
|
||||
"Heaviside Compute failed.");
|
||||
} else {
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
|
||||
*(out + i) = heaviside<T>(*(in0 + i), *(in1 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
|
||||
*(out + i) = heaviside<T>(*in0, *(in1 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
for (int64_t i = static_cast<int64_t>(0); i < data_num; ++i) {
|
||||
*(out + i) = heaviside<T>(*(in0 + i), *in1);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t HeavisideCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
T *in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
T *in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
T *out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_heaviside = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(out + i) = heaviside<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
|
||||
}
|
||||
};
|
||||
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_heaviside),
|
||||
"Heaviside Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
*(out + i) = heaviside<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t HeavisideCpuKernel::HeavisideCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
|
||||
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
|
||||
if (isNeedBcast) {
|
||||
return NoBcastCompute<T>(ctx);
|
||||
} else {
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kHeaviside, HeavisideCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022.All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_HEAVISIDE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_HEAVISIDE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
namespace aicpu {
|
||||
class HeavisideCpuKernel : public CpuKernel {
|
||||
public:
|
||||
HeavisideCpuKernel() = default;
|
||||
~HeavisideCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t HeavisideParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
|
||||
template <typename T>
|
||||
uint32_t HeavisideCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -1,103 +0,0 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "is_inf.h"
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
|
||||
namespace {
|
||||
const char *const kIsInf = "IsInf";
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
constexpr int64_t kParallelDataNumsFloat16 = 128 * 1024;
|
||||
constexpr int64_t kParallelDataNumsFloat = 128 * 1024;
|
||||
constexpr int64_t kParallelDataNumsDouble = 300 * 1024;
|
||||
|
||||
#define ISINF_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = IsInfCompute<TYPE>(CTX); \
|
||||
if (result != static_cast<uint32_t>(KERNEL_STATUS_OK)) { \
|
||||
KERNEL_LOG_ERROR("IsInf kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t IsInfCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kIsInf);
|
||||
KERNEL_HANDLE_ERROR(IsInfCheck(ctx), "[%s] check params failed.", kIsInf);
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
ISINF_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
ISINF_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
ISINF_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("IsInf kernel data type [%s] not supports.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return static_cast<uint32_t>(KERNEL_STATUS_OK);
|
||||
}
|
||||
|
||||
uint32_t IsInfCpuKernel::IsInfCheck(const CpuKernelContext &ctx) const {
|
||||
KERNEL_CHECK_NULLPTR(ctx.Input(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
|
||||
KERNEL_CHECK_NULLPTR(ctx.Output(0)->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t IsInfCpuKernel::IsInfCompute(const CpuKernelContext &ctx) {
|
||||
auto input = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output = reinterpret_cast<bool *>(ctx.Output(0)->GetData());
|
||||
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
|
||||
|
||||
if ((data_type == DT_FLOAT16 && data_size <= kParallelDataNumsFloat16) ||
|
||||
(data_type == DT_FLOAT && data_size <= kParallelDataNumsFloat) ||
|
||||
(data_type == DT_DOUBLE && data_size <= kParallelDataNumsDouble)) {
|
||||
for (int64_t index = 0; index < data_num; index++) {
|
||||
*(output + index) = Eigen::numext::isinf(*(input + index));
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto shard_isinf = [&](size_t start, size_t end) {
|
||||
for (size_t index = start; index < end; index++) {
|
||||
*(output + index) = Eigen::numext::isinf(*(input + index));
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_isinf),
|
||||
"IsInf Compute failed.");
|
||||
}
|
||||
|
||||
return static_cast<uint32_t>(KERNEL_STATUS_OK);
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kIsInf, IsInfCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -88,7 +88,6 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
|
|||
mindspore::kScatterNdOpName,
|
||||
mindspore::kScatterNdUpdateOpName,
|
||||
mindspore::kTensorScatterUpdateOpName,
|
||||
mindspore::kIsInfOpName,
|
||||
mindspore::kIsNanOpName,
|
||||
mindspore::kMatrixDeterminantOpName,
|
||||
mindspore::kMatrixLogarithmOpName,
|
||||
|
@ -145,7 +144,44 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
|
|||
mindspore::kMulOpName,
|
||||
mindspore::kConjOpName,
|
||||
mindspore::kZerosLikeOpName,
|
||||
mindspore::kMatrixBandPartOpName};
|
||||
mindspore::kMatrixBandPartOpName,
|
||||
mindspore::kDenseToCSRSparseMatrixOpName,
|
||||
mindspore::kDenseToSparseSetOperation,
|
||||
mindspore::kDiagOpName,
|
||||
mindspore::kDiagonalOpName,
|
||||
mindspore::kDiagPartOpName,
|
||||
mindspore::kEigOpName,
|
||||
mindspore::kEyeOpName,
|
||||
mindspore::kMaximumOpName,
|
||||
mindspore::kMinimumOpName,
|
||||
mindspore::kFractionalAvgPoolOpName,
|
||||
mindspore::kFractionalAvgPoolGradOpName,
|
||||
mindspore::kFractionalMaxPoolOpName,
|
||||
mindspore::kFractionalMaxPoolGradOpName,
|
||||
mindspore::kFractionalMaxPoolGradWithFixedKsizeOpName,
|
||||
mindspore::kGatherNdOpName,
|
||||
mindspore::kGcdOpName,
|
||||
mindspore::kGeqrfOpName,
|
||||
mindspore::kHardSigmoidOpName,
|
||||
mindspore::kHardSigmoidGradOpName,
|
||||
mindspore::kHeavisideOpName,
|
||||
mindspore::kHypotOpName,
|
||||
mindspore::kIdentityNOpName,
|
||||
mindspore::kIndexFillOpName,
|
||||
mindspore::kKLDivOpName,
|
||||
mindspore::kKlDivLossGradOpName,
|
||||
mindspore::kLcmOpName,
|
||||
mindspore::kLessEqualOpName,
|
||||
mindspore::kLogicalXorOpName,
|
||||
mindspore::kLogitOpName,
|
||||
mindspore::kLogitGradOpName,
|
||||
mindspore::kLogNormalReverseOpName,
|
||||
mindspore::kLowerBoundOpName,
|
||||
mindspore::kLstsqOpName,
|
||||
mindspore::kLuUnpackOpName,
|
||||
mindspore::kLuUnpackGradOpName,
|
||||
mindspore::kMatMulOpName,
|
||||
mindspore::kMatrixExpOpName};
|
||||
|
||||
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
|
||||
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";
|
||||
|
|
|
@ -238,7 +238,42 @@ from .smooth_l1_loss import _smooth_l1_loss_aicpu
|
|||
from .cumulative_logsumexp import _cumulative_logsumexp_aicpu
|
||||
from .nuclear_norm import _nuclear_norm_aicpu
|
||||
from .sparse_segment_sqrt_n import _sparse_segment_sqrt_n_aicpu
|
||||
from .unsorted_segment_prod import _unsorted_segment_prod_aicpu
|
||||
from .scale_and_translate import _scale_and_translate_aicpu
|
||||
from .quant_dtype_cast import _quant_dtype_cast_aicpu
|
||||
from .fse_decode import _fse_decode_aicpu
|
||||
from .unsorted_segment_prod import _unsorted_segment_prod_aicpu
|
||||
from .dense_to_csr_sparse_matrix import _dense_to_csr_sparse_matrix_aicpu
|
||||
from .dense_to_sparse_set_operation import _dense_to_sparse_set_operation_aicpu
|
||||
from .diag import _diag_aicpu
|
||||
from .diagonal import _diagonal_aicpu
|
||||
from .diag_part import _diag_part_aicpu
|
||||
from .eig import _eig_aicpu
|
||||
from .eye import _eye_aicpu
|
||||
from .fmax import _fmax_aicpu
|
||||
from .fmin import _fmin_aicpu
|
||||
from .fractional_avg_pool import _fractional_avg_pool_aicpu
|
||||
from .fractional_avg_pool_grad import _fractional_avg_pool_grad_aicpu
|
||||
from .fractional_max_pool import _fractional_max_pool_aicpu
|
||||
from .fractional_max_pool_grad import _fractional_max_pool_grad_aicpu
|
||||
from .fractional_max_pool_grad_with_fixed_ksize import _fractional_max_pool_grad_with_fixed_ksize_aicpu
|
||||
from .gcd import _gcd_aicpu
|
||||
from .geqrf import _geqrf_aicpu
|
||||
from .hard_sigmoid import _hard_sigmoid_aicpu
|
||||
from .hard_sigmoid_grad import _hard_sigmoid_grad_aicpu
|
||||
from .heaviside import _heaviside_aicpu
|
||||
from .hypot import _hypot_aicpu
|
||||
from .identity_n import _identity_n_aicpu
|
||||
from .index_fill import _index_fill_aicpu
|
||||
from .kldivloss import _kldiv_loss_aicpu
|
||||
from .kldivlossgrad import _kldiv_loss_grad_aicpu
|
||||
from .lcm import _lcm_aicpu
|
||||
from .less_equal import _less_equal_aicpu
|
||||
from .logical_xor import _logical_xor_aicpu
|
||||
from .logit import _logit_aicpu
|
||||
from .logit_grad import _logit_grad_aicpu
|
||||
from .log_normal_reverse import _log_normal_reverse_aicpu
|
||||
from .lower_bound import _lower_bound_aicpu
|
||||
from .lstsq import _lstsq_aicpu
|
||||
from .lu_unpack import _lu_unpack_aicpu
|
||||
from .lu_unpack_grad import _lu_unpack_grad_aicpu
|
||||
from .matrix_exp import _matrix_exp_aicpu
|
||||
|
|
Loading…
Reference in New Issue