forked from mindspore-Ecosystem/mindspore
aicpu migration 35 ops, 0105 branch
This commit is contained in:
parent
551bcec327
commit
66cfa84dce
|
@ -98,3 +98,8 @@
|
|||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "zerodivcond"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noConstructor"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "noExplicitConstructor"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "identicalConditionAfterEarlyExit"
|
||||
|
|
|
@ -282,3 +282,44 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
|
|||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/col2im.cc:aicpu::Col2imCpuKernel::Col2imParamCheck
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd_update.cc:aicpu::ScatterNdUpdateCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_sparse.cc:aicpu::RaggedTensorToSparseCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/ragged_tensor_to_tensor.cc:aicpu::RaggedTensorToTensorCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::GenerateRandomCrop
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc:aicpu::ScatterNdCpuKernel::Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d_grad.cc:aicpu::MaxUnpool3DGradCpuKernel::MaxUnpool3DGradCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool.cc:aicpu::SpacialMaxPool
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_mean.cc:aicpu::ReduceMeanCpuKernel::ReduceMeanCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_mean.cc:aicpu::SegmentMeanCpuKernel::SegmentMeanCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sample_distorted_bounding_box_ext2.cc:aicpu::SDBBExt2CpuKernel::SDBBExt2Compute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_3d.cc:aicpu::MaxUnpool3DCpuKernel::MaxUnpool3DCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/slice.cc:aicpu::SliceCpuKernel::SliceCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/segment_prod.cc:aicpu::SegmentProdCpuKernel::SegmentProdCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/maxpool_grad.cc:aicpu::SpatialMaxPoolWithArgMaxHelper
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/reduce_prod.cc:aicpu::ReduceProdCpuKernel::ReduceProdCompute_Complex
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradC
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/parameterized_truncated_normal.cc:aicpu::Generate
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradComputeFP16
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss_grad.cc:aicpu::MultiMarginLossGradCpuKernel::MultiMarginLossGradCompute
|
||||
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multi_margin_loss.cc:aicpu::MultiMarginLossCpuKernel::MultiMarginLossComputeFP16
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
mindspore.ops.PSROIPooling
|
||||
==========================
|
||||
|
||||
.. py:class:: mindspore.ops.PSROIPooling(spatial_scale, group_size, output_dim)
|
||||
|
||||
对输入Tensor应用Position Sensitive ROI-Pooling。
|
||||
|
||||
参数:
|
||||
- **spatial_scale** (float) - 将框坐标映射到输入坐标的比例因子。例如,如果你的框定义在224x224的图像上,并且你的输入是112x112的特征图(由原始图像的0.5倍缩放产生),此时需要将其设置为0.5。
|
||||
- **group_size** (int) - 执行池化后输出的大小(以像素为单位),以(高度,宽度)的格式输出。
|
||||
- **output_dim** (int) - 执行池化后输出的维度。
|
||||
|
||||
输入:
|
||||
- **features** (Tensor) - 输入特征Tensor,其shape必须为 :math:`(N, C, H, W)` 。 各维度的值应满足: :math:`(C == output\_dim * group\_size * group\_size)` 。数据类型为float16或者float32。
|
||||
- **rois** (Tensor) - 其shape为 :math:`(batch, 5, rois_n)` ,数据类型为float16或者float32。第一个维度的batch为批处理大小。第二个维度的大小必须为5。第三维度rois_n是rois的数量。rois_n的值格式为:(index, x1, y1, x2, y2)。其中第一个元素是rois的索引。方框坐标格式为(x1、y1、x2、y2),之后将把这些方框的选中的区域提取出来。区域坐标必须满足0 <= x1 < x2和0 <= y1 < y2。
|
||||
|
||||
输出:
|
||||
- **out** (Tensor) - 池化后的输出。其shape为 :math:`(rois.shape[0] * rois.shape[2], output\_dim, group\_size, group\_size)` 。
|
||||
|
||||
异常:
|
||||
- **TypeError** - `spatial_scale` 不是float类型。
|
||||
- **TypeError** - `group_size` 或者 `output_dim` 不是 int类型。
|
||||
- **TypeError** - `features` 或者 `rois` 不是Tensor。
|
||||
- **TypeError** - `rois` 数据类型不是float16或者float32。
|
||||
- **ValueError** - `features` 的shape不满足 :math:`(C == output\_dim * group\_size * group\_size)` 。
|
||||
- **ValueError** - `spatial_scale` 为负数。
|
|
@ -157,6 +157,7 @@ constexpr auto kCastOpName = "Cast";
|
|||
constexpr auto kCentralizationOpName = "Centralization";
|
||||
constexpr auto kCeLUOpName = "CeLU";
|
||||
constexpr auto kCeluV2OpName = "CeluV2";
|
||||
constexpr auto kCheckNumericsOpName = "CheckNumerics";
|
||||
constexpr auto kClearZeroOpName = "ClearZero";
|
||||
constexpr auto kClipBoxesOpName = "kClipBoxes";
|
||||
constexpr auto kClipBoxesDOpName = "kClipBoxesD";
|
||||
|
@ -277,6 +278,7 @@ constexpr auto kFillV2DOpName = "FillV2D";
|
|||
constexpr auto kFSEDecodeOpName = "FSEDecode";
|
||||
constexpr auto kFive2FourOpName = "Five2Four";
|
||||
constexpr auto kFlattenGradOpName = "FlattenGrad";
|
||||
constexpr auto kFloorDivOpName = "FloorDiv";
|
||||
constexpr auto kFour2FiveOpName = "Four2Five";
|
||||
constexpr auto kFractionalAvgPoolGradOpName = "FractionalAvgPoolGrad";
|
||||
constexpr auto kFusedAdaFactorName = "FusedAdaFactor";
|
||||
|
@ -386,6 +388,7 @@ constexpr auto kLinSpaceDOpName = "LinSpaceD";
|
|||
constexpr auto kListDiffOpName = "ListDiff";
|
||||
constexpr auto kLogMatrixDeterminantOpName = "LogMatrixDeterminant";
|
||||
constexpr auto kLogOpName = "Log";
|
||||
constexpr auto kLog1pOpName = "Log1p";
|
||||
constexpr auto kLogSoftmaxOpName = "LogSoftmax";
|
||||
constexpr auto kLogSoftmaxV2OpName = "LogSoftmaxV2";
|
||||
constexpr auto kLogSoftmaxGradOpName = "LogSoftmaxGrad";
|
||||
|
@ -409,6 +412,8 @@ constexpr auto kMatrixSetDiagOpName = "MatrixSetDiag";
|
|||
constexpr auto kMatrixSetDiagDOpName = "MatrixSetDiagD";
|
||||
constexpr auto kMatrixSetDiagV3OpName = "MatrixSetDiagV3";
|
||||
constexpr auto kMatrixSolveLsOpName = "MatrixSolveLs";
|
||||
constexpr auto kMatrixTriangularSolveOpName = "MatrixTriangularSolve";
|
||||
constexpr auto kMaximumGradGradOpName = "MaximumGradGrad";
|
||||
constexpr auto kMaximumGradOpName = "MaximumGrad";
|
||||
constexpr auto kMaximumOpName = "Maximum";
|
||||
constexpr auto kMaxPool3DGradGradOpName = "MaxPool3DGradGrad";
|
||||
|
@ -422,15 +427,22 @@ constexpr auto kMaxPoolExt2OpName = "MaxPoolExt2";
|
|||
constexpr auto kMaxPoolWithArgmaxOpName = "MaxPoolWithArgmax";
|
||||
constexpr auto kMaxUnpool2DOpName = "MaxUnpool2D";
|
||||
constexpr auto kMaxUnpool2DGradOpName = "MaxUnpool2DGrad";
|
||||
constexpr auto kMaxUnpool3DOpName = "MaxUnpool3D";
|
||||
constexpr auto kMaxUnpool3DGradOpName = "MaxUnpool3DGrad";
|
||||
constexpr auto kMeanGradOpName = "MeanGrad";
|
||||
constexpr auto kMedianOpName = "Median";
|
||||
constexpr auto kMedianGradOpName = "MedianGrad";
|
||||
constexpr auto kMemCpyAsyncOpName = "memcpy_async";
|
||||
constexpr auto kMinimumGradGradOpName = "MinimumGradGrad";
|
||||
constexpr auto kMinimumGradOpName = "MinimumGrad";
|
||||
constexpr auto kMinimumOpName = "Minimum";
|
||||
constexpr auto kMirrorPadOpName = "MirrorPad";
|
||||
constexpr auto kMomentumOpName = "Momentum";
|
||||
constexpr auto kMulOpName = "Mul";
|
||||
constexpr auto kMulNoNanOpName = "MulNoNan";
|
||||
constexpr auto kMultilabelMarginLossGradOpName = "MultilabelMarginLossGrad";
|
||||
constexpr auto kMultiMarginLossGradOpName = "MultiMarginLossGrad";
|
||||
constexpr auto kMultiMarginLossOpName = "MultiMarginLoss";
|
||||
constexpr auto kMultinomialOpName = "Multinomial";
|
||||
constexpr auto kMuxReceiveOpName = "MuxReceive";
|
||||
constexpr auto kMuxSendOpName = "MuxSend";
|
||||
|
@ -438,17 +450,21 @@ constexpr auto kNanToNumOpName = "NanToNum";
|
|||
constexpr auto kNegOpName = "Neg";
|
||||
constexpr auto kIm2ColOpName = "Im2Col";
|
||||
constexpr auto kNewIm2ColOpName = "NewIm2Col";
|
||||
constexpr auto kNextAfterOpName = "NextAfter";
|
||||
constexpr auto kIm2colOpName = "Im2col";
|
||||
constexpr auto kNMSWithMaskOpName = "NMSWithMask";
|
||||
constexpr auto kNonDeterministicInts = "NonDeterministicInts";
|
||||
constexpr auto kNonDeterministicIntsOpName = "NonDeterministicInts";
|
||||
constexpr auto kNonMaxSuppressionV3OpName = "NonMaxSuppressionV3";
|
||||
constexpr auto kNonZeroOpName = "NonZero";
|
||||
constexpr auto kNPUAllocFloatStatusOpName = "NPUAllocFloatStatus";
|
||||
constexpr auto kNPUClearFloatStatusOpName = "NPUClearFloatStatus";
|
||||
constexpr auto kNPUGetFloatStatusOpName = "NPUGetFloatStatus";
|
||||
constexpr auto kNthElementOpName = "NthElement";
|
||||
constexpr auto kNuclearNormOpName = "NuclearNorm";
|
||||
constexpr auto kOneHotOpName = "OneHot";
|
||||
constexpr auto kOneHotDOpName = "OneHotD";
|
||||
constexpr auto kOrgqrOpName = "Orgqr";
|
||||
constexpr auto kPadAndShiftOpName = "PadAndShift";
|
||||
constexpr auto kPaddingOpName = "Padding";
|
||||
constexpr auto kPadOpName = "Pad";
|
||||
|
@ -457,8 +473,11 @@ constexpr auto kParallelResizeBilinearOpName = "ParallelResizeBilinear";
|
|||
constexpr auto kSyncResizeBilinearV2OpName = "SyncResizeBilinearV2";
|
||||
constexpr auto kParallelResizeBilinearGradOpName = "ParallelResizeBilinearGrad";
|
||||
constexpr auto kSyncResizeBilinearV2GradOpName = "SyncResizeBilinearV2Grad";
|
||||
constexpr auto kParameterizedTruncatedNormalOpName = "ParameterizedTruncatedNormal";
|
||||
constexpr auto kPartialOpName = "partial";
|
||||
constexpr auto kPdistGradOpName = "PdistGrad";
|
||||
constexpr auto kPoissonOpName = "Poisson";
|
||||
constexpr auto kPolarOpName = "Polar";
|
||||
constexpr auto kPoolingOpName = "Pooling";
|
||||
constexpr auto kPSROIPoolingOpName = "PSROIPooling";
|
||||
constexpr auto kPSROIPoolingV2OpName = "PSROIPoolingV2";
|
||||
|
@ -481,13 +500,18 @@ constexpr auto kPushOpName = "Push";
|
|||
constexpr auto kQrOpName = "Qr";
|
||||
constexpr auto kPushWeightOpName = "PushWeight";
|
||||
constexpr auto kQuantileOpName = "Quantile";
|
||||
constexpr auto kRaggedRangeOpName = "RaggedRange";
|
||||
constexpr auto kRaggedTensorToSparseOpName = "RaggedTensorToSparse";
|
||||
constexpr auto kRaggedTensorToTensorOpName = "RaggedTensorToTensor";
|
||||
constexpr auto kRandomChoiceWithMaskOpName = "RandomChoiceWithMask";
|
||||
constexpr auto kRandomPoissonOpName = "RandomPoisson";
|
||||
constexpr auto kRandomShuffleOpName = "RandomShuffle";
|
||||
constexpr auto kRangeOpName = "Range";
|
||||
constexpr auto kRangeDOpName = "RangeD";
|
||||
constexpr auto kQuantDTypeCastOpName = "QuantDTypeCast";
|
||||
constexpr auto kRealDivOpName = "RealDiv";
|
||||
constexpr auto kReciprocalOpName = "Reciprocal";
|
||||
constexpr auto kReciprocalGradOpName = "ReciprocalGrad";
|
||||
constexpr auto kRecvOpName = "StreamRecv";
|
||||
constexpr auto kReduceAllOpName = "ReduceAll";
|
||||
constexpr auto kReduceAllDOpName = "ReduceAllD";
|
||||
|
@ -536,6 +560,7 @@ constexpr auto kResizeNearestNeighborV2DOpName = "ResizeNearestNeighborV2D";
|
|||
constexpr auto kReverseV2OpName = "ReverseV2";
|
||||
constexpr auto kReverseV2DOpName = "ReverseV2D";
|
||||
constexpr auto kReturnOpName = "Return";
|
||||
constexpr auto kRGBToHSVOpName = "RGBToHSV";
|
||||
constexpr auto kROIAlignGradName = "ROIAlignGrad";
|
||||
constexpr auto kRpcRecvOpName = "RpcRecv";
|
||||
constexpr auto kRpcSendOpName = "RpcSend";
|
||||
|
@ -543,6 +568,9 @@ constexpr auto kRpnProposalsOpName = "RpnProposals";
|
|||
constexpr auto kRpnProposalsDOpName = "RpnProposalsD";
|
||||
constexpr auto kRsqrtGradOpName = "RsqrtGrad";
|
||||
constexpr auto kRsqrtOpName = "Rsqrt";
|
||||
constexpr auto kSampleDistortedBoundingBoxExt2OpName = "SampleDistortedBoundingBoxExt2";
|
||||
constexpr auto kScaleAndTranslateOpName = "ScaleAndTranslate";
|
||||
constexpr auto kScaleAndTranslateGradOpName = "ScaleAndTranslateGrad";
|
||||
constexpr auto kScatterAddOpName = "ScatterAdd";
|
||||
constexpr auto kScatterNdOpName = "ScatterNd";
|
||||
constexpr auto kScatterNdDOpName = "ScatterNdD";
|
||||
|
@ -554,13 +582,19 @@ constexpr auto kSegmentMinOpName = "SegmentMin";
|
|||
constexpr auto kSegmentProdOpName = "SegmentProd";
|
||||
constexpr auto kSegmentSumOpName = "SegmentSum";
|
||||
constexpr auto kSelectOpName = "Select";
|
||||
constexpr auto kSelfAdjointEigOpName = "SelfAdjointEig";
|
||||
constexpr auto kSeLUOpName = "SeLU";
|
||||
constexpr auto kSeluOpName = "Selu";
|
||||
constexpr auto kSendOpName = "StreamSend";
|
||||
constexpr auto kSetSizeOpName = "SetSize";
|
||||
constexpr auto kSGDName = "SGD";
|
||||
constexpr auto kSigmoidOpName = "Sigmoid";
|
||||
constexpr auto kSigmoidCrossEntropyWithLogitsV2OpName = "SigmoidCrossEntropyWithLogitsV2";
|
||||
constexpr auto kSignOpName = "Sign";
|
||||
constexpr auto kSimpleMeanGradOpName = "SimpleMeanGrad";
|
||||
constexpr auto kSinOpName = "Sin";
|
||||
constexpr auto kSincOpName = "Sinc";
|
||||
constexpr auto kSinhOpName = "Sinh";
|
||||
constexpr auto kSliceGradOpName = "SliceGrad";
|
||||
constexpr auto kSliceOpName = "Slice";
|
||||
constexpr auto kSliceDV2OpName = "SliceDV2";
|
||||
|
|
|
@ -0,0 +1,134 @@
|
|||
/**
|
||||
* Copyright 2021 Jilin University
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved..
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "check_numerics.h"
|
||||
|
||||
#include <securec.h>
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const std::uint32_t kCheckNumericsInputNum{1};
|
||||
const std::uint32_t kCheckNumericsOutputNum{1};
|
||||
const char *const kCheckNumerics{"CheckNumerics"};
|
||||
const std::int64_t kCheckNumericsParallelNum{64 * 1024};
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
namespace detail {
|
||||
template <typename T>
|
||||
inline bool ScalarCheckNumerics(const T x) {
|
||||
return !std::isfinite(x);
|
||||
}
|
||||
template <>
|
||||
inline bool ScalarCheckNumerics(const Eigen::half x) {
|
||||
return !Eigen::half_impl::isfinite(x);
|
||||
}
|
||||
inline std::uint32_t ParallelForCheckNumerics(const CpuKernelContext &ctx, std::int64_t total,
|
||||
std::int64_t per_unit_size,
|
||||
const std::function<void(std::int64_t, std::int64_t)> &work) {
|
||||
if (total > kCheckNumericsParallelNum)
|
||||
return aicpu::CpuKernelUtils::ParallelFor(ctx, total, per_unit_size, work);
|
||||
else
|
||||
work(0, total);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
template <typename T>
|
||||
inline std::uint32_t ComputeCheckNumericsKernel(const CpuKernelContext &ctx) {
|
||||
T *input0{static_cast<T *>(ctx.Input(0)->GetData())};
|
||||
T *output{static_cast<T *>(ctx.Output(0)->GetData())};
|
||||
std::int64_t total{ctx.Input(0)->NumElements()};
|
||||
std::uint32_t core_num{aicpu::CpuKernelUtils::GetCPUNum(ctx)};
|
||||
std::int64_t per_unit_size{total / std::min(std::max(1L, core_num - 2L), total)};
|
||||
bool flag = false;
|
||||
std::uint32_t ret = ParallelForCheckNumerics(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
|
||||
flag = flag || std::any_of(input0 + begin, input0 + end, ScalarCheckNumerics<T>);
|
||||
if (!flag) {
|
||||
auto ret = memcpy_s(output + begin, static_cast<size_t>((end - begin) * sizeof(T)), input0 + begin,
|
||||
static_cast<size_t>((end - begin) * sizeof(T)));
|
||||
if (ret != EOK) {
|
||||
KERNEL_LOG_ERROR("memcpy_s error");
|
||||
}
|
||||
}
|
||||
});
|
||||
return flag ? KERNEL_STATUS_PARAM_INVALID : ret;
|
||||
}
|
||||
template <typename T>
|
||||
inline std::uint32_t ComputeCheckNumerics(const CpuKernelContext &ctx) {
|
||||
std::uint32_t result{ComputeCheckNumericsKernel<T>(ctx)};
|
||||
if (result != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("CheckNumerics compute failed.");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::uint32_t ExtraCheckCheckNumerics(const CpuKernelContext &ctx) {
|
||||
if (ctx.Input(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get input data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Output(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get output data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
|
||||
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data size of the input [%llu] need be the same as the output "
|
||||
"[%llu].",
|
||||
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
inline std::uint32_t CheckCheckNumerics(CpuKernelContext &ctx) {
|
||||
return NormalCheck(ctx, kCheckNumericsInputNum, kCheckNumericsOutputNum) ? KERNEL_STATUS_PARAM_INVALID
|
||||
: ExtraCheckCheckNumerics(ctx);
|
||||
}
|
||||
|
||||
inline std::uint32_t ComputeCheckNumerics(const CpuKernelContext &ctx) {
|
||||
DataType input_type{ctx.Input(0)->GetDataType()};
|
||||
switch (input_type) {
|
||||
case DT_FLOAT16:
|
||||
return ComputeCheckNumerics<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return ComputeCheckNumerics<std::float_t>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return ComputeCheckNumerics<std::double_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
std::uint32_t CheckNumericsCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
return detail::CheckCheckNumerics(ctx) ? KERNEL_STATUS_PARAM_INVALID : detail::ComputeCheckNumerics(ctx);
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kCheckNumerics, CheckNumericsCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,29 @@
|
|||
/**
|
||||
* Copyright 2021 Jilin University
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_CHECK_NUMERICS_H
|
||||
#define AICPU_KERNELS_NORMALIZED_CHECK_NUMERICS_H
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class CheckNumericsCpuKernel final : public CpuKernel {
|
||||
public:
|
||||
std::uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,296 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021.All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "floordiv.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *const kFloorDiv = "FloorDiv";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 4 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 32 * 1024;
|
||||
|
||||
#define FLOORDIV_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = FloorDivCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("FloorDiv kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t FloorDivCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kFloorDiv);
|
||||
KERNEL_HANDLE_ERROR(FloorDivParamCheck(ctx), "[%s] check params failed.", kFloorDiv);
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
FLOORDIV_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
FLOORDIV_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
FLOORDIV_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
FLOORDIV_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
FLOORDIV_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
FLOORDIV_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
FLOORDIV_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
FLOORDIV_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
FLOORDIV_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("FloorDiv kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t FloorDivCpuKernel::FloorDivParamCheck(const CpuKernelContext &ctx) const {
|
||||
// the non null of input_0, input_1, output has been verified in NormalCheck
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
|
||||
KERNEL_CHECK_NULLPTR(input_1->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
|
||||
KERNEL_CHECK_NULLPTR(output->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T DivCal(const T &x_i, const T &y_i) {
|
||||
return static_cast<T>(Eigen::numext::floor(x_i / y_i));
|
||||
}
|
||||
|
||||
template <>
|
||||
int8_t DivCal(const int8_t &x_i, const int8_t &y_i) {
|
||||
if ((x_i < 0) != (y_i < 0)) {
|
||||
int8_t abs_x_i = x_i < 0 ? -x_i : x_i;
|
||||
int8_t abs_y_i = y_i < 0 ? -y_i : y_i;
|
||||
return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
|
||||
} else {
|
||||
return (x_i / y_i);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
int16_t DivCal(const int16_t &x_i, const int16_t &y_i) {
|
||||
if ((x_i < 0) != (y_i < 0)) {
|
||||
int16_t abs_x_i = x_i < 0 ? -x_i : x_i;
|
||||
int16_t abs_y_i = y_i < 0 ? -y_i : y_i;
|
||||
return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
|
||||
} else {
|
||||
return (x_i / y_i);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
int32_t DivCal(const int32_t &x_i, const int32_t &y_i) {
|
||||
if ((x_i < 0) != (y_i < 0)) {
|
||||
int32_t abs_x_i = x_i < 0 ? -x_i : x_i;
|
||||
int32_t abs_y_i = y_i < 0 ? -y_i : y_i;
|
||||
return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
|
||||
} else {
|
||||
return (x_i / y_i);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
int64_t DivCal(const int64_t &x_i, const int64_t &y_i) {
|
||||
if ((x_i < 0) != (y_i < 0)) {
|
||||
int64_t abs_x_i = x_i < 0 ? -x_i : x_i;
|
||||
int64_t abs_y_i = y_i < 0 ? -y_i : y_i;
|
||||
return (-(abs_x_i + abs_y_i - 1) / abs_y_i);
|
||||
} else {
|
||||
return (x_i / y_i);
|
||||
}
|
||||
}
|
||||
|
||||
// special compute is used in the following situations.
|
||||
// 1. the shapes of input1 and input2 are the same
|
||||
// 2. input1 is a 1D tensor with only one element or input1 is scalar
|
||||
// 3. input2 is a 1D tensor with only one element or input2 is scalar
|
||||
// 4. the shapes of input1 and input2 are different
|
||||
template <typename T>
|
||||
uint32_t FloorDivCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
|
||||
const T *input2, T *output) {
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(input2 + i) == static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
*(output + i) = DivCal<T>(*(input1 + i), *(input2 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(input2 + i) == static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
*(output + i) = DivCal<T>(*input1, *(input2 + i));
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*input2 == static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
*(output + i) = DivCal<T>(*(input1 + i), *input2);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t FloorDivCpuKernel::NoBcastCompute(const CpuKernelContext &ctx) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
BcastShapeType type = in0_elements_nums == in1_elements_nums
|
||||
? BcastShapeType::SAME_SHAPE
|
||||
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
|
||||
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, static_cast<int64_t>(4)); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
uint32_t status = KERNEL_STATUS_OK;
|
||||
auto sharder_floor_div = [&](int64_t start, int64_t end) {
|
||||
uint32_t status_sharder = SpecialCompute<T>(type, start, end, in0, in1, out);
|
||||
if (status_sharder != KERNEL_STATUS_OK) {
|
||||
status = status_sharder;
|
||||
}
|
||||
};
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_floor_div),
|
||||
"FloorDiv Compute failed.");
|
||||
return status;
|
||||
}
|
||||
|
||||
return SpecialCompute<T>(type, 0, data_num, in0, in1, out);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t FloorDivCpuKernel::BcastParallelCompute(const CpuKernelContext &ctx, const Bcast &bcast) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, static_cast<int64_t>(4)); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
uint32_t status = KERNEL_STATUS_OK;
|
||||
auto sharder_floor_div = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(in1 + bcast.GetBroadcastYIndex(i)) == static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
|
||||
status = KERNEL_STATUS_INNER_ERROR;
|
||||
break;
|
||||
}
|
||||
*(out + i) = DivCal<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
|
||||
}
|
||||
};
|
||||
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_floor_div),
|
||||
"FloorDiv Compute failed.");
|
||||
return status;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t FloorDivCpuKernel::BcastCompute(const CpuKernelContext &ctx, const Bcast &bcast) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
if (data_num >= kParallelDataNum) {
|
||||
return BcastParallelCompute<T>(ctx, bcast);
|
||||
} else {
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
if (*(in1 + bcast.GetBroadcastYIndex(i)) == static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("Invalid argumengt: Division by zero.");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
*(out + i) = DivCal<T>(*(in0 + bcast.GetBroadcastXIndex(i)), *(in1 + bcast.GetBroadcastYIndex(i)));
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t FloorDivCpuKernel::FloorDivCompute(const CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
|
||||
bool isNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
|
||||
if (isNeedBcast) {
|
||||
return NoBcastCompute<T>(ctx);
|
||||
} else {
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kFloorDiv, FloorDivCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_FLOORDIV_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_FLOORDIV_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class FloorDivCpuKernel : public CpuKernel {
|
||||
public:
|
||||
FloorDivCpuKernel() = default;
|
||||
~FloorDivCpuKernel() override = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t FloorDivParamCheck(const CpuKernelContext &ctx) const;
|
||||
|
||||
template <typename T>
|
||||
uint32_t SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(const CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(const CpuKernelContext &ctx, const Bcast &bcast);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastParallelCompute(const CpuKernelContext &ctx, const Bcast &bcast);
|
||||
|
||||
template <typename T>
|
||||
uint32_t FloorDivCompute(const CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,162 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "log1p.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *const kLog1p = "Log1p";
|
||||
constexpr int64_t kParallelDataNums = 16 * 1024;
|
||||
|
||||
#define LOG1P_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = Log1pCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Log1p kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define LOG1P_COMPUTE_CASE2(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = Log1pComputeComplex<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Log1p kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t Log1pCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kLog1p);
|
||||
KERNEL_HANDLE_ERROR(Log1pCheck(ctx), "[%s] check params failed.", kLog1p);
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
LOG1P_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
LOG1P_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
LOG1P_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
LOG1P_COMPUTE_CASE2(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
LOG1P_COMPUTE_CASE2(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Log1p kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t Log1pCpuKernel::Log1pCheck(const CpuKernelContext &ctx) const {
|
||||
auto input_0 = ctx.Input(0);
|
||||
auto output_0 = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
|
||||
KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t Log1pCpuKernel::Log1pCompute(const CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
KERNEL_CHECK_FALSE(*(input_x + i) >= static_cast<T>(-1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%llu] must be at least more than -1.", i);
|
||||
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shard_log1p = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
KERNEL_CHECK_FALSE(*(input_x + i) >= static_cast<T>(-1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%llu] must be at least more than -1.", i);
|
||||
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log1p),
|
||||
"Log1p Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t Log1pCpuKernel::Log1pComputeComplex(const CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
|
||||
typedef Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> ArrayxXd;
|
||||
ArrayxXd array_x(1, data_num);
|
||||
if (data_size <= kParallelDataNums) {
|
||||
if (data_type == DT_COMPLEX64) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
array_x(0, i) = *(input_x + i);
|
||||
KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<float>(-1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%llu] must be at least more than -1.", i);
|
||||
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
array_x(0, i) = *(input_x + i);
|
||||
KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<double>(-1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%llu] must be at least more than -1.", i);
|
||||
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shard_log1p = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
if (data_type == DT_COMPLEX64) {
|
||||
array_x(0, i) = *(input_x + i);
|
||||
KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<float>(-1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%llu] must be at least more than -1.", i);
|
||||
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
|
||||
} else {
|
||||
array_x(0, i) = *(input_x + i);
|
||||
KERNEL_CHECK_FALSE(array_x(0, i).real() >= static_cast<double>(-1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%llu] must be at least more than -1.", i);
|
||||
*(output_y + i) = Eigen::numext::log1p(*(input_x + i));
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_log1p),
|
||||
"Log1p Compute failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kLog1p, Log1pCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_LOG1P_H
|
||||
#define AICPU_KERNELS_NORMALIZED_LOG1P_H
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class Log1pCpuKernel : public CpuKernel {
|
||||
public:
|
||||
Log1pCpuKernel() = default;
|
||||
~Log1pCpuKernel() override = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t Log1pCheck(const CpuKernelContext &ctx) const;
|
||||
|
||||
template <typename T>
|
||||
uint32_t Log1pCompute(const CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t Log1pComputeComplex(const CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,180 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "matrix_triangular_solve.h"
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include "Eigen/Core"
|
||||
#include "complex"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "kernel_log.h"
|
||||
|
||||
using namespace Eigen;
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kMatrixTriangularSolve = "MatrixTriangularSolve";
|
||||
constexpr int64_t kParallelDataNums = 16 * 1024;
|
||||
|
||||
#define MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MatrixTriangularSolveCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("MatrixTriangularSolve kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MatrixTriangularSolveCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"MatrixTriangularSolve check input and output number failed.");
|
||||
|
||||
KERNEL_HANDLE_ERROR(MatrixTriangularSolveCheck(ctx), "MatrixTriangularSolve check params failed.");
|
||||
// check the data type of the inputs
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
MATRIXTRIANGULARSOLVE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MatrixTriangularSolve kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MatrixTriangularSolveCpuKernel::MatrixTriangularSolveCheck(CpuKernelContext &ctx) {
|
||||
Tensor *in_matrix = ctx.Input(0);
|
||||
Tensor *in_rhs = ctx.Input(1);
|
||||
// check same data type constraint
|
||||
auto in_type0 = in_matrix->GetDataType();
|
||||
auto in_type1 = in_rhs->GetDataType();
|
||||
KERNEL_CHECK_FALSE((in_type0 == in_type1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input1 [%s] need be same with "
|
||||
"input0 [%s].",
|
||||
DTypeStr(in_type1).c_str(), DTypeStr(in_type0).c_str())
|
||||
// check the number of matrix
|
||||
auto in_shape0 = in_matrix->GetTensorShape();
|
||||
auto in_shape1 = in_rhs->GetTensorShape();
|
||||
|
||||
std::vector<int64_t> dims0 = in_shape0->GetDimSizes();
|
||||
std::vector<int64_t> dims1 = in_shape1->GetDimSizes();
|
||||
|
||||
// Check the shape of two inputs
|
||||
if (dims0[0] != dims1[0]) {
|
||||
KERNEL_LOG_ERROR("The shapes of two inputs are not matched");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
// check square
|
||||
int m = dims0.size();
|
||||
if (dims0[m - 2] != dims0[m - 1] || dims0[m - 1] == 0) {
|
||||
KERNEL_LOG_ERROR("The input0 must be one or more squares.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MatrixTriangularSolveCpuKernel::MatrixTriangularSolveCompute(CpuKernelContext &ctx) {
|
||||
Tensor *matrix_tensor = ctx.Input(0);
|
||||
Tensor *rhs_tensor = ctx.Input(1);
|
||||
Tensor *y_tensor = ctx.Output(0);
|
||||
|
||||
auto input_matrix = reinterpret_cast<T *>(matrix_tensor->GetData());
|
||||
KERNEL_CHECK_NULLPTR(input_matrix, KERNEL_STATUS_PARAM_INVALID, "Get input data0 failed.")
|
||||
auto input_rhs = reinterpret_cast<T *>(rhs_tensor->GetData());
|
||||
KERNEL_CHECK_NULLPTR(input_rhs, KERNEL_STATUS_PARAM_INVALID, "Get input data1 failed.")
|
||||
auto output_y = reinterpret_cast<T *>(y_tensor->GetData());
|
||||
KERNEL_CHECK_NULLPTR(output_y, KERNEL_STATUS_PARAM_INVALID, "Get output data failed.")
|
||||
|
||||
AttrValue *lower_attr = ctx.GetAttr("lower");
|
||||
KERNEL_CHECK_NULLPTR(lower_attr, KERNEL_STATUS_PARAM_INVALID, "Get attr [lower] failed.");
|
||||
AttrValue *adjoint_attr = ctx.GetAttr("adjoint");
|
||||
KERNEL_CHECK_NULLPTR(adjoint_attr, KERNEL_STATUS_PARAM_INVALID, "Get attr [adjoint] failed.");
|
||||
bool lower_data = lower_attr->GetBool();
|
||||
bool adjoint_data = adjoint_attr->GetBool();
|
||||
|
||||
auto matrix_shape = matrix_tensor->GetTensorShape();
|
||||
auto rhs_shape = rhs_tensor->GetTensorShape();
|
||||
auto y_shape = y_tensor->GetTensorShape();
|
||||
|
||||
// Get the number of elements
|
||||
auto input1_num = matrix_tensor->NumElements();
|
||||
|
||||
// slice
|
||||
std::vector<int64_t> matrix_dims = matrix_shape->GetDimSizes();
|
||||
auto last_matrix_dims = *(matrix_dims.end() - 1);
|
||||
size_t matrix_size = last_matrix_dims * last_matrix_dims; // size of a matrix
|
||||
size_t matrix_num = input1_num / matrix_size; // number of matrix
|
||||
|
||||
std::vector<int64_t> rhs_dims = rhs_shape->GetDimSizes();
|
||||
auto last_rhs_dims = *(rhs_dims.end() - 1);
|
||||
size_t rhs_size = last_matrix_dims * last_rhs_dims;
|
||||
|
||||
auto data_size = matrix_num * matrix_size;
|
||||
|
||||
auto shard_matrix_triangular_solve = [&](size_t start, size_t end) {
|
||||
for (size_t k = start; k < end; ++k) {
|
||||
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_input(
|
||||
input_matrix + k * matrix_size, last_matrix_dims, last_matrix_dims);
|
||||
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_rhs(
|
||||
input_rhs + k * rhs_size, last_matrix_dims, last_rhs_dims);
|
||||
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> eigen_output(
|
||||
output_y + k * rhs_size, last_matrix_dims, last_rhs_dims);
|
||||
if (lower_data) {
|
||||
auto triangle = eigen_input.template triangularView<Eigen::Lower>();
|
||||
if (adjoint_data) {
|
||||
eigen_output.noalias() = triangle.adjoint().solve(eigen_rhs);
|
||||
} else {
|
||||
eigen_output.noalias() = triangle.solve(eigen_rhs);
|
||||
}
|
||||
} else {
|
||||
auto triangle = eigen_input.template triangularView<Eigen::Upper>();
|
||||
if (adjoint_data) {
|
||||
eigen_output.noalias() = triangle.adjoint().solve(eigen_rhs);
|
||||
} else {
|
||||
eigen_output.noalias() = triangle.solve(eigen_rhs);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
if (data_size < kParallelDataNums) {
|
||||
shard_matrix_triangular_solve(0, matrix_num);
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > matrix_num) {
|
||||
max_core_num = matrix_num;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_matrix_triangular_solve),
|
||||
"MatrixTriangularSolve Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kMatrixTriangularSolve, MatrixTriangularSolveCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. \
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MATRIXTRIANGULARSOLVE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MATRIXTRIANGULARSOLVE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "Eigen/Core"
|
||||
|
||||
namespace aicpu {
|
||||
class MatrixTriangularSolveCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MatrixTriangularSolveCpuKernel() = default;
|
||||
~MatrixTriangularSolveCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
bool lower;
|
||||
bool adjoint;
|
||||
|
||||
template <typename T>
|
||||
static uint32_t MatrixTriangularSolveCompute(CpuKernelContext &ctx);
|
||||
|
||||
static uint32_t MatrixTriangularSolveCheck(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,127 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "maximum_grad_grad.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
constexpr uint32_t kMaximumGradGradInputNum = 4;
|
||||
constexpr uint32_t kMaximumGradGradOutputNum = 3;
|
||||
const char *kMaximumGradGrad = "MaximumGradGrad";
|
||||
|
||||
#define MAXIMUMGRADGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MaximumGradGradCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("MaximumGradGrad kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MaximumGradGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMaximumGradGradInputNum, kMaximumGradGradOutputNum),
|
||||
"MaximumGradGrad check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MaximumGradGradParamCheck(ctx), "MaximumGradGrad check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
MAXIMUMGRADGRAD_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
MAXIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
MAXIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("The data type of input is not support, input data type is [%s].", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MaximumGradGradCpuKernel::MaximumGradGradParamCheck(CpuKernelContext &ctx) {
|
||||
// the non null of inputs and outputs has been verified in NormalCheck
|
||||
Tensor *x1 = ctx.Input(0);
|
||||
Tensor *x2 = ctx.Input(1);
|
||||
Tensor *grad_y1 = ctx.Input(2);
|
||||
Tensor *grad_y2 = ctx.Input(3);
|
||||
// type check
|
||||
DataType grad_y1_type = grad_y1->GetDataType();
|
||||
DataType grad_y2_type = grad_y2->GetDataType();
|
||||
DataType x1_type = x1->GetDataType();
|
||||
DataType x2_type = x2->GetDataType();
|
||||
KERNEL_CHECK_FALSE(((grad_y1_type == grad_y2_type) && (grad_y2_type == x1_type) && (x1_type == x2_type)),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of grad_y1 [%s], grad_y2 [%s], x1 [%s] and "
|
||||
"x2 [%s] need to be same.",
|
||||
DTypeStr(grad_y1_type).c_str(), DTypeStr(grad_y2_type).c_str(), DTypeStr(x1_type).c_str(),
|
||||
DTypeStr(x2_type).c_str())
|
||||
// shape check
|
||||
auto grad_y1_shape = grad_y1->GetTensorShape()->GetDimSizes();
|
||||
auto grad_y2_shape = grad_y2->GetTensorShape()->GetDimSizes();
|
||||
auto x1_shape = x1->GetTensorShape()->GetDimSizes();
|
||||
auto x2_shape = x2->GetTensorShape()->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE(grad_y1_shape == x1_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y1 and x1.");
|
||||
KERNEL_CHECK_FALSE(grad_y2_shape == x2_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y2 and x2.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MaximumGradGradCpuKernel::MaximumGradGradCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MaximumGradGradCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto in2 = reinterpret_cast<T *>(ctx.Input(2)->GetData());
|
||||
auto in3 = reinterpret_cast<T *>(ctx.Input(3)->GetData());
|
||||
auto out0 = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto out1 = reinterpret_cast<T *>(ctx.Output(1)->GetData());
|
||||
auto out2 = reinterpret_cast<T *>(ctx.Output(2)->GetData());
|
||||
*out0 = static_cast<T>(0);
|
||||
*out1 = static_cast<T>(0);
|
||||
int64_t data_num = ctx.Output(2)->NumElements();
|
||||
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
if (*(in0 + bcast.GetBroadcastXIndex(i)) >= *(in1 + bcast.GetBroadcastYIndex(i))) {
|
||||
*(out2 + i) = *(in2 + bcast.GetBroadcastXIndex(i));
|
||||
} else {
|
||||
*(out2 + i) = *(in3 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMaximumGradGrad, MaximumGradGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MAXIMUM_GRAD_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MAXIMUM_GRAD_GRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MaximumGradGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MaximumGradGradCpuKernel() = default;
|
||||
~MaximumGradGradCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MaximumGradGradParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t MaximumGradGradCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,286 @@
|
|||
/**
|
||||
* Copyright 2021 Harbin Institute of Technology
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "maxpool.h"
|
||||
|
||||
#include <Eigen/Dense>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
namespace {
|
||||
const char *MAXPOOL = "MaxPool";
|
||||
constexpr uint32_t kMaxPoolInputNum = 1;
|
||||
constexpr uint32_t kMaxPoolOutputNum = 1;
|
||||
constexpr int64_t kParallelNum = 64 * 1024;
|
||||
struct PoolParams {
|
||||
int depth;
|
||||
|
||||
int tensor_cols;
|
||||
int tensor_rows;
|
||||
int tensor_batch;
|
||||
|
||||
int ksize_rows;
|
||||
int ksize_cols;
|
||||
int ksize_depth;
|
||||
|
||||
int strides_rows;
|
||||
int strides_cols;
|
||||
int strides_depth;
|
||||
|
||||
int64_t out_height;
|
||||
int64_t out_width;
|
||||
int out_depth;
|
||||
|
||||
int64_t pad_top;
|
||||
int64_t pad_bottom;
|
||||
int64_t pad_left;
|
||||
int64_t pad_right;
|
||||
};
|
||||
} // namespace
|
||||
namespace aicpu {
|
||||
uint32_t GetOutputSize(int input_size, int kernel_size, int stride, const std::string &padding, int64_t *output_size,
|
||||
int64_t *padding_before, int64_t *padding_after) {
|
||||
KERNEL_CHECK_FALSE(stride > 0, KERNEL_STATUS_PARAM_INVALID, "[MaxPool] Stride must be positive.");
|
||||
std::string same("SAME"), valid("VALID");
|
||||
if (valid == padding) {
|
||||
*output_size = (input_size - kernel_size + stride) / stride;
|
||||
*padding_before = 0;
|
||||
*padding_after = 0;
|
||||
} else if (same == padding) {
|
||||
*output_size = (input_size + stride - 1) / stride;
|
||||
const int64_t padding_need =
|
||||
std::max(static_cast<int64_t>(0), (*output_size - 1) * stride + kernel_size - input_size);
|
||||
*padding_before = padding_need / 2;
|
||||
*padding_after = padding_need - *padding_before;
|
||||
} else {
|
||||
KERNEL_LOG_ERROR("[MaxPool] Padding is invalid.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (*output_size < 0) {
|
||||
KERNEL_LOG_ERROR("[MaxPool] Computed output size is negative.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
uint32_t ConstructPoolParams(aicpu::CpuKernelContext &ctx, const aicpu::TensorShape &data_format, PoolParams ¶ms) {
|
||||
Format format = data_format.GetFormat();
|
||||
KERNEL_CHECK_FALSE((format == FORMAT_NHWC || format == FORMAT_NCHW), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[MaxPool] Format is not NHWC or NCHW.");
|
||||
std::vector<int64_t> tensor_in_shapes = data_format.GetDimSizes();
|
||||
if (tensor_in_shapes.size() != 4) {
|
||||
KERNEL_LOG_ERROR("[MaxPool] Input tensor must have 2 spacial dimensions.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
std::vector<int64_t> ksize = ctx.GetAttr("ksize")->GetListInt(), strides = ctx.GetAttr("strides")->GetListInt();
|
||||
std::string padding = ctx.GetAttr("padding")->GetString();
|
||||
std::string data_format_str = "";
|
||||
if (ctx.GetAttr("data_format") == nullptr) {
|
||||
KERNEL_LOG_INFO("[MaxPool] Attr data_format is empty, using default value NHWC.");
|
||||
format = FORMAT_NHWC;
|
||||
} else {
|
||||
std::map<std::string, aicpu::Format> format_str_to_enum_map = {{"NHWC", FORMAT_NHWC}, {"NCHW", FORMAT_NCHW}};
|
||||
data_format_str = ctx.GetAttr("data_format")->GetString();
|
||||
|
||||
KERNEL_HANDLE_ERROR(format_str_to_enum_map.find(data_format_str) == format_str_to_enum_map.end(),
|
||||
"[MaxPool] data_format string is invalid.");
|
||||
format = format_str_to_enum_map[data_format_str];
|
||||
}
|
||||
switch (format) {
|
||||
case FORMAT_NHWC:
|
||||
params.depth = tensor_in_shapes[kFormatNHWCIndexC];
|
||||
params.tensor_rows = tensor_in_shapes[kFormatNHWCIndexH];
|
||||
params.tensor_cols = tensor_in_shapes[kFormatNHWCIndexW];
|
||||
params.tensor_batch = tensor_in_shapes[kFormatNHWCIndexN];
|
||||
params.ksize_rows = ksize[kFormatNHWCIndexH];
|
||||
params.ksize_cols = ksize[kFormatNHWCIndexW];
|
||||
params.ksize_depth = ksize[kFormatNHWCIndexC];
|
||||
params.strides_rows = strides[kFormatNHWCIndexH];
|
||||
params.strides_cols = strides[kFormatNHWCIndexW];
|
||||
params.strides_depth = strides[kFormatNHWCIndexC];
|
||||
break;
|
||||
case FORMAT_NCHW:
|
||||
params.depth = tensor_in_shapes[kFormatNCHWIndexC];
|
||||
params.tensor_rows = tensor_in_shapes[kFormatNCHWIndexH];
|
||||
params.tensor_cols = tensor_in_shapes[kFormatNCHWIndexW];
|
||||
params.tensor_batch = tensor_in_shapes[kFormatNCHWIndexN];
|
||||
params.ksize_rows = ksize[kFormatNCHWIndexH];
|
||||
params.ksize_cols = ksize[kFormatNCHWIndexW];
|
||||
params.ksize_depth = ksize[kFormatNCHWIndexC];
|
||||
params.strides_rows = strides[kFormatNCHWIndexH];
|
||||
params.strides_cols = strides[kFormatNCHWIndexW];
|
||||
params.strides_depth = strides[kFormatNCHWIndexC];
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[MaxPool] Format is not NHWC or NCHW, current is [%s].", FormatToSerialString(format).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
auto ret1 = GetOutputSize(params.tensor_rows, params.ksize_rows, params.strides_rows, padding, ¶ms.out_height,
|
||||
¶ms.pad_top, ¶ms.pad_bottom),
|
||||
ret2 = GetOutputSize(params.tensor_cols, params.ksize_cols, params.strides_cols, padding, ¶ms.out_width,
|
||||
¶ms.pad_left, ¶ms.pad_right);
|
||||
KERNEL_CHECK_FALSE(ret1 == KERNEL_STATUS_OK && ret2 == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[MaxPool] An error occurred while calculating output size.");
|
||||
params.out_depth = params.depth;
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
template <class T>
|
||||
uint32_t SpacialMaxPool(CpuKernelContext &ctx, const PoolParams ¶ms) {
|
||||
Tensor *input = ctx.Input(kFirstInputIndex);
|
||||
Tensor *output = ctx.Output(kFirstOutputIndex);
|
||||
|
||||
const T *raw_input_data = static_cast<T *>(input->GetData());
|
||||
T *raw_output_data = static_cast<T *>(output->GetData());
|
||||
auto shard_NCHW = [¶ms, &raw_input_data, &raw_output_data](int64_t start, int64_t limit) {
|
||||
typedef Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenArrayMap;
|
||||
typedef Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> EigenArrayMap;
|
||||
const int64_t batch_size = limit;
|
||||
const int64_t X_W = static_cast<int64_t>(params.tensor_cols), X_H = static_cast<int64_t>(params.tensor_rows);
|
||||
const int64_t Y_W = params.out_width, Y_H = params.out_height;
|
||||
const int64_t X_HxW = X_H * X_W, Y_HxW = Y_H * Y_W;
|
||||
const int64_t X_stride = X_HxW, Y_stride = Y_HxW;
|
||||
const int64_t stride_h = static_cast<int64_t>(params.strides_rows),
|
||||
stride_w = static_cast<int64_t>(params.strides_cols);
|
||||
const int64_t pad_t = params.pad_top, pad_l = params.pad_left;
|
||||
const int64_t kernel_h = static_cast<int64_t>(params.ksize_rows),
|
||||
kernel_w = static_cast<int64_t>(params.ksize_cols);
|
||||
const T *x_ptr = raw_input_data + start * X_stride;
|
||||
T *y_ptr = raw_output_data + start * Y_stride;
|
||||
for (int64_t i = start; i < batch_size; ++i) {
|
||||
ConstEigenArrayMap x_arr(x_ptr, X_W, X_H);
|
||||
EigenArrayMap y_arr(y_ptr, Y_W, Y_H);
|
||||
for (int64_t h = 0; h < Y_H; ++h) {
|
||||
const int64_t t = std::max(h * stride_h - pad_t, static_cast<int64_t>(0));
|
||||
const int64_t b = std::min(h * stride_h - pad_t + kernel_h, X_H);
|
||||
for (int64_t w = 0; w < Y_W; ++w) {
|
||||
const int64_t l = std::max(w * stride_w - pad_l, static_cast<int64_t>(0));
|
||||
const int64_t r = std::min(w * stride_w - pad_l + kernel_w, X_W);
|
||||
const int64_t y = h * Y_W + w;
|
||||
y_arr(y) = x_arr.block(l, t, r - l, b - t).maxCoeff();
|
||||
}
|
||||
}
|
||||
x_ptr += X_stride;
|
||||
y_ptr += Y_stride;
|
||||
}
|
||||
};
|
||||
auto shard_NHWC = [¶ms, &raw_input_data, &raw_output_data](int64_t start, int64_t limit) {
|
||||
typedef Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> ConstEigenArrayMap;
|
||||
typedef Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>> EigenArrayMap;
|
||||
const int64_t batch_size = limit;
|
||||
const int64_t X_W = static_cast<int64_t>(params.tensor_cols), X_H = static_cast<int64_t>(params.tensor_rows);
|
||||
const int64_t Y_W = params.out_width, Y_H = params.out_height;
|
||||
const int64_t X_HxW = X_H * X_W, Y_HxW = Y_H * Y_W;
|
||||
const int64_t C = static_cast<int64_t>(params.depth);
|
||||
const int64_t X_stride = X_HxW * C, Y_stride = Y_HxW * C;
|
||||
const int64_t stride_h = static_cast<int64_t>(params.strides_rows),
|
||||
stride_w = static_cast<int64_t>(params.strides_cols);
|
||||
const int64_t pad_t = params.pad_top, pad_l = params.pad_left;
|
||||
const int64_t kernel_h = static_cast<int64_t>(params.ksize_rows),
|
||||
kernel_w = static_cast<int64_t>(params.ksize_cols);
|
||||
const T *x_ptr = raw_input_data + start * X_stride;
|
||||
T *y_ptr = raw_output_data + start * Y_stride;
|
||||
for (int64_t i = start; i < batch_size; ++i) {
|
||||
ConstEigenArrayMap x_arr(x_ptr, C, X_HxW);
|
||||
EigenArrayMap y_arr(y_ptr, C, Y_HxW);
|
||||
for (int64_t h = 0; h < Y_H; ++h) {
|
||||
const int64_t t = std::max(h * stride_h - pad_t, static_cast<int64_t>(0));
|
||||
const int64_t b = std::min(h * stride_h - pad_t + kernel_h, X_H);
|
||||
for (int64_t w = 0; w < Y_W; ++w) {
|
||||
const int64_t l = std::max(w * stride_w - pad_l, static_cast<int64_t>(0));
|
||||
const int64_t r = std::min(w * stride_w - pad_l + kernel_w, X_W);
|
||||
const int64_t y = h * Y_W + w;
|
||||
y_arr.col(y).setConstant(Eigen::NumTraits<T>::lowest());
|
||||
for (int64_t xi = t; xi < b; ++xi) {
|
||||
for (int64_t yj = l; yj < r; ++yj) {
|
||||
y_arr.col(y) = y_arr.col(y).max(x_arr.col(xi * X_W + yj));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
x_ptr += X_stride;
|
||||
y_ptr += Y_stride;
|
||||
}
|
||||
};
|
||||
int64_t total_elements = params.tensor_batch * params.tensor_cols * params.tensor_rows * params.depth;
|
||||
if (ctx.GetAttr("data_format") != nullptr && ctx.GetAttr("data_format")->GetString() == "NCHW") {
|
||||
int64_t total_images = params.tensor_batch * params.depth;
|
||||
KERNEL_LOG_INFO("[MaxPool] Calling new shard_NCHW");
|
||||
if (total_elements <= kParallelNum) {
|
||||
shard_NCHW(0, total_images);
|
||||
return KERNEL_STATUS_OK;
|
||||
} else {
|
||||
uint32_t max_core_num = aicpu::CpuKernelUtils::GetCPUNum(ctx);
|
||||
max_core_num = std::min(total_images, static_cast<int64_t>(max_core_num));
|
||||
return CpuKernelUtils::ParallelFor(ctx, total_images, total_images / max_core_num, shard_NCHW);
|
||||
}
|
||||
} else {
|
||||
int64_t total_images_with_chann = params.tensor_batch;
|
||||
KERNEL_LOG_INFO("[MaxPool] Calling new shard_NHWC");
|
||||
if (total_elements <= kParallelNum) {
|
||||
shard_NHWC(0, total_images_with_chann);
|
||||
return KERNEL_STATUS_OK;
|
||||
} else {
|
||||
uint32_t max_core_num = aicpu::CpuKernelUtils::GetCPUNum(ctx);
|
||||
max_core_num = std::min(total_images_with_chann, static_cast<int64_t>(max_core_num));
|
||||
return CpuKernelUtils::ParallelFor(ctx, total_images_with_chann, total_images_with_chann / max_core_num,
|
||||
shard_NHWC);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
uint32_t ComputeMaxPoolImpl(CpuKernelContext &ctx) {
|
||||
TensorShape ts = *(ctx.Input(kFirstInputIndex)->GetTensorShape());
|
||||
PoolParams params;
|
||||
KERNEL_CHECK_FALSE(ConstructPoolParams(ctx, ts, params) == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[MaxPool] Pooling parameters construct failed.")
|
||||
return SpacialMaxPool<T>(ctx, params);
|
||||
}
|
||||
uint32_t MaxPoolCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
const std::vector<std::string> required_attrs = {"ksize", "strides", "padding"};
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMaxPoolInputNum, kMaxPoolOutputNum, required_attrs),
|
||||
"[MaxPool] Check input and output number failed.");
|
||||
DataType input_type = ctx.Input(kFirstInputIndex)->GetDataType();
|
||||
switch (input_type) {
|
||||
case DT_FLOAT16:
|
||||
return ComputeMaxPoolImpl<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return ComputeMaxPoolImpl<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return ComputeMaxPoolImpl<double>(ctx);
|
||||
case DT_INT8:
|
||||
return ComputeMaxPoolImpl<int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return ComputeMaxPoolImpl<int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return ComputeMaxPoolImpl<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return ComputeMaxPoolImpl<int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return ComputeMaxPoolImpl<uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return ComputeMaxPoolImpl<uint16_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[MaxPool] Data type [%s] is not supported.", DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(MAXPOOL, MaxPoolCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,27 @@
|
|||
/**
|
||||
* Copyright 2021 Harbin Institute of Technology
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MaxPoolCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~MaxPoolCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
};
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,129 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "minimum_grad_grad.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
constexpr uint32_t kMinimumGradGradInputNum = 4;
|
||||
constexpr uint32_t kMinimumGradGradOutputNum = 3;
|
||||
const char *kMinimumGradGrad = "MinimumGradGrad";
|
||||
|
||||
#define MINIMUMGRADGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MinimumGradGradCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("MinimumGradGrad kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MinimumGradGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kMinimumGradGradInputNum, kMinimumGradGradOutputNum),
|
||||
"MinimumGradGrad check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MinimumGradGradParamCheck(ctx), "MinimumGradGrad check params failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
MINIMUMGRADGRAD_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
MINIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
MINIMUMGRADGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("The data type of input is not support, input data type is [%s].", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MinimumGradGradCpuKernel::MinimumGradGradParamCheck(CpuKernelContext &ctx) {
|
||||
// the non null of inputs and outputs has been verified in
|
||||
// NormalCheck
|
||||
Tensor *x1 = ctx.Input(0);
|
||||
Tensor *x2 = ctx.Input(1);
|
||||
Tensor *grad_y1 = ctx.Input(2);
|
||||
Tensor *grad_y2 = ctx.Input(3);
|
||||
|
||||
// type check
|
||||
DataType grad_y1_type = grad_y1->GetDataType();
|
||||
DataType grad_y2_type = grad_y2->GetDataType();
|
||||
DataType x1_type = x1->GetDataType();
|
||||
DataType x2_type = x2->GetDataType();
|
||||
KERNEL_CHECK_FALSE(((grad_y1_type == grad_y2_type) && (grad_y2_type == x1_type) && (x1_type == x2_type)),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of grad_y1 [%s], grad_y2 [%s], x1 [%s] and "
|
||||
"x2 [%s] need to be same.",
|
||||
DTypeStr(grad_y1_type).c_str(), DTypeStr(grad_y2_type).c_str(), DTypeStr(x1_type).c_str(),
|
||||
DTypeStr(x2_type).c_str())
|
||||
// shape check
|
||||
auto grad_y1_shape = grad_y1->GetTensorShape()->GetDimSizes();
|
||||
auto grad_y2_shape = grad_y2->GetTensorShape()->GetDimSizes();
|
||||
auto x1_shape = x1->GetTensorShape()->GetDimSizes();
|
||||
auto x2_shape = x2->GetTensorShape()->GetDimSizes();
|
||||
KERNEL_CHECK_FALSE(grad_y1_shape == x1_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y1 and x1.");
|
||||
KERNEL_CHECK_FALSE(grad_y2_shape == x2_shape, KERNEL_STATUS_PARAM_INVALID, "Mismatch in shape of grad_y2 and x2.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MinimumGradGradCpuKernel::MinimumGradGradCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MinimumGradGradCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto in2 = reinterpret_cast<T *>(ctx.Input(2)->GetData());
|
||||
auto in3 = reinterpret_cast<T *>(ctx.Input(3)->GetData());
|
||||
auto out0 = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto out1 = reinterpret_cast<T *>(ctx.Output(1)->GetData());
|
||||
auto out2 = reinterpret_cast<T *>(ctx.Output(2)->GetData());
|
||||
*out0 = static_cast<T>(0);
|
||||
*out1 = static_cast<T>(0);
|
||||
int64_t data_num = ctx.Output(2)->NumElements();
|
||||
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
if (*(in0 + bcast.GetBroadcastXIndex(i)) <= *(in1 + bcast.GetBroadcastYIndex(i))) {
|
||||
*(out2 + i) = *(in2 + bcast.GetBroadcastXIndex(i));
|
||||
} else {
|
||||
*(out2 + i) = *(in3 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMinimumGradGrad, MinimumGradGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,44 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MINIMUM_GRAD_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MINIMUM_GRAD_GRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MinimumGradGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MinimumGradGradCpuKernel() = default;
|
||||
~MinimumGradGradCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MinimumGradGradParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t MinimumGradGradCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,249 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "mul_no_nan.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kMulNoNan = "MulNoNan";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 8 * 1024;
|
||||
const int64_t kParallelDataNumMid = 64 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 32 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 256 * 1024;
|
||||
|
||||
#define MULNONAN_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = MulNoNanCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("MulNoNan kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MulNoNanCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "MulNoNan check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MulNoNanParamCheck(ctx), "MulNoNan check params failed.");
|
||||
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
MULNONAN_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
MULNONAN_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MulNoNan kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MulNoNanCpuKernel::MulNoNanParamCheck(CpuKernelContext &ctx) {
|
||||
// the non null of input_0, input_1, output has been verified in NormalCheck
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output = ctx.Output(0);
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
|
||||
KERNEL_LOG_DEBUG(
|
||||
"LessCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
// special compute is used in the following situations.
|
||||
// 1. the shapes of input1 and input2 are the same
|
||||
// 2. input1 is a 1D tensor with only one element or input1 is scalar
|
||||
// 3. input2 is a 1D tensor with only one element or input2 is scalar
|
||||
// 4. the shapes of input1 and input2 are different
|
||||
template <typename T>
|
||||
void MulNoNanCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
|
||||
const T *input2, T *output) {
|
||||
switch (type) {
|
||||
case BcastShapeType::SAME_SHAPE:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(input2 + i) == (T)0) {
|
||||
*(output + i) = (T)0;
|
||||
} else {
|
||||
*(output + i) = *(input1 + i) * *(input2 + i);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::X_ONE_ELEMENT:
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(input2 + i) == (T)0) {
|
||||
*(output + i) = (T)0;
|
||||
} else {
|
||||
*(output + i) = *input1 * *(input2 + i);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case BcastShapeType::Y_ONE_ELEMENT:
|
||||
if (*input2 == (T)0) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(output + i) = (T)0;
|
||||
}
|
||||
} else {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
*(output + i) = *(input1 + i) * *input2;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MulNoNanCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
|
||||
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
BcastShapeType type = in0_elements_nums == in1_elements_nums
|
||||
? BcastShapeType::SAME_SHAPE
|
||||
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
|
||||
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_mul_no_nan = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, in0, in1, out); };
|
||||
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("Divisor max_core_num is 0");
|
||||
} else {
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_mul_no_nan),
|
||||
"MulNoNan Compute failed.");
|
||||
}
|
||||
} else {
|
||||
SpecialCompute<T>(type, 0, data_num, in0, in1, out);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MulNoNanCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
|
||||
if (data_num >= kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_mul_no_nan = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
if (*(in1 + bcast.GetBroadcastYIndex(i)) == (T)0) {
|
||||
*(out + i) = (T)0;
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) * *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("Divisor max_core_num is 0");
|
||||
} else {
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_mul_no_nan),
|
||||
"MulNoNan Compute failed.");
|
||||
}
|
||||
|
||||
} else {
|
||||
for (int64_t i = 0; i < data_num; ++i) {
|
||||
if (*(in1 + bcast.GetBroadcastYIndex(i)) == (T)0) {
|
||||
*(out + i) = (T)0;
|
||||
} else {
|
||||
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) * *(in1 + bcast.GetBroadcastYIndex(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MulNoNanCpuKernel::MulNoNanCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input0_tensor = ctx.Input(0);
|
||||
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input0_tensor->NumElements();
|
||||
|
||||
Tensor *input1_tensor = ctx.Input(1);
|
||||
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input1_elements_nums = input1_tensor->NumElements();
|
||||
|
||||
bool noNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
|
||||
if (noNeedBcast) {
|
||||
return NoBcastCompute<T>(ctx);
|
||||
} else {
|
||||
Bcast bcast(input0_shape, input1_shape);
|
||||
if (!bcast.IsValid()) {
|
||||
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return BcastCompute<T>(ctx, bcast);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMulNoNan, MulNoNanCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,48 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MUL_NO_NAN_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MUL_NO_NAN_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MulNoNanCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MulNoNanCpuKernel() = default;
|
||||
~MulNoNanCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t MulNoNanParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
void SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
|
||||
|
||||
template <typename T>
|
||||
uint32_t NoBcastCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
|
||||
|
||||
template <typename T>
|
||||
uint32_t MulNoNanCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,196 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "multilabel_margin_loss_grad.h"
|
||||
|
||||
#include <Eigen/Dense>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kMultilabelMarginLossGrad = "MultilabelMarginLossGrad";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t MultilabelMarginLossGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
uint32_t kInputNum = 4;
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"MultilabelMarginLossGrad check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(MultilabelMarginLossGradCheck(ctx), "MultilabelMarginLossGrad check params failed.");
|
||||
auto data_type = ctx.Input(1)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT16:
|
||||
return MultilabelMarginLossGradComputeFP16<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return MultilabelMarginLossGradCompute<float>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("MultilabelMarginLossGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t MultilabelMarginLossGradCpuKernel::MultilabelMarginLossGradCheck(CpuKernelContext &ctx) {
|
||||
auto target = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
|
||||
size_t dims = ctx.Input(1)->GetTensorShape()->GetDims();
|
||||
int64_t batch_size =
|
||||
(dims == 2) ? ctx.Input(1)->GetTensorShape()->GetDimSize(1) : ctx.Input(1)->GetTensorShape()->GetDimSize(0);
|
||||
size_t data_num = ctx.Input(1)->GetTensorShape()->NumElements();
|
||||
AttrValue *Attr_red = ctx.GetAttr("reduction");
|
||||
std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
KERNEL_CHECK_FALSE(*(target + i) >= -1 && (*(target + i) < batch_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%s]'s target out of range.", ctx.GetOpType().c_str());
|
||||
}
|
||||
if (reduction == "none") {
|
||||
if (dims == 1) {
|
||||
KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() == 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%s]'s y_grad should be a scalar "
|
||||
"when rank of x is 1.",
|
||||
ctx.GetOpType().c_str())
|
||||
} else {
|
||||
KERNEL_CHECK_FALSE(
|
||||
ctx.Input(0)->GetTensorShape()->GetDims() == 1 &&
|
||||
ctx.Input(0)->GetTensorShape()->GetDimSize(0) == ctx.Input(1)->GetTensorShape()->GetDimSize(0),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%s]'s y_grad's shape should be the same as "
|
||||
"{x_shape[0]} when the rank of x is 2 and reduction is none.",
|
||||
ctx.GetOpType().c_str())
|
||||
}
|
||||
} else {
|
||||
KERNEL_CHECK_FALSE(ctx.Input(0)->GetTensorShape()->GetDims() == 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"[%s]'s y_grad should be a scalar "
|
||||
"when reduction is mean or sum.",
|
||||
ctx.GetOpType().c_str())
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MultilabelMarginLossGradCpuKernel::MultilabelMarginLossGradCompute(CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto input_target = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
|
||||
auto input_istarget = reinterpret_cast<int32_t *>(ctx.Input(3)->GetData());
|
||||
auto output_x_grad = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
AttrValue *Attr_red = ctx.GetAttr("reduction");
|
||||
std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
|
||||
size_t dims = ctx.Input(1)->GetTensorShape()->GetDims();
|
||||
size_t batch_size =
|
||||
(dims == 2) ? ctx.Input(1)->GetTensorShape()->GetDimSize(1) : ctx.Input(1)->GetTensorShape()->GetDimSize(0);
|
||||
size_t data_num = ctx.Input(1)->GetTensorShape()->NumElements();
|
||||
size_t nframe = data_num / batch_size;
|
||||
auto g = static_cast<T>(reduction == "mean" ? 1. / data_num : 1. / batch_size);
|
||||
std::vector<T> output_vector(data_num, 0);
|
||||
for (size_t t = 0; t < nframe; t++) {
|
||||
for (size_t m = 0; m < batch_size; m++) {
|
||||
int32_t target_idx = input_target[m];
|
||||
if (target_idx < 0) {
|
||||
break;
|
||||
}
|
||||
auto calc_target = input_x[target_idx];
|
||||
for (size_t n = 0; n < batch_size; n++) {
|
||||
if (input_istarget[n] == 0) {
|
||||
float z = 1 - calc_target + input_x[n];
|
||||
if (z > 0) {
|
||||
output_vector[t * batch_size + target_idx] -= g;
|
||||
output_vector[t * batch_size + n] += g;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
input_x += batch_size;
|
||||
input_target += batch_size;
|
||||
input_istarget += batch_size;
|
||||
}
|
||||
auto y_grad = ctx.Input(0);
|
||||
auto y_grad_data = reinterpret_cast<T *>(y_grad->GetData());
|
||||
size_t y_grad_dims = y_grad->GetTensorShape()->GetDims();
|
||||
if (reduction != "none" || y_grad_dims == 0) {
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
*(output_x_grad + i) = output_vector[i] * (*(y_grad_data));
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < nframe; i++) {
|
||||
for (size_t j = 0; j < batch_size; j++) {
|
||||
*(output_x_grad + i * batch_size + j) = output_vector[i * batch_size + j] * (*(y_grad_data + i));
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t MultilabelMarginLossGradCpuKernel::MultilabelMarginLossGradComputeFP16(CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto input_target = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
|
||||
auto input_istarget = reinterpret_cast<int32_t *>(ctx.Input(3)->GetData());
|
||||
auto output_x_grad = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
AttrValue *Attr_red = ctx.GetAttr("reduction");
|
||||
std::string reduction = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
|
||||
size_t dims = ctx.Input(1)->GetTensorShape()->GetDims();
|
||||
size_t batch_size =
|
||||
(dims == 2) ? ctx.Input(1)->GetTensorShape()->GetDimSize(1) : ctx.Input(1)->GetTensorShape()->GetDimSize(0);
|
||||
size_t data_num = ctx.Input(1)->GetTensorShape()->NumElements();
|
||||
size_t nframe = data_num / batch_size;
|
||||
float g = static_cast<float>(reduction == "mean" ? 1. / data_num : 1. / batch_size);
|
||||
std::vector<float> output_vector(data_num, 0);
|
||||
for (size_t t = 0; t < nframe; t++) {
|
||||
for (size_t m = 0; m < batch_size; m++) {
|
||||
int32_t target_idx = input_target[m];
|
||||
if (target_idx < 0) {
|
||||
break;
|
||||
}
|
||||
float calc_target = static_cast<float>(input_x[target_idx]);
|
||||
for (size_t n = 0; n < batch_size; n++) {
|
||||
if (input_istarget[n] == 0) {
|
||||
float z = 1 - calc_target + static_cast<float>(input_x[n]);
|
||||
if (z > 0) {
|
||||
output_vector[t * batch_size + target_idx] -= g;
|
||||
output_vector[t * batch_size + n] += g;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
input_x += batch_size;
|
||||
input_target += batch_size;
|
||||
input_istarget += batch_size;
|
||||
}
|
||||
auto y_grad = ctx.Input(0);
|
||||
auto y_grad_data = reinterpret_cast<T *>(y_grad->GetData());
|
||||
size_t y_grad_dims = y_grad->GetTensorShape()->GetDims();
|
||||
if (reduction != "none" || y_grad_dims == 0) {
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
*(output_x_grad + i) = static_cast<T>(output_vector[i] * static_cast<float>(*(y_grad_data)));
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < nframe; i++) {
|
||||
for (size_t j = 0; j < batch_size; j++) {
|
||||
*(output_x_grad + i * batch_size + j) =
|
||||
static_cast<T>(output_vector[i * batch_size + j] * static_cast<float>(*(y_grad_data + i)));
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kMultilabelMarginLossGrad, MultilabelMarginLossGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MULITLABEL_MARGIN_LOSS_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MULTILABEL_MARGIN_LOSS_GRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class MultilabelMarginLossGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
MultilabelMarginLossGradCpuKernel() = default;
|
||||
~MultilabelMarginLossGradCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
static uint32_t MultilabelMarginLossGradCheck(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
static uint32_t MultilabelMarginLossGradCompute(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
static uint32_t MultilabelMarginLossGradComputeFP16(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,168 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "non_max_suppression_with_overlaps.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "cpu_attr_value.h"
|
||||
#include "cpu_tensor.h"
|
||||
#include "cpu_tensor_shape.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/allocator_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kNonMaxSuppressionWithOverlaps = "NonMaxSuppressionWithOverlaps";
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 5;
|
||||
const uint32_t kFirstInputIndex = 0;
|
||||
const uint32_t kSecondInputIndex = 1;
|
||||
const uint32_t kThirdInputIndex = 2;
|
||||
const uint32_t kforthInputIndex = 3;
|
||||
const uint32_t kfifthInputIndex = 4;
|
||||
const uint32_t kFirstOutputIndex = 0;
|
||||
const uint32_t kOverlapsRank = 2;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t NonMaxSuppressionWithOverlapsCpuKernel::GetInputAndCheck(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"NonMaxSuppressionWithOverlaps check input and output number failed.");
|
||||
overlaps_ = ctx.Input(kFirstInputIndex);
|
||||
scores_ = ctx.Input(kSecondInputIndex);
|
||||
Tensor *max_output_size_tensor = ctx.Input(kThirdInputIndex);
|
||||
max_output_size_ = *static_cast<int32_t *>(max_output_size_tensor->GetData());
|
||||
KERNEL_CHECK_FALSE((max_output_size_ >= 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The input max_output_size must be non-negative");
|
||||
overlap_threshold_tensor_ = ctx.Input(kforthInputIndex);
|
||||
score_threshold_tensor_ = ctx.Input(kfifthInputIndex);
|
||||
output_indices_ = ctx.Output(kFirstOutputIndex);
|
||||
|
||||
std::shared_ptr<TensorShape> overlaps_shape = overlaps_->GetTensorShape();
|
||||
int32_t overlaps_rank = overlaps_shape->GetDims();
|
||||
if (overlaps_rank != kOverlapsRank || overlaps_shape->GetDimSize(0) != overlaps_shape->GetDimSize(1)) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The input dim size of overlaps must be 2-D and must be square, "
|
||||
"while %d, %lld",
|
||||
overlaps_rank, overlaps_shape->GetDimSize(1));
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
num_boxes_ = overlaps_shape->GetDimSize(0);
|
||||
|
||||
std::shared_ptr<TensorShape> scores_shape = scores_->GetTensorShape();
|
||||
int32_t scores_rank = scores_shape->GetDims();
|
||||
KERNEL_CHECK_FALSE((scores_rank == 1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The input dim size of scores must be 1-D, while %d.", scores_rank);
|
||||
KERNEL_CHECK_FALSE((scores_shape->GetDimSize(0) == num_boxes_), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The len of scores must be equal to the number of boxes, "
|
||||
"while dims[%lld], num_boxes_[%d].",
|
||||
scores_shape->GetDimSize(0), num_boxes_);
|
||||
|
||||
overlaps_dtype_ = static_cast<DataType>(overlaps_->GetDataType());
|
||||
if (overlaps_dtype_ != DT_FLOAT) {
|
||||
KERNEL_LOG_ERROR("The dtype of input[0] overlaps must be float.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
scores_dtype_ = static_cast<DataType>(scores_->GetDataType());
|
||||
if (scores_dtype_ != DT_FLOAT) {
|
||||
KERNEL_LOG_ERROR("The dtype of input[1] scores must be float.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
overlap_threshold_dtype_ = static_cast<DataType>(overlap_threshold_tensor_->GetDataType());
|
||||
if (overlap_threshold_dtype_ != DT_FLOAT) {
|
||||
KERNEL_LOG_ERROR("The dtype of input[3] overlap_threshold must be float.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
score_threshold_dtype_ = static_cast<DataType>(score_threshold_tensor_->GetDataType());
|
||||
if (score_threshold_dtype_ != DT_FLOAT) {
|
||||
KERNEL_LOG_ERROR("The dtype of input[4] score_threshold must be float.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, typename T_threshold>
|
||||
uint32_t NonMaxSuppressionWithOverlapsCpuKernel::DoNonMaxSuppressionWithOverlapsOp() {
|
||||
KERNEL_LOG_INFO("DoNonMaxSuppressionWithOverlapsOp start!!");
|
||||
Eigen::TensorMap<Eigen::Tensor<T, kOverlapsRank, Eigen::RowMajor>> overlaps_map(
|
||||
reinterpret_cast<T *>(overlaps_->GetData()), num_boxes_, num_boxes_);
|
||||
std::vector<T> scores_data(num_boxes_);
|
||||
std::copy_n(reinterpret_cast<T *>(scores_->GetData()), num_boxes_, scores_data.begin());
|
||||
auto overlap_threshold = static_cast<T>(*(static_cast<T_threshold *>(overlap_threshold_tensor_->GetData())));
|
||||
auto score_threshold = static_cast<T>(*(static_cast<T_threshold *>(score_threshold_tensor_->GetData())));
|
||||
std::unique_ptr<int32_t[]> indices_data(new int32_t[max_output_size_]);
|
||||
if (indices_data == nullptr) {
|
||||
KERNEL_LOG_ERROR("DoNonMaxSuppressionWithOverlapsOp: new indices_data failed");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
struct Candidate {
|
||||
int box_index;
|
||||
T score;
|
||||
int suppress_begin_index;
|
||||
};
|
||||
auto cmp = [](const Candidate boxes_i, const Candidate boxes_j) { return boxes_i.score < boxes_j.score; };
|
||||
std::priority_queue<Candidate, std::deque<Candidate>, decltype(cmp)> candidate_priority_queue(cmp);
|
||||
for (uint32_t i = 0; i < scores_data.size(); ++i) {
|
||||
if (scores_data[i] > score_threshold) {
|
||||
candidate_priority_queue.emplace(Candidate({(int)i, scores_data[i]}));
|
||||
}
|
||||
}
|
||||
T similarity = static_cast<T>(0.0);
|
||||
Candidate next_candidate = {.box_index = 0, .score = static_cast<T>(0.0), .suppress_begin_index = 0};
|
||||
int32_t cnt = 0;
|
||||
while (cnt < max_output_size_ && !candidate_priority_queue.empty()) {
|
||||
next_candidate = candidate_priority_queue.top();
|
||||
candidate_priority_queue.pop();
|
||||
bool should_suppress = false;
|
||||
for (int j = cnt - 1; j >= next_candidate.suppress_begin_index; --j) {
|
||||
similarity = overlaps_map(next_candidate.box_index, indices_data[j]);
|
||||
if (similarity >= overlap_threshold) {
|
||||
should_suppress = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
next_candidate.suppress_begin_index = cnt;
|
||||
if (!should_suppress) {
|
||||
indices_data[cnt] = next_candidate.box_index;
|
||||
cnt += 1;
|
||||
}
|
||||
}
|
||||
auto value = reinterpret_cast<int32_t *>(output_indices_->GetData());
|
||||
for (int j = 0; j <= std::min(cnt, max_output_size_) - 1; j++) {
|
||||
*(value + j) = indices_data[j];
|
||||
}
|
||||
output_indices_->GetTensorShape()->SetDimSizes({std::min(cnt, max_output_size_)});
|
||||
KERNEL_LOG_INFO("DoNonMaxSuppressionWithOverlapsOp end!!");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t NonMaxSuppressionWithOverlapsCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_LOG_INFO("NonMaxSuppressionWithOverlaps kernel in.");
|
||||
uint32_t res = GetInputAndCheck(ctx);
|
||||
if (res != KERNEL_STATUS_OK) {
|
||||
return res;
|
||||
}
|
||||
res = DoNonMaxSuppressionWithOverlapsOp<float, float>();
|
||||
KERNEL_CHECK_FALSE((res == KERNEL_STATUS_OK), res, "Compute failed.");
|
||||
KERNEL_LOG_INFO("Compute end!!");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kNonMaxSuppressionWithOverlaps, NonMaxSuppressionWithOverlapsCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,47 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_NON_MAX_SUPPRESSION_WITH_OVERLAPS_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_NON_MAX_SUPPRESSION_WITH_OVERLAPS_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "eigen_tensor.h"
|
||||
|
||||
namespace aicpu {
|
||||
class NonMaxSuppressionWithOverlapsCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~NonMaxSuppressionWithOverlapsCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t GetInputAndCheck(CpuKernelContext &ctx);
|
||||
template <typename T, typename T_threshold>
|
||||
uint32_t DoNonMaxSuppressionWithOverlapsOp();
|
||||
|
||||
const Tensor *overlaps_ = nullptr;
|
||||
Tensor *scores_ = nullptr;
|
||||
Tensor *overlap_threshold_tensor_ = nullptr;
|
||||
Tensor *score_threshold_tensor_ = nullptr;
|
||||
Tensor *output_indices_ = nullptr;
|
||||
int32_t num_boxes_ = 0;
|
||||
int32_t max_output_size_ = 0;
|
||||
DataType overlaps_dtype_ = DT_UINT32;
|
||||
DataType scores_dtype_ = DT_UINT32;
|
||||
DataType overlap_threshold_dtype_ = DT_UINT32;
|
||||
DataType score_threshold_dtype_ = DT_UINT32;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_NON_MAX_SUPPRESSION_WITH_OVERLAPS_H_
|
|
@ -0,0 +1,138 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "nth_element.h"
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const char *kNthElement = "NthElement";
|
||||
constexpr uint64_t kParallelDataNums = 32 * 1024;
|
||||
|
||||
#define NTHELEMENT_COMPUTE_CASE(DTYPE, TYPE, X, Y, N, LAST_DIM, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = NthElementCompute<TYPE>(X, Y, N, LAST_DIM, CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("NthElement kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t NthElement::Compute(CpuKernelContext &ctx) {
|
||||
Tensor *input_n = ctx.Input(1);
|
||||
KERNEL_CHECK_FALSE((input_n->GetTensorShape()->GetDimSizes().empty()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input n must be a scalar.");
|
||||
DataType n_type = input_n->GetDataType();
|
||||
KERNEL_CHECK_FALSE((n_type == DT_INT32), KERNEL_STATUS_PARAM_INVALID, "The type of input n must be int32.");
|
||||
KERNEL_CHECK_NULLPTR(input_n->GetData(), KERNEL_STATUS_PARAM_INVALID, "NthElement Get input n failed.");
|
||||
int32_t *n_data = reinterpret_cast<int32_t *>(input_n->GetData());
|
||||
int32_t n = *n_data;
|
||||
KERNEL_CHECK_FALSE((n >= 0), KERNEL_STATUS_PARAM_INVALID, "Input n must be non-negative but is [%d].", n);
|
||||
|
||||
Tensor *x = ctx.Input(0);
|
||||
KERNEL_CHECK_NULLPTR(x, KERNEL_STATUS_PARAM_INVALID, "NthElement Get input x failed.");
|
||||
auto x_shape = x->GetTensorShape();
|
||||
int32_t dims = x_shape->GetDims();
|
||||
KERNEL_CHECK_FALSE((dims >= 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 1 but is rank [%d]",
|
||||
dims);
|
||||
const int32_t last_dim = x_shape->GetDimSize(dims - 1);
|
||||
KERNEL_CHECK_FALSE((last_dim > n), KERNEL_STATUS_PARAM_INVALID, "Input x must have last dimension = [%d] > n = [%d]",
|
||||
last_dim, n);
|
||||
|
||||
AttrValue *reverse_attr = ctx.GetAttr("reverse");
|
||||
KERNEL_CHECK_NULLPTR(reverse_attr, KERNEL_STATUS_PARAM_INVALID, "NthElement get attr reverse failed.");
|
||||
bool reverse = reverse_attr->GetBool();
|
||||
if (reverse) {
|
||||
n = last_dim - n - 1;
|
||||
}
|
||||
|
||||
Tensor *y = ctx.Output(0);
|
||||
|
||||
auto x_type = x->GetDataType();
|
||||
switch (x_type) {
|
||||
NTHELEMENT_COMPUTE_CASE(DT_FLOAT, float, x, y, n, last_dim, ctx)
|
||||
NTHELEMENT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, x, y, n, last_dim, ctx)
|
||||
NTHELEMENT_COMPUTE_CASE(DT_UINT8, uint8_t, x, y, n, last_dim, ctx)
|
||||
NTHELEMENT_COMPUTE_CASE(DT_UINT16, uint16_t, x, y, n, last_dim, ctx)
|
||||
NTHELEMENT_COMPUTE_CASE(DT_INT8, int8_t, x, y, n, last_dim, ctx)
|
||||
NTHELEMENT_COMPUTE_CASE(DT_INT16, int16_t, x, y, n, last_dim, ctx)
|
||||
NTHELEMENT_COMPUTE_CASE(DT_INT32, int32_t, x, y, n, last_dim, ctx)
|
||||
NTHELEMENT_COMPUTE_CASE(DT_INT64, int64_t, x, y, n, last_dim, ctx)
|
||||
NTHELEMENT_COMPUTE_CASE(DT_DOUBLE, double, x, y, n, last_dim, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(x_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t NthElement::NthElementCompute(Tensor *x, Tensor *y, const int32_t n, const int32_t last_dim,
|
||||
CpuKernelContext &ctx) {
|
||||
T *x_addrs = reinterpret_cast<T *>(x->GetData());
|
||||
T *y_addrs = reinterpret_cast<T *>(y->GetData());
|
||||
|
||||
const uint64_t num_rows = y->NumElements();
|
||||
const uint64_t num = x->NumElements();
|
||||
|
||||
if (num <= kParallelDataNums) {
|
||||
std::vector<T> buf(last_dim);
|
||||
for (size_t i = 0; i < num_rows; i++) {
|
||||
const T *input_start = x_addrs + i * last_dim;
|
||||
const T *input_end = input_start + last_dim;
|
||||
std::copy(input_start, input_end, buf.begin());
|
||||
std::nth_element(buf.begin(), buf.begin() + n, buf.end());
|
||||
y_addrs[i] = buf[n];
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > num_rows) {
|
||||
max_core_num = num_rows;
|
||||
}
|
||||
auto shard_nth_element = [&](size_t start, size_t end) {
|
||||
std::vector<T> buf(last_dim);
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
const T *input_start = x_addrs + i * last_dim;
|
||||
const T *input_end = input_start + last_dim;
|
||||
std::copy(input_start, input_end, buf.begin());
|
||||
std::nth_element(buf.begin(), buf.begin() + n, buf.end());
|
||||
y_addrs[i] = buf[n];
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, num_rows, num_rows / max_core_num, shard_nth_element),
|
||||
"NthElement Parallel Compute failed.");
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kNthElement, NthElement);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,34 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_NTH_ELEMENT_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_NTH_ELEMENT_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class NthElement : public CpuKernel {
|
||||
public:
|
||||
NthElement() = default;
|
||||
~NthElement() override = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t NthElementCompute(Tensor *x, Tensor *y, const int32_t n, const int32_t last_dim, CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,198 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file one_hot.cc
|
||||
* \brief
|
||||
*/
|
||||
#include "one_hot.h"
|
||||
#include <string>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/sparse_tensor.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 4;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kOneHot = "OneHot";
|
||||
const int64_t kParallelDataNumSameShape = 100 * 1024;
|
||||
#define ONE_HOT_INPUT_COMPUTE_CASE(DTYPE, TYPE, ODTYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
switch (ODTYPE) { \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_COMPLEX64, std::complex<float>, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_COMPLEX128, std::complex<double>, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_DOUBLE, double, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_FLOAT, float_t, CTX); \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_FLOAT16, Eigen::half, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT8, int8_t, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT16, int16_t, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT32, int32_t, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_INT64, int64_t, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT8, uint8_t, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT16, uint16_t, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT32, uint32_t, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_UINT64, uint64_t, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_BOOL, bool, CTX) \
|
||||
ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, DT_STRING, std::string, CTX) \
|
||||
default: \
|
||||
KERNEL_LOG_ERROR("OneHot kernel output data type [%s] not support.", DTypeStr(output_data_type).c_str()); \
|
||||
return KERNEL_STATUS_PARAM_INVALID; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define ONE_HOT_OUTPUT_COMPUTE_CASE(DTYPE, TYPE, ODTYPE, OTYPE, CTX) \
|
||||
case (ODTYPE): { \
|
||||
uint32_t result = OneHotCompute<OTYPE, TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("OneHot kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t OneHotCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "OneHot check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(OneHotParamCheck(ctx), "OneHot check params failed.");
|
||||
auto input_data_type = ctx.Input(0)->GetDataType();
|
||||
auto output_data_type = ctx.Output(0)->GetDataType();
|
||||
switch (input_data_type) {
|
||||
ONE_HOT_INPUT_COMPUTE_CASE(DT_UINT8, uint8_t, output_data_type, ctx);
|
||||
ONE_HOT_INPUT_COMPUTE_CASE(DT_INT32, int32_t, output_data_type, ctx);
|
||||
ONE_HOT_INPUT_COMPUTE_CASE(DT_INT64, int64_t, output_data_type, ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("OneHot kernel input data type [%s] not support.", DTypeStr(input_data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, typename TI>
|
||||
uint32_t OneHotCpuKernel::OneHotCompute(CpuKernelContext &ctx) {
|
||||
// 输入张量
|
||||
Tensor *indices = ctx.Input(0);
|
||||
// 输出张量
|
||||
Tensor *output = ctx.Output(0);
|
||||
// 输入张量数据
|
||||
auto indices_data = reinterpret_cast<TI *>(indices->GetData());
|
||||
// 输出张量数据
|
||||
auto output_data = reinterpret_cast<T *>(output->GetData());
|
||||
// depth值
|
||||
auto depth = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
|
||||
// on_value值
|
||||
auto on_value = reinterpret_cast<T *>(ctx.Input(2)->GetData());
|
||||
// off_value值
|
||||
auto off_value = reinterpret_cast<T *>(ctx.Input(3)->GetData());
|
||||
// 输入张量形状
|
||||
auto indices_shape = indices->GetTensorShape();
|
||||
// axis值
|
||||
int64_t axis = ctx.GetAttr("axis") == nullptr ? -1 : ctx.GetAttr("axis")->GetInt();
|
||||
if (axis == -1) {
|
||||
axis = indices_shape->GetDims();
|
||||
}
|
||||
// 输出张量形状
|
||||
auto output_shape = output->GetTensorShape();
|
||||
// 对输出张量用off_value进行初始化匿名函数
|
||||
auto init_output_func = [&](int64_t start, int64_t end) -> void {
|
||||
for (int i = start; i < end; ++i) {
|
||||
*(output_data + i) = *(off_value);
|
||||
}
|
||||
};
|
||||
// 计算axis前维度大小
|
||||
int64_t prefix_dim_size = 1;
|
||||
for (int i = 0; i < axis; ++i) {
|
||||
prefix_dim_size *= indices_shape->GetDimSize(i);
|
||||
}
|
||||
// 计算计算axis后维度大小
|
||||
int64_t suffix_dim_size = indices_shape->NumElements() / prefix_dim_size;
|
||||
// 输入张量元素总个数
|
||||
int64_t data_num = indices_shape->NumElements();
|
||||
// depth_value为depth的具体值
|
||||
int32_t depth_value = *(depth);
|
||||
// 将输出张量的维度看做{prefix_dim_size,depth, suffix_dim_size}
|
||||
// 通过offset = suffix_dim_size == 1?(d0 * depth_value + d1):(d0 * prefix_dim_size * depth_value + d1 *
|
||||
// suffix_dim_size + d2)来计算出独热张量有效值的位置 然后对输出张量的该位置赋值为on_value
|
||||
const auto get_output_func = [&](int64_t start, int64_t end) -> void {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
int64_t d0 = i / suffix_dim_size;
|
||||
int64_t d1 = i - (d0 * suffix_dim_size);
|
||||
int64_t depth_v = SubtleMustCopy<int64_t>(*(indices_data + d0 * suffix_dim_size + d1));
|
||||
if (depth_v < static_cast<int64_t>(depth_value) && depth_v >= 0) {
|
||||
int64_t offset = suffix_dim_size == 1 ? i * depth_value + depth_v
|
||||
: d0 * depth_value * suffix_dim_size + depth_v * suffix_dim_size + d1;
|
||||
*(output_data + offset) = *(on_value);
|
||||
}
|
||||
}
|
||||
};
|
||||
// 使用CpuKernelUtils::GetCPUNum接口获取AI CPU的核数
|
||||
uint32_t max_core_num = std::max(1U, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
// 多线程执行状态
|
||||
bool run_state = true;
|
||||
// 对于数据量小于100K的场景则只单核运行,否则使用实际的AI CPU总核数进行计算
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
max_core_num = (max_core_num > data_num) ? data_num : max_core_num;
|
||||
max_core_num = max_core_num == 0 ? 1 : max_core_num;
|
||||
uint32_t ret1 = CpuKernelUtils::ParallelFor(ctx, output_shape->NumElements(),
|
||||
(output_shape->NumElements() / max_core_num), init_output_func);
|
||||
uint32_t ret2 = CpuKernelUtils::ParallelFor(ctx, data_num, (data_num / max_core_num), get_output_func);
|
||||
run_state = (ret1 == KERNEL_STATUS_OK) && (ret2 == KERNEL_STATUS_OK);
|
||||
} else {
|
||||
// 输入数据大小没有100k,单核调用
|
||||
init_output_func(0, output_shape->NumElements());
|
||||
get_output_func(0, data_num);
|
||||
}
|
||||
return run_state ? KERNEL_STATUS_OK : KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
|
||||
// 参数校验
|
||||
uint32_t OneHotCpuKernel::OneHotParamCheck(CpuKernelContext &ctx) {
|
||||
Tensor *indices = ctx.Input(0);
|
||||
Tensor *depth = ctx.Input(1);
|
||||
Tensor *on_value = ctx.Input(2);
|
||||
Tensor *off_value = ctx.Input(3);
|
||||
int64_t axis = ctx.GetAttr("axis") == nullptr ? -1 : ctx.GetAttr("axis")->GetInt();
|
||||
|
||||
DataType on_value_type = on_value->GetDataType();
|
||||
DataType off_value_type = off_value->GetDataType();
|
||||
KERNEL_CHECK_FALSE((on_value_type == off_value_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of on_value [%s] need be same with off_value [%s].",
|
||||
DTypeStr(on_value_type).c_str(), DTypeStr(off_value_type).c_str())
|
||||
auto depth_shape = depth->GetTensorShape();
|
||||
auto on_value_shape = on_value->GetTensorShape();
|
||||
auto off_value_shape = off_value->GetTensorShape();
|
||||
KERNEL_CHECK_FALSE((depth_shape->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Depth must be a scalar, actual dim num is %d.", depth_shape->GetDims())
|
||||
KERNEL_CHECK_FALSE((on_value_shape->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"On_value must be a scalar, actual dim num is %d.", on_value_shape->GetDims())
|
||||
KERNEL_CHECK_FALSE((off_value_shape->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Off_value must be a scalar , actual dim num is %d.", off_value_shape->GetDims())
|
||||
int32_t output_dims = indices->GetTensorShape()->GetDims() + 1;
|
||||
KERNEL_CHECK_FALSE(((axis > -2 && axis < output_dims)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Expected axis value should between [-1, %d]. But received: %d.", output_dims - 1, axis)
|
||||
int32_t depth_value = *(reinterpret_cast<int32_t *>(ctx.Input(1)->GetData()));
|
||||
KERNEL_CHECK_FALSE((depth_value >= 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Depth should be a non-negative. But received: %d.", depth_value)
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kOneHot, OneHotCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file one_hot.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_ONE_HOT_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_ONE_HOT_H_
|
||||
|
||||
#include <type_traits>
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class OneHotCpuKernel : public CpuKernel {
|
||||
public:
|
||||
OneHotCpuKernel() = default;
|
||||
~OneHotCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T, typename TI>
|
||||
uint32_t OneHotCompute(CpuKernelContext &ctx);
|
||||
|
||||
uint32_t OneHotParamCheck(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,228 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "orgqr.h"
|
||||
|
||||
#include "Eigen/Dense"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include <numeric>
|
||||
#include <iostream>
|
||||
|
||||
using namespace Eigen;
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 2;
|
||||
const char *kOrgqr = "Orgqr";
|
||||
const double ZERO = 0.;
|
||||
const uint32_t kTWO = 2;
|
||||
constexpr int64_t kParallelDataNums = 18 * 1024;
|
||||
constexpr int64_t kParallelDataNumsMid = 32 * 1024;
|
||||
|
||||
#define ORGQR_COMPUTE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = OrgqrCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Orgqr kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
#define ORGQR_COMPUTE_COMPLEX(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = OrgqrComputeComplex<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Orgqr kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t OrgqrCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Orgqr check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(OrgqrCheck(ctx), "[%s] check params failed.", kOrgqr);
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
ORGQR_COMPUTE(DT_FLOAT, float, ctx)
|
||||
ORGQR_COMPUTE(DT_DOUBLE, double, ctx)
|
||||
ORGQR_COMPUTE_COMPLEX(DT_COMPLEX64, std::complex<float_t>, ctx)
|
||||
ORGQR_COMPUTE_COMPLEX(DT_COMPLEX128, std::complex<double_t>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Orgqr kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t OrgqrCpuKernel::OrgqrCheck(CpuKernelContext &ctx) {
|
||||
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
size_t shape_size = shape_x.size();
|
||||
KERNEL_CHECK_FALSE((shape_size > 1), KERNEL_STATUS_PARAM_INVALID, "Input x must be at least rank 2.")
|
||||
KERNEL_CHECK_FALSE((shape_x[shape_size - kTWO] > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Dimension [%zu] of input x must be at least 1, but [%zu].", shape_size - kTWO,
|
||||
shape_x[shape_size - kTWO])
|
||||
KERNEL_CHECK_FALSE((shape_x[shape_size - 1] > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Dimension [%zu] of input x must be at least 1, but [%zu].", shape_size - 1,
|
||||
shape_x[shape_size - 1])
|
||||
KERNEL_CHECK_FALSE((shape_x[shape_size - kTWO] >= shape_x[shape_size - 1]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Dimension [%zu] of input x must be bigger than dimension [%zu], when input x has rank [%zu].",
|
||||
shape_size - kTWO, shape_size - 1, shape_size)
|
||||
std::vector<int64_t> shape_tau = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
size_t shape_tau_size = shape_tau.size();
|
||||
KERNEL_CHECK_FALSE((shape_x[shape_size - 1] >= shape_tau[shape_tau_size - 1]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Dimension [%zu] of input tau must be less than [%zu], but [%zu].", shape_tau_size - 1,
|
||||
shape_x[shape_size - 1], shape_tau[shape_tau_size - 1])
|
||||
if (shape_size > kTWO) {
|
||||
KERNEL_CHECK_FALSE((shape_x[0] == shape_tau[0]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Dimension 0 of input tau must equal Dimension 0 of input x when input has batch")
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t OrgqrCpuKernel::OrgqrCompute(CpuKernelContext &ctx) {
|
||||
auto *x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto *tau = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto *y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
size_t shape_size = shape_x.size();
|
||||
size_t m = shape_x[shape_size - kTWO];
|
||||
size_t n = shape_x[shape_size - 1];
|
||||
std::vector<int64_t> shape_tau = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
size_t p = *(shape_tau.end() - 1);
|
||||
size_t size_mn = m * n;
|
||||
size_t matrix_num = ctx.Input(0)->NumElements() / size_mn;
|
||||
int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
|
||||
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartrixXd;
|
||||
typedef Eigen::Matrix<T, Eigen::Dynamic, 1> VectorXd;
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (size_t i = 0; i < matrix_num; i++) {
|
||||
Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
|
||||
Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
|
||||
MartrixXd tmp = MartrixXd::Identity(m, m);
|
||||
Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
|
||||
for (size_t k = 0; k < p; k++) {
|
||||
VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
|
||||
vector_v[0] = 1;
|
||||
tmp.rightCols(m - k) =
|
||||
tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.transpose();
|
||||
}
|
||||
martrix_y = tmp.leftCols(n);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (data_size <= kParallelDataNumsMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
if (max_core_num > matrix_num) {
|
||||
max_core_num = matrix_num;
|
||||
}
|
||||
auto shard_qr = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
|
||||
Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
|
||||
MartrixXd tmp = MartrixXd::Identity(m, m);
|
||||
Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
|
||||
for (size_t k = 0; k < p; k++) {
|
||||
VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
|
||||
vector_v[0] = 1;
|
||||
tmp.rightCols(m - k) =
|
||||
tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.transpose();
|
||||
}
|
||||
martrix_y = tmp.leftCols(n);
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_qr),
|
||||
"Orgqr Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t OrgqrCpuKernel::OrgqrComputeComplex(CpuKernelContext &ctx) {
|
||||
auto *x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto *tau = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto *y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
size_t shape_size = shape_x.size();
|
||||
size_t m = shape_x[shape_size - kTWO];
|
||||
size_t n = shape_x[shape_size - 1];
|
||||
std::vector<int64_t> shape_tau = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
size_t p = *(shape_tau.end() - 1);
|
||||
size_t size_mn = m * n;
|
||||
size_t matrix_num = ctx.Input(0)->NumElements() / size_mn;
|
||||
int64_t data_size = ctx.Input(0)->NumElements() * sizeof(T);
|
||||
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MartrixXd;
|
||||
typedef Eigen::Matrix<T, Eigen::Dynamic, 1> VectorXd;
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (size_t i = 0; i < matrix_num; i++) {
|
||||
Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
|
||||
Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
|
||||
MartrixXd tmp = MartrixXd::Identity(m, m);
|
||||
Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
|
||||
for (size_t k = 0; k < p; k++) {
|
||||
VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
|
||||
vector_v[0] = 1;
|
||||
tmp.rightCols(m - k) =
|
||||
tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.adjoint();
|
||||
}
|
||||
martrix_y = tmp.leftCols(n);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (data_size <= kParallelDataNumsMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
if (max_core_num > matrix_num) {
|
||||
max_core_num = matrix_num;
|
||||
}
|
||||
auto shard_qr = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
Eigen::Map<MartrixXd> martrix_y(y + i * m * n, m, n);
|
||||
Eigen::Map<MartrixXd> martrix_x(x + i * m * n, m, n);
|
||||
MartrixXd tmp = MartrixXd::Identity(m, m);
|
||||
Eigen::Map<VectorXd> vector_tau(tau + i * p, p, 1);
|
||||
for (size_t k = 0; k < p; k++) {
|
||||
VectorXd vector_v = martrix_x.block(k, k, m - k, 1);
|
||||
vector_v[0] = 1;
|
||||
tmp.rightCols(m - k) =
|
||||
tmp.rightCols(m - k) - vector_tau(k) * (tmp.rightCols(m - k) * vector_v) * vector_v.adjoint();
|
||||
}
|
||||
martrix_y = tmp.leftCols(n);
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, matrix_num, matrix_num / max_core_num, shard_qr),
|
||||
"Orgqr Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kOrgqr, OrgqrCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_ORGQR_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_ORQGR_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class OrgqrCpuKernel : public CpuKernel {
|
||||
public:
|
||||
OrgqrCpuKernel() = default;
|
||||
~OrgqrCpuKernel() = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t OrgqrCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t OrgqrCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t OrgqrComputeComplex(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_ORGQR_H_
|
|
@ -0,0 +1,140 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "pack.h"
|
||||
#include <securec.h>
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "Eigen/Core"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum{1u};
|
||||
const uint32_t kInputNum{aicpu::kDynamicInput};
|
||||
const char *kPack = "Pack";
|
||||
// constexpr int64_t kParallelDataNums = 512 * 1024;
|
||||
|
||||
#define PACK_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = PackCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Pack kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t PackCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kPack);
|
||||
KERNEL_HANDLE_ERROR(PackCheck(ctx), "[%s] check params failed.", kPack);
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
PACK_COMPUTE_CASE(DT_BOOL, bool, ctx)
|
||||
PACK_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
PACK_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
PACK_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
PACK_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
PACK_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
PACK_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
PACK_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
PACK_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
PACK_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
PACK_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
|
||||
PACK_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
|
||||
PACK_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
PACK_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Pack kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t PackCpuKernel::PackCheck(CpuKernelContext &ctx) {
|
||||
auto *input = ctx.Input(0);
|
||||
AttrValue *n_attr = ctx.GetAttr("N");
|
||||
AttrValue *axis_attr = ctx.GetAttr("axis");
|
||||
int64_t axis = axis_attr->GetInt();
|
||||
auto expanded_num_dims = input->GetTensorShape()->GetDims() + 1; // first_input.dims() + 1;
|
||||
if (axis < 0) axis += expanded_num_dims;
|
||||
|
||||
if (axis < 0 || axis >= expanded_num_dims) {
|
||||
KERNEL_LOG_ERROR("Pack axis error.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
int64_t input_num = n_attr->GetInt();
|
||||
auto x1_dims = input->GetTensorShape()->GetDims();
|
||||
for (int64_t i = 1; i < input_num; i++) {
|
||||
auto input_dims = ctx.Input(i)->GetTensorShape()->GetDims();
|
||||
if (x1_dims != input_dims) {
|
||||
KERNEL_LOG_ERROR("Pack input dims no equal.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PackCpuKernel::PackCompute(CpuKernelContext &ctx) {
|
||||
AttrValue *axis_attr = ctx.GetAttr("axis");
|
||||
int64_t axis = axis_attr->GetInt();
|
||||
|
||||
AttrValue *n_attr = ctx.GetAttr("N");
|
||||
int64_t input_num = n_attr->GetInt();
|
||||
|
||||
auto *input = ctx.Input(0);
|
||||
auto *output = ctx.Output(0);
|
||||
|
||||
auto expanded_num_dims = input->GetTensorShape()->GetDims() + 1;
|
||||
if (axis < 0) axis += expanded_num_dims;
|
||||
|
||||
std::vector<int64_t> temp_shape = input->GetTensorShape()->GetDimSizes();
|
||||
temp_shape.insert(temp_shape.begin() + axis, input_num);
|
||||
|
||||
auto *y = reinterpret_cast<T *>(output->GetData());
|
||||
int64_t x_NumElements = input->GetTensorShape()->NumElements();
|
||||
|
||||
if (axis == 0) {
|
||||
int64_t num = 0;
|
||||
for (int64_t j = 0; j < input_num; j++) {
|
||||
auto *input_x = reinterpret_cast<T *>(ctx.Input(j)->GetData());
|
||||
auto input_numelements = ctx.Input(j)->GetTensorShape()->NumElements();
|
||||
for (int64_t i = 0; i < input_numelements; i++) {
|
||||
*(y + num) = *(input_x + i);
|
||||
num++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int64_t num = 0;
|
||||
for (int64_t j = 0; j < x_NumElements; j++) {
|
||||
for (int64_t i = 0; i < input_num; i++) {
|
||||
auto *input_x = reinterpret_cast<T *>(ctx.Input(i)->GetData());
|
||||
*(y + num) = *(input_x + j);
|
||||
num++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kPack, PackCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,37 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_PACK_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_PACK_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class PackCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~PackCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx);
|
||||
|
||||
uint32_t PackCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t PackCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,379 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "parameterized_truncated_normal.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
#include <Eigen/Dense>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 5;
|
||||
const char *kParameterizedTruncatedNormal = "ParameterizedTruncatedNormal";
|
||||
using RNG_Engine = std::mt19937;
|
||||
static constexpr int kMaxIterations = 1000;
|
||||
|
||||
#define BATCH_SIZE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
batch_size = int64_t(GetBatchSizeCheckDims<TYPE>(CTX)); \
|
||||
break; \
|
||||
}
|
||||
|
||||
// override functions for half
|
||||
bool isinf(Eigen::half &data) { return Eigen::half_impl::isinf(data); }
|
||||
void swap(Eigen::half &data1, Eigen::half &data2) {
|
||||
Eigen::half tmp = data1;
|
||||
data1 = data2;
|
||||
data2 = tmp;
|
||||
}
|
||||
|
||||
Eigen::half exp(Eigen::half &data) { return Eigen::half_impl::exp(data); }
|
||||
Eigen::half log(Eigen::half &data) { return Eigen::half_impl::log(data); }
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
T GetBatchSizeCheckDims(CpuKernelContext &ctx) {
|
||||
auto output_shape = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
for (int i = 1; i < ctx.Input(0)->NumElements(); i++) {
|
||||
KERNEL_CHECK_FALSE((output_shape[i] >= 0), KERNEL_STATUS_PARAM_INVALID, "The output dimension must be >= 0.")
|
||||
}
|
||||
return output_shape[0];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void Generate(int64_t size, T mean, T stddev, T minval, T maxval, T **output_ptr, RNG_Engine &rng) {
|
||||
auto output = *output_ptr;
|
||||
std::normal_distribution<double> normal_dist(0, 1);
|
||||
std::uniform_real_distribution<double> unifrom_dist(0, 1);
|
||||
// Vectorized intermediate calculations for uniform rejection sampling.
|
||||
const T stddev_inside_bound = T(1.3);
|
||||
|
||||
/**
|
||||
* If possible, make one-sided bound be the lower bound, or make both
|
||||
* bounds positive. Otherwise, the bounds are on either side of the
|
||||
* mean.
|
||||
*/
|
||||
if ((isinf(minval) && minval < T(0)) || maxval < mean) {
|
||||
// Reverse all calculations. norm_min and norm_max will be flipped.
|
||||
swap(minval, maxval);
|
||||
stddev = -stddev;
|
||||
}
|
||||
|
||||
auto tmp_num = (stddev == static_cast<T>(0)) ? static_cast<T>(1) : stddev;
|
||||
// Calculate normalized samples, then convert them.
|
||||
const T norm_min = (minval - mean) / tmp_num;
|
||||
const T norm_max = (maxval - mean) / tmp_num;
|
||||
int sample_num = 0;
|
||||
|
||||
// Determine the method to use.
|
||||
const T sqrt_factor = sqrt((norm_min * norm_min) + T(4));
|
||||
const T cutoff = T(2) * exp(T(0.5) + (norm_min * (norm_min - sqrt_factor)) / T(4)) / (norm_min + sqrt_factor);
|
||||
const T diff = norm_max - norm_min;
|
||||
|
||||
if (((norm_min < -stddev_inside_bound) && (norm_max >= T(0.))) ||
|
||||
((norm_max > stddev_inside_bound) && (norm_min <= T(0.)))) {
|
||||
/**
|
||||
* If the bounds are a least 3 standard deviations from the mean
|
||||
* on at least one side then we rejection sample by sampling
|
||||
* from the normal distribution and rejecting samples outside
|
||||
* the bounds.
|
||||
* Under this condition the acceptance rate per iteration should
|
||||
* always be ~ 50%. This sampler is more efficient (and more
|
||||
* numerically stable when one or both bounds is far from the mean).
|
||||
*/
|
||||
while (sample_num < size) {
|
||||
for (int iter = 0; iter <= kMaxIterations;) {
|
||||
T normal_sample = T(normal_dist(rng));
|
||||
|
||||
if ((normal_sample >= norm_min) && (normal_sample <= norm_max)) {
|
||||
*output = normal_sample * stddev + mean;
|
||||
if (stddev <= static_cast<T>(0)) {
|
||||
*output = static_cast<T>(INFINITY);
|
||||
} else {
|
||||
output = output + 1;
|
||||
}
|
||||
sample_num++;
|
||||
break;
|
||||
} else {
|
||||
iter++;
|
||||
if (iter > kMaxIterations) {
|
||||
/**
|
||||
* This should never occur because this sampler should
|
||||
* (by the selection criteria above) be used if at least 3
|
||||
* standard deviations of one side of the distribution
|
||||
* is within the limits (so acceptance probability per
|
||||
* iterations >~ 1/2 per iteration).
|
||||
*/
|
||||
KERNEL_LOG_ERROR(
|
||||
"TruncatedNormal randn rejection sampler "
|
||||
"exceeded maximum iterations");
|
||||
*output_ptr = output;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (diff < cutoff) {
|
||||
// Sample from a uniform distribution on [norm_min, norm_max].
|
||||
const T plus_Factor = (norm_min < T(0)) ? T(0) : norm_min * norm_min;
|
||||
|
||||
while (sample_num < size) {
|
||||
for (int iter = 0; iter <= kMaxIterations;) {
|
||||
T uniform_sample = T(unifrom_dist(rng));
|
||||
|
||||
T z = uniform_sample * diff + norm_min;
|
||||
T g = (plus_Factor - z * z) / T(2.0);
|
||||
|
||||
bool accept = T(unifrom_dist(rng)) <= exp(g);
|
||||
|
||||
if (accept || iter + 1 >= kMaxIterations) {
|
||||
if (!accept) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"TruncatedNormal uniform rejection sampler "
|
||||
"exceeded max iterations. Sample may contain outliers.");
|
||||
*output_ptr = output;
|
||||
return;
|
||||
}
|
||||
|
||||
*output = z * stddev + mean;
|
||||
if (stddev <= static_cast<T>(0)) {
|
||||
*output = static_cast<T>(INFINITY);
|
||||
} else {
|
||||
output = output + 1;
|
||||
}
|
||||
sample_num++;
|
||||
break;
|
||||
|
||||
} else {
|
||||
iter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/**
|
||||
* Sample from an exponential distribution with alpha maximizing
|
||||
* acceptance probability, offset by norm_min from the origin.
|
||||
* Accept only if less than norm_max.
|
||||
*/
|
||||
const T alpha = (norm_min + sqrt((norm_min * norm_min) + T(4))) / T(2);
|
||||
while (sample_num < size) {
|
||||
for (int iter = 0; iter <= kMaxIterations;) {
|
||||
T uniform_sample = T(unifrom_dist(rng));
|
||||
T z = -log(uniform_sample) / alpha + norm_min;
|
||||
const T x = norm_min < alpha ? alpha - z : norm_min - alpha;
|
||||
const T g = exp(-x * x / T(2.0));
|
||||
|
||||
const T u = T(unifrom_dist(rng));
|
||||
|
||||
bool accept = (u <= g && z < norm_max);
|
||||
if (accept || iter + 1 >= kMaxIterations) {
|
||||
if (!accept) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"TruncatedNormal exponential distribution "
|
||||
"rejection sampler exceeds max iterations. "
|
||||
"Sample may contain outliers.");
|
||||
*output_ptr = output;
|
||||
return;
|
||||
}
|
||||
*output = z * stddev + mean;
|
||||
output = output + 1;
|
||||
sample_num++;
|
||||
break;
|
||||
} else {
|
||||
iter++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*output_ptr = output;
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T_shape, typename T_val>
|
||||
uint32_t BatchGenerate(CpuKernelContext &ctx) {
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
auto output_shape = reinterpret_cast<T_shape *>(input_0->GetData());
|
||||
// check shape
|
||||
auto batch_size = output_shape[0];
|
||||
int sample_size = 1;
|
||||
for (int i = 1; i < ctx.Input(0)->NumElements(); i++) {
|
||||
sample_size *= output_shape[i];
|
||||
}
|
||||
|
||||
Tensor *input_3 = ctx.Input(3);
|
||||
Tensor *input_4 = ctx.Input(4);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *input_2 = ctx.Input(2);
|
||||
Tensor *output = ctx.Output(0);
|
||||
|
||||
auto output_data = reinterpret_cast<T_val *>(output->GetData());
|
||||
auto means = reinterpret_cast<T_val *>(input_1->GetData());
|
||||
auto stdevs = reinterpret_cast<T_val *>(input_2->GetData());
|
||||
auto minvals = reinterpret_cast<T_val *>(input_3->GetData());
|
||||
auto maxvals = reinterpret_cast<T_val *>(input_4->GetData());
|
||||
|
||||
// setup seed
|
||||
int64_t final_seed = 0;
|
||||
auto attr_seed = ctx.GetAttr("seed");
|
||||
if (attr_seed != nullptr) {
|
||||
final_seed = attr_seed->GetInt();
|
||||
}
|
||||
if (final_seed == 0) {
|
||||
auto attr_seed2 = ctx.GetAttr("seed2");
|
||||
if (attr_seed2 != nullptr) {
|
||||
final_seed = attr_seed2->GetInt();
|
||||
}
|
||||
}
|
||||
|
||||
// setup random engine
|
||||
std::random_device r;
|
||||
RNG_Engine rng;
|
||||
final_seed = final_seed ? final_seed : r();
|
||||
rng.seed(final_seed);
|
||||
|
||||
vector<T_val *> params = {means, stdevs, minvals, maxvals};
|
||||
|
||||
vector<int> params_idx;
|
||||
if (input_1->NumElements() > 1) {
|
||||
params_idx.push_back(0);
|
||||
}
|
||||
if (input_2->NumElements() > 1) {
|
||||
params_idx.push_back(1);
|
||||
}
|
||||
if (input_3->NumElements() > 1) {
|
||||
params_idx.push_back(2);
|
||||
}
|
||||
if (input_4->NumElements() > 1) {
|
||||
params_idx.push_back(3);
|
||||
}
|
||||
|
||||
for (int batch = 0; batch < batch_size; batch++) {
|
||||
auto maxval = *params[3];
|
||||
auto minval = *params[2];
|
||||
KERNEL_CHECK_FALSE((maxval > minval), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Max value must be greater than min value in each batch")
|
||||
Generate<T_val>(int64_t(sample_size), *params[0], *params[1], minval, maxval, &output_data, rng);
|
||||
for (auto i : params_idx) {
|
||||
params[i] = params[i] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t ParameterizedTruncatedNormalCpuKernel::ParameterizedTruncatedNormalCheck(CpuKernelContext &ctx) {
|
||||
DataType val_datatype = ctx.Input(1)->GetDataType();
|
||||
DataType shape_datatype = ctx.Input(0)->GetDataType();
|
||||
|
||||
for (uint32_t i = 0; i < kInputNum; i++) {
|
||||
Tensor *input = ctx.Input(i);
|
||||
|
||||
// check input datatype
|
||||
DataType input_datatype = input->GetDataType();
|
||||
switch (i) {
|
||||
case 0:
|
||||
KERNEL_CHECK_FALSE((input_datatype == DT_INT32 || input_datatype == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[0] data type must DT_INT32 or DT_INT64,"
|
||||
"but got data type[%s].",
|
||||
DTypeStr(input_datatype).c_str());
|
||||
break;
|
||||
case 1:
|
||||
KERNEL_CHECK_FALSE((input_datatype == DT_FLOAT16 || input_datatype == DT_FLOAT || input_datatype == DT_DOUBLE),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[1] data type must DT_FLOAT16 or DT_FLOAT or DT_DOUBLE,"
|
||||
"but got data type[%s].",
|
||||
DTypeStr(input_datatype).c_str());
|
||||
break;
|
||||
default:
|
||||
KERNEL_CHECK_FALSE((input_datatype == val_datatype), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input[%u] [%s] need be same with input[1] [%s].", i,
|
||||
DTypeStr(input_datatype).c_str(), DTypeStr(val_datatype).c_str())
|
||||
}
|
||||
|
||||
// check input dimension
|
||||
auto input_dims = input->GetTensorShape()->GetDims();
|
||||
|
||||
int64_t batch_size = 0;
|
||||
switch (shape_datatype) {
|
||||
BATCH_SIZE_CASE(DT_INT32, int32_t, ctx)
|
||||
BATCH_SIZE_CASE(DT_INT64, int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("input0 data type [%u] not support.", shape_datatype);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
KERNEL_CHECK_FALSE((batch_size >= 0), KERNEL_STATUS_PARAM_INVALID, "The batch size must be >= 0.")
|
||||
|
||||
switch (i) {
|
||||
case 0:
|
||||
KERNEL_CHECK_FALSE((input_dims == 1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[0] should be rank 1, but got rank [%d].", input_dims);
|
||||
break;
|
||||
|
||||
default:
|
||||
KERNEL_CHECK_FALSE((input_dims <= 1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[%u] should be at most rank 1, but got rank [%d].", i, input_dims);
|
||||
if (input_dims == 1) {
|
||||
auto num_of_elems = input->NumElements();
|
||||
|
||||
KERNEL_CHECK_FALSE((num_of_elems == 1 || num_of_elems == batch_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input[%u] length should be 1 or equal to the "
|
||||
"batch size, got %d.",
|
||||
i, num_of_elems);
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
void ParameterizedTruncatedNormalCpuKernel::SetMap() {
|
||||
calls_[DT_INT32][DT_FLOAT16] = BatchGenerate<int32_t, Eigen::half>;
|
||||
calls_[DT_INT32][DT_FLOAT] = BatchGenerate<int32_t, float>;
|
||||
calls_[DT_INT32][DT_DOUBLE] = BatchGenerate<int32_t, double>;
|
||||
calls_[DT_INT64][DT_FLOAT16] = BatchGenerate<int64_t, Eigen::half>;
|
||||
calls_[DT_INT64][DT_FLOAT] = BatchGenerate<int64_t, float>;
|
||||
calls_[DT_INT64][DT_DOUBLE] = BatchGenerate<int64_t, double>;
|
||||
}
|
||||
|
||||
uint32_t ParameterizedTruncatedNormalCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"ParameterizedTruncatedNormal check input and output number failed.");
|
||||
|
||||
KERNEL_HANDLE_ERROR(ParameterizedTruncatedNormalCheck(ctx), "ParameterizedTruncatedNormal check params failed.");
|
||||
|
||||
DataType val_datatype = ctx.Input(1)->GetDataType();
|
||||
DataType shape_datatype = ctx.Input(0)->GetDataType();
|
||||
|
||||
SetMap();
|
||||
calls_[shape_datatype][val_datatype](ctx);
|
||||
calls_.clear();
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kParameterizedTruncatedNormal, ParameterizedTruncatedNormalCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_PARAMETERIZEDTRUNCATEDNORMAL_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_PARAMETERIZEDTRUNCATEDNORMAL_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class ParameterizedTruncatedNormalCpuKernel : public CpuKernel {
|
||||
public:
|
||||
ParameterizedTruncatedNormalCpuKernel() = default;
|
||||
~ParameterizedTruncatedNormalCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
static uint32_t ParameterizedTruncatedNormalCheck(CpuKernelContext &ctx);
|
||||
|
||||
// use map for 2 template parameter functions
|
||||
void SetMap();
|
||||
std::map<int, std::map<int, std::function<void(CpuKernelContext &)>>> calls_;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_PARAMETERIZEDTRUNCATEDNORMAL_H_
|
|
@ -0,0 +1,185 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "pdist_grad.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <math.h>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
|
||||
namespace {
|
||||
const char *kPdistGrad = "PdistGrad";
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 3;
|
||||
constexpr int64_t kParallelDataNums = 16 * 1024;
|
||||
constexpr int64_t kParallelDataNumsMid = 7 * 1024;
|
||||
|
||||
#define SWITCH_PARALLEL(SHARD, end_num, divisor) \
|
||||
if (end_num >= (kParallelDataNumsMid / divisor)) { \
|
||||
uint32_t min_core_num = 1; \
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2); \
|
||||
if (end_num < (kParallelDataNums / divisor)) { \
|
||||
max_core_num = std::min(max_core_num, 4L); \
|
||||
} \
|
||||
if (max_core_num > end_num) { \
|
||||
max_core_num = end_num; \
|
||||
} \
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, end_num, end_num / max_core_num, SHARD), \
|
||||
"PdistGrad #SHARD Compute failed."); \
|
||||
} else { \
|
||||
SHARD(0, end_num); \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
struct Grad {
|
||||
static inline T abs(T x) { return static_cast<T>(std::abs(*((float *)&x))); }
|
||||
|
||||
static inline T pow(T x, float p) { return static_cast<T>(std::pow(*((float *)&x), p)); }
|
||||
|
||||
static inline T sign(T x) { return x > T{0.0f} ? T{1.0f} : T{-1.0f}; }
|
||||
|
||||
struct o_grad {
|
||||
static inline T backward(T diff, T grad, T dist, float p) { return diff > T{0.0f} ? grad : -grad; }
|
||||
};
|
||||
|
||||
struct t_grad {
|
||||
static inline float backward(float diff, float grad, float dist, float p) {
|
||||
return dist == 0.0f ? 0.0f : grad * diff / dist;
|
||||
}
|
||||
|
||||
static inline Eigen::half backward(Eigen::half diff, Eigen::half grad, Eigen::half dist, float p) {
|
||||
return dist == Eigen::half{0.0f} ? Eigen::half{0.0f}
|
||||
: sign(diff) * pow(abs(diff), p - 1) * grad / pow(dist, p - 1);
|
||||
}
|
||||
};
|
||||
|
||||
struct p_grad {
|
||||
static inline T backward(T diff, T grad, T dist, float p) {
|
||||
return dist == T{0.0f} ? T{0.0f} : sign(diff) * pow(abs(diff), p - 1) * grad / pow(dist, p - 1);
|
||||
}
|
||||
};
|
||||
|
||||
struct i_grad {
|
||||
static inline T backward(T diff, T grad, T dist, float p) {
|
||||
return (diff == dist || -diff == dist) ? sign(diff) * grad : T{0.0f};
|
||||
}
|
||||
};
|
||||
|
||||
template <typename S>
|
||||
static uint32_t ParallelForPdistGrad(T *grad, T *x, T *dist, T *y, float p, CpuKernelContext &ctx) {
|
||||
int64_t data_num = ctx.Input(1)->NumElements();
|
||||
int64_t n = ctx.Input(1)->GetTensorShape()->GetDimSize(0);
|
||||
int64_t m = ctx.Input(1)->GetTensorShape()->GetDimSize(1);
|
||||
auto shard_pdistgrad = [&](int64_t start, int64_t end) {
|
||||
int64_t index;
|
||||
for (int64_t col = start; col < end; col++) {
|
||||
index = 0;
|
||||
for (int64_t i = col; i < data_num; i += m) {
|
||||
for (int64_t j = i + m; j < data_num; j += m) {
|
||||
T diff = x[i] - x[j];
|
||||
if (diff == T{0.0f}) {
|
||||
index++;
|
||||
continue;
|
||||
}
|
||||
T result = S::backward(diff, grad[index], dist[index], p);
|
||||
*(y + i) += result;
|
||||
*(y + j) -= result;
|
||||
index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
SWITCH_PARALLEL(shard_pdistgrad, m, n);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
static inline uint32_t PdistGradComputeKernel(T *grad, T *x, T *dist, T *y, float p, CpuKernelContext &ctx) {
|
||||
int64_t data_num = ctx.Input(1)->NumElements();
|
||||
T zero = T{0};
|
||||
auto shard_fill = [&](int64_t start, int64_t end) { std::fill(y + start, y + end, zero); };
|
||||
SWITCH_PARALLEL(shard_fill, data_num, 1);
|
||||
if (p == 0.0) {
|
||||
return KERNEL_STATUS_OK;
|
||||
} else if (p == 1.0) {
|
||||
return ParallelForPdistGrad<o_grad>(grad, x, dist, y, p, ctx);
|
||||
} else if (p == 2.0) {
|
||||
return ParallelForPdistGrad<t_grad>(grad, x, dist, y, p, ctx);
|
||||
} else if (std::isinf(p)) {
|
||||
return ParallelForPdistGrad<i_grad>(grad, x, dist, y, p, ctx);
|
||||
} else {
|
||||
return ParallelForPdistGrad<p_grad>(grad, x, dist, y, p, ctx);
|
||||
}
|
||||
}
|
||||
}; // Grad
|
||||
|
||||
uint32_t PdistGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "PdistGrad check input and output number failed.");
|
||||
DataType input_type = ctx.Input(1)->GetDataType();
|
||||
DataType output_type = ctx.Output(0)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((input_type == output_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input data type[%s] is not equal to output data type[%s].", DTypeStr(input_type).c_str(),
|
||||
DTypeStr(output_type).c_str());
|
||||
uint64_t input_size = ctx.Input(1)->GetDataSize();
|
||||
uint64_t output_size = ctx.Output(0)->GetDataSize();
|
||||
KERNEL_CHECK_FALSE((input_size == output_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input data size[%llu] is not equal to output data size[%llu].", input_size, output_size);
|
||||
switch (input_type) {
|
||||
case DT_FLOAT16:
|
||||
return PdistGradCompute<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return PdistGradCompute<float>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("PdistGrad kernel data type [%s] not support.", DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PdistGradCpuKernel::PdistGradCompute(CpuKernelContext &ctx) {
|
||||
Tensor *grad_tensor = ctx.Input(0);
|
||||
Tensor *x_tensor = ctx.Input(1);
|
||||
Tensor *pdist_tensor = ctx.Input(2);
|
||||
Tensor *y_tensor = ctx.Output(0);
|
||||
|
||||
T *grad = reinterpret_cast<T *>(grad_tensor->GetData());
|
||||
T *x = reinterpret_cast<T *>(x_tensor->GetData());
|
||||
T *pdist = reinterpret_cast<T *>(pdist_tensor->GetData());
|
||||
T *y = reinterpret_cast<T *>(y_tensor->GetData());
|
||||
|
||||
float p = 2.0;
|
||||
AttrValue *p_attr = ctx.GetAttr("p");
|
||||
if (p_attr != nullptr) {
|
||||
p = p_attr->GetFloat();
|
||||
}
|
||||
KERNEL_CHECK_FALSE((p >= 0), KERNEL_STATUS_PARAM_INVALID, "Attr[p] data cannot be less than 0.");
|
||||
|
||||
uint32_t ret = Grad<T>::PdistGradComputeKernel(grad, x, pdist, y, p, ctx);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
return ret;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kPdistGrad, PdistGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_PDIST_GRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_PDIST_GRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
class PdistGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
PdistGradCpuKernel() = default;
|
||||
~PdistGradCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t PdistGradCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,82 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All right reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "polar.h"
|
||||
|
||||
#include "complex"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "iostream"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 2;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kPolar = "Polar";
|
||||
const int64_t kParallelDataNumMid = 35 * 1024;
|
||||
const int64_t kParallelDataNum = 7 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t PolarCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
DataType abs_type = ctx.Input(0)->GetDataType();
|
||||
DataType angle_type = ctx.Input(1)->GetDataType();
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Polar check input and output number failed.");
|
||||
if (abs_type == DT_FLOAT && angle_type == DT_FLOAT) {
|
||||
return PolarCompute<float>(ctx);
|
||||
} else if (abs_type == DT_DOUBLE && angle_type == DT_DOUBLE) {
|
||||
return PolarCompute<double>(ctx);
|
||||
} else {
|
||||
KERNEL_LOG_ERROR("Polar kernel data type [%s],[%s] not support.", DTypeStr(abs_type).c_str(),
|
||||
DTypeStr(angle_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t PolarCpuKernel::PolarCompute(CpuKernelContext &ctx) {
|
||||
auto abs = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto angle = reinterpret_cast<T *>(ctx.Input(1)->GetData());
|
||||
auto output = reinterpret_cast<std::complex<T> *>(ctx.Output(0)->GetData());
|
||||
auto input_shape = ctx.Input(0)->GetTensorShape();
|
||||
int64_t elements = input_shape->NumElements();
|
||||
auto sharder_polar = [&](int64_t start, int64_t end) {
|
||||
for (int64_t i = start; i < end; i++) {
|
||||
output[i].real(abs[i] * cos(angle[i]));
|
||||
output[i].imag(abs[i] * sin(angle[i]));
|
||||
}
|
||||
};
|
||||
if (elements > kParallelDataNum) {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (elements <= kParallelDataNumMid) {
|
||||
max_core_num = std::min(max_core_num, static_cast<int64_t>(4)); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > elements) {
|
||||
max_core_num = elements;
|
||||
}
|
||||
if (max_core_num > 0) {
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, elements, elements / max_core_num, sharder_polar),
|
||||
"Polar Compute failed.");
|
||||
}
|
||||
} else {
|
||||
sharder_polar(0, elements);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kPolar, PolarCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file polar.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_STFT_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_STFT_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class PolarCpuKernel : public CpuKernel {
|
||||
public:
|
||||
PolarCpuKernel() = default;
|
||||
~PolarCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static uint32_t PolarCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,203 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "ragged_range.h"
|
||||
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <type_traits>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
using namespace std;
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 2;
|
||||
const uint32_t kInputNum = 3;
|
||||
const char *kRaggedRange = "RaggedRange";
|
||||
constexpr int64_t kParallelDataNums = 16 * 1024;
|
||||
|
||||
#define RAGGEDRANGE_COMPUTE_CASE(DTYPE, TYPE, TSPLITS, NROWS, STARTS, LIMITS, DELTAS, BROADCAST_START, \
|
||||
BROADCAST_LIMITS, BROADCAST_DELTAS, RT_NESTED_SPLITS, RT_DENSE_VALUE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = \
|
||||
RaggedRangeCompute<TYPE, TSPLITS>(NROWS, STARTS, LIMITS, DELTAS, BROADCAST_START, BROADCAST_LIMITS, \
|
||||
BROADCAST_DELTAS, RT_NESTED_SPLITS, RT_DENSE_VALUE, CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("RaggedRange kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t RaggedRange::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "RaggedRange check params failed.");
|
||||
Tensor *starts = ctx.Input(0);
|
||||
auto starts_shape = starts->GetTensorShape();
|
||||
int32_t starts_dim = starts_shape->GetDims();
|
||||
|
||||
Tensor *limits = ctx.Input(1);
|
||||
auto limits_shape = limits->GetTensorShape();
|
||||
int32_t limits_dim = limits_shape->GetDims();
|
||||
|
||||
Tensor *deltas = ctx.Input(2);
|
||||
auto deltas_shape = deltas->GetTensorShape();
|
||||
int32_t deltas_dim = deltas_shape->GetDims();
|
||||
|
||||
KERNEL_CHECK_FALSE((starts_dim <= 1), KERNEL_STATUS_PARAM_INVALID, "starts must be a scalar or vector.");
|
||||
KERNEL_CHECK_FALSE((limits_dim <= 1), KERNEL_STATUS_PARAM_INVALID, "limits must be a scalar or vector.");
|
||||
KERNEL_CHECK_FALSE((deltas_dim <= 1), KERNEL_STATUS_PARAM_INVALID, "deltas must be a scalar or vector.");
|
||||
|
||||
bool broadcast_starts = starts_dim == 0;
|
||||
bool broadcast_limits = limits_dim == 0;
|
||||
bool broadcast_deltas = deltas_dim == 0;
|
||||
|
||||
vector<int> in_sizes;
|
||||
if (!broadcast_starts) in_sizes.push_back(starts_shape->GetDimSize(0));
|
||||
if (!broadcast_limits) in_sizes.push_back(limits_shape->GetDimSize(0));
|
||||
if (!broadcast_deltas) in_sizes.push_back(deltas_shape->GetDimSize(0));
|
||||
for (uint32_t i = 1; i < in_sizes.size(); ++i) {
|
||||
KERNEL_CHECK_FALSE((in_sizes[i] == in_sizes[i - 1]), KERNEL_STATUS_PARAM_INVALID,
|
||||
"starts, limits, and deltas must have the same shape.");
|
||||
}
|
||||
|
||||
uint32_t nrows = in_sizes.empty() ? 1 : in_sizes[0];
|
||||
|
||||
AttrValue *attr = ctx.GetAttr("Tsplits");
|
||||
KERNEL_CHECK_NULLPTR(attr, KERNEL_STATUS_PARAM_INVALID, "Get attr[Tsplits] failed.");
|
||||
DataType Tsplits = attr->GetDataType();
|
||||
KERNEL_CHECK_FALSE((Tsplits == DT_INT32 || Tsplits == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The attr Tsplits must be int32 or int64.");
|
||||
|
||||
Tensor *rt_nested_splits = ctx.Output(0);
|
||||
Tensor *rt_dense_values = ctx.Output(1);
|
||||
|
||||
auto starts_type = starts->GetDataType();
|
||||
auto limits_type = limits->GetDataType();
|
||||
auto deltas_type = deltas->GetDataType();
|
||||
KERNEL_CHECK_FALSE((starts_type == limits_type && limits_type == deltas_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"starts, limits and deltas must have the same type.");
|
||||
|
||||
if (Tsplits == DT_INT32) {
|
||||
switch (starts_type) {
|
||||
RAGGEDRANGE_COMPUTE_CASE(DT_FLOAT, float, int32_t, nrows, starts, limits, deltas, broadcast_starts,
|
||||
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
|
||||
RAGGEDRANGE_COMPUTE_CASE(DT_DOUBLE, double, int32_t, nrows, starts, limits, deltas, broadcast_starts,
|
||||
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
|
||||
RAGGEDRANGE_COMPUTE_CASE(DT_INT32, int32_t, int32_t, nrows, starts, limits, deltas, broadcast_starts,
|
||||
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
|
||||
RAGGEDRANGE_COMPUTE_CASE(DT_INT64, int64_t, int32_t, nrows, starts, limits, deltas, broadcast_starts,
|
||||
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(starts_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else {
|
||||
switch (starts_type) {
|
||||
RAGGEDRANGE_COMPUTE_CASE(DT_FLOAT, float, int64_t, nrows, starts, limits, deltas, broadcast_starts,
|
||||
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
|
||||
RAGGEDRANGE_COMPUTE_CASE(DT_DOUBLE, double, int64_t, nrows, starts, limits, deltas, broadcast_starts,
|
||||
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
|
||||
RAGGEDRANGE_COMPUTE_CASE(DT_INT32, int32_t, int64_t, nrows, starts, limits, deltas, broadcast_starts,
|
||||
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
|
||||
RAGGEDRANGE_COMPUTE_CASE(DT_INT64, int64_t, int64_t, nrows, starts, limits, deltas, broadcast_starts,
|
||||
broadcast_limits, broadcast_deltas, rt_nested_splits, rt_dense_values, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(starts_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, typename TSPLITS>
|
||||
uint32_t RaggedRange::RaggedRangeCompute(const uint32_t nrows, Tensor *starts, Tensor *limits, Tensor *deltas,
|
||||
bool broadcast_starts, bool broadcast_limits, bool broadcast_deltas,
|
||||
Tensor *rt_nested_splits, Tensor *rt_dense_values, CpuKernelContext &ctx) {
|
||||
T *starts_addr = reinterpret_cast<T *>(starts->GetData());
|
||||
T *limits_addr = reinterpret_cast<T *>(limits->GetData());
|
||||
T *deltas_addr = reinterpret_cast<T *>(deltas->GetData());
|
||||
|
||||
TSPLITS *rt_nested_splits_addr = reinterpret_cast<TSPLITS *>(rt_nested_splits->GetData());
|
||||
rt_nested_splits_addr[0] = 0;
|
||||
for (uint32_t row = 0; row < nrows; ++row) {
|
||||
T start = broadcast_starts ? starts_addr[0] : starts_addr[row];
|
||||
T limit = broadcast_limits ? limits_addr[0] : limits_addr[row];
|
||||
T delta = broadcast_deltas ? deltas_addr[0] : deltas_addr[row];
|
||||
KERNEL_CHECK_FALSE((delta != 0), KERNEL_STATUS_PARAM_INVALID, "Requires delta != 0.");
|
||||
rt_nested_splits_addr[row + 1] = rt_nested_splits_addr[row] + RangeSize<T, TSPLITS>(start, limit, delta);
|
||||
}
|
||||
|
||||
T *rt_dense_values_addr = reinterpret_cast<T *>(rt_dense_values->GetData());
|
||||
if (nrows <= kParallelDataNums) {
|
||||
int value_index = 0;
|
||||
for (uint32_t row = 0; row < nrows; ++row) {
|
||||
TSPLITS row_size = rt_nested_splits_addr[row + 1] - rt_nested_splits_addr[row];
|
||||
T value = broadcast_starts ? starts_addr[0] : starts_addr[row];
|
||||
T delta = broadcast_deltas ? deltas_addr[0] : deltas_addr[row];
|
||||
for (TSPLITS i = 0; i < row_size; ++i) {
|
||||
rt_dense_values_addr[value_index++] = value;
|
||||
value += delta;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > nrows) {
|
||||
max_core_num = nrows;
|
||||
}
|
||||
auto shared_rtvalues = [&](size_t start, size_t end) {
|
||||
for (size_t row = start; row < end; row++) {
|
||||
TSPLITS row_size = rt_nested_splits_addr[row + 1] - rt_nested_splits_addr[row];
|
||||
T value = broadcast_starts ? starts_addr[0] : starts_addr[row];
|
||||
T delta = broadcast_deltas ? deltas_addr[0] : deltas_addr[row];
|
||||
TSPLITS y_offset = rt_nested_splits_addr[row];
|
||||
for (TSPLITS i = 0; i < row_size; ++i) {
|
||||
rt_dense_values_addr[y_offset++] = value;
|
||||
value += delta;
|
||||
}
|
||||
}
|
||||
};
|
||||
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, nrows, nrows / max_core_num, shared_rtvalues);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed.");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T, typename TSPLITS>
|
||||
TSPLITS RaggedRange::RangeSize(T start, T limit, T delta) {
|
||||
if (((delta > 0) && (limit < start)) || ((delta < 0) && (limit > start))) {
|
||||
return 0;
|
||||
}
|
||||
return (std::is_integral<T>::value ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
|
||||
: std::ceil(std::abs((limit - start) / delta)));
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kRaggedRange, RaggedRange);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_RAGGED_RANGE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_RAGGED_RANGE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
class RaggedRange : public CpuKernel {
|
||||
public:
|
||||
RaggedRange() = default;
|
||||
~RaggedRange() override = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T, typename TSPLITS>
|
||||
uint32_t RaggedRangeCompute(const uint32_t nrows, Tensor *starts, Tensor *limits, Tensor *deltas,
|
||||
bool broadcast_starts, bool broadcast_limits, bool broadcast_deltas,
|
||||
Tensor *rt_nested_splits, Tensor *rt_dense_values, CpuKernelContext &ctx);
|
||||
|
||||
template <typename T, typename TSPLITS>
|
||||
TSPLITS RangeSize(T start, T limit, T delta);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,336 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "ragged_tensor_to_sparse.h"
|
||||
|
||||
namespace {
|
||||
const std::uint32_t kInputNum{aicpu::kDynamicInput};
|
||||
const std::uint32_t kOutputNum{3u};
|
||||
const char *kRaggedTensorToSparse = "RaggedTensorToSparse";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t RaggedTensorToSparseCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
|
||||
n_ = ctx.GetInputsSize() - 1;
|
||||
KERNEL_CHECK_FALSE((n_ >= 1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input num must great equal 1,"
|
||||
"but got input num[%u]",
|
||||
n_);
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"RaggedTensorToSparse check input and output number failed.");
|
||||
Tensor *rt_dense_values_ptr = ctx.Input(n_);
|
||||
KERNEL_CHECK_NULLPTR(rt_dense_values_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input rt_dense_values failed.");
|
||||
auto rt_dense_values_shape_ptr = rt_dense_values_ptr->GetTensorShape();
|
||||
KERNEL_CHECK_NULLPTR(rt_dense_values_shape_ptr, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Get input rt_dense_values shape failed.");
|
||||
DataType rt_dense_values_data_type = rt_dense_values_ptr->GetDataType();
|
||||
KERNEL_CHECK_FALSE((rt_dense_values_data_type == DT_INT32 || rt_dense_values_data_type == DT_INT64 ||
|
||||
rt_dense_values_data_type == DT_BOOL || rt_dense_values_data_type == DT_INT8 ||
|
||||
rt_dense_values_data_type == DT_UINT8 || rt_dense_values_data_type == DT_INT16 ||
|
||||
rt_dense_values_data_type == DT_UINT16 || rt_dense_values_data_type == DT_DOUBLE ||
|
||||
rt_dense_values_data_type == DT_FLOAT || rt_dense_values_data_type == DT_FLOAT16),
|
||||
KERNEL_STATUS_PARAM_INVALID,
|
||||
"Input rt_dense_values data type must {DT_BOOL, DT_INT8, "
|
||||
"DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, "
|
||||
"DT_DOUBLE, DT_FLOAT, DT_FLOAT16},"
|
||||
"but got data type [%s].",
|
||||
DTypeStr(rt_dense_values_data_type).c_str());
|
||||
auto rt_dense_values_data_ptr = rt_dense_values_ptr->GetData();
|
||||
KERNEL_CHECK_NULLPTR(rt_dense_values_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input rt_dense_values data failed.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
// Validate `rt_nested_splits`
|
||||
template <typename T1>
|
||||
uint32_t RaggedTensorToSparseCpuKernel::ValidateInputs(std::vector<typename TTypes<T1>::Flat> rt_nested_splits,
|
||||
const Tensor *rt_dense_values_in) {
|
||||
for (uint32_t i = 0; i < rt_nested_splits.size(); ++i) {
|
||||
if (rt_nested_splits[i].size() == 0) {
|
||||
KERNEL_LOG_ERROR("ragged splits may not be empty.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (rt_nested_splits[i](0) != 0) {
|
||||
KERNEL_LOG_ERROR("First value of ragged splits must be 0.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
for (uint32_t j = 1; j < rt_nested_splits[i].size(); ++j) {
|
||||
if (rt_nested_splits[i](j) < rt_nested_splits[i](j - 1)) {
|
||||
KERNEL_LOG_ERROR("Ragged splits should be non decreasing.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
if (i > 0) {
|
||||
T1 last_split = rt_nested_splits[i - 1](rt_nested_splits[i - 1].size() - 1);
|
||||
if (rt_nested_splits[i].size() != last_split + 1) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"Final value of ragged splits must match the length "
|
||||
"the corresponding ragged values.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (rt_dense_values_in->GetTensorShape()->GetDimSizes()[0] !=
|
||||
rt_nested_splits.back()(rt_nested_splits.back().size() - 1)) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"Final value of ragged splits must match the length "
|
||||
"the corresponding ragged values.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> RaggedTensorToSparseCpuKernel::MakeIndexSuffixes(const TensorShape &values_shape) {
|
||||
std::vector<std::vector<int64_t>> suffixes{{}};
|
||||
for (int32_t dim = 1; dim < values_shape.GetDims(); ++dim) {
|
||||
std::vector<std::vector<int64_t>> new_suffixes;
|
||||
for (const auto &suffix : suffixes) {
|
||||
for (int64_t i = 0; i < values_shape.GetDimSize(dim); ++i) {
|
||||
new_suffixes.push_back(suffix);
|
||||
new_suffixes.back().push_back(i);
|
||||
}
|
||||
}
|
||||
suffixes.swap(new_suffixes);
|
||||
}
|
||||
return suffixes;
|
||||
}
|
||||
|
||||
template <typename T1>
|
||||
bool RaggedTensorToSparseCpuKernel::IsCompleted(const std::vector<int64_t> &pos, int dim,
|
||||
const std::vector<typename TTypes<T1>::Flat> &rt_nested_splits) {
|
||||
int64_t current_child = pos[dim + 1];
|
||||
int64_t limit_child = rt_nested_splits[dim](pos[dim] + 1);
|
||||
return current_child >= limit_child;
|
||||
}
|
||||
|
||||
void RaggedTensorToSparseCpuKernel::input_list(CpuKernelContext &ctx, OpInputList *list) {
|
||||
static uint32_t start = 0, stop;
|
||||
if (ctx.Input(0)->NumElements() > 0) {
|
||||
stop = start + static_cast<uint32_t>(ctx.Input(0)->NumElements());
|
||||
*list = OpInputList(&ctx, start, stop);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t RaggedTensorToSparseCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
// Assemble each value in `sparse_indices` using three parts:
|
||||
// - `index_prefix` is the index in dimensions up through the last ragged
|
||||
// dimension.
|
||||
// - `index_middle` is the index in the last ragged dimension.
|
||||
// - `index_suffix` is the index in the dense value dimensions.
|
||||
OpInputList rt_nested_splits_in;
|
||||
input_list(ctx, &rt_nested_splits_in);
|
||||
const int64_t rt_nested_splits_len = n_;
|
||||
std::vector<typename TTypes<T1>::Flat> rt_nested_splits;
|
||||
rt_nested_splits.reserve(n_);
|
||||
for (int i = 0; i < rt_nested_splits_len; ++i) {
|
||||
if (rt_nested_splits_in[i]->NumElements() > 0) {
|
||||
EigenTensor indicesET(rt_nested_splits_in[i], rt_nested_splits_in[i]->GetData());
|
||||
Eigen::Tensor<T1, 1, Eigen::RowMajor, Eigen::DenseIndex> m = indicesET.flat<T1>();
|
||||
rt_nested_splits.push_back(indicesET.flat<T1>());
|
||||
}
|
||||
}
|
||||
|
||||
const Tensor *rt_dense_values_in = ctx.Input(n_);
|
||||
KERNEL_CHECK_FALSE((ValidateInputs<T1>(rt_nested_splits, rt_dense_values_in) == KERNEL_STATUS_OK),
|
||||
KERNEL_STATUS_PARAM_INVALID, "ValidateInputs failed.");
|
||||
KERNEL_CHECK_FALSE((Update<T1>(ctx, rt_nested_splits) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Update failed.");
|
||||
OutPutSparseValues<T2>(ctx);
|
||||
OutPutSparseDenseShape<T1>(ctx, rt_nested_splits_in, rt_nested_splits);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T1>
|
||||
uint32_t RaggedTensorToSparseCpuKernel::Update(CpuKernelContext &ctx,
|
||||
std::vector<typename TTypes<T1>::Flat> rt_nested_splits) {
|
||||
const Tensor *rt_dense_values_in = ctx.Input(n_);
|
||||
const int64_t rt_nested_splits_len = n_;
|
||||
|
||||
std::vector<int64_t> index_prefix(n_);
|
||||
std::vector<std::vector<int64_t>> index_suffixes = MakeIndexSuffixes(*rt_dense_values_in->GetTensorShape());
|
||||
|
||||
// Allocate the `sparse_indices` output tensor.
|
||||
const int64_t nvals = (rt_nested_splits.back()(rt_nested_splits.back().size() - 1) * index_suffixes.size());
|
||||
const int64_t indices_len = rt_nested_splits_len + rt_dense_values_in->GetTensorShape()->GetDims();
|
||||
Tensor *sparse_indices = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR((sparse_indices), KERNEL_STATUS_PARAM_INVALID, "Get sparse_indices failed.");
|
||||
sparse_indices->SetDataType(DT_INT64);
|
||||
auto sparse_indices_ptr = reinterpret_cast<int64_t *>(sparse_indices->GetData());
|
||||
KERNEL_CHECK_NULLPTR(sparse_indices_ptr, KERNEL_STATUS_PARAM_INVALID, "Get sparse_indices data failed.");
|
||||
KERNEL_CHECK_NULLPTR(sparse_indices, KERNEL_STATUS_PARAM_INVALID, "Create sparse_indices Flat failed.");
|
||||
|
||||
// pos[i] is the current position in rt_nested_splits[i]. final_pos is a
|
||||
// reference to make it easier to refer to pos[-1].
|
||||
std::vector<int64_t> pos(n_);
|
||||
int64_t &final_pos = pos[n_ - 1];
|
||||
// Each iteration through the loop, we increment pos[-1], and add indices
|
||||
// for all the values corresponding to
|
||||
// rt_nested_splits[-1][pos[-1]:pos[-1]+1].
|
||||
int next_index = 0;
|
||||
int64_t num = 0;
|
||||
int max_final_pos = rt_nested_splits.back().size() - 1;
|
||||
for (; final_pos < max_final_pos; ++final_pos) {
|
||||
// Update `pos` to skip over completed elements (i.e., elements where
|
||||
// we have already generated indices for all contained values).
|
||||
for (int dim = n_ - 2; dim >= 0; --dim) {
|
||||
while (IsCompleted<T1>(pos, dim, rt_nested_splits)) {
|
||||
pos[dim] += 1;
|
||||
}
|
||||
}
|
||||
// Update index_prefix.
|
||||
for (size_t dim = 0; dim < index_prefix.size(); ++dim) {
|
||||
int start = dim > 0 ? rt_nested_splits[dim - 1](pos[dim - 1]) : 0;
|
||||
index_prefix[dim] = pos[dim] - start;
|
||||
}
|
||||
// Get length of the final-ragged-dimension slice.
|
||||
const auto &final_splits = rt_nested_splits[n_ - 1];
|
||||
int64_t slice_len = final_splits(final_pos + 1) - final_splits(final_pos);
|
||||
// Add sparse_indices for this slice.
|
||||
for (int64_t i = 0; i < slice_len; ++i) {
|
||||
for (const auto &index_suffix : index_suffixes) {
|
||||
int dim = 0;
|
||||
for (int64_t index : index_prefix) { // index_prefix
|
||||
sparse_indices_ptr[num++] = index;
|
||||
dim++;
|
||||
}
|
||||
dim++;
|
||||
sparse_indices_ptr[num++] = i;
|
||||
for (int64_t index : index_suffix) { // index_suffix
|
||||
sparse_indices_ptr[num++] = index;
|
||||
dim++;
|
||||
}
|
||||
KERNEL_CHECK_FALSE((dim == indices_len), KERNEL_STATUS_PARAM_INVALID,
|
||||
"dim should be equal to indices_len,but get %d.", dim);
|
||||
++next_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
KERNEL_CHECK_FALSE((next_index == nvals), KERNEL_STATUS_PARAM_INVALID,
|
||||
"next_index should be equal to nvals,but get %d.", next_index);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T2>
|
||||
void RaggedTensorToSparseCpuKernel::OutPutSparseValues(CpuKernelContext &ctx) {
|
||||
// Output the `sparse_values` Tensor.
|
||||
const Tensor *rt_dense_values_in = ctx.Input(n_);
|
||||
Tensor *spares_values_out = ctx.Output(1);
|
||||
if (rt_dense_values_in->GetTensorShape()->GetDims() == 1) {
|
||||
spares_values_out->SetDataType(rt_dense_values_in->GetDataType());
|
||||
auto rt_dense_shape = rt_dense_values_in->GetTensorShape();
|
||||
auto spares_values_out_ptr = reinterpret_cast<T2 *>(spares_values_out->GetData());
|
||||
auto rt_dense_values_in_ptr = reinterpret_cast<T2 *>(rt_dense_values_in->GetData());
|
||||
for (int64_t i = 0; i < rt_dense_values_in->NumElements(); i++) {
|
||||
spares_values_out_ptr[i] = rt_dense_values_in_ptr[i];
|
||||
}
|
||||
} else {
|
||||
spares_values_out->SetDataType(rt_dense_values_in->GetDataType());
|
||||
auto rt_dense_shape = rt_dense_values_in->GetTensorShape();
|
||||
auto spares_values_out_ptr = reinterpret_cast<T2 *>(spares_values_out->GetData());
|
||||
auto rt_dense_values_in_ptr = reinterpret_cast<T2 *>(rt_dense_values_in->GetData());
|
||||
for (int64_t i = 0; i < rt_dense_values_in->NumElements(); i++) {
|
||||
spares_values_out_ptr[i] = rt_dense_values_in_ptr[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1>
|
||||
void RaggedTensorToSparseCpuKernel::OutPutSparseDenseShape(CpuKernelContext &ctx, OpInputList rt_nested_splits_in,
|
||||
std::vector<typename TTypes<T1>::Flat> rt_nested_splits) {
|
||||
// Output the `sparse_dense_shape` Tensor.
|
||||
const Tensor *rt_dense_values_in = ctx.Input(n_);
|
||||
Tensor *sparse_dense_shape_out = ctx.Output(2);
|
||||
int64_t *sparse_dense_shape = static_cast<int64_t *>(sparse_dense_shape_out->GetData());
|
||||
sparse_dense_shape[0] = rt_nested_splits_in[0]->GetTensorShape()->GetDimSizes()[0] - 1;
|
||||
for (int dim = 0; dim < n_; ++dim) {
|
||||
const auto &splits = rt_nested_splits[dim];
|
||||
T1 max_width = 0;
|
||||
for (int i = 1; i < splits.size(); ++i) {
|
||||
max_width = std::max(max_width, splits(i) - splits(i - 1));
|
||||
}
|
||||
sparse_dense_shape[dim + 1] = max_width;
|
||||
}
|
||||
for (int dim = 1; dim < rt_dense_values_in->GetTensorShape()->GetDims(); ++dim) {
|
||||
sparse_dense_shape[dim + n_] = rt_dense_values_in->GetTensorShape()->GetDimSizes()[dim];
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t RaggedTensorToSparseCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_CHECK_FALSE((CheckAndInitParams(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
|
||||
"CheckAndInitParams failed.");
|
||||
DataType type1 = ctx.Input(n_)->GetDataType();
|
||||
DataType SplitType = ctx.Input(0)->GetDataType();
|
||||
switch (SplitType) {
|
||||
case DT_INT32:
|
||||
switch (type1) {
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<int32_t, double>(ctx);
|
||||
case DT_FLOAT16:
|
||||
return DoCompute<int32_t, Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return DoCompute<int32_t, float>(ctx);
|
||||
case DT_INT8:
|
||||
return DoCompute<int32_t, int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return DoCompute<int32_t, int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int32_t, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int32_t, int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return DoCompute<int32_t, uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return DoCompute<int32_t, uint16_t>(ctx);
|
||||
case DT_BOOL:
|
||||
return DoCompute<int32_t, bool>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
};
|
||||
break;
|
||||
case DT_INT64:
|
||||
switch (type1) {
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<int64_t, double>(ctx);
|
||||
case DT_FLOAT16:
|
||||
return DoCompute<int64_t, Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return DoCompute<int64_t, float>(ctx);
|
||||
case DT_INT8:
|
||||
return DoCompute<int64_t, int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return DoCompute<int64_t, int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int64_t, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int64_t, int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return DoCompute<int64_t, uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return DoCompute<int64_t, uint16_t>(ctx);
|
||||
case DT_BOOL:
|
||||
return DoCompute<int64_t, bool>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
};
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(SplitType).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kRaggedTensorToSparse, RaggedTensorToSparseCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,87 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_RAGGED_TENSOR_TO_SPARSE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_RAGGED_TENSOR_TO_SPARSE_H_
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "securec.h"
|
||||
#include "status.h"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace aicpu {
|
||||
class OpInputList {
|
||||
public:
|
||||
OpInputList() : ctx_(nullptr), start_(0), stop_(0) {}
|
||||
OpInputList(CpuKernelContext *ctx, uint32_t start, uint32_t stop) : ctx_(ctx), start_(start), stop_(stop) {}
|
||||
OpInputList(const OpInputList &) = default;
|
||||
OpInputList &operator=(const OpInputList &other) = default;
|
||||
Tensor *operator[](uint32_t i) const { return ctx_->Input(start_ + i); }
|
||||
uint32_t size() const { return stop_ - start_; }
|
||||
|
||||
private:
|
||||
CpuKernelContext *ctx_; // not owned
|
||||
uint32_t start_;
|
||||
uint32_t stop_;
|
||||
};
|
||||
|
||||
class RaggedTensorToSparseCpuKernel : public CpuKernel {
|
||||
public:
|
||||
RaggedTensorToSparseCpuKernel() : type1(DT_DOUBLE), n_(1) {}
|
||||
~RaggedTensorToSparseCpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t CheckAndInitParams(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T1>
|
||||
uint32_t ValidateInputs(std::vector<typename TTypes<T1>::Flat> rt_nested_splits, const Tensor *rt_dense_values_in);
|
||||
|
||||
std::vector<std::vector<int64_t>> MakeIndexSuffixes(const TensorShape &values_shape);
|
||||
|
||||
template <typename T1>
|
||||
bool IsCompleted(const std::vector<int64_t> &pos, int dim,
|
||||
const std::vector<typename TTypes<T1>::Flat> &rt_nested_splits);
|
||||
|
||||
void input_list(CpuKernelContext &ctx, OpInputList *list);
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T1>
|
||||
uint32_t Update(CpuKernelContext &ctx, std::vector<typename TTypes<T1>::Flat> rt_nested_splits);
|
||||
|
||||
template <typename T2>
|
||||
void OutPutSparseValues(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T1>
|
||||
void OutPutSparseDenseShape(CpuKernelContext &ctx, OpInputList rt_nested_splits_in,
|
||||
std::vector<typename TTypes<T1>::Flat> rt_nested_splits);
|
||||
|
||||
private:
|
||||
DataType type1;
|
||||
int64_t n_;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,617 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "ragged_tensor_to_tensor.h"
|
||||
|
||||
namespace {
|
||||
constexpr uint32_t kInputNum = 4;
|
||||
constexpr uint32_t kOutputNum = 1;
|
||||
const char *kRaggedTensorToTensor = "RaggedTensorToTensor";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t RaggedTensorToTensorCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"RaggedTensorToTensor check input and output number failed.");
|
||||
DataType type1 = ctx.Input(1)->GetDataType();
|
||||
DataType SplitType = ctx.Input(0)->GetDataType();
|
||||
switch (SplitType) {
|
||||
case DT_INT32:
|
||||
switch (type1) {
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<int32_t, double>(ctx);
|
||||
case DT_FLOAT16:
|
||||
return DoCompute<int32_t, Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return DoCompute<int32_t, float>(ctx);
|
||||
case DT_INT8:
|
||||
return DoCompute<int32_t, int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return DoCompute<int32_t, int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int32_t, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int32_t, int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return DoCompute<int32_t, uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return DoCompute<int32_t, uint16_t>(ctx);
|
||||
case DT_BOOL:
|
||||
return DoCompute<int32_t, bool>(ctx);
|
||||
default: {
|
||||
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
};
|
||||
break;
|
||||
case DT_INT64:
|
||||
switch (type1) {
|
||||
case DT_DOUBLE:
|
||||
return DoCompute<int64_t, double>(ctx);
|
||||
case DT_FLOAT16:
|
||||
return DoCompute<int64_t, Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return DoCompute<int64_t, float>(ctx);
|
||||
case DT_INT8:
|
||||
return DoCompute<int64_t, int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return DoCompute<int64_t, int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return DoCompute<int64_t, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DoCompute<int64_t, int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return DoCompute<int64_t, uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return DoCompute<int64_t, uint16_t>(ctx);
|
||||
case DT_BOOL:
|
||||
return DoCompute<int64_t, bool>(ctx);
|
||||
default: {
|
||||
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(type1).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
};
|
||||
break;
|
||||
default: {
|
||||
KERNEL_LOG_ERROR("Unsupported datatype [%s]", DTypeStr(SplitType).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
graphStatus RaggedTensorToTensorCpuKernel::GetRowPartitionTypes(CpuKernelContext &ctx) {
|
||||
std::vector<std::string> partition_types;
|
||||
AttrValue *row_part = ctx.GetAttr("row_partition_types");
|
||||
int64_t N = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
row_partition_types_.reserve(N);
|
||||
partition_types.reserve(N);
|
||||
if (!row_part) {
|
||||
KERNEL_LOG_ERROR("row_partition_types error.");
|
||||
return GRAPH_FAILED;
|
||||
}
|
||||
partition_types = row_part->GetListString();
|
||||
const auto string_to_type =
|
||||
new std::unordered_map<std::string, RowPartitionType>({{"FIRST_DIM_SIZE", RowPartitionType::FIRST_DIM_SIZE},
|
||||
{"VALUE_ROWIDS", RowPartitionType::VALUE_ROWIDS},
|
||||
{"ROW_LENGTHS", RowPartitionType::ROW_LENGTHS},
|
||||
{"ROW_SPLITS", RowPartitionType::ROW_SPLITS},
|
||||
{"ROW_LIMITS", RowPartitionType::ROW_LIMITS},
|
||||
{"ROW_STARTS", RowPartitionType::ROW_STARTS}});
|
||||
|
||||
for (const std::string &type_str : partition_types) {
|
||||
const auto iter = string_to_type->find(type_str);
|
||||
if (iter == string_to_type->end()) {
|
||||
delete string_to_type;
|
||||
KERNEL_LOG_ERROR("Unknown string for partition info type.");
|
||||
return GRAPH_FAILED;
|
||||
}
|
||||
row_partition_types_.push_back(iter->second);
|
||||
}
|
||||
delete string_to_type;
|
||||
return GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
int32_t RaggedTensorToTensorCpuKernel::GetRaggedRank(const std::vector<RowPartitionType> &partition_types) {
|
||||
if (partition_types.empty()) {
|
||||
return 0;
|
||||
}
|
||||
if (partition_types[0] == RowPartitionType::FIRST_DIM_SIZE) {
|
||||
return partition_types.size() - 1;
|
||||
}
|
||||
return partition_types.size();
|
||||
}
|
||||
|
||||
RowPartitionType RaggedTensorToTensorCpuKernel::GetRowPartitionTypeByDimension(int dimension) {
|
||||
if (row_partition_types_[0] == RowPartitionType::FIRST_DIM_SIZE) {
|
||||
return row_partition_types_[dimension + 1];
|
||||
} else {
|
||||
return row_partition_types_[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the relationship between dimension and dimension + 1.
|
||||
template <typename INDEX_TYPE>
|
||||
typename TTypes<INDEX_TYPE>::Flat RaggedTensorToTensorCpuKernel::GetRowPartitionTensor(CpuKernelContext &c,
|
||||
int64_t dimension) {
|
||||
if (row_partition_types_[0] == RowPartitionType::FIRST_DIM_SIZE) {
|
||||
Tensor *row_partition = c.Input(dimension + 1 + kFirstPartitionInputIndex);
|
||||
EigenTensor rowET(row_partition, reinterpret_cast<INDEX_TYPE *>(row_partition->GetData()));
|
||||
typename TTypes<INDEX_TYPE>::Flat flat_tensor = rowET.flat<INDEX_TYPE>();
|
||||
return flat_tensor;
|
||||
} else {
|
||||
Tensor *row_partition = c.Input(dimension + kFirstPartitionInputIndex);
|
||||
EigenTensor rowET(row_partition, reinterpret_cast<INDEX_TYPE *>(row_partition->GetData()));
|
||||
typename TTypes<INDEX_TYPE>::Flat flat_tensor = rowET.flat<INDEX_TYPE>();
|
||||
return flat_tensor;
|
||||
}
|
||||
}
|
||||
|
||||
string RaggedTensorToTensorCpuKernel::RowPartitionTypeToString(RowPartitionType row_partition_type) {
|
||||
switch (row_partition_type) {
|
||||
case RowPartitionType::FIRST_DIM_SIZE:
|
||||
return "FIRST_DIM_SIZE";
|
||||
case RowPartitionType::VALUE_ROWIDS:
|
||||
return "VALUE_ROWIDS";
|
||||
case RowPartitionType::ROW_LENGTHS:
|
||||
return "ROW_LENGTHS";
|
||||
case RowPartitionType::ROW_SPLITS:
|
||||
return "ROW_SPLITS";
|
||||
case RowPartitionType::ROW_LIMITS:
|
||||
return "ROW_LIMITS";
|
||||
case RowPartitionType::ROW_STARTS:
|
||||
return "ROW_STARTS";
|
||||
default:
|
||||
return "UNKNOWN ROW PARTITION TYPE";
|
||||
}
|
||||
}
|
||||
|
||||
graphStatus RaggedTensorToTensorCpuKernel::ValidateDefaultValueShape(const TensorShapeProto &default_value_shape,
|
||||
const TensorShapeProto &value_shape,
|
||||
const char *op_name) {
|
||||
if (default_value_shape.unknown_rank || value_shape.unknown_rank) {
|
||||
return GRAPH_SUCCESS;
|
||||
}
|
||||
if (default_value_shape.dims.size() > value_shape.dims.size()) {
|
||||
KERNEL_LOG_ERROR("default_value must have less dimensions than the values.");
|
||||
return GRAPH_FAILED;
|
||||
}
|
||||
for (size_t i = 0; i < std::min(default_value_shape.dims.size(), value_shape.dims.size() - 1); ++i) {
|
||||
if (default_value_shape.dims[i].size >= 0 && value_shape.dims[i + 1].size >= 0 &&
|
||||
default_value_shape.dims[i].size != 1 && default_value_shape.dims[i].size != value_shape.dims[i + 1].size) {
|
||||
return GRAPH_FAILED;
|
||||
}
|
||||
}
|
||||
return GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
graphStatus RaggedTensorToTensorCpuKernel::AsProto(Tensor *tshape, TensorShapeProto *proto, std::string name) const {
|
||||
proto->dims.clear();
|
||||
if (name == "shape") {
|
||||
if (tshape->GetTensorShape()) {
|
||||
if ((tshape->GetDataType() == DT_INT32 &&
|
||||
static_cast<int64_t *>(tshape->GetData())[0] == static_cast<int32_t>(-1)) ||
|
||||
(tshape->GetDataType() == DT_INT64 &&
|
||||
static_cast<int64_t *>(tshape->GetData())[0] == static_cast<int64_t>(-1))) {
|
||||
proto->unknown_rank = true;
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
}
|
||||
if (tshape->GetDataType() == DT_INT32) {
|
||||
int64_t dimsnum = tshape->GetTensorShape()->NumElements();
|
||||
Dim tdim;
|
||||
proto->dims.reserve(dimsnum);
|
||||
auto dd = static_cast<int32_t *>(tshape->GetData());
|
||||
for (int64_t i = 0; i < tshape->GetTensorShape()->NumElements(); i++) {
|
||||
tdim.size = dd[i];
|
||||
proto->dims.push_back(tdim);
|
||||
proto->unknown_rank = false;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
} else if (tshape->GetDataType() == DT_INT64) {
|
||||
int64_t dimsnum = tshape->GetTensorShape()->NumElements();
|
||||
Dim tdim;
|
||||
proto->dims.reserve(dimsnum);
|
||||
for (int64_t i = 0; i < tshape->GetTensorShape()->NumElements(); i++) {
|
||||
tdim.size = static_cast<int64_t *>(tshape->GetData())[i];
|
||||
proto->dims.push_back(tdim);
|
||||
proto->unknown_rank = false;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
KERNEL_LOG_ERROR("Expected an int32 or int64 shape tensor.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else {
|
||||
if (tshape->GetTensorShape()->GetUnknownRank()) {
|
||||
proto->unknown_rank = true;
|
||||
} else {
|
||||
for (int i = 0; i < tshape->GetTensorShape()->GetDims(); i++) {
|
||||
Dim dim;
|
||||
dim.size = tshape->GetTensorShape()->GetDimSizes()[i];
|
||||
proto->dims.push_back(dim);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
}
|
||||
|
||||
graphStatus RaggedTensorToTensorCpuKernel::CombineRaggedTensorToTensorShapes(int32_t ragged_rank,
|
||||
const TensorShapeProto &shape,
|
||||
const TensorShapeProto &value_shape,
|
||||
TensorShapeProto &output_shape,
|
||||
const char *op_name) {
|
||||
if (value_shape.unknown_rank && shape.unknown_rank) {
|
||||
output_shape.dims.clear();
|
||||
output_shape.unknown_rank = true;
|
||||
return GRAPH_SUCCESS;
|
||||
}
|
||||
if (shape.unknown_rank) {
|
||||
while (output_shape.dims.size() < ragged_rank + value_shape.dims.size()) {
|
||||
Dim temp_dim;
|
||||
temp_dim.size = -1;
|
||||
output_shape.dims.emplace_back(temp_dim);
|
||||
}
|
||||
} else {
|
||||
output_shape = shape;
|
||||
}
|
||||
if (value_shape.unknown_rank) {
|
||||
return GRAPH_SUCCESS;
|
||||
}
|
||||
if (ragged_rank + value_shape.dims.size() != output_shape.dims.size()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"error:ragged_rank plus value_shape dims should be equal to output dim "
|
||||
"sizes.");
|
||||
return GRAPH_FAILED;
|
||||
}
|
||||
|
||||
for (size_t i = 1; i < value_shape.dims.size(); ++i) {
|
||||
const Dim value_dim = value_shape.dims[i];
|
||||
Dim output_shape_dim = output_shape.dims.at(output_shape.dims.size() - value_shape.dims.size() + i);
|
||||
if (value_dim.size >= 0) {
|
||||
if (output_shape_dim.size >= 0 && output_shape_dim.size != value_dim.size) {
|
||||
KERNEL_LOG_ERROR("Value and shape dimension are inconsistent.");
|
||||
return GRAPH_FAILED;
|
||||
}
|
||||
if (output_shape_dim.size < 0) {
|
||||
output_shape_dim.size = value_dim.size;
|
||||
}
|
||||
}
|
||||
}
|
||||
return GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputSize(INDEX_TYPE first_dim, CpuKernelContext &c,
|
||||
vector<INDEX_TYPE> *result) {
|
||||
TensorShapeProto value_shape_proto;
|
||||
Tensor *value_ptr = c.Input(kValueInputIndex);
|
||||
AsProto(value_ptr, &value_shape_proto, "value");
|
||||
TensorShapeProto default_value_shape_proto;
|
||||
Tensor *default_value_ptr = c.Input(kDefaultValueInputIndex);
|
||||
AsProto(default_value_ptr, &default_value_shape_proto, "default_value");
|
||||
TensorShapeProto output_shape_proto;
|
||||
Tensor *output_ptr = c.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(output_ptr, KERNEL_STATUS_PARAM_INVALID, "Output error.");
|
||||
KERNEL_CHECK_FALSE(
|
||||
(ValidateDefaultValueShape(default_value_shape_proto, value_shape_proto, "RaggedTensorToTensor") != GRAPH_FAILED),
|
||||
KERNEL_STATUS_PARAM_INVALID, "ValidateDefaultValueShape error.");
|
||||
TensorShapeProto shape_proto;
|
||||
{
|
||||
Tensor *shape_ptr = c.Input(kShapeInputIndex);
|
||||
AsProto(shape_ptr, &shape_proto, "shape");
|
||||
}
|
||||
KERNEL_CHECK_FALSE((CombineRaggedTensorToTensorShapes(ragged_rank_, shape_proto, value_shape_proto,
|
||||
output_shape_proto, "RaggedTensorToTensor") != GRAPH_FAILED),
|
||||
KERNEL_STATUS_PARAM_INVALID, "CombineRaggedTensorToTensorShapes error.");
|
||||
result->reserve(output_shape_proto.dims.size());
|
||||
for (unsigned int dim = 0; dim < output_shape_proto.dims.size(); dim++) {
|
||||
// Note that this may be -1 (if dimension size is unknown).
|
||||
result->push_back(output_shape_proto.dims[dim].size);
|
||||
}
|
||||
if ((*result)[0] < 0) {
|
||||
(*result)[0] = first_dim;
|
||||
}
|
||||
for (int i = 1; i <= ragged_rank_; ++i) {
|
||||
KERNEL_CHECK_FALSE(((*result)[i] >= 0), KERNEL_STATUS_PARAM_INVALID, "Result error.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
/**
|
||||
* The output_index represents the index in the output tensor
|
||||
* where the first element of a particular dimension would be written.
|
||||
* If it is -1, it indicates that the index is out of scope.
|
||||
* Example, given first_dimension = 10, first_dimension_output = 6,
|
||||
* and output_index_multiplier = 100:
|
||||
* result = [0 100 200 300 400 500 -1 -1 -1 -1]
|
||||
* If first_dimension_output = 11 instead, then:
|
||||
* result = [0 100 200 300 400 500 600 700 800 900]
|
||||
*/
|
||||
template <typename INDEX_TYPE>
|
||||
vector<INDEX_TYPE> RaggedTensorToTensorCpuKernel::CalculateFirstParentOutputIndex(INDEX_TYPE first_dimension,
|
||||
INDEX_TYPE output_index_multiplier,
|
||||
INDEX_TYPE first_dimension_output) {
|
||||
const INDEX_TYPE min_dimension = std::min(first_dimension, first_dimension_output);
|
||||
vector<INDEX_TYPE> result;
|
||||
result.reserve(first_dimension);
|
||||
int current_output_index = 0;
|
||||
for (INDEX_TYPE i = 0; i < min_dimension; ++i, current_output_index += output_index_multiplier) {
|
||||
result.push_back(current_output_index);
|
||||
}
|
||||
for (INDEX_TYPE i = min_dimension; i < first_dimension; ++i) {
|
||||
result.push_back(-1);
|
||||
}
|
||||
unsigned int fisrt_dim = (unsigned int)first_dimension;
|
||||
if (result.size() < fisrt_dim) KERNEL_LOG_ERROR("Resize size shou l d be greater equal first dim.");
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputIndexRowSplit(const typename TTypes<INDEX_TYPE>::Flat &row_split,
|
||||
const vector<INDEX_TYPE> &parent_output_index,
|
||||
INDEX_TYPE output_index_multiplier,
|
||||
INDEX_TYPE output_size,
|
||||
vector<INDEX_TYPE> *result) {
|
||||
INDEX_TYPE row_split_size = row_split.size();
|
||||
if (row_split_size > 0) {
|
||||
result->reserve(row_split(row_split_size - 1));
|
||||
}
|
||||
for (INDEX_TYPE i = 0; i < row_split_size - 1; ++i) {
|
||||
INDEX_TYPE row_length = row_split(i + 1) - row_split(i);
|
||||
INDEX_TYPE real_length = std::min(output_size, row_length);
|
||||
INDEX_TYPE parent_output_index_current = parent_output_index[i];
|
||||
if (parent_output_index_current == -1) {
|
||||
real_length = 0;
|
||||
}
|
||||
for (INDEX_TYPE j = 0; j < real_length; ++j) {
|
||||
result->push_back(parent_output_index_current);
|
||||
parent_output_index_current += output_index_multiplier;
|
||||
}
|
||||
for (INDEX_TYPE j = 0; j < row_length - real_length; ++j) {
|
||||
result->push_back(-1);
|
||||
}
|
||||
}
|
||||
if (row_split_size > 0) {
|
||||
unsigned int row_split_size1 = row_split(row_split_size - 1);
|
||||
KERNEL_CHECK_FALSE((result->size() >= row_split_size1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Result size should be greater equal row split size.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
// Calculate the output index of the first element of a list.
|
||||
// The parent_output_index is the same computation for the previous list.
|
||||
// -1 indicates an element or list that is out of range.
|
||||
// The output_index_multiplier is the number of output indices one moves
|
||||
// forward for each column.
|
||||
// E.g., given:
|
||||
// value_rowids:[0 1 2 2 2 3 5 5 6]
|
||||
// parent_output_index:[1000 1100 2000 2100 -1 3000 4000]
|
||||
// output_index_multiplier: 10
|
||||
// output_size: 2
|
||||
// You get:
|
||||
// result = [1000 1100 2000 2010 -1 2100 -1 -1 3000]
|
||||
// result[0] = parent_output_index[value_rowids[0]]
|
||||
// result[1] = parent_output_index[value_rowids[1]]
|
||||
// result[2] = parent_output_index[value_rowids[2]]
|
||||
// result[3] = parent_output_index[value_rowids[2] + 10]
|
||||
// result[4] = -1 because it is the third element the size is 2.
|
||||
// result[5] = parent_output_index[value_rowids[3]]
|
||||
// result[6] = -1 because parent_output_index[value_rowids[6]] == -1
|
||||
// result[7] = -1 because parent_output_index[value_rowids[6]] == -1
|
||||
// result[8] = parent_output_index[value_rowids[7]]
|
||||
template <typename INDEX_TYPE>
|
||||
uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputIndexValueRowID(
|
||||
const typename TTypes<INDEX_TYPE>::Flat &value_rowids, const vector<INDEX_TYPE> &parent_output_index,
|
||||
INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size, vector<INDEX_TYPE> *result) {
|
||||
const INDEX_TYPE index_size = value_rowids.size();
|
||||
result->reserve(index_size);
|
||||
KERNEL_CHECK_FALSE((index_size != 0), KERNEL_STATUS_PARAM_INVALID, "Index size should not be zero.");
|
||||
INDEX_TYPE current_output_column = 0;
|
||||
unsigned int current_value_rowid = value_rowids(0);
|
||||
KERNEL_CHECK_FALSE((current_value_rowid < parent_output_index.size()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Current value rowid should be less than parent output index size.");
|
||||
INDEX_TYPE current_output_index = parent_output_index[current_value_rowid];
|
||||
result->push_back(current_output_index);
|
||||
for (INDEX_TYPE i = 1; i < index_size; ++i) {
|
||||
unsigned int next_value_rowid = value_rowids(i);
|
||||
if (next_value_rowid == current_value_rowid && current_output_index >= 0) {
|
||||
++current_output_column;
|
||||
if (current_output_column < output_size) {
|
||||
current_output_index += output_index_multiplier;
|
||||
} else {
|
||||
current_output_index = -1;
|
||||
}
|
||||
}
|
||||
if (next_value_rowid != current_value_rowid) {
|
||||
current_output_column = 0;
|
||||
current_value_rowid = next_value_rowid;
|
||||
if (next_value_rowid >= parent_output_index.size()) {
|
||||
KERNEL_LOG_ERROR("Next value rowid should be less than parent output index size.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
current_output_index = parent_output_index[next_value_rowid];
|
||||
}
|
||||
result->push_back(current_output_index);
|
||||
}
|
||||
size_t result_size = result->size();
|
||||
size_t value_rowid_size = value_rowids.size();
|
||||
KERNEL_CHECK_FALSE((result_size == value_rowid_size), KERNEL_STATUS_PARAM_INVALID, "Invalid row ids.");
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
uint32_t RaggedTensorToTensorCpuKernel::CalculateOutputIndex(CpuKernelContext &ctx, int64_t dimension,
|
||||
const vector<INDEX_TYPE> &parent_output_index,
|
||||
INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
|
||||
vector<INDEX_TYPE> *result) {
|
||||
const typename TTypes<INDEX_TYPE>::Flat row_partition_tensor = GetRowPartitionTensor<INDEX_TYPE>(ctx, dimension);
|
||||
auto partition_type = GetRowPartitionTypeByDimension(dimension);
|
||||
switch (partition_type) {
|
||||
case RowPartitionType::VALUE_ROWIDS:
|
||||
return CalculateOutputIndexValueRowID(row_partition_tensor, parent_output_index, output_index_multiplier,
|
||||
output_size, result);
|
||||
case RowPartitionType::ROW_SPLITS:
|
||||
return CalculateOutputIndexRowSplit(row_partition_tensor, parent_output_index, output_index_multiplier,
|
||||
output_size, result);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported partition type:[%s]", RowPartitionTypeToString(partition_type));
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
uint32_t RaggedTensorToTensorCpuKernel::GetFirstDimensionSize(CpuKernelContext &ctx, INDEX_TYPE *result) {
|
||||
const Tensor *first_partition_tensor = ctx.Input(kFirstPartitionInputIndex);
|
||||
const RowPartitionType first_partition_type = row_partition_types_[0];
|
||||
|
||||
switch (first_partition_type) {
|
||||
case RowPartitionType::FIRST_DIM_SIZE:
|
||||
*result = static_cast<INDEX_TYPE *>(first_partition_tensor->GetData())[0];
|
||||
return KERNEL_STATUS_OK;
|
||||
case RowPartitionType::VALUE_ROWIDS:
|
||||
KERNEL_LOG_ERROR("Cannot handle VALUE_ROWIDS in first dimension.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
case RowPartitionType::ROW_SPLITS:
|
||||
*result = first_partition_tensor->GetTensorShape()->GetDimSizes()[0] - 1;
|
||||
return KERNEL_STATUS_OK;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Cannot handle type [%s]", RowPartitionTypeToString(first_partition_type));
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename INDEX_TYPE, typename VALUE_TYPE>
|
||||
uint32_t RaggedTensorToTensorCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
KERNEL_CHECK_FALSE((GetRowPartitionTypes(ctx) != GRAPH_FAILED), KERNEL_STATUS_PARAM_INVALID,
|
||||
"GetRowPartitionTypes error");
|
||||
ragged_rank_ = GetRaggedRank(row_partition_types_);
|
||||
INDEX_TYPE first_dimension;
|
||||
KERNEL_CHECK_FALSE((GetFirstDimensionSize(ctx, &first_dimension) == 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"GetFirstDimensionSize error.");
|
||||
vector<INDEX_TYPE> output_size;
|
||||
KERNEL_CHECK_FALSE((CalculateOutputSize(first_dimension, ctx, &output_size) == 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"CalculateOutputSize error.");
|
||||
|
||||
vector<INDEX_TYPE> multiplier;
|
||||
multiplier.resize(output_size.size());
|
||||
multiplier[multiplier.size() - 1] = 1;
|
||||
for (int i = output_size.size() - 2; i >= 0; --i) {
|
||||
multiplier[i] = multiplier[i + 1] * output_size[i + 1];
|
||||
}
|
||||
|
||||
Tensor *output_tensor = nullptr;
|
||||
output_tensor = ctx.Output(0);
|
||||
auto output_shape = output_tensor->GetTensorShape();
|
||||
auto output_shape_dims = output_shape->GetDimSizes();
|
||||
for (unsigned int i = 0; i < output_size.size(); i++) {
|
||||
output_shape_dims[i] = output_size[i];
|
||||
}
|
||||
|
||||
const INDEX_TYPE full_size = multiplier[0] * output_size[0];
|
||||
if (full_size > 0) {
|
||||
vector<INDEX_TYPE> output_index = CalculateFirstParentOutputIndex(first_dimension, multiplier[0], output_size[0]);
|
||||
for (int i = 1; i <= ragged_rank_; ++i) {
|
||||
vector<INDEX_TYPE> new_output_index;
|
||||
KERNEL_CHECK_FALSE(
|
||||
(CalculateOutputIndex(ctx, i - 1, output_index, multiplier[i], output_size[i], &new_output_index) == 0),
|
||||
KERNEL_STATUS_PARAM_INVALID, "CalculateOutputIndex error.");
|
||||
output_index = new_output_index;
|
||||
}
|
||||
return SetOutput<INDEX_TYPE, VALUE_TYPE>(ctx, output_index, output_tensor);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename INDEX_TYPE, typename VALUE_TYPE>
|
||||
uint32_t RaggedTensorToTensorCpuKernel::SetOutput(CpuKernelContext &ctx, const vector<INDEX_TYPE> &output_index,
|
||||
Tensor *output_tensor) {
|
||||
EigenTensor outputET(output_tensor, reinterpret_cast<INDEX_TYPE *>(output_tensor->GetData()));
|
||||
typename aicpu::TTypes<VALUE_TYPE>::Flat output_flat = outputET.flat<VALUE_TYPE>();
|
||||
const auto value_tensor = ctx.Input(kValueInputIndex);
|
||||
const auto default_value_tensor = ctx.Input(kDefaultValueInputIndex);
|
||||
if (value_tensor->GetTensorShape()->GetDims() == 1) {
|
||||
// Initialize tensor to default_value.
|
||||
VALUE_TYPE *base_output = output_flat.data();
|
||||
VALUE_TYPE *default_value_pt = static_cast<VALUE_TYPE *>(default_value_tensor->GetData());
|
||||
VALUE_TYPE default_value = default_value_pt[0];
|
||||
std::fill(base_output, base_output + output_flat.size(), default_value);
|
||||
EigenTensor valuesET(value_tensor, reinterpret_cast<INDEX_TYPE *>(value_tensor->GetData()));
|
||||
auto values = valuesET.flat<VALUE_TYPE>();
|
||||
unsigned int values_size = values.size();
|
||||
KERNEL_CHECK_FALSE((values_size == output_index.size()), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Values and indices must be equal.");
|
||||
for (unsigned int i = 0; i < values_size; ++i) {
|
||||
if (output_index[i] >= 0) {
|
||||
output_flat(output_index[i]) = values(i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
|
||||
auto default_value_shape = default_value_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t output_element_size = 1;
|
||||
for (const int64_t &d : output_shape) {
|
||||
output_element_size *= d;
|
||||
}
|
||||
// Initialize tensor to default_value.
|
||||
std::vector<int64_t> broadcast_shape;
|
||||
auto ret = GetBroadcastShape(default_value_shape, output_shape, broadcast_shape);
|
||||
KERNEL_CHECK_FALSE(ret == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID, "Broadcast failed.");
|
||||
KERNEL_CHECK_FALSE(broadcast_shape == output_shape, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Unable to broadcast shape of default_value to result.");
|
||||
BroadcastIterator iter(default_value_shape, output_shape, broadcast_shape);
|
||||
auto default_value_addr = reinterpret_cast<VALUE_TYPE *>(default_value_tensor->GetData());
|
||||
auto output_addr = reinterpret_cast<VALUE_TYPE *>(output_tensor->GetData());
|
||||
iter.SetPos(0);
|
||||
for (int i = 0; i < output_element_size; ++i) {
|
||||
output_addr[i] = default_value_addr[iter.GetInputPosA()];
|
||||
iter.GenNextPos();
|
||||
}
|
||||
VALUE_TYPE *base_output = output_flat.data();
|
||||
EigenTensor valuesET(value_tensor, reinterpret_cast<INDEX_TYPE *>(value_tensor->GetData()));
|
||||
auto values = valuesET.flat<VALUE_TYPE>();
|
||||
size_t values_size = values.size();
|
||||
size_t output_index_size = output_index.size();
|
||||
// A value "element" is a group of values that are arranged together.
|
||||
// For example, if the value shape is [3,4,5], then 20 values are in a
|
||||
// value element.
|
||||
unsigned int value_element_size;
|
||||
if (output_index_size != 0) {
|
||||
value_element_size = values_size / output_index_size;
|
||||
} else {
|
||||
KERNEL_LOG_DEBUG("Values and indices must be equal");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
unsigned int value_element_bytesize = value_element_size * sizeof(VALUE_TYPE);
|
||||
const VALUE_TYPE *values_base = values.data();
|
||||
unsigned int values_dimsize = value_tensor->GetTensorShape()->GetDimSizes()[0];
|
||||
KERNEL_CHECK_FALSE((values_dimsize == output_index_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Values and indices must be equal.");
|
||||
KERNEL_CHECK_FALSE((values_size == output_index_size * value_element_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Values and indices must be equal.");
|
||||
|
||||
INDEX_TYPE value_index = 0;
|
||||
for (unsigned int i = 0; i < output_index_size; ++i, value_index += value_element_size) {
|
||||
if (output_index[i] >= 0) {
|
||||
VALUE_TYPE *dst = base_output + output_index[i];
|
||||
const VALUE_TYPE *src = values_base + value_index;
|
||||
copy_array<VALUE_TYPE, INDEX_TYPE>(dst, src, value_element_size, value_element_bytesize);
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kRaggedTensorToTensor, RaggedTensorToTensorCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,150 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_RAGGEDTENSORTOTENSOR_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_RAGGEDTENSORTOTENSOR_H_
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "securec.h"
|
||||
#include "status.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/broadcast_iterator.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "Eigen/Core"
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
#include <unordered_map>
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
namespace aicpu {
|
||||
struct DimStruct {
|
||||
int64_t size = 1;
|
||||
};
|
||||
using Dim = DimStruct;
|
||||
|
||||
struct TensorShapeProtoStruct {
|
||||
std::vector<Dim> dims;
|
||||
bool unknown_rank = false;
|
||||
};
|
||||
using TensorShapeProto = TensorShapeProtoStruct;
|
||||
|
||||
enum class RowPartitionType { FIRST_DIM_SIZE, VALUE_ROWIDS, ROW_LENGTHS, ROW_SPLITS, ROW_LIMITS, ROW_STARTS };
|
||||
const int kShapeInputIndex = 0;
|
||||
const int kValueInputIndex = 1;
|
||||
const int kDefaultValueInputIndex = 2;
|
||||
const int kFirstPartitionInputIndex = 3;
|
||||
using graphStatus = uint32_t;
|
||||
const graphStatus GRAPH_FAILED = 0xFFFFFFFF;
|
||||
const graphStatus GRAPH_SUCCESS = 0;
|
||||
|
||||
template <typename VALUE_TYPE, typename INDEX_TYPE>
|
||||
void slow_copy_array(VALUE_TYPE *dst, const VALUE_TYPE *src, INDEX_TYPE size) {
|
||||
for (INDEX_TYPE index = 0; index < size; ++index) {
|
||||
dst[index] = src[index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename VALUE_TYPE, typename INDEX_TYPE>
|
||||
void copy_array(VALUE_TYPE *dst, const VALUE_TYPE *src, INDEX_TYPE size, size_t bytes) {
|
||||
memcpy(dst, src, bytes);
|
||||
}
|
||||
|
||||
template <>
|
||||
void copy_array<string, int64_t>(std::string *dst, const string *src, int64_t size, size_t bytes) {
|
||||
slow_copy_array(dst, src, size);
|
||||
}
|
||||
|
||||
template <>
|
||||
void copy_array<string, int32_t>(string *dst, const string *src, int32_t size, size_t bytes) {
|
||||
slow_copy_array(dst, src, size);
|
||||
}
|
||||
|
||||
template <>
|
||||
void copy_array<Eigen::half, int64_t>(Eigen::half *dst, const Eigen::half *src, int64_t size, size_t bytes) {
|
||||
slow_copy_array(dst, src, size);
|
||||
}
|
||||
|
||||
template <>
|
||||
void copy_array<Eigen::half, int32_t>(Eigen::half *dst, const Eigen::half *src, int32_t size, size_t bytes) {
|
||||
slow_copy_array(dst, src, size);
|
||||
}
|
||||
|
||||
class RaggedTensorToTensorCpuKernel : public CpuKernel {
|
||||
public:
|
||||
graphStatus GetRowPartitionTypes(CpuKernelContext &ctx);
|
||||
int32_t GetRaggedRank(const std::vector<RowPartitionType> &partition_types);
|
||||
RowPartitionType GetRowPartitionTypeByDimension(int dimension);
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
typename TTypes<INDEX_TYPE>::Flat GetRowPartitionTensor(CpuKernelContext &c, int64_t dimension);
|
||||
|
||||
string RowPartitionTypeToString(RowPartitionType row_partition_type);
|
||||
|
||||
graphStatus ValidateDefaultValueShape(const TensorShapeProto &default_value_shape,
|
||||
const TensorShapeProto &value_shape, const char *op_name);
|
||||
|
||||
graphStatus AsProto(Tensor *tshape, TensorShapeProto *proto, std::string name) const;
|
||||
|
||||
graphStatus CombineRaggedTensorToTensorShapes(int32_t ragged_rank, const TensorShapeProto &shape,
|
||||
const TensorShapeProto &value_shape, TensorShapeProto &output_shape,
|
||||
const char *op_name);
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
uint32_t CalculateOutputSize(INDEX_TYPE first_dim, CpuKernelContext &c, vector<INDEX_TYPE> *result);
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
vector<INDEX_TYPE> CalculateFirstParentOutputIndex(INDEX_TYPE first_dimension, INDEX_TYPE output_index_multiplier,
|
||||
INDEX_TYPE first_dimension_output);
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
uint32_t CalculateOutputIndexRowSplit(const typename TTypes<INDEX_TYPE>::Flat &row_split,
|
||||
const vector<INDEX_TYPE> &parent_output_index,
|
||||
INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
|
||||
vector<INDEX_TYPE> *result);
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
uint32_t CalculateOutputIndexValueRowID(const typename TTypes<INDEX_TYPE>::Flat &value_rowids,
|
||||
const vector<INDEX_TYPE> &parent_output_index,
|
||||
INDEX_TYPE output_index_multiplier, INDEX_TYPE output_size,
|
||||
vector<INDEX_TYPE> *result);
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
uint32_t CalculateOutputIndex(CpuKernelContext &context, int64_t dimension,
|
||||
const vector<INDEX_TYPE> &parent_output_index, INDEX_TYPE output_index_multiplier,
|
||||
INDEX_TYPE output_size, vector<INDEX_TYPE> *result);
|
||||
|
||||
template <typename INDEX_TYPE>
|
||||
uint32_t GetFirstDimensionSize(CpuKernelContext &context, INDEX_TYPE *result);
|
||||
|
||||
template <typename INDEX_TYPE, typename VALUE_TYPE>
|
||||
uint32_t DoCompute(CpuKernelContext &context);
|
||||
|
||||
template <typename INDEX_TYPE, typename VALUE_TYPE>
|
||||
uint32_t SetOutput(CpuKernelContext &context, const vector<INDEX_TYPE> &output_index, Tensor *output_tensor);
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
std::vector<RowPartitionType> row_partition_types_;
|
||||
int ragged_rank_;
|
||||
};
|
||||
}; // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,160 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "reciprocal.h"
|
||||
|
||||
#include <float.h>
|
||||
#include <complex>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kReciprocal = "Reciprocal";
|
||||
const size_t kReciprocalInputNum = 1;
|
||||
const size_t kReciprocalOutputNum = 1;
|
||||
constexpr int64_t kParallelDataNums = 32 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t ReciprocalCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
Tensor *x = ctx.Input(0);
|
||||
Tensor *y = ctx.Output(0);
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kReciprocalOutputNum, kReciprocalInputNum), "Check Reciprocal params failed.");
|
||||
if (x->GetDataType() != y->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s]",
|
||||
DTypeStr(x->GetDataType()).c_str(), DTypeStr(y->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (x->GetDataSize() != y->GetDataSize()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data size of the input [%llu] need be the same as the output "
|
||||
"[%llu]",
|
||||
x->GetDataSize(), y->GetDataSize());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
uint64_t data_num = x->NumElements();
|
||||
DataType data_type = x->GetDataType();
|
||||
uint32_t res = KERNEL_STATUS_OK;
|
||||
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
res = ReciprocalCompute<float>(x, y, data_num, ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
res = ReciprocalCompute<double>(x, y, data_num, ctx);
|
||||
break;
|
||||
case DT_FLOAT16:
|
||||
res = ReciprocalCompute<Eigen::half>(x, y, data_num, ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
res = ReciprocalComputeComplex<std::complex<float>>(x, y, data_num, ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
res = ReciprocalComputeComplex<std::complex<double>>(x, y, data_num, ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Reciprocal kernel data type [%s] not support", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (res != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t ReciprocalCpuKernel::ReciprocalCompute(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(x->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(y->GetData());
|
||||
if (data_num <= kParallelDataNums) {
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
if (input_x[i] == static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("Reciprocal kernel input[%d] cannot be 0", i);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
output_y[i] = static_cast<T>(1) / (input_x[i]);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto shared_reciprocal = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
if (input_x[i] == static_cast<T>(0)) {
|
||||
KERNEL_LOG_ERROR("Reciprocal kernel input[%d] cannot be 0", i);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
output_y[i] = static_cast<T>(1) / (input_x[i]);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal),
|
||||
"Reciprocal Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t ReciprocalCpuKernel::ReciprocalComputeComplex(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(x->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(y->GetData());
|
||||
if (data_num <= kParallelDataNums) {
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
output_y[i] = conj(input_x[i]) / (input_x[i].real() * input_x[i].real() + input_x[i].imag() * input_x[i].imag());
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shared_reciprocal = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
output_y[i] =
|
||||
conj(input_x[i]) / (input_x[i].real() * input_x[i].real() + input_x[i].imag() * input_x[i].imag());
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_reciprocal),
|
||||
"Reciprocal Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kReciprocal, ReciprocalCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_RECIPROCAL_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_RECIPROCAL_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class ReciprocalCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~ReciprocalCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t ReciprocalCompute(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t ReciprocalComputeComplex(Tensor *x, Tensor *y, uint64_t data_num, CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,155 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "reciprocal_grad.h"
|
||||
|
||||
#include <float.h>
|
||||
#include <complex>
|
||||
#include <math.h>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *kReciprocalGrad = "ReciprocalGrad";
|
||||
const size_t kReciprocalGradInputNum = 2;
|
||||
const size_t kReciprocalGradOutputNum = 1;
|
||||
constexpr int64_t kParallelDataNums = 64 * 1024;
|
||||
constexpr int64_t kParallelComplexDataNums = 16 * 1024;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t ReciprocalGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kReciprocalGradInputNum, kReciprocalGradOutputNum),
|
||||
"Check ReciprocalGrad params failed.");
|
||||
Tensor *y = ctx.Input(0);
|
||||
Tensor *dy = ctx.Input(1);
|
||||
Tensor *z = ctx.Output(0);
|
||||
if (y->GetDataType() != dy->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the input2 [%s] need be the same as the input1 [%s]",
|
||||
DTypeStr(dy->GetDataType()).c_str(), DTypeStr(y->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (y->GetDataSize() != dy->GetDataSize()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data size of the input2 [%llu] need be the same as the input1 "
|
||||
"[%llu]",
|
||||
dy->GetDataSize(), y->GetDataSize());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
uint64_t data_num = y->NumElements();
|
||||
DataType data_type = y->GetDataType();
|
||||
uint32_t res = KERNEL_STATUS_OK;
|
||||
|
||||
switch (data_type) {
|
||||
case DT_FLOAT16:
|
||||
res = ReciprocalGradCompute<Eigen::half>(y, dy, z, data_num, ctx);
|
||||
break;
|
||||
case DT_FLOAT:
|
||||
res = ReciprocalGradCompute<float>(y, dy, z, data_num, ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
res = ReciprocalGradCompute<double>(y, dy, z, data_num, ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
res = ReciprocalGradComputeComplex<std::complex<float>>(y, dy, z, data_num, ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
res = ReciprocalGradComputeComplex<std::complex<double>>(y, dy, z, data_num, ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("ReciprocalGrad invalid input type [%s]", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (res != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t ReciprocalGradCpuKernel::ReciprocalGradCompute(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num,
|
||||
CpuKernelContext &ctx) {
|
||||
auto input_y = reinterpret_cast<T *>(y->GetData());
|
||||
auto input_dy = reinterpret_cast<T *>(dy->GetData());
|
||||
auto output_z = reinterpret_cast<T *>(z->GetData());
|
||||
if (data_num <= kParallelDataNums) {
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
output_z[i] = static_cast<T>(-1) * input_dy[i] * input_y[i] * input_y[i];
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shard_ReciprocalGrad = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
output_z[i] = static_cast<T>(-1) * input_dy[i] * input_y[i] * input_y[i];
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad),
|
||||
"ReciprocalGrad Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t ReciprocalGradCpuKernel::ReciprocalGradComputeComplex(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num,
|
||||
CpuKernelContext &ctx) {
|
||||
auto input_y = reinterpret_cast<T *>(y->GetData());
|
||||
auto input_dy = reinterpret_cast<T *>(dy->GetData());
|
||||
auto output_z = reinterpret_cast<T *>(z->GetData());
|
||||
if (data_num <= kParallelComplexDataNums) {
|
||||
for (size_t i = 0; i < data_num; i++) {
|
||||
output_z[i] = static_cast<T>(-1) * input_dy[i] * conj(input_y[i] * input_y[i]);
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shard_ReciprocalGrad = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
output_z[i] = static_cast<T>(-1) * input_dy[i] * conj(input_y[i] * input_y[i]);
|
||||
}
|
||||
};
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
KERNEL_LOG_ERROR("CpuKernelUtils::ParallelFor failed");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_ReciprocalGrad),
|
||||
"ReciprocalGrad Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kReciprocalGrad, ReciprocalGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_RECIPROCALGRAD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_RECIPROCALGRAD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class ReciprocalGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
~ReciprocalGradCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t ReciprocalGradCompute(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num, CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t ReciprocalGradComputeComplex(Tensor *y, Tensor *dy, Tensor *z, uint64_t data_num, CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,487 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "reduce_mean.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "algorithm"
|
||||
#include "iostream"
|
||||
|
||||
namespace {
|
||||
const char *kReduceMean = "ReduceMean";
|
||||
|
||||
#define REDUCEMEAN_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = ReduceMeanCompute<TYPE1, TYPE2>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("ReduceMean kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define REDUCEMEAN_COMPUTE_CASE_CP(DTYPE, TYPE1, TYPE2, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = ReduceMeanCompute_Complex<TYPE1, TYPE2>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("ReduceMean kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define REDUCEMEAN_COMPUTE_CASE_ALL(TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE_CP(DT_COMPLEX64, std::complex<float>, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE_CP(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX) \
|
||||
REDUCEMEAN_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
T ComplexDiv(T sum, int64_t num) {
|
||||
T res;
|
||||
auto real = sum.real();
|
||||
auto imag = sum.imag();
|
||||
res.real(real / num);
|
||||
res.imag(imag / num);
|
||||
return res;
|
||||
}
|
||||
|
||||
uint32_t ReduceMeanCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
uint32_t input_num = ctx.GetInputsSize();
|
||||
uint32_t output_num = ctx.GetOutputsSize();
|
||||
if (input_num != 2 || output_num != 1) {
|
||||
KERNEL_LOG_ERROR("The number of input or output parameters does not match.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
Tensor *input_data = ctx.Input(0);
|
||||
KERNEL_CHECK_NULLPTR(input_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[0] failed.")
|
||||
Tensor *axes_data = ctx.Input(1);
|
||||
KERNEL_CHECK_NULLPTR(axes_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[1] failed.")
|
||||
Tensor *output_data = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(output_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output[0] failed.");
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
DataType axes_type = ctx.Input(1)->GetDataType();
|
||||
switch (axes_type) {
|
||||
case DT_INT32:
|
||||
switch (data_type) {
|
||||
REDUCEMEAN_COMPUTE_CASE_ALL(int32_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
case DT_INT64:
|
||||
switch (data_type) {
|
||||
REDUCEMEAN_COMPUTE_CASE_ALL(int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(axes_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
Calculate the mean of the corresponding dimension data
|
||||
Rule: except for the specified dimension, a set of data with other
|
||||
dimensions unchanged participate in the calculation of a mean.
|
||||
e.g. input_x : float array[2][2][2]={1,2,3,4,5,6,7,8}
|
||||
axes : [1 , 2]
|
||||
output:[2.5, 6.5]
|
||||
2.5 is calculated from array[0][0][0], array[0][0][1],
|
||||
array[0][1][0] and array[0][1][1]
|
||||
The same group of data addresses involved in calculating the
|
||||
mean consists of one same base address and different offset addresses
|
||||
input_data_address = base_address + offset_address
|
||||
*/
|
||||
template <typename T1, typename T2>
|
||||
uint32_t ReduceMeanCpuKernel::ReduceMeanCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input_data = ctx.Input(0);
|
||||
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
|
||||
const int64_t input_data_num = input_data->NumElements();
|
||||
auto input_data_shape = input_data->GetTensorShape();
|
||||
const int32_t input_data_dims = input_data_shape->GetDims();
|
||||
std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
|
||||
std::vector<int64_t> dims_addr(input_data_dims);
|
||||
dims_addr[input_data_dims - 1] = 1;
|
||||
int64_t addr_tmp = 1;
|
||||
for (int32_t i = input_data_dims - 2; i > -1; i--) {
|
||||
addr_tmp *= input_data_dimsize[i + 1];
|
||||
dims_addr[i] = addr_tmp;
|
||||
}
|
||||
Tensor *output_data = ctx.Output(0);
|
||||
auto output_data_shape = output_data->GetTensorShape();
|
||||
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
|
||||
const int64_t output_data_num = output_data->NumElements();
|
||||
Tensor *axes_data = ctx.Input(1);
|
||||
auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
|
||||
int64_t axes_data_num = axes_data->NumElements();
|
||||
// Check the effectiveness of the value of axes
|
||||
for (int64_t i = 0; i < axes_data_num; i++) {
|
||||
if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
|
||||
KERNEL_LOG_ERROR("The value of axes is incorrect.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else if (*(axes_data_addr + i) < 0) {
|
||||
*(axes_data_addr + i) += input_data_dims;
|
||||
}
|
||||
}
|
||||
std::sort(axes_data_addr, axes_data_addr + axes_data_num);
|
||||
std::vector<T2> axes_data_norepeat;
|
||||
for (int64_t i = 0; i < axes_data_num - 1; i++) {
|
||||
T2 value = axes_data_addr[i];
|
||||
if (value == axes_data_addr[i + 1]) {
|
||||
axes_data_num--;
|
||||
continue;
|
||||
}
|
||||
axes_data_norepeat.push_back(value);
|
||||
}
|
||||
axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
|
||||
// deal with attr
|
||||
auto attr_value = ctx.GetAttr("keep_dims");
|
||||
bool keep_dims;
|
||||
if (attr_value == nullptr) {
|
||||
keep_dims = false;
|
||||
} else {
|
||||
keep_dims = static_cast<bool>(attr_value->GetBool());
|
||||
}
|
||||
if (axes_data_num == input_data_dims) {
|
||||
if (keep_dims) {
|
||||
std::vector<int64_t> dims_new(axes_data_num, 1);
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
} else {
|
||||
std::vector<int64_t> dims_new(1, 1);
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
}
|
||||
T1 data_sum = static_cast<T1>(0);
|
||||
for (int64_t i = 0; i < input_data_num; i++) {
|
||||
data_sum += input_data_addr[i];
|
||||
}
|
||||
output_data_addr[0] = data_sum / input_data_num;
|
||||
} else {
|
||||
std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
|
||||
if (keep_dims) {
|
||||
for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
|
||||
dims_new[*iter] = 1;
|
||||
}
|
||||
} else {
|
||||
for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
|
||||
dims_new.erase(dims_new.begin() + (*iter));
|
||||
}
|
||||
}
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
// Extract unspecified dimensions
|
||||
std::vector<T2> dims_base;
|
||||
const int32_t axes_data_num_const = axes_data_num;
|
||||
const int32_t dims_base_num = input_data_dims - axes_data_num_const;
|
||||
for (T2 i = 0; i < (T2)input_data_dims; i++) {
|
||||
bool cflag = true;
|
||||
for (int64_t j = 0; j < axes_data_num_const; j++) {
|
||||
if (axes_data_norepeat[j] == i) {
|
||||
cflag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (cflag) {
|
||||
dims_base.push_back(i);
|
||||
}
|
||||
}
|
||||
int64_t addr_stride[axes_data_num_const];
|
||||
addr_tmp = 1;
|
||||
addr_stride[axes_data_num_const - 1] = addr_tmp;
|
||||
for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
|
||||
addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
|
||||
addr_stride[i] = addr_tmp;
|
||||
}
|
||||
int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
|
||||
if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > output_data_num) {
|
||||
max_core_num = output_data_num;
|
||||
}
|
||||
auto shard_compute = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
int64_t output_i_addr = 0;
|
||||
int64_t seq_tmp = i;
|
||||
for (int32_t j = dims_base_num - 1; j > -1; j--) {
|
||||
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
|
||||
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
|
||||
seq_tmp = next;
|
||||
output_i_addr += loc * dims_addr[dims_base[j]];
|
||||
if (seq_tmp == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
T1 data_sum = input_data_addr[output_i_addr];
|
||||
// In the array, the actual address of the element participating in the calculation.
|
||||
int64_t addr_offset = 0;
|
||||
for (int64_t j = 1; j < offset_num; j++) {
|
||||
int32_t stride = axes_data_num_const - 1;
|
||||
for (int32_t k = stride - 1; k > -1; k--) {
|
||||
if (j % addr_stride[k] == 0) {
|
||||
addr_offset -=
|
||||
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
|
||||
stride = k;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
addr_offset += dims_addr[axes_data_norepeat[stride]];
|
||||
data_sum += input_data_addr[output_i_addr + addr_offset];
|
||||
}
|
||||
output_data_addr[i] = data_sum / offset_num;
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
|
||||
"ReduceMean Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < output_data_num; i++) {
|
||||
// In the array, the actual address of the output.
|
||||
int64_t output_i_addr = 0;
|
||||
int64_t seq_tmp = i;
|
||||
for (int32_t j = dims_base_num - 1; j > -1; j--) {
|
||||
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
|
||||
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
|
||||
seq_tmp = next;
|
||||
output_i_addr += loc * dims_addr[dims_base[j]];
|
||||
if (seq_tmp == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
T1 data_sum = input_data_addr[output_i_addr];
|
||||
// In the array, the actual address of the element participating in the calculation.
|
||||
int64_t addr_offset = 0;
|
||||
for (int64_t j = 1; j < offset_num; j++) {
|
||||
int32_t stride = axes_data_num_const - 1;
|
||||
for (int32_t k = stride - 1; k > -1; k--) {
|
||||
if (j % addr_stride[k] == 0) {
|
||||
addr_offset -=
|
||||
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
|
||||
stride = k;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
addr_offset += dims_addr[axes_data_norepeat[stride]];
|
||||
data_sum += input_data_addr[output_i_addr + addr_offset];
|
||||
}
|
||||
output_data_addr[i] = data_sum / offset_num;
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t ReduceMeanCpuKernel::ReduceMeanCompute_Complex(CpuKernelContext &ctx) {
|
||||
Tensor *input_data = ctx.Input(0);
|
||||
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
|
||||
const int64_t input_data_num = input_data->NumElements();
|
||||
auto input_data_shape = input_data->GetTensorShape();
|
||||
const int32_t input_data_dims = input_data_shape->GetDims();
|
||||
std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
|
||||
std::vector<int64_t> dims_addr(input_data_dims);
|
||||
dims_addr[input_data_dims - 1] = 1;
|
||||
int64_t addr_tmp = 1;
|
||||
for (int32_t i = input_data_dims - 2; i > -1; i--) {
|
||||
addr_tmp *= input_data_dimsize[i + 1];
|
||||
dims_addr[i] = addr_tmp;
|
||||
}
|
||||
Tensor *output_data = ctx.Output(0);
|
||||
auto output_data_shape = output_data->GetTensorShape();
|
||||
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
|
||||
const int64_t output_data_num = output_data->NumElements();
|
||||
Tensor *axes_data = ctx.Input(1);
|
||||
auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
|
||||
int64_t axes_data_num = axes_data->NumElements();
|
||||
// Check the effectiveness of the value of axes
|
||||
for (int64_t i = 0; i < axes_data_num; i++) {
|
||||
if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
|
||||
KERNEL_LOG_ERROR("The value of axes is incorrect.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else if (*(axes_data_addr + i) < 0) {
|
||||
*(axes_data_addr + i) += input_data_dims;
|
||||
}
|
||||
}
|
||||
std::sort(axes_data_addr, axes_data_addr + axes_data_num);
|
||||
std::vector<T2> axes_data_norepeat;
|
||||
for (int64_t i = 0; i < axes_data_num - 1; i++) {
|
||||
T2 value = axes_data_addr[i];
|
||||
if (value == axes_data_addr[i + 1]) {
|
||||
axes_data_num--;
|
||||
continue;
|
||||
}
|
||||
axes_data_norepeat.push_back(value);
|
||||
}
|
||||
axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
|
||||
// deal with attr
|
||||
auto attr_value = ctx.GetAttr("keep_dims");
|
||||
bool keep_dims;
|
||||
if (attr_value == nullptr) {
|
||||
keep_dims = false;
|
||||
} else {
|
||||
keep_dims = static_cast<bool>(attr_value->GetBool());
|
||||
}
|
||||
if (axes_data_num == input_data_dims) {
|
||||
if (keep_dims) {
|
||||
std::vector<int64_t> dims_new(axes_data_num, 1);
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
} else {
|
||||
std::vector<int64_t> dims_new(1, 1);
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
}
|
||||
T1 data_sum = static_cast<T1>(0);
|
||||
for (int64_t i = 0; i < input_data_num; i++) {
|
||||
data_sum += input_data_addr[i];
|
||||
}
|
||||
output_data_addr[0] = ComplexDiv<T1>(data_sum, input_data_num);
|
||||
} else {
|
||||
std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
|
||||
if (keep_dims) {
|
||||
for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
|
||||
dims_new[*iter] = 1;
|
||||
}
|
||||
} else {
|
||||
for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
|
||||
dims_new.erase(dims_new.begin() + (*iter));
|
||||
}
|
||||
}
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
// Extract unspecified dimensions
|
||||
std::vector<T2> dims_base;
|
||||
const int32_t axes_data_num_const = axes_data_num;
|
||||
const int32_t dims_base_num = input_data_dims - axes_data_num_const;
|
||||
for (T2 i = 0; i < (T2)input_data_dims; i++) {
|
||||
bool cflag = true;
|
||||
for (int64_t j = 0; j < axes_data_num_const; j++) {
|
||||
if (axes_data_norepeat[j] == i) {
|
||||
cflag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (cflag) {
|
||||
dims_base.push_back(i);
|
||||
}
|
||||
}
|
||||
int64_t addr_stride[axes_data_num_const];
|
||||
addr_tmp = 1;
|
||||
addr_stride[axes_data_num_const - 1] = addr_tmp;
|
||||
for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
|
||||
addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
|
||||
addr_stride[i] = addr_tmp;
|
||||
}
|
||||
int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
|
||||
if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > output_data_num) {
|
||||
max_core_num = output_data_num;
|
||||
}
|
||||
auto shard_compute = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
int64_t output_i_addr = 0;
|
||||
int64_t seq_tmp = i;
|
||||
for (int32_t j = dims_base_num - 1; j > -1; j--) {
|
||||
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
|
||||
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
|
||||
seq_tmp = next;
|
||||
output_i_addr += loc * dims_addr[dims_base[j]];
|
||||
if (seq_tmp == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
T1 data_sum = input_data_addr[output_i_addr];
|
||||
// In the array, the actual address of the element participating in the calculation.
|
||||
int64_t addr_offset = 0;
|
||||
for (int64_t j = 1; j < offset_num; j++) {
|
||||
int32_t stride = axes_data_num_const - 1;
|
||||
for (int32_t k = stride - 1; k > -1; k--) {
|
||||
if (j % addr_stride[k] == 0) {
|
||||
addr_offset -=
|
||||
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
|
||||
stride = k;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
addr_offset += dims_addr[axes_data_norepeat[stride]];
|
||||
data_sum += input_data_addr[output_i_addr + addr_offset];
|
||||
}
|
||||
output_data_addr[i] = ComplexDiv<T1>(data_sum, offset_num);
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
|
||||
"ReduceMean Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < output_data_num; i++) {
|
||||
// In the array, the actual address of the output.
|
||||
int64_t output_i_addr = 0;
|
||||
int64_t seq_tmp = i;
|
||||
for (int32_t j = dims_base_num - 1; j > -1; j--) {
|
||||
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
|
||||
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
|
||||
seq_tmp = next;
|
||||
output_i_addr += loc * dims_addr[dims_base[j]];
|
||||
if (seq_tmp == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
T1 data_sum = input_data_addr[output_i_addr];
|
||||
// In the array, the actual address of the element participating in the calculation.
|
||||
int64_t addr_offset = 0;
|
||||
for (int64_t j = 1; j < offset_num; j++) {
|
||||
int32_t stride = axes_data_num_const - 1;
|
||||
for (int32_t k = stride - 1; k > -1; k--) {
|
||||
if (j % addr_stride[k] == 0) {
|
||||
addr_offset -=
|
||||
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
|
||||
stride = k;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
addr_offset += dims_addr[axes_data_norepeat[stride]];
|
||||
data_sum += input_data_addr[output_i_addr + addr_offset];
|
||||
}
|
||||
output_data_addr[i] = ComplexDiv<T1>(data_sum, offset_num);
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kReduceMean, ReduceMeanCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_REDUCEMEAN_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_REDUCEMEAN_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class ReduceMeanCpuKernel : public CpuKernel {
|
||||
public:
|
||||
ReduceMeanCpuKernel() = default;
|
||||
~ReduceMeanCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T1, typename T2>
|
||||
static uint32_t ReduceMeanCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T1, typename T2>
|
||||
static uint32_t ReduceMeanCompute_Complex(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,496 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "reduce_prod.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include "algorithm"
|
||||
#include "iostream"
|
||||
|
||||
namespace {
|
||||
const char *kReduceProd = "ReduceProd";
|
||||
|
||||
#define REDUCEPROD_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = ReduceProdCompute<TYPE1, TYPE2>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("ReduceProd kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define REDUCEPROD_COMPUTE_CASE_CP(DTYPE, TYPE1, TYPE2, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = ReduceProdCompute_Complex<TYPE1, TYPE2>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("ReduceProd kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define REDUCEPROD_COMPUTE_CASE_ALL(TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE_CP(DT_COMPLEX64, std::complex<float>, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE_CP(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX) \
|
||||
REDUCEPROD_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
T ReduceProdCpuKernel::ComputeMul(T num_1, T num_2) {
|
||||
T res;
|
||||
auto a = num_1.real();
|
||||
auto b = num_1.imag();
|
||||
auto x = num_2.real();
|
||||
auto y = num_2.imag();
|
||||
auto real_res = a * x - b * y;
|
||||
auto imag_res = b * x + a * y;
|
||||
res.real(real_res);
|
||||
res.imag(imag_res);
|
||||
return res;
|
||||
}
|
||||
|
||||
uint32_t ReduceProdCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
uint32_t input_num = ctx.GetInputsSize();
|
||||
uint32_t output_num = ctx.GetOutputsSize();
|
||||
if (input_num != 2 || output_num != 1) {
|
||||
KERNEL_LOG_ERROR("The number of input or output parameters does not match.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
Tensor *input_data = ctx.Input(0);
|
||||
KERNEL_CHECK_NULLPTR(input_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[0] failed.")
|
||||
Tensor *axes_data = ctx.Input(1);
|
||||
KERNEL_CHECK_NULLPTR(axes_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input[1] failed.")
|
||||
Tensor *output_data = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(output_data->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output[0] failed.");
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
DataType axes_type = ctx.Input(1)->GetDataType();
|
||||
switch (axes_type) {
|
||||
case DT_INT32:
|
||||
switch (data_type) {
|
||||
REDUCEPROD_COMPUTE_CASE_ALL(int32_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
case DT_INT64:
|
||||
switch (data_type) {
|
||||
REDUCEPROD_COMPUTE_CASE_ALL(int64_t, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(axes_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
Calculate the prod of the corresponding dimension data
|
||||
Rule: except for the specified dimension, a set of data with other
|
||||
dimensions unchanged participate in the calculation of a prod.
|
||||
e.g. input_x : float array[2][2][2]={1,2,3,4,5,6,7,8}
|
||||
axes : [1 , 2]
|
||||
output:[2.5, 6.5]
|
||||
2.5 is calculated from array[0][0][0], array[0][0][1],
|
||||
array[0][1][0] and array[0][1][1]
|
||||
The same group of data addresses involved in calculating the
|
||||
prod consists of one same base address and different offset addresses
|
||||
input_data_address = base_address + offset_address
|
||||
*/
|
||||
template <typename T1, typename T2>
|
||||
uint32_t ReduceProdCpuKernel::ReduceProdCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input_data = ctx.Input(0);
|
||||
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
|
||||
const int64_t input_data_num = input_data->NumElements();
|
||||
auto input_data_shape = input_data->GetTensorShape();
|
||||
const int32_t input_data_dims = input_data_shape->GetDims();
|
||||
std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
|
||||
std::vector<int64_t> dims_addr(input_data_dims);
|
||||
dims_addr[input_data_dims - 1] = 1;
|
||||
int64_t addr_tmp = 1;
|
||||
for (int32_t i = input_data_dims - 2; i > -1; i--) {
|
||||
addr_tmp *= input_data_dimsize[i + 1];
|
||||
dims_addr[i] = addr_tmp;
|
||||
}
|
||||
Tensor *output_data = ctx.Output(0);
|
||||
auto output_data_shape = output_data->GetTensorShape();
|
||||
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
|
||||
const int64_t output_data_num = output_data->NumElements();
|
||||
Tensor *axes_data = ctx.Input(1);
|
||||
auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
|
||||
int64_t axes_data_num = axes_data->NumElements();
|
||||
// Check the effectiveness of the value of axes
|
||||
for (int64_t i = 0; i < axes_data_num; i++) {
|
||||
if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
|
||||
KERNEL_LOG_ERROR("The value of axes is incorrect.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else if (*(axes_data_addr + i) < 0) {
|
||||
*(axes_data_addr + i) += input_data_dims;
|
||||
}
|
||||
}
|
||||
std::sort(axes_data_addr, axes_data_addr + axes_data_num);
|
||||
std::vector<T2> axes_data_norepeat;
|
||||
for (int64_t i = 0; i < axes_data_num - 1; i++) {
|
||||
T2 value = axes_data_addr[i];
|
||||
if (value == axes_data_addr[i + 1]) {
|
||||
axes_data_num--;
|
||||
continue;
|
||||
}
|
||||
axes_data_norepeat.push_back(value);
|
||||
}
|
||||
axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
|
||||
// deal with attr
|
||||
auto attr_value = ctx.GetAttr("keep_dims");
|
||||
bool keep_dims;
|
||||
if (attr_value == nullptr) {
|
||||
keep_dims = false;
|
||||
} else {
|
||||
keep_dims = static_cast<bool>(attr_value->GetBool());
|
||||
}
|
||||
if (axes_data_num == input_data_dims) {
|
||||
if (keep_dims) {
|
||||
std::vector<int64_t> dims_new(axes_data_num, 1);
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
} else {
|
||||
std::vector<int64_t> dims_new(1, 1);
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
}
|
||||
T1 data_prod = static_cast<T1>(1);
|
||||
for (int64_t i = 0; i < input_data_num; i++) {
|
||||
data_prod *= input_data_addr[i];
|
||||
}
|
||||
output_data_addr[0] = data_prod;
|
||||
} else {
|
||||
std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
|
||||
if (keep_dims) {
|
||||
for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
|
||||
dims_new[*iter] = 1;
|
||||
}
|
||||
} else {
|
||||
for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
|
||||
dims_new.erase(dims_new.begin() + (*iter));
|
||||
}
|
||||
}
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
// Extract unspecified dimensions
|
||||
std::vector<T2> dims_base;
|
||||
const int32_t axes_data_num_const = axes_data_num;
|
||||
const int32_t dims_base_num = input_data_dims - axes_data_num_const;
|
||||
for (T2 i = 0; i < (T2)input_data_dims; i++) {
|
||||
bool cflag = true;
|
||||
for (int64_t j = 0; j < axes_data_num_const; j++) {
|
||||
if (axes_data_norepeat[j] == i) {
|
||||
cflag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (cflag) {
|
||||
dims_base.push_back(i);
|
||||
}
|
||||
}
|
||||
int64_t addr_stride[axes_data_num_const];
|
||||
addr_tmp = 1;
|
||||
addr_stride[axes_data_num_const - 1] = addr_tmp;
|
||||
for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
|
||||
addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
|
||||
addr_stride[i] = addr_tmp;
|
||||
}
|
||||
int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
|
||||
if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > output_data_num) {
|
||||
max_core_num = output_data_num;
|
||||
}
|
||||
auto shard_compute = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
int64_t output_i_addr = 0;
|
||||
int64_t seq_tmp = i;
|
||||
for (int32_t j = dims_base_num - 1; j > -1; j--) {
|
||||
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
|
||||
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
|
||||
seq_tmp = next;
|
||||
output_i_addr += loc * dims_addr[dims_base[j]];
|
||||
if (seq_tmp == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
T1 data_prod = input_data_addr[output_i_addr];
|
||||
// In the array, the actual address of the element participating in the calculation.
|
||||
int64_t addr_offset = 0;
|
||||
for (int64_t j = 1; j < offset_num; j++) {
|
||||
int32_t stride = axes_data_num_const - 1;
|
||||
for (int32_t k = stride - 1; k > -1; k--) {
|
||||
if (j % addr_stride[k] == 0) {
|
||||
addr_offset -=
|
||||
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
|
||||
stride = k;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
addr_offset += dims_addr[axes_data_norepeat[stride]];
|
||||
data_prod *= input_data_addr[output_i_addr + addr_offset];
|
||||
}
|
||||
output_data_addr[i] = data_prod;
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
|
||||
"ReduceProd Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < output_data_num; i++) {
|
||||
// In the array, the actual address of the output.
|
||||
int64_t output_i_addr = 0;
|
||||
int64_t seq_tmp = i;
|
||||
for (int32_t j = dims_base_num - 1; j > -1; j--) {
|
||||
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
|
||||
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
|
||||
seq_tmp = next;
|
||||
output_i_addr += loc * dims_addr[dims_base[j]];
|
||||
if (seq_tmp == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
T1 data_prod = input_data_addr[output_i_addr];
|
||||
// In the array, the actual address of the element participating in the calculation.
|
||||
int64_t addr_offset = 0;
|
||||
for (int64_t j = 1; j < offset_num; j++) {
|
||||
int32_t stride = axes_data_num_const - 1;
|
||||
for (int32_t k = stride - 1; k > -1; k--) {
|
||||
if (j % addr_stride[k] == 0) {
|
||||
addr_offset -=
|
||||
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
|
||||
stride = k;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
addr_offset += dims_addr[axes_data_norepeat[stride]];
|
||||
data_prod *= input_data_addr[output_i_addr + addr_offset];
|
||||
}
|
||||
output_data_addr[i] = data_prod;
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t ReduceProdCpuKernel::ReduceProdCompute_Complex(CpuKernelContext &ctx) {
|
||||
Tensor *input_data = ctx.Input(0);
|
||||
auto input_data_addr = reinterpret_cast<T1 *>(input_data->GetData());
|
||||
const int64_t input_data_num = input_data->NumElements();
|
||||
auto input_data_shape = input_data->GetTensorShape();
|
||||
const int32_t input_data_dims = input_data_shape->GetDims();
|
||||
std::vector<int64_t> input_data_dimsize = input_data_shape->GetDimSizes();
|
||||
std::vector<int64_t> dims_addr(input_data_dims);
|
||||
dims_addr[input_data_dims - 1] = 1;
|
||||
int64_t addr_tmp = 1;
|
||||
for (int32_t i = input_data_dims - 2; i > -1; i--) {
|
||||
addr_tmp *= input_data_dimsize[i + 1];
|
||||
dims_addr[i] = addr_tmp;
|
||||
}
|
||||
Tensor *output_data = ctx.Output(0);
|
||||
auto output_data_shape = output_data->GetTensorShape();
|
||||
auto output_data_addr = reinterpret_cast<T1 *>(output_data->GetData());
|
||||
const int64_t output_data_num = output_data->NumElements();
|
||||
Tensor *axes_data = ctx.Input(1);
|
||||
auto axes_data_addr = reinterpret_cast<T2 *>(axes_data->GetData());
|
||||
int64_t axes_data_num = axes_data->NumElements();
|
||||
// Check the effectiveness of the value of axes
|
||||
for (int64_t i = 0; i < axes_data_num; i++) {
|
||||
if ((*(axes_data_addr + i) >= input_data_dims) || (*(axes_data_addr + i) < -input_data_dims)) {
|
||||
KERNEL_LOG_ERROR("The value of axes is incorrect.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
} else if (*(axes_data_addr + i) < 0) {
|
||||
*(axes_data_addr + i) += input_data_dims;
|
||||
}
|
||||
}
|
||||
std::sort(axes_data_addr, axes_data_addr + axes_data_num);
|
||||
std::vector<T2> axes_data_norepeat;
|
||||
for (int64_t i = 0; i < axes_data_num - 1; i++) {
|
||||
T2 value = axes_data_addr[i];
|
||||
if (value == axes_data_addr[i + 1]) {
|
||||
axes_data_num--;
|
||||
continue;
|
||||
}
|
||||
axes_data_norepeat.push_back(value);
|
||||
}
|
||||
axes_data_norepeat.push_back(axes_data_addr[axes_data_num - 1]);
|
||||
// deal with attr
|
||||
auto attr_value = ctx.GetAttr("keep_dims");
|
||||
bool keep_dims;
|
||||
if (attr_value == nullptr) {
|
||||
keep_dims = false;
|
||||
} else {
|
||||
keep_dims = static_cast<bool>(attr_value->GetBool());
|
||||
}
|
||||
if (axes_data_num == input_data_dims) {
|
||||
if (keep_dims) {
|
||||
std::vector<int64_t> dims_new(axes_data_num, 1);
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
} else {
|
||||
std::vector<int64_t> dims_new(1, 1);
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
}
|
||||
T1 data_prod;
|
||||
data_prod.real(1);
|
||||
data_prod.imag(0);
|
||||
for (int64_t i = 0; i < input_data_num; i++) {
|
||||
T1 data_cur = input_data_addr[i];
|
||||
data_prod = ComputeMul<T1>(data_prod, data_cur);
|
||||
}
|
||||
output_data_addr[0] = data_prod;
|
||||
} else {
|
||||
std::vector<int64_t> dims_new(input_data_shape->GetDimSizes());
|
||||
if (keep_dims) {
|
||||
for (auto iter = axes_data_norepeat.cbegin(); iter != axes_data_norepeat.cend(); iter++) {
|
||||
dims_new[*iter] = 1;
|
||||
}
|
||||
} else {
|
||||
for (auto iter = axes_data_norepeat.rbegin(); iter != axes_data_norepeat.rend(); iter++) {
|
||||
dims_new.erase(dims_new.begin() + (*iter));
|
||||
}
|
||||
}
|
||||
output_data_shape->SetDimSizes(dims_new);
|
||||
// Extract unspecified dimensions
|
||||
std::vector<T2> dims_base;
|
||||
const int32_t axes_data_num_const = axes_data_num;
|
||||
const int32_t dims_base_num = input_data_dims - axes_data_num_const;
|
||||
for (T2 i = 0; i < (T2)input_data_dims; i++) {
|
||||
bool cflag = true;
|
||||
for (int64_t j = 0; j < axes_data_num_const; j++) {
|
||||
if (axes_data_norepeat[j] == i) {
|
||||
cflag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (cflag) {
|
||||
dims_base.push_back(i);
|
||||
}
|
||||
}
|
||||
int64_t addr_stride[axes_data_num_const];
|
||||
addr_tmp = 1;
|
||||
addr_stride[axes_data_num_const - 1] = addr_tmp;
|
||||
for (int32_t i = axes_data_num_const - 2; i > -1; i--) {
|
||||
addr_tmp *= input_data_dimsize[axes_data_norepeat[i + 1]];
|
||||
addr_stride[i] = addr_tmp;
|
||||
}
|
||||
int64_t offset_num = addr_tmp * input_data_dimsize[axes_data_norepeat[0]];
|
||||
if ((input_data_num > 256 * 1024 && input_data_num / output_data_num > 256) || (output_data_num > 1024)) {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
|
||||
if (max_core_num > output_data_num) {
|
||||
max_core_num = output_data_num;
|
||||
}
|
||||
auto shard_compute = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
int64_t output_i_addr = 0;
|
||||
int64_t seq_tmp = i;
|
||||
for (int32_t j = dims_base_num - 1; j > -1; j--) {
|
||||
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
|
||||
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
|
||||
seq_tmp = next;
|
||||
output_i_addr += loc * dims_addr[dims_base[j]];
|
||||
if (seq_tmp == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
T1 data_prod = input_data_addr[output_i_addr];
|
||||
// In the array, the actual address of the element participating in the calculation.
|
||||
int64_t addr_offset = 0;
|
||||
for (int64_t j = 1; j < offset_num; j++) {
|
||||
int32_t stride = axes_data_num_const - 1;
|
||||
for (int32_t k = stride - 1; k > -1; k--) {
|
||||
if (j % addr_stride[k] == 0) {
|
||||
addr_offset -=
|
||||
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
|
||||
stride = k;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
addr_offset += dims_addr[axes_data_norepeat[stride]];
|
||||
T1 data_cur = input_data_addr[i];
|
||||
data_prod = ComputeMul<T1>(data_prod, data_cur);
|
||||
}
|
||||
output_data_addr[i] = data_prod;
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(
|
||||
CpuKernelUtils::ParallelFor(ctx, output_data_num, output_data_num / max_core_num, shard_compute),
|
||||
"ReduceProd Compute failed.");
|
||||
} else {
|
||||
for (int64_t i = 0; i < output_data_num; i++) {
|
||||
// In the array, the actual address of the output.
|
||||
int64_t output_i_addr = 0;
|
||||
int64_t seq_tmp = i;
|
||||
for (int32_t j = dims_base_num - 1; j > -1; j--) {
|
||||
int64_t next = seq_tmp / input_data_dimsize[dims_base[j]];
|
||||
int64_t loc = seq_tmp % input_data_dimsize[dims_base[j]];
|
||||
seq_tmp = next;
|
||||
output_i_addr += loc * dims_addr[dims_base[j]];
|
||||
if (seq_tmp == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
T1 data_prod = input_data_addr[output_i_addr];
|
||||
// In the array, the actual address of the element participating in the calculation.
|
||||
int64_t addr_offset = 0;
|
||||
for (int64_t j = 1; j < offset_num; j++) {
|
||||
int32_t stride = axes_data_num_const - 1;
|
||||
for (int32_t k = stride - 1; k > -1; k--) {
|
||||
if (j % addr_stride[k] == 0) {
|
||||
addr_offset -=
|
||||
(input_data_dimsize[axes_data_norepeat[stride]] - 1) * dims_addr[axes_data_norepeat[stride]];
|
||||
stride = k;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
addr_offset += dims_addr[axes_data_norepeat[stride]];
|
||||
T1 data_cur = input_data_addr[output_i_addr + addr_offset];
|
||||
data_prod = ComputeMul<T1>(data_prod, data_cur);
|
||||
}
|
||||
output_data_addr[i] = data_prod;
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kReduceProd, ReduceProdCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_REDUCEPROD_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_REDUCEPROD_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class ReduceProdCpuKernel : public CpuKernel {
|
||||
public:
|
||||
ReduceProdCpuKernel() = default;
|
||||
~ReduceProdCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static T ComputeMul(T num_1, T num_2);
|
||||
|
||||
template <typename T1, typename T2>
|
||||
static uint32_t ReduceProdCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T1, typename T2>
|
||||
static uint32_t ReduceProdCompute_Complex(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,107 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "relu.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *kRelu = "Relu";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
|
||||
#define RELU_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = ReluCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Relu kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t ReluCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Relu check input and output number failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
RELU_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
RELU_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
RELU_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
RELU_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
RELU_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
RELU_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
RELU_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
RELU_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
RELU_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Relu kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReluCpuKernel::DoCompute(int64_t start, int64_t end, const T *input1, T *output) {
|
||||
for (int64_t i = start; i < end; ++i) {
|
||||
T v = *(input1 + i);
|
||||
bool p = v > static_cast<T>(0);
|
||||
*(output + i) = p ? v : static_cast<T>(0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t ReluCpuKernel::ReluCompute(CpuKernelContext &ctx) {
|
||||
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Output(0)->NumElements();
|
||||
if (data_num >= kParallelDataNumSameShape) {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
|
||||
if (data_num <= kParallelDataNumSameShapeMid) {
|
||||
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
|
||||
}
|
||||
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
|
||||
auto sharder_relu = [&](int64_t start, int64_t end) { DoCompute<T>(start, end, in0, out); };
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_relu),
|
||||
"Relu Compute failed.");
|
||||
} else {
|
||||
DoCompute<T>(0, data_num, in0, out);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kRelu, ReluCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_RELU_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_RELU_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class ReluCpuKernel : public CpuKernel {
|
||||
public:
|
||||
ReluCpuKernel() = default;
|
||||
~ReluCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void DoCompute(int64_t start, int64_t end, const T *input1, T *output);
|
||||
template <typename T>
|
||||
uint32_t ReluCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,186 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "reversev2.h"
|
||||
#include <securec.h>
|
||||
#include "Eigen/Core"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "iostream"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
using namespace std;
|
||||
namespace {
|
||||
const uint32_t kInputNum = 2;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kReverseV2 = "ReverseV2";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t ReverseV2CpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
int x_max_dim = 8;
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "ReverseV2 check input or output is failed.");
|
||||
DataType axis_type = ctx.Input(1)->GetDataType();
|
||||
KERNEL_CHECK_FALSE((axis_type == DT_INT32 || axis_type == DT_INT64), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of [axis] need be DT_INT32 or DT_INT64.")
|
||||
auto x_shape = ctx.Input(0)->GetTensorShape();
|
||||
auto axis_shape = ctx.Input(1)->GetTensorShape();
|
||||
DataType data_type = DataType(ctx.Input(0)->GetDataType());
|
||||
std::vector<int64_t> reverse_shape;
|
||||
for (int i = 0; i < x_shape->GetDims(); i++) {
|
||||
reverse_shape.push_back(false);
|
||||
}
|
||||
// dims check
|
||||
if (x_shape->GetDims() == 0 || axis_shape->GetDims() == 0) {
|
||||
uint32_t ret = ComputeDiffType(data_type, reverse_shape, ctx);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
return ret;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
KERNEL_CHECK_FALSE((x_shape->GetDims() > 0 && x_shape->GetDims() <= x_max_dim), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Shapes of x is not support.")
|
||||
KERNEL_CHECK_FALSE((axis_shape->GetDims() == 1), KERNEL_STATUS_PARAM_INVALID, "Shapes of axis is not support.")
|
||||
|
||||
auto input0_datasize = ctx.Input(0)->GetDataSize();
|
||||
auto output_datasize = ctx.Output(0)->GetDataSize();
|
||||
KERNEL_CHECK_FALSE((input0_datasize == output_datasize), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data size of input0 [%d] need be same with "
|
||||
"output0 [%d].",
|
||||
input0_datasize, output_datasize)
|
||||
int64_t dim = x_shape->GetDims();
|
||||
auto input_axis = reinterpret_cast<int64_t *>(ctx.Input(1)->GetData());
|
||||
int64_t axis_element = axis_shape->NumElements();
|
||||
for (int j = 0; j < axis_element; j++) {
|
||||
int64_t realdim = *(input_axis + j) < 0 ? dim + *(input_axis + j) : *(input_axis + j);
|
||||
KERNEL_CHECK_FALSE((realdim >= 0 && realdim < dim), KERNEL_STATUS_PARAM_INVALID, "[%d] is invalid", realdim)
|
||||
KERNEL_CHECK_FALSE((!reverse_shape[realdim]), KERNEL_STATUS_PARAM_INVALID, "axis [%d], specified more than once.",
|
||||
realdim)
|
||||
reverse_shape[realdim] = true;
|
||||
}
|
||||
uint32_t ret = ComputeDiffType(data_type, reverse_shape, ctx);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
return ret;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t ReverseV2CpuKernel::ComputeDiffType(DataType data_type, std::vector<int64_t> reverse_shape,
|
||||
CpuKernelContext &ctx) {
|
||||
switch (data_type) {
|
||||
case DT_FLOAT16:
|
||||
return ComputeReverseV2<Eigen::half>(reverse_shape, ctx);
|
||||
case DT_FLOAT:
|
||||
return ComputeReverseV2<float>(reverse_shape, ctx);
|
||||
case DT_DOUBLE:
|
||||
return ComputeReverseV2<double>(reverse_shape, ctx);
|
||||
case DT_UINT8:
|
||||
return ComputeReverseV2<uint8_t>(reverse_shape, ctx);
|
||||
case DT_INT8:
|
||||
return ComputeReverseV2<int8_t>(reverse_shape, ctx);
|
||||
case DT_UINT16:
|
||||
return ComputeReverseV2<uint16_t>(reverse_shape, ctx);
|
||||
case DT_INT16:
|
||||
return ComputeReverseV2<int16_t>(reverse_shape, ctx);
|
||||
case DT_INT32:
|
||||
return ComputeReverseV2<int32_t>(reverse_shape, ctx);
|
||||
case DT_INT64:
|
||||
return ComputeReverseV2<int64_t>(reverse_shape, ctx);
|
||||
case DT_BOOL:
|
||||
return ComputeReverseV2<bool>(reverse_shape, ctx);
|
||||
case DT_COMPLEX64:
|
||||
return ComputeReverseV2<std::complex<float>>(reverse_shape, ctx);
|
||||
case DT_COMPLEX128:
|
||||
return ComputeReverseV2<std::complex<double>>(reverse_shape, ctx);
|
||||
case DT_STRING:
|
||||
return ComputeReverseV2<string>(reverse_shape, ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("ReverseV2 invalid input type[%s]", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t ReverseV2CpuKernel::ComputeReverseV2(std::vector<int64_t> reverse_shape, CpuKernelContext &ctx) {
|
||||
auto x_shape = ctx.Input(0)->GetTensorShape();
|
||||
auto input_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
if (x_shape->GetDims() == 0) {
|
||||
*(output_data) = *(input_data);
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
auto axis_shape = ctx.Input(1)->GetTensorShape();
|
||||
if (axis_shape->GetDims() == 0) {
|
||||
for (int i = 0; i < x_shape->NumElements(); i++) {
|
||||
*(output_data + i) = *(input_data + i);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
int64_t front = 1;
|
||||
int64_t shape_element = x_shape->NumElements();
|
||||
int64_t dim = x_shape->GetDims();
|
||||
std::vector<int64_t> dims = x_shape->GetDimSizes();
|
||||
bool redo = false;
|
||||
for (int j = 0; j < dim; j++) {
|
||||
front = front * dims[j];
|
||||
if (j != dim - 1 && reverse_shape[j] == true) {
|
||||
if (redo == true) {
|
||||
auto copy_size = shape_element * sizeof(T);
|
||||
auto ret_mem = memcpy_s(input_data, copy_size, output_data, copy_size);
|
||||
KERNEL_CHECK_FALSE(ret_mem == EOK, KERNEL_STATUS_INNER_ERROR, "Memcpy failed, size = [%zu].", copy_size);
|
||||
}
|
||||
int64_t row_size = shape_element / front;
|
||||
int64_t input_forward = (dims[j] - 1) * row_size;
|
||||
int64_t save = input_forward;
|
||||
int64_t output_forward = 0;
|
||||
int64_t behind = shape_element / (front / dims[j]);
|
||||
for (int k = 0; k < front / dims[j]; k++) {
|
||||
int64_t remain = dims[j];
|
||||
while (remain > 0) {
|
||||
auto copy_size = row_size * sizeof(T);
|
||||
auto cur_output = output_data + output_forward;
|
||||
auto cur_input = input_data + input_forward;
|
||||
auto ret_mem = memcpy_s(cur_output, copy_size, cur_input, copy_size);
|
||||
KERNEL_CHECK_FALSE(ret_mem == EOK, KERNEL_STATUS_INNER_ERROR, "Memcpy size[%zu] from input to output failed.",
|
||||
copy_size);
|
||||
input_forward = input_forward - row_size;
|
||||
output_forward = output_forward + row_size;
|
||||
remain--;
|
||||
}
|
||||
save = save + behind;
|
||||
input_forward = save;
|
||||
}
|
||||
redo = true;
|
||||
} else if (j == dim - 1 && reverse_shape[j] == true) {
|
||||
if (redo == true) {
|
||||
auto copy_size = shape_element * sizeof(T);
|
||||
auto ret_mem = memcpy_s(input_data, copy_size, output_data, copy_size);
|
||||
KERNEL_CHECK_FALSE(ret_mem == EOK, KERNEL_STATUS_INNER_ERROR, "Memcpy failed, size = [%zu].", copy_size);
|
||||
}
|
||||
int64_t output_forward = 0;
|
||||
for (int k = 0; k < shape_element / dims[j]; k++) {
|
||||
for (int i = dims[j] - 1; i >= 0; i--) {
|
||||
*(output_data + output_forward) = *(input_data + i + k * dims[j]);
|
||||
output_forward++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kReverseV2, ReverseV2CpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_REVERSEV2_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_REVERSEV2_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class ReverseV2CpuKernel : public CpuKernel {
|
||||
public:
|
||||
ReverseV2CpuKernel() = default;
|
||||
~ReverseV2CpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t ComputeDiffType(DataType data_type, std::vector<int64_t> reverse_shape, CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t ComputeReverseV2(std::vector<int64_t> reverse_shape, CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,161 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "rgb_to_hsv.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
constexpr size_t kInputShapeRank = 3;
|
||||
constexpr size_t kOutputShapeRank = 3;
|
||||
constexpr int64_t kImageChannels = 3;
|
||||
const char *kInputStr = "input";
|
||||
const char *kOutputStr = "output";
|
||||
const char *kRGBToHSV = "RGBToHSV";
|
||||
// when input data size is more than kParallelDataNum, use Parallel func
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
|
||||
const std::map<std::string, RGBToHSVCpuKernel::KernelFunction> RGBToHSVCpuKernel::kernels_ = {
|
||||
{"(DT_FLOAT16,DT_FLOAT16)", &RGBToHSVCpuKernel::DoCompute<Eigen::half, Eigen::half>},
|
||||
{"(DT_FLOAT,DT_FLOAT)", &RGBToHSVCpuKernel::DoCompute<float, float>},
|
||||
{"(DT_DOUBLE,DT_DOUBLE)", &RGBToHSVCpuKernel::DoCompute<double, double>}};
|
||||
|
||||
const std::vector<std::string> RGBToHSVCpuKernel::kernels_name_ = {"(DT_FLOAT16,DT_FLOAT16)", "(DT_FLOAT,DT_FLOAT)",
|
||||
"(DT_DOUBLE,DT_DOUBLE)"};
|
||||
|
||||
template <typename T1, typename T2>
|
||||
uint32_t RGBToHSVCpuKernel::DoCompute(CpuKernelContext &ctx) {
|
||||
Tensor *input_tensor = ctx.Input(0);
|
||||
Tensor *output_tensor = ctx.Output(0);
|
||||
auto input_shape = input_tensor->GetTensorShape()->GetDimSizes();
|
||||
int64_t input0_elements_nums = input_tensor->NumElements();
|
||||
auto output_shape = output_tensor->GetTensorShape()->GetDimSizes();
|
||||
auto input_data = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
|
||||
auto out = reinterpret_cast<T2 *>(ctx.Output(0)->GetData());
|
||||
|
||||
for (int64_t i = 0; i < input0_elements_nums; i = i + 3) {
|
||||
auto t_red = *(input_data + i);
|
||||
auto t_green = *(input_data + i + 1);
|
||||
auto t_blue = *(input_data + i + 2);
|
||||
auto t_value = std::max(std::max(t_red, t_blue), t_green);
|
||||
auto t_minimum = std::min(std::min(t_red, t_blue), t_green);
|
||||
auto range = t_value - t_minimum;
|
||||
auto t_saturation = t_value > static_cast<T1>(0) ? (range / t_value) : static_cast<T1>(0);
|
||||
auto norm = static_cast<T1>(1.0) / static_cast<T1>(6.0) / range;
|
||||
auto t_hue = t_green == t_value ? (norm * (t_blue - t_red) + static_cast<T1>(2.0) / static_cast<T1>(6.0))
|
||||
: (norm * (t_red - t_green) + static_cast<T1>(4.0) / static_cast<T1>(6.0));
|
||||
t_hue = t_red == t_value ? (norm * (t_green - t_blue)) : t_hue;
|
||||
t_hue = range > static_cast<T1>(0) ? t_hue : static_cast<T1>(0);
|
||||
t_hue = t_hue < static_cast<T1>(0) ? (t_hue + static_cast<T1>(1)) : t_hue;
|
||||
*(out + i) = t_hue;
|
||||
*(out + i + 1) = t_saturation;
|
||||
*(out + i + 2) = t_value;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t RGBToHSVCpuKernel::CheckParam(CpuKernelContext &ctx, const std::string &in_or_out, uint32_t index,
|
||||
size_t rank) {
|
||||
Tensor *param = nullptr;
|
||||
if (in_or_out == kInputStr) {
|
||||
param = ctx.Input(index);
|
||||
} else if (in_or_out == kOutputStr) {
|
||||
param = ctx.Output(index);
|
||||
}
|
||||
std::string err_header = ConcatString(kRGBToHSV, " op ", in_or_out, "[", index, "]");
|
||||
|
||||
KERNEL_CHECK_NULLPTR(param, KERNEL_STATUS_PARAM_INVALID, "%s tensor is nullptr.", err_header.c_str());
|
||||
|
||||
auto param_shape = param->GetTensorShape();
|
||||
KERNEL_CHECK_NULLPTR(param_shape, KERNEL_STATUS_PARAM_INVALID, "%s tensor shape is nullptr.", err_header.c_str());
|
||||
auto param_dim_sizes = param_shape->GetDimSizes();
|
||||
if (param_dim_sizes.size() < 1) {
|
||||
KERNEL_LOG_ERROR("%s shape rank must be at least 1, but got shape[%zu].", err_header.c_str(),
|
||||
VectorToString(param_dim_sizes).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (param->GetData() == nullptr) {
|
||||
KERNEL_CHECK_NULLPTR(param, KERNEL_STATUS_PARAM_INVALID, "%s tensor data is nullptr.", err_header.c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t RGBToHSVCpuKernel::CheckShapes(CpuKernelContext &ctx) {
|
||||
auto input0_shape = ctx.Input(kFirstInputIndex)->GetTensorShape()->GetDimSizes();
|
||||
if (input0_shape.back() != kImageChannels) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"%s op input[0] shape last dim should be [%d], but got "
|
||||
"shape[%s].",
|
||||
kRGBToHSV, kImageChannels, VectorToString(input0_shape).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t RGBToHSVCpuKernel::CheckParams(CpuKernelContext &ctx) {
|
||||
auto ret = CheckParam(ctx, kInputStr, kFirstInputIndex, kInputShapeRank);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = CheckShapes(ctx);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
return ret;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t RGBToHSVCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
auto input0 = ctx.Input(kFirstInputIndex);
|
||||
KERNEL_CHECK_NULLPTR(input0, KERNEL_STATUS_PARAM_INVALID, "%s input[0] tensor is nullptr.", kRGBToHSV);
|
||||
DataType input0_data_type = input0->GetDataType();
|
||||
KERNEL_LOG_DEBUG("%s op input[0] data type is [%s].", kRGBToHSV, DTypeStr(input0_data_type).c_str());
|
||||
|
||||
auto output = ctx.Output(kFirstOutputIndex);
|
||||
KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "%s output[0] tensor is nullptr.", kRGBToHSV);
|
||||
DataType output_data_type = output->GetDataType();
|
||||
KERNEL_LOG_DEBUG("%s op output[0] data type is [%s].", kRGBToHSV, DTypeStr(output_data_type).c_str());
|
||||
|
||||
std::string kernel_name = ConcatString("(", DTypeStr(input0_data_type), ",", DTypeStr(output_data_type), ")");
|
||||
|
||||
auto it = kernels_.find(kernel_name);
|
||||
if (it != kernels_.end()) {
|
||||
auto ret = CheckParams(ctx);
|
||||
if (ret != KERNEL_STATUS_OK) {
|
||||
return ret;
|
||||
}
|
||||
auto kernel = it->second;
|
||||
ret = kernel(ctx);
|
||||
KERNEL_LOG_DEBUG("%s op end.", kRGBToHSV);
|
||||
return ret;
|
||||
}
|
||||
|
||||
KERNEL_LOG_ERROR("%s op only support data type [%s], but got [%s].", kRGBToHSV, VectorToString(kernels_name_).c_str(),
|
||||
kernel_name.c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kRGBToHSV, RGBToHSVCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,51 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_RGBToHSV_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_RGBToHSV_H_
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class RGBToHSVCpuKernel : public CpuKernel {
|
||||
public:
|
||||
RGBToHSVCpuKernel() = default;
|
||||
|
||||
~RGBToHSVCpuKernel() = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename TInput, typename TOutput>
|
||||
static uint32_t DoCompute(CpuKernelContext &ctx);
|
||||
|
||||
uint32_t CheckParams(CpuKernelContext &ctx);
|
||||
|
||||
uint32_t CheckParam(CpuKernelContext &ctx, const std::string &in_or_out, uint32_t index, size_t rank);
|
||||
|
||||
uint32_t CheckShapes(CpuKernelContext &ctx);
|
||||
|
||||
private:
|
||||
using KernelFunction = uint32_t (*)(CpuKernelContext &ctx);
|
||||
static const std::map<std::string, KernelFunction> kernels_;
|
||||
static const std::vector<std::string> kernels_name_;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,163 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "rsqrt_grad.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <complex>
|
||||
#include <iostream>
|
||||
|
||||
#include "utils/eigen_tensor.h"
|
||||
|
||||
namespace {
|
||||
const char *kRsqrtGrad = "RsqrtGrad";
|
||||
constexpr uint32_t kOutputNum = 1;
|
||||
constexpr uint32_t kInputNum = 2;
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t RsqrtGradCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
Tensor *input_0 = ctx.Input(kFirstInputIndex);
|
||||
Tensor *input_1 = ctx.Input(kSecondInputIndex);
|
||||
if ((input_0->GetDataSize() == 0) || (input_1->GetDataSize() == 0)) {
|
||||
KERNEL_LOG_INFO("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
// choose compute function depend on dataType
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT16:
|
||||
return RsqrtGradComputeFP16<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return RsqrtGradCompute<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return RsqrtGradCompute<double>(ctx);
|
||||
case DT_INT8:
|
||||
return RsqrtGradCompute<int8_t>(ctx);
|
||||
case DT_INT32:
|
||||
return RsqrtGradCompute<int32_t>(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return RsqrtGradComputeComplex<std::complex<double>>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return RsqrtGradComputeComplex<std::complex<float>>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
aicpu::DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t RsqrtGradCpuKernel::RsqrtGradComputeFP16(CpuKernelContext &ctx) {
|
||||
Tensor *y = ctx.Input(0);
|
||||
Tensor *dy = ctx.Input(1);
|
||||
Tensor *z = ctx.Output(0);
|
||||
auto y_ptr = reinterpret_cast<T *>(y->GetData());
|
||||
auto dy_ptr = reinterpret_cast<T *>(dy->GetData());
|
||||
auto z_ptr = reinterpret_cast<T *>(z->GetData());
|
||||
int32_t input_0_num = y->GetTensorShape()->NumElements();
|
||||
int32_t input_1_num = dy->GetTensorShape()->NumElements();
|
||||
|
||||
if (input_0_num >= input_1_num) {
|
||||
for (int32_t i = 0; i < input_1_num; i++) {
|
||||
z_ptr[i] =
|
||||
static_cast<T>((static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i])) *
|
||||
(static_cast<double>(dy_ptr[i]) / (static_cast<double>(-2))));
|
||||
}
|
||||
for (int32_t i = input_1_num; i < input_0_num; i++) {
|
||||
z_ptr[i] = (T)(0);
|
||||
}
|
||||
} else {
|
||||
for (int32_t i = 0; i < input_0_num; i++) {
|
||||
z_ptr[i] =
|
||||
static_cast<T>((static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i]) * static_cast<double>(y_ptr[i])) *
|
||||
(static_cast<double>(dy_ptr[i]) / (static_cast<double>(-2))));
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t RsqrtGradCpuKernel::RsqrtGradCompute(CpuKernelContext &ctx) {
|
||||
Tensor *y = ctx.Input(0);
|
||||
Tensor *dy = ctx.Input(1);
|
||||
Tensor *z = ctx.Output(0);
|
||||
|
||||
KERNEL_CHECK_NULLPTR(z->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get output data failed",
|
||||
ctx.GetOpType().c_str())
|
||||
KERNEL_LOG_INFO(
|
||||
"[%s] Input[0] data size is [%llu], input[1] data size is [%llu], output "
|
||||
"data size is [%llu].",
|
||||
ctx.GetOpType().c_str(), y->GetDataSize(), dy->GetDataSize(), z->GetDataSize());
|
||||
auto y_ptr = reinterpret_cast<T *>(y->GetData());
|
||||
auto dy_ptr = reinterpret_cast<T *>(dy->GetData());
|
||||
auto z_ptr = reinterpret_cast<T *>(z->GetData());
|
||||
int32_t input_0_num = y->GetTensorShape()->NumElements();
|
||||
int32_t input_1_num = dy->GetTensorShape()->NumElements();
|
||||
|
||||
if (input_0_num >= input_1_num) {
|
||||
for (int32_t i = 0; i < input_1_num; i++) {
|
||||
z_ptr[i] = (dy_ptr[i] * y_ptr[i] * y_ptr[i] * y_ptr[i]) / (static_cast<T>(-2));
|
||||
}
|
||||
for (int32_t i = input_1_num; i < input_0_num; i++) {
|
||||
z_ptr[i] = (T)(0);
|
||||
}
|
||||
} else {
|
||||
for (int32_t i = 0; i < input_0_num; i++) {
|
||||
z_ptr[i] = (dy_ptr[i] * y_ptr[i] * y_ptr[i] * y_ptr[i]) / (static_cast<T>(-2));
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t RsqrtGradCpuKernel::RsqrtGradComputeComplex(CpuKernelContext &ctx) {
|
||||
Tensor *y = ctx.Input(0);
|
||||
Tensor *dy = ctx.Input(1);
|
||||
Tensor *z = ctx.Output(0);
|
||||
|
||||
KERNEL_CHECK_NULLPTR(z->GetData(), KERNEL_STATUS_PARAM_INVALID, "[%s] Get output data failed",
|
||||
ctx.GetOpType().c_str())
|
||||
KERNEL_LOG_INFO(
|
||||
"[%s] Input[0] data size is [%llu], input[1] data size is [%llu], output "
|
||||
"data size is [%llu].",
|
||||
ctx.GetOpType().c_str(), y->GetDataSize(), dy->GetDataSize(), z->GetDataSize());
|
||||
auto y_ptr = reinterpret_cast<T *>(y->GetData());
|
||||
auto dy_ptr = reinterpret_cast<T *>(dy->GetData());
|
||||
auto z_ptr = reinterpret_cast<T *>(z->GetData());
|
||||
int32_t input_0_num = y->GetTensorShape()->NumElements();
|
||||
int32_t input_1_num = dy->GetTensorShape()->NumElements();
|
||||
if (input_0_num >= input_1_num) {
|
||||
for (int32_t i = 0; i < input_1_num; i++) {
|
||||
z_ptr[i] = (dy_ptr[i] * conj(y_ptr[i]) * conj(y_ptr[i]) * conj(y_ptr[i])) * (static_cast<T>(-0.5));
|
||||
}
|
||||
for (int32_t i = input_1_num; i < input_0_num; i++) {
|
||||
z_ptr[i] = static_cast<T>(0);
|
||||
}
|
||||
} else {
|
||||
for (int32_t i = 0; i < input_0_num; i++) {
|
||||
z_ptr[i] = (dy_ptr[i] * conj(y_ptr[i]) * conj(y_ptr[i]) * conj(y_ptr[i])) * (static_cast<T>(-0.5));
|
||||
}
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kRsqrtGrad, RsqrtGradCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,48 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_MUL_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_MUL_H_
|
||||
#define EIGEN_USE_THREADS
|
||||
#define EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
#include <Eigen/Dense>
|
||||
|
||||
namespace aicpu {
|
||||
class RsqrtGradCpuKernel : public CpuKernel {
|
||||
public:
|
||||
RsqrtGradCpuKernel() = default;
|
||||
~RsqrtGradCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t RsqrtGradCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t RsqrtGradComputeComplex(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t RsqrtGradComputeFP16(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_MUL_H_
|
|
@ -0,0 +1,421 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sample_distorted_bounding_box_ext2.h"
|
||||
|
||||
#include <random>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 3;
|
||||
const uint32_t kInputNum = 3;
|
||||
const char *kSDBBExt2 = "SampleDistortedBoundingBoxExt2";
|
||||
|
||||
#define SDBBExt2CpuKernel_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = SDBBExt2Compute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("SampleDistortedBoundingBoxExt2 kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint64_t SDBBExt2CpuKernel::New64() {
|
||||
std::random_device device("/dev/urandom");
|
||||
static std::mt19937_64 rng = std::mt19937_64(device());
|
||||
return (rng)();
|
||||
}
|
||||
|
||||
void SDBBExt2CpuKernel::InitPhiloxRandom(int64_t seed, int64_t seed2) {
|
||||
if (seed == 0 && seed2 == 0) {
|
||||
seed = New64();
|
||||
seed2 = New64();
|
||||
}
|
||||
generator_ = PhiloxRandom(seed, seed2);
|
||||
}
|
||||
|
||||
float SDBBExt2CpuKernel::RandFloat() {
|
||||
uint32_t x = GenerateSingle();
|
||||
const uint32_t man = x & 0x7fffffu; // 23 bit mantissa
|
||||
const uint32_t exp = static_cast<uint32_t>(127);
|
||||
const uint32_t val = (exp << 23) | man;
|
||||
|
||||
float result;
|
||||
memcpy(&result, &val, sizeof(val));
|
||||
return result - 1.0f;
|
||||
}
|
||||
|
||||
uint32_t SDBBExt2CpuKernel::Uniform(uint32_t n) {
|
||||
if (n == 0) {
|
||||
return GenerateSingle() * n;
|
||||
} else if (0 == (n & (n - 1))) {
|
||||
return GenerateSingle() & (n - 1);
|
||||
} else {
|
||||
const uint32_t range = ~static_cast<uint32_t>(0);
|
||||
const uint32_t rem = (range % n) + 1;
|
||||
uint32_t rnd;
|
||||
do {
|
||||
rnd = GenerateSingle();
|
||||
} while (rnd < rem);
|
||||
return rnd % n;
|
||||
}
|
||||
}
|
||||
|
||||
SDBBExt2CpuKernel::ResultElementType SDBBExt2CpuKernel::GenerateSingle() {
|
||||
if (used_result_index_ == PhiloxRandom::kResultElementCount) {
|
||||
unused_results_ = generator_();
|
||||
used_result_index_ = 0;
|
||||
}
|
||||
return unused_results_[used_result_index_++];
|
||||
}
|
||||
|
||||
bool SDBBExt2CpuKernel::SatisfiesOverlapConstraints(const Rectangle &crop, float minimum_object_covered,
|
||||
const std::vector<Rectangle> &bounding_boxes) {
|
||||
const float kMinArea = 1.0;
|
||||
if (crop.Area() < kMinArea) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_object_covered = false;
|
||||
for (const auto &bbox : bounding_boxes) {
|
||||
const float object_area = bbox.Area();
|
||||
if (object_area < kMinArea) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (object_area == 0) {
|
||||
continue;
|
||||
}
|
||||
const float object_covered = crop.Intersect(bbox).Area() / object_area;
|
||||
if (object_covered >= minimum_object_covered) {
|
||||
is_object_covered = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return is_object_covered;
|
||||
}
|
||||
|
||||
bool SDBBExt2CpuKernel::GenerateRandomCrop(int original_width, int original_height, float min_relative_crop_area,
|
||||
float max_relative_crop_area, float aspect_ratio, Rectangle *crop_rect) {
|
||||
if (max_relative_crop_area <= 0.0 || aspect_ratio <= 0.0 || original_width <= 0 || original_height <= 0 ||
|
||||
min_relative_crop_area > max_relative_crop_area) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const float min_area = min_relative_crop_area * original_width * original_height;
|
||||
const float max_area = max_relative_crop_area * original_width * original_height;
|
||||
|
||||
if (aspect_ratio == 0) {
|
||||
return false;
|
||||
}
|
||||
int height = static_cast<int>(lrintf(std::sqrt(min_area / aspect_ratio)));
|
||||
if (aspect_ratio == 0) {
|
||||
return false;
|
||||
}
|
||||
int max_height = static_cast<int>(lrintf(std::sqrt(max_area / aspect_ratio)));
|
||||
if (lrintf(max_height * aspect_ratio) > original_width) {
|
||||
const float kEps = 0.0000001;
|
||||
const float kBias = 0.5;
|
||||
if (aspect_ratio == 0) {
|
||||
return false;
|
||||
}
|
||||
max_height = static_cast<int>((original_width + kBias - kEps) / aspect_ratio);
|
||||
if (lrintf(max_height * aspect_ratio) > original_width) {
|
||||
max_height -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (max_height > original_height) {
|
||||
max_height = original_height;
|
||||
}
|
||||
|
||||
if (height >= max_height) {
|
||||
height = max_height;
|
||||
}
|
||||
|
||||
if (height < max_height) {
|
||||
height += Uniform(max_height - height + 1);
|
||||
}
|
||||
int width = static_cast<int>(lrintf(height * aspect_ratio));
|
||||
float area = static_cast<float>(width * height);
|
||||
if (area < min_area) {
|
||||
height += 1;
|
||||
width = static_cast<int>(lrintf(height * aspect_ratio));
|
||||
area = width * height;
|
||||
}
|
||||
|
||||
if (area > max_area) {
|
||||
height -= 1;
|
||||
width = static_cast<int>(lrintf(height * aspect_ratio));
|
||||
area = width * height;
|
||||
}
|
||||
|
||||
if (area < min_area || area > max_area || width > original_width || height > original_height || width <= 0 ||
|
||||
height <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int y = 0;
|
||||
if (height < original_height) {
|
||||
y = Uniform(original_height - height);
|
||||
}
|
||||
int x = 0;
|
||||
if (width < original_width) {
|
||||
x = Uniform(original_width - width);
|
||||
}
|
||||
|
||||
crop_rect->min_x_ = x;
|
||||
crop_rect->min_y_ = y;
|
||||
crop_rect->max_x_ = x + width;
|
||||
crop_rect->max_y_ = y + height;
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t SDBBExt2CpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"SampleDistortedBoundingBoxExt2 check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(SDBBExt2Check(ctx), "SampleDistortedBoundingBoxExt2 check params or bcast failed.");
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
SDBBExt2CpuKernel_COMPUTE_CASE(DT_UINT8, uint8_t, ctx) SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT16, int16_t, ctx) SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
SDBBExt2CpuKernel_COMPUTE_CASE(DT_INT64, int64_t, ctx) default
|
||||
: KERNEL_LOG_ERROR("SampleDistortedBoundingBoxExt2 kernel data type [%s] not support.",
|
||||
DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SDBBExt2CpuKernel::SDBBExt2Check(CpuKernelContext &ctx) {
|
||||
auto image_size = ctx.Input(0);
|
||||
auto bounding_boxes = ctx.Input(1);
|
||||
auto min_object_covered = ctx.Input(2);
|
||||
auto begin = ctx.Output(0);
|
||||
auto size = ctx.Output(1);
|
||||
auto bboxes = ctx.Output(2);
|
||||
KERNEL_CHECK_NULLPTR(image_size->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 0 data failed.")
|
||||
KERNEL_CHECK_NULLPTR(bounding_boxes->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 1 data failed.")
|
||||
KERNEL_CHECK_NULLPTR(min_object_covered->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input 2 data failed.")
|
||||
KERNEL_CHECK_NULLPTR(begin->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 0 data failed")
|
||||
KERNEL_CHECK_NULLPTR(size->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 1 data failed")
|
||||
KERNEL_CHECK_NULLPTR(bboxes->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output 2 data failed")
|
||||
|
||||
auto attr_seed = ctx.GetAttr("seed");
|
||||
KERNEL_CHECK_NULLPTR(attr_seed, KERNEL_STATUS_PARAM_INVALID, "Get seed attr failed.")
|
||||
seed = attr_seed->GetInt();
|
||||
|
||||
auto attr_seed2 = ctx.GetAttr("seed2");
|
||||
KERNEL_CHECK_NULLPTR(attr_seed2, KERNEL_STATUS_PARAM_INVALID, "Get seed2 attr failed.")
|
||||
seed2 = attr_seed2->GetInt();
|
||||
|
||||
auto attr_aspect_ratio_range = ctx.GetAttr("aspect_ratio_range");
|
||||
KERNEL_CHECK_NULLPTR(attr_aspect_ratio_range, KERNEL_STATUS_PARAM_INVALID, "Get aspect_ratio_range attr failed.")
|
||||
aspect_ratio_range = attr_aspect_ratio_range->GetListFloat();
|
||||
|
||||
auto attr_area_range = ctx.GetAttr("area_range");
|
||||
KERNEL_CHECK_NULLPTR(attr_area_range, KERNEL_STATUS_PARAM_INVALID, "Get area_range attr failed.")
|
||||
area_range = attr_area_range->GetListFloat();
|
||||
|
||||
auto attr_max_attempts = ctx.GetAttr("max_attempts");
|
||||
KERNEL_CHECK_NULLPTR(attr_max_attempts, KERNEL_STATUS_PARAM_INVALID, "Get max_attempts attr failed.")
|
||||
max_attempts = attr_max_attempts->GetInt();
|
||||
|
||||
auto attr_use_image_if_no_bounding_boxes = ctx.GetAttr("use_image_if_no_bounding_boxes");
|
||||
KERNEL_CHECK_NULLPTR(attr_use_image_if_no_bounding_boxes, KERNEL_STATUS_PARAM_INVALID,
|
||||
"Get use_image_if_no_bounding_boxes attr failed.")
|
||||
use_image_if_no_bounding_boxes = attr_use_image_if_no_bounding_boxes->GetBool();
|
||||
|
||||
KERNEL_CHECK_NULLPTR(image_size->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input image_size shape failed.")
|
||||
KERNEL_CHECK_NULLPTR(bounding_boxes->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Get input bounding_boxes shape failed.")
|
||||
KERNEL_CHECK_NULLPTR(min_object_covered->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Get input min_object_covered shape failed.")
|
||||
|
||||
std::vector<int64_t> shape_image_size = image_size->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> shape_bounding_boxes = bounding_boxes->GetTensorShape()->GetDimSizes();
|
||||
|
||||
KERNEL_CHECK_FALSE((shape_image_size.size() == 1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"image_size must be 1-dimensional, got: [%d].", shape_image_size.size())
|
||||
const int image_size_num = 3;
|
||||
KERNEL_CHECK_FALSE((shape_image_size.at(0) == image_size_num), KERNEL_STATUS_PARAM_INVALID,
|
||||
"image_size must contain 3 elements, got: [%d].", shape_image_size.size())
|
||||
|
||||
const int shape_bounding_boxes_size = 3;
|
||||
KERNEL_CHECK_FALSE((shape_bounding_boxes.size() == shape_bounding_boxes_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"input boxes must be 3-dimensional [batch, num_boxes, "
|
||||
"coords], got: [%d].",
|
||||
shape_bounding_boxes.size())
|
||||
const int bounding_boxes_size = 4;
|
||||
KERNEL_CHECK_FALSE((shape_bounding_boxes.at(shape_bounding_boxes.size() - 1) == bounding_boxes_size),
|
||||
KERNEL_STATUS_PARAM_INVALID, "bounding boxes must have shape [4], got: [%d].",
|
||||
shape_bounding_boxes.at(shape_bounding_boxes.size() - 1))
|
||||
|
||||
const int aspect_ratio_range_size = 2;
|
||||
KERNEL_CHECK_FALSE((aspect_ratio_range.size() == aspect_ratio_range_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Aspect ratio range field must specify 2 dimensions.")
|
||||
KERNEL_CHECK_FALSE((aspect_ratio_range[0] > 0 && aspect_ratio_range[1] > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Aspect ratio range must be positive: [%f], [%f].", aspect_ratio_range[0], aspect_ratio_range[1])
|
||||
|
||||
const int area_range_size = 2;
|
||||
KERNEL_CHECK_FALSE((area_range.size() == area_range_size), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Area range field must specify 2 dimensions.")
|
||||
KERNEL_CHECK_FALSE((area_range[0] > 0 && area_range[1] > 0), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Area range must be positive: [%f], [%f].", area_range[0], area_range[1])
|
||||
KERNEL_CHECK_FALSE((area_range[0] <= 1 && area_range[1] <= 1), KERNEL_STATUS_PARAM_INVALID,
|
||||
"Area range must be less then or equal to 1.0: [%f], [%f].", area_range[0], area_range[1])
|
||||
|
||||
KERNEL_CHECK_FALSE((max_attempts > 0), KERNEL_STATUS_PARAM_INVALID, "Max attempts must be positive: [%d]",
|
||||
max_attempts)
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SDBBExt2CpuKernel::SDBBExt2Compute(CpuKernelContext &ctx) {
|
||||
auto image_size = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto bounding_boxes = reinterpret_cast<float *>(ctx.Input(1)->GetData());
|
||||
auto min_object_covered = reinterpret_cast<float *>(ctx.Input(2)->GetData());
|
||||
auto begin = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto size = reinterpret_cast<T *>(ctx.Output(1)->GetData());
|
||||
auto bboxes = reinterpret_cast<float *>(ctx.Output(2)->GetData());
|
||||
|
||||
const int32_t height = static_cast<int32_t>(image_size[0]);
|
||||
const int32_t width = static_cast<int32_t>(image_size[1]);
|
||||
if (!(height > 0 && width > 0)) {
|
||||
KERNEL_LOG_ERROR("Image height and width must be positive, got: [%d] and [%d]", height, width);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
float min_object_covered_val = 0.0;
|
||||
min_object_covered_val = *min_object_covered;
|
||||
if (min_object_covered_val < 0.0 || min_object_covered_val > 1.0) {
|
||||
KERNEL_LOG_ERROR("min_object_covered must be in [0.0, 1.0], got: [%f]", min_object_covered_val);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
const int index_y_min = 0;
|
||||
const int index_x_min = 1;
|
||||
const int index_y_max = 2;
|
||||
const int index_x_max = 3;
|
||||
const int kBBoxSize = 4;
|
||||
std::vector<Rectangle> boxes;
|
||||
int64_t size_bounding_boxes = ctx.Input(1)->NumElements();
|
||||
if (size_bounding_boxes > 0) {
|
||||
for (int b = 0; b < size_bounding_boxes / kBBoxSize; ++b) {
|
||||
if (!(bounding_boxes[b * kBBoxSize + index_x_min] < bounding_boxes[b * kBBoxSize + index_x_max])) {
|
||||
KERNEL_LOG_ERROR("x_min must be less than x_max, got: [%f] and [%f]",
|
||||
bounding_boxes[b * kBBoxSize + index_x_min], bounding_boxes[b * kBBoxSize + index_x_max]);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
if (!(bounding_boxes[b * kBBoxSize + index_y_min] < bounding_boxes[b * kBBoxSize + index_y_max])) {
|
||||
KERNEL_LOG_ERROR("y_min must be less than y_max, got: [%f] and [%f]",
|
||||
bounding_boxes[b * kBBoxSize + index_y_min], bounding_boxes[b * kBBoxSize + index_y_max]);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
for (int i = 0; i < kBBoxSize; ++i) {
|
||||
if (bounding_boxes[b * kBBoxSize + i] < 0.0 || bounding_boxes[b * kBBoxSize + i] > 1.0) {
|
||||
KERNEL_LOG_ERROR("All bounding box coordinates must be in [0.0, 1.0], got: [%f]",
|
||||
bounding_boxes[b * kBBoxSize + i]);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
}
|
||||
const int32_t x_min = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_x_min] * width);
|
||||
const int32_t y_min = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_y_min] * height);
|
||||
const int32_t x_max = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_x_max] * width);
|
||||
const int32_t y_max = static_cast<int32_t>(bounding_boxes[b * kBBoxSize + index_y_max] * height);
|
||||
boxes.push_back(Rectangle(x_min, y_min, x_max, y_max));
|
||||
}
|
||||
}
|
||||
|
||||
const Rectangle image_rect(0, 0, width, height);
|
||||
if (boxes.empty()) {
|
||||
if (!use_image_if_no_bounding_boxes) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"No bounding boxes provided as input. One must "
|
||||
"enable use_image_if_no_bounding_boxes if you wish "
|
||||
"to not provide any bounding boxes.");
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
|
||||
boxes.push_back(image_rect);
|
||||
}
|
||||
|
||||
const float min_sample_area = area_range[0];
|
||||
const float max_sample_area = area_range[1];
|
||||
const float min_sample_aspect_ratio = aspect_ratio_range[0];
|
||||
const float max_sample_aspect_ratio = aspect_ratio_range[1];
|
||||
|
||||
InitPhiloxRandom(seed, seed2);
|
||||
|
||||
Rectangle crop_rect;
|
||||
bool sample_generated = false;
|
||||
for (int i = 0; i < max_attempts; ++i) {
|
||||
const float sample_aspect_ratio =
|
||||
RandFloat() * (max_sample_aspect_ratio - min_sample_aspect_ratio) + min_sample_aspect_ratio;
|
||||
if (GenerateRandomCrop(width, height, min_sample_area, max_sample_area, sample_aspect_ratio, &crop_rect)) {
|
||||
if (SatisfiesOverlapConstraints(crop_rect, min_object_covered_val, boxes)) {
|
||||
sample_generated = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!sample_generated) {
|
||||
crop_rect = image_rect;
|
||||
}
|
||||
|
||||
// Determine the cropping parameters from the bounding box.
|
||||
const int target_width = crop_rect.max_x_ - crop_rect.min_x_;
|
||||
const int target_height = crop_rect.max_y_ - crop_rect.min_y_;
|
||||
const int offset_width = crop_rect.min_x_;
|
||||
const int offset_height = crop_rect.min_y_;
|
||||
|
||||
if (width < target_width + offset_width) {
|
||||
KERNEL_LOG_ERROR("width must be >= target_width + offset_width: [%d] vs [%d] + [%d]", width, target_width,
|
||||
offset_width);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
|
||||
if (height < target_height + offset_height) {
|
||||
KERNEL_LOG_ERROR("height must be >= target_height + offset_height: [%d] vs [%d] + [%d]", height, target_height,
|
||||
offset_height);
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
|
||||
begin[0] = static_cast<T>(offset_height);
|
||||
size[0] = static_cast<T>(target_height);
|
||||
begin[1] = static_cast<T>(offset_width);
|
||||
size[1] = static_cast<T>(target_width);
|
||||
|
||||
bboxes[index_y_min] = static_cast<float>(crop_rect.min_y_) / static_cast<float>(height);
|
||||
bboxes[index_x_min] = static_cast<float>(crop_rect.min_x_) / static_cast<float>(width);
|
||||
bboxes[index_y_max] = static_cast<float>(crop_rect.max_y_) / static_cast<float>(height);
|
||||
bboxes[index_x_max] = static_cast<float>(crop_rect.max_x_) / static_cast<float>(width);
|
||||
|
||||
// Retain all of the channels.
|
||||
const int32_t begin_channels = 3;
|
||||
const int32_t size_channels = 3;
|
||||
begin[begin_channels - 1] = static_cast<T>(0);
|
||||
size[size_channels - 1] = static_cast<T>(-1);
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSDBBExt2, SDBBExt2CpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,101 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SAMPLE_DISTORTED_BOUNDING_BOX_EXT2_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SAMPLE_DISTORTED_BOUNDING_BOX_EXT2_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/philox_random.h"
|
||||
|
||||
class Rectangle {
|
||||
public:
|
||||
Rectangle() { Set(0, 0, 0, 0); }
|
||||
Rectangle(int xmin, int ymin, int xmax, int ymax) { Set(xmin, ymin, xmax, ymax); }
|
||||
|
||||
void Set(int xmin, int ymin, int xmax, int ymax) {
|
||||
min_x_ = xmin;
|
||||
min_y_ = ymin;
|
||||
max_x_ = xmax;
|
||||
max_y_ = ymax;
|
||||
}
|
||||
|
||||
bool IsEmpty() const { return min_x_ > max_x_ || min_y_ > max_y_; }
|
||||
float Area() const { return static_cast<float>((max_x_ - min_x_) * (max_y_ - min_y_)); }
|
||||
|
||||
Rectangle Intersect(const Rectangle &r) const {
|
||||
const int pmin_x = std::max(min_x_, r.min_x_);
|
||||
const int pmin_y = std::max(min_y_, r.min_y_);
|
||||
const int pmax_x = std::min(max_x_, r.max_x_);
|
||||
const int pmax_y = std::min(max_y_, r.max_y_);
|
||||
if (pmin_x > pmax_x || pmin_y > pmax_y) {
|
||||
return Rectangle();
|
||||
} else {
|
||||
return Rectangle(pmin_x, pmin_y, pmax_x, pmax_y);
|
||||
}
|
||||
}
|
||||
|
||||
int min_x_;
|
||||
int min_y_;
|
||||
int max_x_;
|
||||
int max_y_;
|
||||
};
|
||||
|
||||
namespace aicpu {
|
||||
class SDBBExt2CpuKernel : public CpuKernel {
|
||||
public:
|
||||
SDBBExt2CpuKernel() = default;
|
||||
~SDBBExt2CpuKernel() override = default;
|
||||
|
||||
static const int kResultTypeNum = 4;
|
||||
static const int kKeyNum = 2;
|
||||
using ResultType = Array<uint32_t, kResultTypeNum>;
|
||||
using ResultElementType = uint32_t;
|
||||
using Key = Array<uint32_t, kKeyNum>;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
int seed;
|
||||
int seed2;
|
||||
std::vector<float> aspect_ratio_range;
|
||||
std::vector<float> area_range;
|
||||
int max_attempts;
|
||||
bool use_image_if_no_bounding_boxes;
|
||||
|
||||
PhiloxRandom generator_;
|
||||
|
||||
float RandFloat();
|
||||
uint32_t Uniform(uint32_t n);
|
||||
|
||||
uint64_t New64();
|
||||
void InitPhiloxRandom(int64_t seed, int64_t seed2);
|
||||
ResultType unused_results_;
|
||||
int used_result_index_ = PhiloxRandom::kResultElementCount;
|
||||
ResultElementType GenerateSingle();
|
||||
|
||||
// Image
|
||||
bool SatisfiesOverlapConstraints(const Rectangle &crop, float minimum_object_covered,
|
||||
const std::vector<Rectangle> &bounding_boxes);
|
||||
bool GenerateRandomCrop(int original_width, int original_height, float min_relative_crop_area,
|
||||
float max_relative_crop_area, float aspect_ratio, Rectangle *crop_rect);
|
||||
|
||||
uint32_t SDBBExt2Check(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SDBBExt2Compute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,196 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "scatter_nd.h"
|
||||
|
||||
#include <complex>
|
||||
|
||||
#include "eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 3;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kScatterNd = "ScatterNd";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t ScatterNdCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check ScatterNd Input and Output failed.");
|
||||
|
||||
Tensor *input_indices = ctx.Input(0);
|
||||
Tensor *input_x = ctx.Input(1);
|
||||
Tensor *input_shape = ctx.Input(2);
|
||||
|
||||
auto shape_x = input_x->GetTensorShape();
|
||||
auto shape_indices = input_indices->GetTensorShape();
|
||||
auto shape_shape = input_shape->GetTensorShape();
|
||||
int64_t indices_shape_m = shape_indices->GetDimSize(shape_indices->GetDims() - 1);
|
||||
|
||||
if (shape_x->GetDims() < 1) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input_x's rank less than 1.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (shape_indices->GetDims() < 1) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input_indices's rank less than 1.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (shape_shape->GetDims() < 1) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input_shape's rank less than 1.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (indices_shape_m > shape_shape->NumElements()) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input_shape&input_indices ranks mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < shape_indices->GetDims() - 1; i++) {
|
||||
if (shape_indices->GetDimSize(i) != shape_x->GetDimSize(i)) {
|
||||
KERNEL_LOG_ERROR("[%s], shape_indices and shape_updates mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
auto data_type_x = input_x->GetDataType();
|
||||
auto data_type_indices = input_indices->GetDataType();
|
||||
auto data_type_shape = input_shape->GetDataType();
|
||||
if (data_type_shape != DT_INT32 && data_type_shape != DT_INT64) {
|
||||
KERNEL_LOG_ERROR("ScatterNd kernel data type [%s] not support.", DTypeStr(data_type_shape).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (data_type_indices != DT_INT32 && data_type_indices != DT_INT64) {
|
||||
KERNEL_LOG_ERROR("ScatterNd kernel data type [%s] not support.", DTypeStr(data_type_indices).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (data_type_indices != data_type_shape) {
|
||||
KERNEL_LOG_ERROR("Indices and shape must have the same type.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
switch (data_type_x) {
|
||||
case DT_INT8:
|
||||
return DTYPE_CHOOSE<int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return DTYPE_CHOOSE<int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return DTYPE_CHOOSE<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DTYPE_CHOOSE<int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return DTYPE_CHOOSE<uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return DTYPE_CHOOSE<uint16_t>(ctx);
|
||||
case DT_UINT32:
|
||||
return DTYPE_CHOOSE<uint32_t>(ctx);
|
||||
case DT_UINT64:
|
||||
return DTYPE_CHOOSE<uint64_t>(ctx);
|
||||
case DT_FLOAT16:
|
||||
return DTYPE_CHOOSE<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return DTYPE_CHOOSE<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return DTYPE_CHOOSE<double>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return DTYPE_CHOOSE<std::complex<float>>(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return DTYPE_CHOOSE<std::complex<double>>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("ScatterNd kernel data type [%s] not support.", DTypeStr(data_type_x).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename data_type_x>
|
||||
uint32_t ScatterNdCpuKernel::DTYPE_CHOOSE(CpuKernelContext &ctx) {
|
||||
auto indices_type = static_cast<DataType>(ctx.Input(0)->GetDataType());
|
||||
switch (indices_type) {
|
||||
case DT_INT32:
|
||||
return ScatterNdComputeRealKernel<int32_t, data_type_x>(ctx);
|
||||
case DT_INT64:
|
||||
return ScatterNdComputeRealKernel<int64_t, data_type_x>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not supported, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(indices_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename indices_type, typename data_type_x>
|
||||
uint32_t ScatterNdCpuKernel::ScatterNdComputeRealKernel(CpuKernelContext &ctx) {
|
||||
int64_t n_slices = 1;
|
||||
int64_t slice_size = 1;
|
||||
|
||||
const int64_t outer_dims = ctx.Input(0)->GetTensorShape()->GetDims() - 1;
|
||||
const int64_t indices_nd = ctx.Input(0)->GetTensorShape()->GetDimSize(outer_dims);
|
||||
const int64_t updates_dims = ctx.Input(1)->GetTensorShape()->GetDims();
|
||||
|
||||
auto shape_indices = ctx.Input(0)->GetTensorShape();
|
||||
auto data_shape = reinterpret_cast<indices_type *>(ctx.Input(2)->GetData());
|
||||
auto dims_shape = ctx.Input(2)->GetTensorShape()->NumElements();
|
||||
auto updates_shape = ctx.Input(1)->GetTensorShape();
|
||||
for (int64_t i = 0; i < dims_shape - indices_nd; i++) {
|
||||
if (updates_shape->GetDimSize(i + shape_indices->GetDims() - 1) != data_shape[i + indices_nd]) {
|
||||
KERNEL_LOG_ERROR("[%s], shape_indices and shape_updates mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < outer_dims; ++i) {
|
||||
n_slices *= ctx.Input(0)->GetTensorShape()->GetDimSize(i);
|
||||
}
|
||||
for (int64_t i = outer_dims; i < updates_dims; ++i) {
|
||||
slice_size *= ctx.Input(1)->GetTensorShape()->GetDimSize(i);
|
||||
}
|
||||
const int kNumberInputTwo = 2;
|
||||
int64_t output_flat_size = 1;
|
||||
int64_t num_shape = ctx.Input(kNumberInputTwo)->NumElements();
|
||||
for (int64_t i = 0; i < num_shape; i++) {
|
||||
output_flat_size *= data_shape[i];
|
||||
}
|
||||
int64_t remain_flat_size = output_flat_size;
|
||||
std::vector<int64_t> dims_to_count(indices_nd, 0);
|
||||
for (int64_t i = 0; i < indices_nd; ++i) {
|
||||
dims_to_count[i] = remain_flat_size / data_shape[i];
|
||||
remain_flat_size = dims_to_count[i];
|
||||
}
|
||||
|
||||
auto Indices_data = reinterpret_cast<indices_type *>(ctx.Input(0)->GetData());
|
||||
auto Updates_data = reinterpret_cast<data_type_x *>(ctx.Input(1)->GetData());
|
||||
auto Output_data = reinterpret_cast<data_type_x *>(ctx.Output(0)->GetData());
|
||||
|
||||
memset(Output_data, 0, sizeof(data_type_x) * output_flat_size);
|
||||
for (int64_t i = 0; i < n_slices; ++i) {
|
||||
int64_t to_pos = 0;
|
||||
for (int64_t j = 0; j < indices_nd; ++j) {
|
||||
int64_t idx = Indices_data[i * indices_nd + j];
|
||||
|
||||
if (idx < 0 || idx >= data_shape[j]) {
|
||||
KERNEL_LOG_ERROR("The indices[%d] is so big or small", idx);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
to_pos += idx * dims_to_count[j];
|
||||
}
|
||||
for (int64_t j = 0; j < slice_size; j++) {
|
||||
Output_data[to_pos + j] += Updates_data[i * slice_size + j];
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kScatterNd, ScatterNdCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SCATTERND_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SCATTERND_H_
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class ScatterNdCpuKernel : public CpuKernel {
|
||||
public:
|
||||
ScatterNdCpuKernel() = default;
|
||||
~ScatterNdCpuKernel() override = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename data_type0>
|
||||
uint32_t DTYPE_CHOOSE(CpuKernelContext &ctx);
|
||||
|
||||
template <typename indices_type, typename data_type0>
|
||||
uint32_t ScatterNdComputeRealKernel(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,211 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "scatter_nd_update.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <complex>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
#include "eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 3;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kScatterNdUpdate = "ScatterNdUpdate";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t ScatterNdUpdateCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check ScatterNdUpdate Input and Output failed.");
|
||||
|
||||
Tensor *input_var = ctx.Input(0);
|
||||
Tensor *input_indices = ctx.Input(1);
|
||||
Tensor *input_updates = ctx.Input(2);
|
||||
|
||||
auto shape_var = input_var->GetTensorShape();
|
||||
auto shape_indices = input_indices->GetTensorShape();
|
||||
auto shape_updates = input_updates->GetTensorShape();
|
||||
|
||||
if (shape_var->GetDims() < 1) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input_var's rank less than 1.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (shape_indices->GetDims() < 2) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input_indices's rank less than 2.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (shape_updates->GetDims() < 1) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input_updates's rank less than 1.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
auto index_size = shape_indices->GetDims() - 1;
|
||||
auto index_depth = shape_indices->GetDimSize(index_size);
|
||||
|
||||
if (index_depth > shape_var->GetDims()) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor input_var&input_indices ranks mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
std::vector<int64_t> batch_shape;
|
||||
for (int64_t i = 0; i < index_size; ++i) {
|
||||
batch_shape.push_back(shape_indices->GetDimSize(i));
|
||||
}
|
||||
|
||||
for (int64_t i = index_depth; i <= shape_var->GetDims() - 1; ++i) {
|
||||
batch_shape.push_back(shape_var->GetDimSize(i));
|
||||
}
|
||||
|
||||
if (batch_shape != shape_updates->GetDimSizes()) {
|
||||
KERNEL_LOG_ERROR("[%s] Tensor indices's & updates' and var's shape are dismatch .", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < index_size; i++) {
|
||||
if (shape_indices->GetDimSize(i) != shape_updates->GetDimSize(i)) {
|
||||
KERNEL_LOG_ERROR("[%s], Tensor indices and updates should have the same batch number.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
auto data_type_var = input_var->GetDataType();
|
||||
auto data_type_indices = input_indices->GetDataType();
|
||||
|
||||
if (data_type_indices != DT_INT32 && data_type_indices != DT_INT64) {
|
||||
KERNEL_LOG_ERROR("ScatterNdUpdate kernel data type [%s] not support.", DTypeStr(data_type_indices).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
switch (data_type_var) {
|
||||
case DT_INT8:
|
||||
return DTYPE_CHOOSE<int8_t>(ctx);
|
||||
case DT_INT16:
|
||||
return DTYPE_CHOOSE<int16_t>(ctx);
|
||||
case DT_INT32:
|
||||
return DTYPE_CHOOSE<int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return DTYPE_CHOOSE<int64_t>(ctx);
|
||||
case DT_UINT8:
|
||||
return DTYPE_CHOOSE<uint8_t>(ctx);
|
||||
case DT_UINT16:
|
||||
return DTYPE_CHOOSE<uint16_t>(ctx);
|
||||
case DT_UINT32:
|
||||
return DTYPE_CHOOSE<uint32_t>(ctx);
|
||||
case DT_UINT64:
|
||||
return DTYPE_CHOOSE<uint64_t>(ctx);
|
||||
case DT_FLOAT16:
|
||||
return DTYPE_CHOOSE<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return DTYPE_CHOOSE<float>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return DTYPE_CHOOSE<double>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return DTYPE_CHOOSE<std::complex<float>>(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return DTYPE_CHOOSE<std::complex<double>>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("ScatterNdUpdate kernel data type [%s] not support.", DTypeStr(data_type_var).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename var_type>
|
||||
uint32_t ScatterNdUpdateCpuKernel::DTYPE_CHOOSE(CpuKernelContext &ctx) {
|
||||
auto indices_type = static_cast<DataType>(ctx.Input(1)->GetDataType());
|
||||
switch (indices_type) {
|
||||
case DT_INT32:
|
||||
return ScatterNdUpdateComputeRealKernel<var_type, int32_t>(ctx);
|
||||
case DT_INT64:
|
||||
return ScatterNdUpdateComputeRealKernel<var_type, int64_t>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not supported, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(indices_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename var_type, typename indices_type>
|
||||
uint32_t ScatterNdUpdateCpuKernel::ScatterNdUpdateComputeRealKernel(CpuKernelContext &ctx) {
|
||||
int64_t n_slices = 1;
|
||||
int64_t slice_size = 1;
|
||||
|
||||
const int64_t indices_dims = ctx.Input(1)->GetTensorShape()->GetDims() - 1;
|
||||
const int64_t indices_nd = ctx.Input(1)->GetTensorShape()->GetDimSize(indices_dims);
|
||||
const int64_t updates_dims = ctx.Input(2)->GetTensorShape()->GetDims();
|
||||
|
||||
auto shape_var = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
auto shape_indices = ctx.Input(1)->GetTensorShape();
|
||||
auto dims_shape = ctx.Input(0)->GetTensorShape()->GetDims();
|
||||
for (int64_t i = 0; i < dims_shape - indices_nd; i++) {
|
||||
if (ctx.Input(2)->GetTensorShape()->GetDimSize(i + shape_indices->GetDims() - 1) != shape_var[i + indices_nd]) {
|
||||
KERNEL_LOG_ERROR("[%s] shape_indices and shape_updates mismatch.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t i = 0; i < indices_dims; ++i) {
|
||||
n_slices *= ctx.Input(1)->GetTensorShape()->GetDimSize(i);
|
||||
}
|
||||
for (int i = indices_dims; i < updates_dims; ++i) {
|
||||
slice_size *= ctx.Input(2)->GetTensorShape()->GetDimSize(i);
|
||||
}
|
||||
|
||||
const int64_t var_flat_size = ctx.Input(0)->GetTensorShape()->NumElements();
|
||||
std::vector<int64_t> output_shape = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
|
||||
int64_t remain_flat_size = var_flat_size;
|
||||
std::vector<int64_t> dims_to_count(indices_nd, 0);
|
||||
for (int64_t i = 0; i < indices_nd; ++i) {
|
||||
dims_to_count[i] = remain_flat_size / output_shape[i];
|
||||
remain_flat_size = dims_to_count[i];
|
||||
}
|
||||
|
||||
auto Var_data = reinterpret_cast<var_type *>(ctx.Input(0)->GetData());
|
||||
auto Indices_data = reinterpret_cast<indices_type *>(ctx.Input(1)->GetData());
|
||||
auto Updates_data = reinterpret_cast<var_type *>(ctx.Input(2)->GetData());
|
||||
auto Output_data = reinterpret_cast<var_type *>(ctx.Output(0)->GetData());
|
||||
|
||||
for (int64_t i = 0; i < var_flat_size; ++i) {
|
||||
Output_data[i] = Var_data[i];
|
||||
}
|
||||
for (int64_t i = 0; i < n_slices; ++i) {
|
||||
int64_t to_pos = 0;
|
||||
for (int64_t j = 0; j < indices_nd; ++j) {
|
||||
int64_t idx = Indices_data[i * indices_nd + j];
|
||||
|
||||
if (idx < 0 || idx >= output_shape[j]) {
|
||||
KERNEL_LOG_ERROR("The indices[%d] is so big or small", idx);
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
to_pos += idx * dims_to_count[j];
|
||||
}
|
||||
for (int64_t j = 0; j < slice_size; j++) {
|
||||
Output_data[to_pos + j] = Updates_data[i * slice_size + j];
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kScatterNdUpdate, ScatterNdUpdateCpuKernel);
|
||||
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SCATTERNDUPDATE_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SCATTERNDUPDATE_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_types.h"
|
||||
#include "utils/bcast.h"
|
||||
#include <string.h>
|
||||
|
||||
namespace aicpu {
|
||||
class ScatterNdUpdateCpuKernel : public CpuKernel {
|
||||
public:
|
||||
ScatterNdUpdateCpuKernel() = default;
|
||||
~ScatterNdUpdateCpuKernel() override = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename var_type>
|
||||
uint32_t DTYPE_CHOOSE(CpuKernelContext &ctx);
|
||||
|
||||
template <typename var_type, typename indices_type>
|
||||
uint32_t ScatterNdUpdateComputeRealKernel(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,151 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "select.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/broadcast_iterator.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 3;
|
||||
const char *kSelect = "Select";
|
||||
|
||||
#define SELECT_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = SelectCompute<TYPE>(CTX); \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("Select kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SelectCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
// check params
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Select check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(SelectParamCheck(ctx), "Select check params failed.");
|
||||
auto data_type = ctx.Input(1)->GetDataType();
|
||||
switch (data_type) {
|
||||
SELECT_COMPUTE_CASE(DT_INT8, int8_t, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_INT16, int16_t, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_BOOL, uint64_t, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
SELECT_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx);
|
||||
SELECT_COMPUTE_CASE(DT_COMPLEX64, std::complex<double>, ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Select kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SelectCpuKernel::SelectParamCheck(CpuKernelContext &ctx) {
|
||||
// the non null of input_0, input_1, output has been verified in NormalCheck
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *input_2 = ctx.Input(2);
|
||||
Tensor *output = ctx.Output(0);
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
DataType input2_type = input_2->GetDataType();
|
||||
|
||||
auto input_shape_a = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
auto input_shape_b = ctx.Input(2)->GetTensorShape()->GetDimSizes();
|
||||
|
||||
if (input0_type != DT_BOOL) {
|
||||
KERNEL_LOG_ERROR("[%s] Data type of mask requires bool, but got data type [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(input0_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
KERNEL_CHECK_FALSE((input1_type == input2_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input1 [%s] need be same with "
|
||||
"input2 [%s].",
|
||||
DTypeStr(input1_type).c_str(), DTypeStr(input2_type).c_str())
|
||||
|
||||
if (input_shape_a != input_shape_b) {
|
||||
KERNEL_LOG_ERROR("The shape of X1 must equal X2.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
KERNEL_LOG_DEBUG(
|
||||
"SelectCpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], input2: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), input_2->GetDataSize(),
|
||||
output->GetDataSize());
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SelectCpuKernel::SelectCompute(CpuKernelContext &ctx) {
|
||||
bool *condition = static_cast<bool *>(ctx.Input(0)->GetData());
|
||||
T *x1 = static_cast<T *>(ctx.Input(1)->GetData());
|
||||
T *x2 = static_cast<T *>(ctx.Input(2)->GetData());
|
||||
T *y = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
auto input_shape_a = ctx.Input(1)->GetTensorShape()->GetDimSizes();
|
||||
auto input_shape_b = ctx.Input(2)->GetTensorShape()->GetDimSizes();
|
||||
auto input_shape_mask = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> output_shape;
|
||||
int64_t tensor_size = 1;
|
||||
int64_t position = 0;
|
||||
if (input_shape_a == input_shape_mask) {
|
||||
for (const int64_t &d : input_shape_a) {
|
||||
tensor_size *= d;
|
||||
}
|
||||
for (int64_t i = 0; i < tensor_size; ++i) {
|
||||
if (condition[i]) {
|
||||
y[position++] = x1[i];
|
||||
} else {
|
||||
y[position++] = x2[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto ret = GetBroadcastShape(input_shape_a, input_shape_mask, output_shape);
|
||||
KERNEL_CHECK_FALSE(ret == KERNEL_STATUS_OK, KERNEL_STATUS_PARAM_INVALID, "Shape of x and mask can't be broadcast.");
|
||||
for (const int64_t &d : output_shape) {
|
||||
tensor_size *= d;
|
||||
}
|
||||
BroadcastIterator iter(input_shape_a, input_shape_mask, output_shape);
|
||||
iter.SetPos(0);
|
||||
for (int64_t i = 0; i < tensor_size; ++i) {
|
||||
if (condition[iter.GetInputPosB()]) {
|
||||
y[position++] = x1[i];
|
||||
} else {
|
||||
y[position++] = x2[i];
|
||||
}
|
||||
iter.GenNextPos();
|
||||
}
|
||||
}
|
||||
ctx.Output(0)->GetTensorShape()->SetDimSizes({position});
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSelect, SelectCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SELECT_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SELECT_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SelectCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SelectCpuKernel() = default;
|
||||
~SelectCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t SelectParamCheck(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SelectCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,127 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "self_adjoint_eig.h"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_util.h"
|
||||
#include <complex>
|
||||
#include "utils/kernel_util.h"
|
||||
#include "Eigen/Core"
|
||||
#include <iostream>
|
||||
#include <Eigen/Dense>
|
||||
|
||||
using namespace std;
|
||||
namespace {
|
||||
const char *kSelfAdjointEig = "SelfAdjointEig";
|
||||
const uint32_t kInputNum = 1;
|
||||
const uint32_t kOutputNum = 2;
|
||||
} // namespace
|
||||
namespace aicpu {
|
||||
uint32_t SelfAdjointEigCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
if (NormalCheck(ctx, kInputNum, kOutputNum) != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
Tensor *input0 = ctx.Input(0);
|
||||
if ((input0->GetDataSize() == 0)) {
|
||||
KERNEL_LOG_INFO("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
uint32_t ret = KERNEL_STATUS_OK;
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT:
|
||||
ret = SelfAdjointEigCompute<float>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
ret = SelfAdjointEigCompute<double>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
ret = SelfAdjointEigCompute<complex<float>>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
ret = SelfAdjointEigCompute<complex<double>>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
|
||||
DTypeStr(data_type).c_str());
|
||||
ret = KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SelfAdjointEigCpuKernel::SelfAdjointEigCompute(CpuKernelContext &ctx) {
|
||||
auto input_tensor = ctx.Input(0);
|
||||
auto output_tensor0 = ctx.Output(0);
|
||||
auto output_tensor1 = ctx.Output(1);
|
||||
auto input_tensor_shape = input_tensor->GetTensorShape();
|
||||
auto inputData = reinterpret_cast<T *>(input_tensor->GetData());
|
||||
int64_t rank = input_tensor_shape->GetDims();
|
||||
std::vector<int64_t> input_dims = input_tensor_shape->GetDimSizes();
|
||||
const int32_t m = input_dims[rank - 1];
|
||||
int64_t num_array = input_tensor_shape->NumElements() / (m * m);
|
||||
using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
|
||||
|
||||
if (rank <= 2) {
|
||||
MatrixMap input0(inputData, m, m);
|
||||
MatrixMap output0(reinterpret_cast<T *>(output_tensor0->GetData()), m, 1);
|
||||
MatrixMap output1(reinterpret_cast<T *>(output_tensor1->GetData()), m, m);
|
||||
AttrValue *attr = ctx.GetAttr("compute_v");
|
||||
bool attr_ = (attr == nullptr) ? true : attr->GetBool();
|
||||
Eigen::SelfAdjointEigenSolver<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> es(
|
||||
input0, attr_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
|
||||
output0 = es.eigenvalues().template cast<T>();
|
||||
if (attr_) {
|
||||
output1 = es.eigenvectors();
|
||||
}
|
||||
} else {
|
||||
auto outputData0 = reinterpret_cast<T *>(output_tensor0->GetData());
|
||||
auto outputData1 = reinterpret_cast<T *>(output_tensor1->GetData());
|
||||
for (int64_t batch = 0; batch < num_array; ++batch) {
|
||||
AttrValue *attr = ctx.GetAttr("compute_v");
|
||||
bool attr_ = (attr == nullptr) ? true : attr->GetBool();
|
||||
T *inputDataMap = reinterpret_cast<T *>(new T[m * m]);
|
||||
T *outputDataMap0 = reinterpret_cast<T *>(new T[m]);
|
||||
T *outputDataMap1 = reinterpret_cast<T *>(new T[m * m]);
|
||||
for (int64_t i = 0; i < m * m; ++i) {
|
||||
inputDataMap[i] = inputData[batch * m * m + i];
|
||||
outputDataMap1[i] = outputData1[batch * m * m + i];
|
||||
}
|
||||
for (int64_t i = 0; i < m; ++i) {
|
||||
outputDataMap0[i] = outputData0[batch * m + i];
|
||||
}
|
||||
MatrixMap input0(inputDataMap, m, m);
|
||||
MatrixMap output0(outputDataMap0, m, 1);
|
||||
MatrixMap output1(outputDataMap1, m, m);
|
||||
Eigen::SelfAdjointEigenSolver<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> es(
|
||||
input0, attr_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
|
||||
output0 = es.eigenvalues().template cast<T>();
|
||||
for (int64_t i = 0; i < m; i++) {
|
||||
*(outputData0 + batch * m + i) = output0(i, 0);
|
||||
}
|
||||
if (attr_) {
|
||||
output1 = es.eigenvectors();
|
||||
for (int64_t i = 0; i < m; i++) {
|
||||
for (int64_t j = 0; j < m; j++) {
|
||||
*(outputData1 + batch * m * m + i * m + j) = output1(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kSelfAdjointEig, SelfAdjointEigCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SELFADJOINTEIG_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SELFADJOINTEIG_H_
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "Eigen/Eigenvalues"
|
||||
#include <iostream>
|
||||
namespace aicpu {
|
||||
|
||||
class SelfAdjointEigCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SelfAdjointEigCpuKernel() = default;
|
||||
~SelfAdjointEigCpuKernel() = default;
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t SelfAdjointEigCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_RANDOM_UNIFORM_H_
|
|
@ -0,0 +1,152 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sign.h"
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kOutputNum = 1;
|
||||
const uint32_t kInputNum = 1;
|
||||
const char *const kSign = "Sign";
|
||||
constexpr int64_t kParallelDataNums = 128 * 1024;
|
||||
|
||||
#define SIGN_COMPUTE_CASE(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = SignCompute<TYPE>(CTX); \
|
||||
if (result != static_cast<uint32_t>(KERNEL_STATUS_OK)) { \
|
||||
KERNEL_LOG_ERROR("Sign kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
#define SIGN_COMPUTE_CASE2(DTYPE, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
uint32_t result = SignComputeComplex<TYPE>(CTX); \
|
||||
if (result != static_cast<uint32_t>(KERNEL_STATUS_OK)) { \
|
||||
KERNEL_LOG_ERROR("Sign kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SignCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSign);
|
||||
KERNEL_HANDLE_ERROR(static_cast<uint32_t>(SignCheck(ctx)), "[%s] check params failed.", kSign);
|
||||
DataType data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
SIGN_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
|
||||
SIGN_COMPUTE_CASE(DT_FLOAT, float, ctx)
|
||||
SIGN_COMPUTE_CASE(DT_DOUBLE, double, ctx)
|
||||
SIGN_COMPUTE_CASE(DT_INT32, int32_t, ctx)
|
||||
SIGN_COMPUTE_CASE(DT_INT64, int64_t, ctx)
|
||||
SIGN_COMPUTE_CASE2(DT_COMPLEX64, std::complex<float>, ctx)
|
||||
SIGN_COMPUTE_CASE2(DT_COMPLEX128, std::complex<double>, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Sign kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return static_cast<uint32_t>(KERNEL_STATUS_PARAM_INVALID);
|
||||
}
|
||||
return static_cast<uint32_t>(KERNEL_STATUS_OK);
|
||||
}
|
||||
|
||||
KernelStatus SignCpuKernel::SignCheck(const CpuKernelContext &ctx) const {
|
||||
auto input_0 = ctx.Input(0);
|
||||
auto output_0 = ctx.Output(0);
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get input data failed.")
|
||||
KERNEL_CHECK_NULLPTR(output_0->GetData(), KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
|
||||
KERNEL_CHECK_NULLPTR(input_0->GetTensorShape(), KERNEL_STATUS_PARAM_INVALID, "Get input tensor shape failed.")
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SignCpuKernel::SignCompute(const CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
if (*(input_x + i) > static_cast<T>(0)) {
|
||||
*(output_y + i) = static_cast<T>(1);
|
||||
} else if (*(input_x + i) == static_cast<T>(0)) {
|
||||
*(output_y + i) = static_cast<T>(0);
|
||||
} else {
|
||||
*(output_y + i) = static_cast<T>(-1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shard_sign = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
if (*(input_x + i) > static_cast<T>(0)) {
|
||||
*(output_y + i) = static_cast<T>(1);
|
||||
} else if (*(input_x + i) == static_cast<T>(0)) {
|
||||
*(output_y + i) = static_cast<T>(0);
|
||||
} else {
|
||||
*(output_y + i) = static_cast<T>(-1);
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_sign),
|
||||
"Sign Compute failed.");
|
||||
}
|
||||
return static_cast<uint32_t>(KERNEL_STATUS_OK);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SignCpuKernel::SignComputeComplex(const CpuKernelContext &ctx) {
|
||||
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
|
||||
int64_t data_num = ctx.Input(0)->NumElements();
|
||||
int64_t data_size = data_num * static_cast<int64_t>(sizeof(T));
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
if (*(input_x + i) != static_cast<T>(0)) {
|
||||
*(output_y + i) = (*(input_x + i) / Eigen::numext::abs(*(input_x + i)));
|
||||
} else {
|
||||
*(output_y + i) = static_cast<T>(0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint32_t min_num = 1;
|
||||
int64_t max_core_num = std::max(min_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shard_sign = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
if (*(input_x + i) != static_cast<T>(0)) {
|
||||
*(output_y + i) = (*(input_x + i) / Eigen::numext::abs(*(input_x + i)));
|
||||
} else {
|
||||
*(output_y + i) = static_cast<T>(0);
|
||||
}
|
||||
}
|
||||
};
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_sign),
|
||||
"Sign Compute failed.");
|
||||
}
|
||||
return static_cast<uint32_t>(KERNEL_STATUS_OK);
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kSign, SignCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SIGN_H
|
||||
#define AICPU_KERNELS_NORMALIZED_SIGN_H
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "cpu_kernel/common/status.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SignCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SignCpuKernel() = default;
|
||||
~SignCpuKernel() override = default;
|
||||
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
KernelStatus SignCheck(const CpuKernelContext &ctx) const;
|
||||
|
||||
template <typename T>
|
||||
uint32_t SignCompute(const CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SignComputeComplex(const CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,148 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "sin.h"
|
||||
|
||||
#include <complex>
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const std::uint32_t kSinInputNum{1};
|
||||
const std::uint32_t kSinOutputNum{1};
|
||||
const char *kSin{"Sin"};
|
||||
} // namespace
|
||||
|
||||
namespace internal {
|
||||
template <typename T>
|
||||
inline T ScalarSin(T x) {
|
||||
return std::sin(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Eigen::half ScalarSin(Eigen::half x) {
|
||||
const Eigen::half val{static_cast<Eigen::half>(Eigen::numext::sin(x))};
|
||||
return val;
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
namespace aicpu {
|
||||
namespace detail {
|
||||
template <typename T>
|
||||
inline std::uint32_t ComputeSinKernel(const CpuKernelContext &ctx) {
|
||||
using i64 = std::int64_t;
|
||||
const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
|
||||
const auto ScalarSin = internal::ScalarSin<T>;
|
||||
auto input = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
i64 total = ctx.Input(0)->NumElements();
|
||||
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
|
||||
i64 num = 1024;
|
||||
if (total > num) {
|
||||
i64 per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
|
||||
return ParallelFor(ctx, total, per_unit_size, [&](i64 begin, i64 end) {
|
||||
std::transform(input + begin, input + end, output + begin, ScalarSin);
|
||||
});
|
||||
} else if (cores != 0) {
|
||||
std::transform(input, input + total, output, ScalarSin);
|
||||
} else {
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::uint32_t ComputeSin(const CpuKernelContext &ctx) {
|
||||
uint32_t result = ComputeSinKernel<T>(ctx);
|
||||
if (result != 0) {
|
||||
KERNEL_LOG_ERROR("Sin compute failed.");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::uint32_t SinExtraCheck(const CpuKernelContext &ctx) {
|
||||
if (ctx.Input(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get input data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Output(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get output data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
|
||||
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data size of the input [%llu] need be the same as the output "
|
||||
"[%llu].",
|
||||
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
|
||||
if (input_dims.size() != output_dims.size()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data dim size of the input [%llu] need be the same as the output "
|
||||
"[%llu].",
|
||||
input_dims.size(), output_dims.size());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
for (size_t index = 0; index < input_dims.size(); index++) {
|
||||
if (input_dims[index] != output_dims[index]) {
|
||||
KERNEL_LOG_ERROR("The data dim of the input need be the same as the output.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
std::uint32_t SinCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
|
||||
return NormalCheck(ctx, kSinInputNum, kSinOutputNum) ? KERNEL_STATUS_PARAM_INVALID : SinExtraCheck(ctx);
|
||||
}
|
||||
|
||||
std::uint32_t SinCompute(const CpuKernelContext &ctx) {
|
||||
DataType input_type{ctx.Input(0)->GetDataType()};
|
||||
switch (input_type) {
|
||||
case DT_FLOAT16:
|
||||
return ComputeSin<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return ComputeSin<std::float_t>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return ComputeSin<std::double_t>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return ComputeSin<std::complex<std::float_t>>(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return ComputeSin<std::complex<std::double_t>>(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
std::uint32_t SinCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
return detail::SinCheck(ctx, kSinInputNum, kSinOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::SinCompute(ctx);
|
||||
}
|
||||
REGISTER_CPU_KERNEL(kSin, SinCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,26 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_TAN_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_TAN_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SinCpuKernel final : public CpuKernel {
|
||||
virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,267 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "sinc.h"
|
||||
|
||||
#include <complex>
|
||||
#include <set>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "utils/eigen_tensor.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
constexpr double kPI = 3.14159265358979323846L;
|
||||
constexpr uint32_t kSincInputNum = 1;
|
||||
constexpr uint32_t kSincOutputNum = 1;
|
||||
const int64_t paralled_data_size = 64 * 1024;
|
||||
const char *kSinc = "Sinc";
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
template <typename T>
|
||||
uint32_t SincCpuKernel::SincTypeSameCompute(CpuKernelContext &ctx) {
|
||||
T *x_addr = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto y_addr = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
size_t x_size = ctx.Input(0)->NumElements();
|
||||
size_t date_size = x_size * sizeof(T);
|
||||
if (date_size <= paralled_data_size) {
|
||||
for (size_t i = 0; i < x_size; i++) {
|
||||
if (x_addr[i] == T(0.0f)) {
|
||||
y_addr[i] = T(1.0f);
|
||||
} else {
|
||||
T product = T(kPI) * x_addr[i];
|
||||
y_addr[i] = sin(product) / product;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto shard_sinc = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
if (x_addr[i] == T(0.0f)) {
|
||||
y_addr[i] = T(1.0f);
|
||||
} else {
|
||||
T product = T(kPI) * x_addr[i];
|
||||
y_addr[i] = sin(product) / product;
|
||||
}
|
||||
}
|
||||
};
|
||||
uint32_t min_core_num = 1;
|
||||
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num == 0) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (max_core_num > date_size) {
|
||||
max_core_num = date_size;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x_size, x_size / max_core_num, shard_sinc),
|
||||
"Sinc Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SincCpuKernel::SincTypeChangeCompute(CpuKernelContext &ctx) {
|
||||
T *x_addr = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto y_addr = static_cast<float *>(ctx.Output(0)->GetData());
|
||||
size_t x_size = ctx.Input(0)->NumElements();
|
||||
size_t date_size = x_size * sizeof(T);
|
||||
if (date_size <= paralled_data_size) {
|
||||
for (size_t i = 0; i < x_size; i++) {
|
||||
if (x_addr[i] == T(0.0f)) {
|
||||
y_addr[i] = float(1.0f);
|
||||
} else {
|
||||
float product = static_cast<float>(kPI) * x_addr[i];
|
||||
y_addr[i] = sin(product) / product;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto shard_sinc = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
if (x_addr[i] == T(0.0f)) {
|
||||
y_addr[i] = float(1.0f);
|
||||
} else {
|
||||
float product = static_cast<float>(kPI) * x_addr[i];
|
||||
y_addr[i] = sin(product) / product;
|
||||
}
|
||||
}
|
||||
};
|
||||
uint32_t min_core_num = 1;
|
||||
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num == 0) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (max_core_num > date_size) {
|
||||
max_core_num = date_size;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x_size, x_size / max_core_num, shard_sinc),
|
||||
"Sinc Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SincCpuKernel::SincBoolCompute(CpuKernelContext &ctx) {
|
||||
bool *x_addr = static_cast<bool *>(ctx.Input(0)->GetData());
|
||||
auto y_addr = static_cast<float *>(ctx.Output(0)->GetData());
|
||||
size_t x_size = ctx.Input(0)->NumElements();
|
||||
size_t date_size = x_size * sizeof(T);
|
||||
if (date_size <= paralled_data_size) {
|
||||
for (size_t i = 0; i < x_size; i++) {
|
||||
float tmp;
|
||||
if (x_addr[i] == true) {
|
||||
tmp = 1.0f;
|
||||
} else {
|
||||
tmp = 0.0f;
|
||||
}
|
||||
float product = static_cast<float>(kPI) * tmp;
|
||||
y_addr[i] = sin(product) / product;
|
||||
}
|
||||
} else {
|
||||
auto shard_sinc = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
float tmp;
|
||||
if (x_addr[i] == true) {
|
||||
tmp = 1.0f;
|
||||
} else {
|
||||
tmp = 0.0f;
|
||||
}
|
||||
float product = static_cast<float>(kPI) * tmp;
|
||||
y_addr[i] = sin(product) / product;
|
||||
}
|
||||
};
|
||||
uint32_t min_core_num = 1;
|
||||
size_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
|
||||
if (max_core_num == 0) {
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (max_core_num > date_size) {
|
||||
max_core_num = date_size;
|
||||
}
|
||||
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, x_size, x_size / max_core_num, shard_sinc),
|
||||
"Sinc Compute failed.");
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
inline std::uint32_t SincExtraCheck(const CpuKernelContext &ctx) {
|
||||
if (ctx.Input(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get input data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Output(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get output data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
DataType in_dtype = ctx.Input(0)->GetDataType();
|
||||
DataType out_dtype = ctx.Output(0)->GetDataType();
|
||||
std::set<DataType> dtypes;
|
||||
dtypes.insert(DT_FLOAT16);
|
||||
dtypes.insert(DT_FLOAT);
|
||||
dtypes.insert(DT_DOUBLE);
|
||||
dtypes.insert(DT_COMPLEX64);
|
||||
dtypes.insert(DT_COMPLEX128);
|
||||
if (dtypes.count(in_dtype) == 1) {
|
||||
if (out_dtype != in_dtype) {
|
||||
KERNEL_LOG_ERROR("The data type of the output need be the same as the input when input is [%s], but got [%s].",
|
||||
DTypeStr(in_dtype).c_str(), DTypeStr(out_dtype).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
} else {
|
||||
if (out_dtype != DT_FLOAT) {
|
||||
KERNEL_LOG_ERROR("The data type of the output must be float32 when the dtype of input is [%s], but got [%s].",
|
||||
DTypeStr(in_dtype).c_str(), DTypeStr(out_dtype).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
|
||||
if (input_dims.size() != output_dims.size()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data dim size of the input [%llu] need be the same as the output "
|
||||
"[%llu].",
|
||||
input_dims.size(), output_dims.size());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
for (size_t index = 0; index < input_dims.size(); index++) {
|
||||
if (input_dims[index] != output_dims[index]) {
|
||||
KERNEL_LOG_ERROR("The data dim of the input need be the same as the output.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SincCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSincInputNum, kSincOutputNum), "[%s] check params failed.", kSinc);
|
||||
uint32_t res = KERNEL_STATUS_OK;
|
||||
res = SincExtraCheck(ctx);
|
||||
if (res != KERNEL_STATUS_OK) {
|
||||
return res;
|
||||
}
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
case DT_FLOAT16:
|
||||
res = SincTypeSameCompute<Eigen::half>(ctx);
|
||||
break;
|
||||
case DT_FLOAT:
|
||||
res = SincTypeSameCompute<float>(ctx);
|
||||
break;
|
||||
case DT_DOUBLE:
|
||||
res = SincTypeSameCompute<double>(ctx);
|
||||
break;
|
||||
case DT_INT8:
|
||||
res = SincTypeChangeCompute<int8_t>(ctx);
|
||||
break;
|
||||
case DT_UINT8:
|
||||
res = SincTypeChangeCompute<uint8_t>(ctx);
|
||||
break;
|
||||
case DT_INT16:
|
||||
res = SincTypeChangeCompute<int16_t>(ctx);
|
||||
break;
|
||||
case DT_UINT16:
|
||||
res = SincTypeChangeCompute<uint16_t>(ctx);
|
||||
break;
|
||||
case DT_INT32:
|
||||
res = SincTypeChangeCompute<int32_t>(ctx);
|
||||
break;
|
||||
case DT_UINT32:
|
||||
res = SincTypeChangeCompute<uint32_t>(ctx);
|
||||
break;
|
||||
case DT_INT64:
|
||||
res = SincTypeChangeCompute<int64_t>(ctx);
|
||||
break;
|
||||
case DT_UINT64:
|
||||
res = SincTypeChangeCompute<uint64_t>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX64:
|
||||
res = SincTypeSameCompute<std::complex<float>>(ctx);
|
||||
break;
|
||||
case DT_COMPLEX128:
|
||||
res = SincTypeSameCompute<std::complex<double>>(ctx);
|
||||
break;
|
||||
case DT_BOOL:
|
||||
res = SincBoolCompute<bool>(ctx);
|
||||
break;
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Sinc invalid input type [%s]", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (res != KERNEL_STATUS_OK) {
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSinc, SincCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SINC_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SINC_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
namespace aicpu {
|
||||
class SincCpuKernel : public CpuKernel {
|
||||
public:
|
||||
SincCpuKernel() = default;
|
||||
|
||||
~SincCpuKernel() override = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
uint32_t SincTypeSameCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SincTypeChangeCompute(CpuKernelContext &ctx);
|
||||
|
||||
template <typename T>
|
||||
uint32_t SincBoolCompute(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,148 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "sinh.h"
|
||||
|
||||
#include <complex>
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "cpu_types.h"
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const std::uint32_t kSinhInputNum{1};
|
||||
const std::uint32_t kSinhOutputNum{1};
|
||||
const std::uint32_t ParallelNum{4096};
|
||||
const char *kSinh{"Sinh"};
|
||||
} // namespace
|
||||
|
||||
namespace internal {
|
||||
template <typename T>
|
||||
inline T ScalarSinh(T x) {
|
||||
return Eigen::numext::sinh(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Eigen::half ScalarSinh(Eigen::half x) {
|
||||
const Eigen::half val{Eigen::numext::sinh(static_cast<float>(x))};
|
||||
return Eigen::half_impl::isnan(val) ? Eigen::half{0.0f} : val;
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
namespace aicpu {
|
||||
namespace detail {
|
||||
template <typename T>
|
||||
inline std::uint32_t ComputeSinhKernel(const CpuKernelContext &ctx) {
|
||||
const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
|
||||
const auto ScalarSinh = internal::ScalarSinh<T>;
|
||||
auto input = static_cast<T *>(ctx.Input(0)->GetData());
|
||||
auto output = static_cast<T *>(ctx.Output(0)->GetData());
|
||||
std::int64_t total = ctx.Input(0)->NumElements();
|
||||
std::uint64_t total_size = ctx.Input(0)->GetDataSize();
|
||||
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
|
||||
if (total_size > ParallelNum * sizeof(T)) {
|
||||
std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
|
||||
return ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
|
||||
std::transform(input + begin, input + end, output + begin, ScalarSinh);
|
||||
});
|
||||
} else if (cores != 0) {
|
||||
std::transform(input, input + total, output, ScalarSinh);
|
||||
} else {
|
||||
return KERNEL_STATUS_INNER_ERROR;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::uint32_t ComputeSinh(const CpuKernelContext &ctx) {
|
||||
uint32_t result = ComputeSinhKernel<T>(ctx);
|
||||
if (result != 0) {
|
||||
KERNEL_LOG_ERROR("Sinh compute failed.");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::uint32_t SinhExtraCheck(const CpuKernelContext &ctx) {
|
||||
if (ctx.Input(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get input data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Output(0)->GetData() == nullptr) {
|
||||
KERNEL_LOG_ERROR("Get output data failed.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
|
||||
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
|
||||
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data size of the input [%llu] need be the same as the output "
|
||||
"[%llu].",
|
||||
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
|
||||
std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
|
||||
if (input_dims.size() != output_dims.size()) {
|
||||
KERNEL_LOG_ERROR(
|
||||
"The data dim size of the input [%llu] need be the same as the output "
|
||||
"[%llu].",
|
||||
input_dims.size(), output_dims.size());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
for (size_t index = 0; index < input_dims.size(); index++) {
|
||||
if (input_dims[index] != output_dims[index]) {
|
||||
KERNEL_LOG_ERROR("The data dim of the input need be the same as the output.");
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
std::uint32_t SinhCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
|
||||
return NormalCheck(ctx, kSinhInputNum, kSinhOutputNum) ? KERNEL_STATUS_PARAM_INVALID : SinhExtraCheck(ctx);
|
||||
}
|
||||
|
||||
std::uint32_t SinhCompute(const CpuKernelContext &ctx) {
|
||||
DataType input_type{ctx.Input(0)->GetDataType()};
|
||||
switch (input_type) {
|
||||
case DT_FLOAT16:
|
||||
return ComputeSinh<Eigen::half>(ctx);
|
||||
case DT_FLOAT:
|
||||
return ComputeSinh<std::float_t>(ctx);
|
||||
case DT_DOUBLE:
|
||||
return ComputeSinh<std::double_t>(ctx);
|
||||
case DT_COMPLEX64:
|
||||
return ComputeSinh<std::complex<std::float_t> >(ctx);
|
||||
case DT_COMPLEX128:
|
||||
return ComputeSinh<std::complex<std::double_t> >(ctx);
|
||||
default:
|
||||
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
std::uint32_t SinhCpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
return detail::SinhCheck(ctx, kSinhInputNum, kSinhOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::SinhCompute(ctx);
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSinh, SinhCpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,26 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SINH_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SINH_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SinhCpuKernel final : public CpuKernel {
|
||||
virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif
|
|
@ -0,0 +1,387 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "smooth_l1_loss_grad_v2.h"
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const uint32_t kInputNum = 3;
|
||||
const uint32_t kOutputNum = 1;
|
||||
const char *kSmoothL1LossGradV2 = "SmoothL1LossGradV2";
|
||||
const int64_t kParallelDataNum = 2 * 1024;
|
||||
const int64_t kParallelDataNumMid = 16 * 1024;
|
||||
const int64_t kParallelDataNumSameShape = 7 * 1024;
|
||||
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
|
||||
float sigma = 1.0;
|
||||
std::string reduction = "mean";
|
||||
std::mutex mtx;
|
||||
|
||||
#define SmoothL1LossGradV2_COMPUTE_CASE(DTYPE, REDUCTION, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
KERNEL_LOG_INFO("Compute [%s]", DTypeStr(data_type).c_str()); \
|
||||
uint32_t result = KERNEL_STATUS_PARAM_INVALID; \
|
||||
if ((REDUCTION) == "mean") { \
|
||||
result = ComputeMean<TYPE>(CTX); \
|
||||
} else if ((REDUCTION) == "sum") { \
|
||||
result = ComputeSum<TYPE>(CTX); \
|
||||
} else if ((REDUCTION) == "none") { \
|
||||
result = ComputeNone<TYPE>(CTX); \
|
||||
} \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("SmoothL1LossGradV2 kernel compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SmoothL1LossGradV2CpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
|
||||
"SmoothL1LossGradV2 check input and output number failed.");
|
||||
KERNEL_HANDLE_ERROR(ParamCheck(ctx), "SmoothL1LossGradV2 check params failed.");
|
||||
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
SmoothL1LossGradV2_COMPUTE_CASE(DT_FLOAT16, reduction, Eigen::half, ctx)
|
||||
SmoothL1LossGradV2_COMPUTE_CASE(DT_FLOAT, reduction, float, ctx)
|
||||
SmoothL1LossGradV2_COMPUTE_CASE(DT_DOUBLE, reduction, double, ctx) default
|
||||
: KERNEL_LOG_ERROR("SmoothL1LossGradV2 kernel data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SmoothL1LossGradV2CpuKernel::ParamCheck(CpuKernelContext &ctx) {
|
||||
Tensor *predict_tensor = ctx.Input(0);
|
||||
Tensor *label_tensor = ctx.Input(1);
|
||||
Tensor *dout_tensor = ctx.Input(2);
|
||||
Tensor *gradient_tensor = ctx.Output(0);
|
||||
DataType predict_type = predict_tensor->GetDataType();
|
||||
DataType label_type = label_tensor->GetDataType();
|
||||
DataType dout_type = dout_tensor->GetDataType();
|
||||
DataType gradient_type = gradient_tensor->GetDataType();
|
||||
KERNEL_CHECK_FALSE((predict_type == label_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of predict [%s] need be same with "
|
||||
"label [%s].",
|
||||
DTypeStr(predict_type).c_str(), DTypeStr(label_type).c_str());
|
||||
KERNEL_CHECK_FALSE((predict_type == dout_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of predict [%s] need be same with "
|
||||
"dout [%s].",
|
||||
DTypeStr(predict_type).c_str(), DTypeStr(dout_type).c_str());
|
||||
KERNEL_CHECK_FALSE((predict_type == gradient_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of predict [%s] need be same with "
|
||||
"gradient [%s].",
|
||||
DTypeStr(predict_type).c_str(), DTypeStr(gradient_type).c_str());
|
||||
auto predict_shape = predict_tensor->GetTensorShape();
|
||||
auto label_shape = label_tensor->GetTensorShape();
|
||||
auto gradient_shape = gradient_tensor->GetTensorShape();
|
||||
int32_t predict_dims = predict_shape->GetDims();
|
||||
int32_t label_dims = label_shape->GetDims();
|
||||
int32_t gradient_dims = gradient_shape->GetDims();
|
||||
KERNEL_CHECK_FALSE((predict_dims == label_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the input shape dim of predict [%d] need be same with "
|
||||
"label [%d].",
|
||||
predict_dims, label_dims);
|
||||
KERNEL_CHECK_FALSE((predict_dims == gradient_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the input shape dim of predict [%d] need be same with "
|
||||
"gradient [%d].",
|
||||
predict_dims, gradient_dims);
|
||||
for (int32_t i = 0; i < predict_dims; i++) {
|
||||
KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == label_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the every input shape dim of predict [%d] need be same with "
|
||||
"label [%d] where dim in [%d].",
|
||||
predict_shape->GetDimSize(i), label_shape->GetDimSize(i), i);
|
||||
KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == gradient_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the every input shape dim of predict [%d] need be same with "
|
||||
"gradient [%d] where dim in [%d].",
|
||||
predict_shape->GetDimSize(i), gradient_shape->GetDimSize(i), i);
|
||||
}
|
||||
KERNEL_LOG_DEBUG(
|
||||
"SmoothL1LossGradV2CpuKernel[%s], predict: size[%llu];"
|
||||
"label: size[%llu], dout: size[%llu], gradient: size[%llu].",
|
||||
ctx.GetOpType().c_str(), predict_tensor->GetDataSize(), label_tensor->GetDataSize(), dout_tensor->GetDataSize(),
|
||||
gradient_tensor->GetDataSize());
|
||||
return AttributesCheck(ctx);
|
||||
}
|
||||
|
||||
uint32_t SmoothL1LossGradV2CpuKernel::AttributesCheck(CpuKernelContext &ctx) {
|
||||
Tensor *predict_tensor = ctx.Input(0);
|
||||
Tensor *dout_tensor = ctx.Input(2);
|
||||
Tensor *gradient_tensor = ctx.Output(0);
|
||||
auto predict_shape = predict_tensor->GetTensorShape();
|
||||
auto dout_shape = dout_tensor->GetTensorShape();
|
||||
auto gradient_shape = gradient_tensor->GetTensorShape();
|
||||
int32_t predict_dims = predict_shape->GetDims();
|
||||
int32_t dout_dims = dout_shape->GetDims();
|
||||
int32_t gradient_dims = gradient_shape->GetDims();
|
||||
auto sigma_attr = ctx.GetAttr("sigma");
|
||||
auto reduction_attr = ctx.GetAttr("reduction");
|
||||
sigma = sigma_attr == nullptr ? 1.0 : sigma_attr->GetFloat();
|
||||
reduction = reduction_attr == nullptr ? "mean" : reduction_attr->GetString();
|
||||
KERNEL_CHECK_FALSE(sigma >= 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"the sigma value must greater than or equal to 0 "
|
||||
"when value of input sigma is [%f].",
|
||||
sigma);
|
||||
KERNEL_CHECK_FALSE((reduction == "none" || reduction == "mean" || reduction == "sum"), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the reduction value must be a value in a range of ['none','mean','sum'].", reduction);
|
||||
if (reduction == "none" || reduction == "mean" || reduction == "sum") {
|
||||
KERNEL_CHECK_FALSE((predict_dims == gradient_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the input shape dim of predict [%d] need be same with "
|
||||
"gradient [%d].",
|
||||
predict_dims, gradient_dims);
|
||||
for (int32_t i = 0; i < predict_dims; i++) {
|
||||
KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == gradient_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the input shape dim of predict [%d] must be same with "
|
||||
"gradient [%d] where dim in [%d].",
|
||||
predict_shape->GetDimSize(i), gradient_shape->GetDimSize(i), i);
|
||||
}
|
||||
}
|
||||
if (reduction == "none") {
|
||||
KERNEL_CHECK_FALSE((predict_dims == dout_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the input shape dim of predict [%d] need be same with "
|
||||
"dout [%d].",
|
||||
predict_dims, dout_dims);
|
||||
for (int32_t i = 0; i < predict_dims; i++) {
|
||||
KERNEL_CHECK_FALSE((predict_shape->GetDimSize(i) == dout_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the every input shape dim of predict [%d] need be same with "
|
||||
"dout [%d] where dim in [%d].",
|
||||
predict_shape->GetDimSize(i), dout_shape->GetDimSize(i), i);
|
||||
}
|
||||
} else if (reduction == "sum" || reduction == "mean") {
|
||||
KERNEL_CHECK_FALSE((dout_dims == 0) || ((dout_dims == 1) && (dout_tensor->NumElements() == 1)),
|
||||
KERNEL_STATUS_PARAM_INVALID, "the dout shape dim of dout [%d] need be a scalar.", dout_dims);
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
// 1 * dout if x >= sigma
|
||||
// -1 * dout if x <= -sigma
|
||||
// x / sigma * dout if |x| < sigma
|
||||
template <typename T>
|
||||
uint32_t SmoothL1LossGradV2CpuKernel::ComputeSum(CpuKernelContext &ctx) {
|
||||
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeSum start");
|
||||
Tensor *predict_tensor = ctx.Input(0);
|
||||
Tensor *label_tensor = ctx.Input(1);
|
||||
Tensor *dout_tensor = ctx.Input(2);
|
||||
Tensor *gradient_tensor = ctx.Output(0);
|
||||
T *predict_val = static_cast<T *>(predict_tensor->GetData());
|
||||
T *label_val = static_cast<T *>(label_tensor->GetData());
|
||||
T *dout_val = static_cast<T *>(dout_tensor->GetData());
|
||||
T *gradient_val = static_cast<T *>(gradient_tensor->GetData());
|
||||
int64_t data_num = predict_tensor->NumElements();
|
||||
int64_t data_size = data_num * sizeof(T);
|
||||
T *result = gradient_val;
|
||||
if (data_size <= kParallelDataNum) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
T predict = *(predict_val + i);
|
||||
T label = *(label_val + i);
|
||||
T dout = *dout_val;
|
||||
T x = predict - label;
|
||||
if (x == T(0)) {
|
||||
*(result + i) = T(0) * dout;
|
||||
} else if (x <= -T(sigma)) {
|
||||
*(result + i) = T(-1) * dout;
|
||||
} else if (x >= T(sigma)) {
|
||||
*(result + i) = T(1) * dout;
|
||||
} else if (sigma == 0) {
|
||||
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
|
||||
} else {
|
||||
*(result + i) = x / T(sigma) * dout;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num cannot be 0.");
|
||||
}
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shared_smoothl1lossgradv2 = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T predict = *(predict_val + i);
|
||||
T label = *(label_val + i);
|
||||
T dout = *dout_val;
|
||||
T x = predict - label;
|
||||
if (x == T(0)) {
|
||||
*(result + i) = T(0) * dout;
|
||||
} else if (x <= -T(sigma)) {
|
||||
*(result + i) = T(-1) * dout;
|
||||
} else if (x >= T(sigma)) {
|
||||
*(result + i) = T(1) * dout;
|
||||
} else if (sigma == 0) {
|
||||
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
|
||||
} else {
|
||||
*(result + i) = x / T(sigma) * dout;
|
||||
}
|
||||
}
|
||||
};
|
||||
return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossgradv2);
|
||||
}
|
||||
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeSum end");
|
||||
}
|
||||
|
||||
// Mean's result is Sum's result divided by the total number of elements per
|
||||
// element
|
||||
template <typename T>
|
||||
uint32_t SmoothL1LossGradV2CpuKernel::ComputeMean(CpuKernelContext &ctx) {
|
||||
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeMean start");
|
||||
Tensor *predict_tensor = ctx.Input(0);
|
||||
Tensor *label_tensor = ctx.Input(1);
|
||||
Tensor *dout_tensor = ctx.Input(2);
|
||||
Tensor *gradient_tensor = ctx.Output(0);
|
||||
T *predict_val = static_cast<T *>(predict_tensor->GetData());
|
||||
T *label_val = static_cast<T *>(label_tensor->GetData());
|
||||
T *dout_val = static_cast<T *>(dout_tensor->GetData());
|
||||
T *gradient_val = static_cast<T *>(gradient_tensor->GetData());
|
||||
int64_t data_num = predict_tensor->NumElements();
|
||||
if (data_num == 0) {
|
||||
KERNEL_LOG_ERROR("data_num cannot be 0.");
|
||||
}
|
||||
int64_t data_size = data_num * sizeof(T);
|
||||
T *result = gradient_val;
|
||||
if (data_size <= kParallelDataNum) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
T predict = *(predict_val + i);
|
||||
T label = *(label_val + i);
|
||||
T dout = *dout_val;
|
||||
T x = predict - label;
|
||||
if (x == T(0)) {
|
||||
*(result + i) = T(0) * dout;
|
||||
} else if (x <= -T(sigma)) {
|
||||
*(result + i) = T(-1) / data_num * dout;
|
||||
} else if (x >= T(sigma)) {
|
||||
*(result + i) = T(1) / data_num * dout;
|
||||
} else if (sigma == 0) {
|
||||
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
|
||||
} else {
|
||||
*(result + i) = x / T(sigma) / data_num * dout;
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num cannot be 0.");
|
||||
}
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shared_smoothl1lossgradv2 = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T predict = *(predict_val + i);
|
||||
T label = *(label_val + i);
|
||||
T dout = *dout_val;
|
||||
T x = predict - label;
|
||||
if (x == T(0)) {
|
||||
*(result + i) = T(0) * dout;
|
||||
} else if (x <= -T(sigma)) {
|
||||
*(result + i) = T(-1) / data_num * dout;
|
||||
} else if (x >= T(sigma)) {
|
||||
*(result + i) = T(1) / data_num * dout;
|
||||
} else if (sigma == 0) {
|
||||
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
|
||||
} else {
|
||||
*(result + i) = x / T(sigma) / data_num * dout;
|
||||
}
|
||||
}
|
||||
};
|
||||
return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossgradv2);
|
||||
}
|
||||
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeMean end");
|
||||
}
|
||||
|
||||
// "None" takes grad_output as a parameter,
|
||||
// and the end result is that result of "Sum" is multiplied by the grad_output
|
||||
// one by one, that is, the weight is increased
|
||||
template <typename T>
|
||||
uint32_t SmoothL1LossGradV2CpuKernel::ComputeNone(CpuKernelContext &ctx) {
|
||||
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeNone start");
|
||||
Tensor *predict_tensor = ctx.Input(0);
|
||||
Tensor *label_tensor = ctx.Input(1);
|
||||
Tensor *dout_tensor = ctx.Input(2);
|
||||
Tensor *gradient_tensor = ctx.Output(0);
|
||||
T *predict_val = static_cast<T *>(predict_tensor->GetData());
|
||||
T *label_val = static_cast<T *>(label_tensor->GetData());
|
||||
T *dout_val = static_cast<T *>(dout_tensor->GetData());
|
||||
T *gradient_val = static_cast<T *>(gradient_tensor->GetData());
|
||||
int64_t data_num = predict_tensor->NumElements();
|
||||
int64_t data_size = data_num * sizeof(T);
|
||||
T *result = gradient_val;
|
||||
if (data_size <= kParallelDataNum) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
T predict = *(predict_val + i);
|
||||
T label = *(label_val + i);
|
||||
T x = predict - label;
|
||||
T dout = *(dout_val + i);
|
||||
if (x == T(0)) {
|
||||
*(result + i) = T(0) * dout;
|
||||
} else if (x <= -T(sigma)) {
|
||||
*(result + i) = T(-1) * dout;
|
||||
} else if (x >= T(sigma)) {
|
||||
*(result + i) = T(1) * dout;
|
||||
} else if (sigma == 0) {
|
||||
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
|
||||
} else {
|
||||
*(result + i) = dout * x / T(sigma);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
} else {
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num cannot be 0.");
|
||||
}
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
auto shared_smoothl1lossgradv2 = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T predict = *(predict_val + i);
|
||||
T label = *(label_val + i);
|
||||
T x = predict - label;
|
||||
T dout = *(dout_val + i);
|
||||
if (x == T(0)) {
|
||||
*(result + i) = T(0) * dout;
|
||||
} else if (x <= -T(sigma)) {
|
||||
*(result + i) = T(-1) * dout;
|
||||
} else if (x >= T(sigma)) {
|
||||
*(result + i) = T(1) * dout;
|
||||
} else if (sigma == 0) {
|
||||
KERNEL_LOG_ERROR("attribute sigma could not be 0.");
|
||||
} else {
|
||||
*(result + i) = dout * x / T(sigma);
|
||||
}
|
||||
}
|
||||
};
|
||||
return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossgradv2);
|
||||
}
|
||||
KERNEL_LOG_INFO("SmoothL1LossGradV2CpuKernel::ComputeNone end");
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(kSmoothL1LossGradV2, SmoothL1LossGradV2CpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,44 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_GRAD_V2_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_GRAD_V2_H_
|
||||
|
||||
#include <string>
|
||||
#include "cpu_ops_kernel.h"
|
||||
#include "utils/bcast.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SmoothL1LossGradV2CpuKernel : public CpuKernel {
|
||||
public:
|
||||
SmoothL1LossGradV2CpuKernel() = default;
|
||||
~SmoothL1LossGradV2CpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t ParamCheck(CpuKernelContext &ctx);
|
||||
uint32_t AttributesCheck(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t ComputeMean(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t ComputeSum(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t ComputeNone(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_GRAD_V2_H_
|
|
@ -0,0 +1,278 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "smooth_l1_loss_v2.h"
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include "Eigen/Core"
|
||||
#include "cpu_kernel_utils.h"
|
||||
#include "kernel_log.h"
|
||||
#include "utils/kernel_util.h"
|
||||
|
||||
namespace {
|
||||
const char *SmoothL1LossV2 = "SmoothL1LossV2";
|
||||
const uint32_t kInputNum = 2;
|
||||
const uint32_t kOutputNum = 1;
|
||||
constexpr int64_t kParallelDataNums = 16 * 1024;
|
||||
const float opHalf = 0.5;
|
||||
float sigma = 1.0;
|
||||
std::string reduction = "mean";
|
||||
std::mutex mtx;
|
||||
|
||||
#define COMPUTE_CASE(DTYPE, REDUCTION, TYPE, CTX) \
|
||||
case (DTYPE): { \
|
||||
KERNEL_LOG_DEBUG("Compute [%s]", DTypeStr(data_type).c_str()); \
|
||||
uint32_t result = KERNEL_STATUS_PARAM_INVALID; \
|
||||
if ((REDUCTION) == "mean") { \
|
||||
result = ComputeMean<TYPE>(CTX); \
|
||||
} else if ((REDUCTION) == "sum") { \
|
||||
result = ComputeSum<TYPE>(CTX); \
|
||||
} else if ((REDUCTION) == "none") { \
|
||||
result = ComputeNone<TYPE>(CTX); \
|
||||
} \
|
||||
if (result != KERNEL_STATUS_OK) { \
|
||||
KERNEL_LOG_ERROR("SmoothL1LossV2 compute failed."); \
|
||||
return result; \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace aicpu {
|
||||
uint32_t SmoothL1LossV2CpuKernel::Compute(CpuKernelContext &ctx) {
|
||||
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Check SmoothL1LossV2 params failed.");
|
||||
KERNEL_HANDLE_ERROR(ParamCheck(ctx), "Check SmoothL1LossV2 params failed.");
|
||||
|
||||
auto data_type = ctx.Input(0)->GetDataType();
|
||||
switch (data_type) {
|
||||
COMPUTE_CASE(DT_FLOAT16, reduction, Eigen::half, ctx)
|
||||
COMPUTE_CASE(DT_FLOAT, reduction, float, ctx)
|
||||
COMPUTE_CASE(DT_DOUBLE, reduction, double, ctx)
|
||||
default:
|
||||
KERNEL_LOG_ERROR("SmoothL1LossV2 data type [%s] not support.", DTypeStr(data_type).c_str());
|
||||
return KERNEL_STATUS_PARAM_INVALID;
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
uint32_t SmoothL1LossV2CpuKernel::ParamCheck(CpuKernelContext &ctx) {
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *input_1 = ctx.Input(1);
|
||||
Tensor *output_0 = ctx.Output(0);
|
||||
DataType input0_type = input_0->GetDataType();
|
||||
DataType input1_type = input_1->GetDataType();
|
||||
DataType output0_type = output_0->GetDataType();
|
||||
|
||||
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"input1 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str());
|
||||
KERNEL_CHECK_FALSE((input0_type == output0_type), KERNEL_STATUS_PARAM_INVALID,
|
||||
"The data type of input0 [%s] need be same with "
|
||||
"output0 [%s].",
|
||||
DTypeStr(input0_type).c_str(), DTypeStr(output0_type).c_str());
|
||||
auto input0_shape = input_0->GetTensorShape();
|
||||
auto input1_shape = input_1->GetTensorShape();
|
||||
int32_t input0_dims = input0_shape->GetDims();
|
||||
int32_t input1_dims = input1_shape->GetDims();
|
||||
KERNEL_CHECK_FALSE((input0_dims == input1_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the input shape dim of input0 [%d] need be same with "
|
||||
"input1 [%d].",
|
||||
input0_dims, input1_dims);
|
||||
for (int32_t i = 0; i < input0_dims; i++) {
|
||||
KERNEL_CHECK_FALSE((input0_shape->GetDimSize(i) == input1_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the every input shape dim of input0 [%d] need be same with "
|
||||
"input1 [%d] where dim in [%d].",
|
||||
input0_shape->GetDimSize(i), input1_shape->GetDimSize(i), i);
|
||||
}
|
||||
KERNEL_LOG_DEBUG(
|
||||
"SmoothL1LossV2CpuKernel[%s], input0: size[%llu];"
|
||||
"input1: size[%llu], output: size[%llu].",
|
||||
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output_0->GetDataSize());
|
||||
|
||||
return AttributeCheck(ctx);
|
||||
}
|
||||
|
||||
uint32_t SmoothL1LossV2CpuKernel::AttributeCheck(CpuKernelContext &ctx) {
|
||||
Tensor *input_0 = ctx.Input(0);
|
||||
Tensor *output_0 = ctx.Output(0);
|
||||
auto input0_shape = input_0->GetTensorShape();
|
||||
auto output0_shape = output_0->GetTensorShape();
|
||||
int32_t input0_dims = input0_shape->GetDims();
|
||||
int32_t output0_dims = output0_shape->GetDims();
|
||||
|
||||
auto sigma_attr = ctx.GetAttr("sigma");
|
||||
auto reduction_attr = ctx.GetAttr("reduction");
|
||||
sigma = sigma_attr == nullptr ? 1.0 : sigma_attr->GetFloat();
|
||||
reduction = reduction_attr == nullptr ? "mean" : reduction_attr->GetString();
|
||||
KERNEL_CHECK_FALSE(sigma >= 0, KERNEL_STATUS_PARAM_INVALID,
|
||||
"the sigma value need to greater than or equal to 0 "
|
||||
"when input sigma value is [%f].",
|
||||
sigma);
|
||||
KERNEL_CHECK_FALSE((reduction == "none" || reduction == "mean" || reduction == "sum"), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the reduction value need be the member of ['none','mean','sum'] "
|
||||
"when input reduction value is [%s].",
|
||||
reduction);
|
||||
if (reduction == "none") {
|
||||
KERNEL_CHECK_FALSE((input0_dims == output0_dims), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the input shape dim of input0 [%d] need be same with "
|
||||
"output0 [%d].",
|
||||
input0_dims, output0_dims);
|
||||
for (int32_t i = 0; i < input0_dims; i++) {
|
||||
KERNEL_CHECK_FALSE((input0_shape->GetDimSize(i) == output0_shape->GetDimSize(i)), KERNEL_STATUS_PARAM_INVALID,
|
||||
"the every input shape dim of input0 [%d] need be same with "
|
||||
"output0 [%d] where dim in [%d].",
|
||||
input0_shape->GetDimSize(i), output0_shape->GetDimSize(i), i);
|
||||
}
|
||||
} else if (reduction == "sum" || reduction == "mean") {
|
||||
KERNEL_CHECK_FALSE((output0_dims == 0) || ((output0_dims == 1) && (output_0->NumElements() == 1)),
|
||||
KERNEL_STATUS_PARAM_INVALID, "the output shape dim of output0 [%d] need be [1] or a scalar.",
|
||||
output0_dims);
|
||||
}
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SmoothL1LossV2CpuKernel::ComputeMean(CpuKernelContext &ctx) {
|
||||
uint32_t compute_sum_res = ComputeSum<T>(ctx);
|
||||
if (compute_sum_res != KERNEL_STATUS_OK) {
|
||||
return compute_sum_res;
|
||||
}
|
||||
Tensor *predict_tensor = ctx.Input(0);
|
||||
int64_t data_num = predict_tensor->NumElements();
|
||||
Tensor *loss_tensor = ctx.Output(0);
|
||||
T *loss_val = reinterpret_cast<T *>(loss_tensor->GetData());
|
||||
T *res = loss_val;
|
||||
if (data_num == 0) {
|
||||
*(res) = T(0);
|
||||
}
|
||||
*(res) = *(res) / data_num;
|
||||
|
||||
return KERNEL_STATUS_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SmoothL1LossV2CpuKernel::ComputeSum(CpuKernelContext &ctx) {
|
||||
Tensor *predict_tensor = ctx.Input(0);
|
||||
Tensor *label_tensor = ctx.Input(1);
|
||||
Tensor *loss_tensor = ctx.Output(0);
|
||||
T *predict_val = reinterpret_cast<T *>(predict_tensor->GetData());
|
||||
T *label_val = reinterpret_cast<T *>(label_tensor->GetData());
|
||||
T *loss_val = reinterpret_cast<T *>(loss_tensor->GetData());
|
||||
int64_t data_num = predict_tensor->NumElements();
|
||||
int64_t data_size = data_num * sizeof(T);
|
||||
|
||||
double res = 0;
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
T predict = *(predict_val + i);
|
||||
T label = *(label_val + i);
|
||||
T z = predict - label > T(0) ? predict - label : label - predict;
|
||||
if (sigma == 0) {
|
||||
res += static_cast<double>(z);
|
||||
} else {
|
||||
res += static_cast<double>(z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma));
|
||||
}
|
||||
}
|
||||
*(loss_val) = static_cast<T>(res);
|
||||
return KERNEL_STATUS_OK;
|
||||
} else {
|
||||
auto shared_smoothl1lossv2 = [&](size_t start, size_t end) -> double {
|
||||
double sum = 0;
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T predict = *(predict_val + i);
|
||||
T label = *(label_val + i);
|
||||
T z = predict - label > T(0) ? predict - label : label - predict;
|
||||
if (sigma == 0) {
|
||||
res += static_cast<double>(z);
|
||||
} else {
|
||||
sum += static_cast<double>(z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma));
|
||||
}
|
||||
}
|
||||
mtx.lock();
|
||||
res = res + sum;
|
||||
mtx.unlock();
|
||||
return KERNEL_STATUS_OK;
|
||||
};
|
||||
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
auto result = CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossv2);
|
||||
*(loss_val) = static_cast<T>(res);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
uint32_t SmoothL1LossV2CpuKernel::ComputeNone(CpuKernelContext &ctx) {
|
||||
Tensor *predict_tensor = ctx.Input(0);
|
||||
Tensor *label_tensor = ctx.Input(1);
|
||||
Tensor *loss_tensor = ctx.Output(0);
|
||||
T *predict_val = reinterpret_cast<T *>(predict_tensor->GetData());
|
||||
T *label_val = reinterpret_cast<T *>(label_tensor->GetData());
|
||||
T *loss_val = reinterpret_cast<T *>(loss_tensor->GetData());
|
||||
int64_t data_num = predict_tensor->NumElements();
|
||||
|
||||
T *res = loss_val;
|
||||
int64_t data_size = data_num * sizeof(T);
|
||||
if (data_size <= kParallelDataNums) {
|
||||
for (int64_t i = 0; i < data_num; i++) {
|
||||
T predict = *(predict_val + i);
|
||||
T label = *(label_val + i);
|
||||
T z = predict - label > T(0) ? predict - label : label - predict;
|
||||
if (sigma == 0) {
|
||||
*(res + i) = z;
|
||||
} else {
|
||||
*(res + i) = z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma);
|
||||
}
|
||||
}
|
||||
return KERNEL_STATUS_OK;
|
||||
} else {
|
||||
auto shared_smoothl1lossv2 = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T predict = *(predict_val + i);
|
||||
T label = *(label_val + i);
|
||||
T z = predict - label > T(0) ? predict - label : label - predict;
|
||||
if (sigma == 0) {
|
||||
*(res + i) = z;
|
||||
} else {
|
||||
*(res + i) = z < T(sigma) ? T(opHalf) * z * z / T(sigma) : z - T(opHalf) * T(sigma);
|
||||
}
|
||||
}
|
||||
};
|
||||
uint32_t min_core_num = 1;
|
||||
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
|
||||
if (max_core_num > data_num) {
|
||||
max_core_num = data_num;
|
||||
}
|
||||
if (max_core_num == 0) {
|
||||
KERNEL_LOG_ERROR("max_core_num could not be 0.");
|
||||
}
|
||||
return CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shared_smoothl1lossv2);
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CPU_KERNEL(SmoothL1LossV2, SmoothL1LossV2CpuKernel);
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_V2_H_
|
||||
#define AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_V2_H_
|
||||
|
||||
#include "cpu_ops_kernel.h"
|
||||
|
||||
namespace aicpu {
|
||||
class SmoothL1LossV2CpuKernel : public CpuKernel {
|
||||
public:
|
||||
SmoothL1LossV2CpuKernel() = default;
|
||||
~SmoothL1LossV2CpuKernel() = default;
|
||||
|
||||
protected:
|
||||
uint32_t Compute(CpuKernelContext &ctx) override;
|
||||
|
||||
private:
|
||||
uint32_t ParamCheck(CpuKernelContext &ctx);
|
||||
uint32_t AttributeCheck(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t ComputeMean(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t ComputeSum(CpuKernelContext &ctx);
|
||||
template <typename T>
|
||||
uint32_t ComputeNone(CpuKernelContext &ctx);
|
||||
};
|
||||
} // namespace aicpu
|
||||
#endif // AICPU_KERNELS_NORMALIZED_SMOOTH_L1_LOSS_V2_H_
|
|
@ -0,0 +1,185 @@
|
|||
/**
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_PHILOX_RANDOM_H
|
||||
#define _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_PHILOX_RANDOM_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "cpu_kernel/common/status.h"
|
||||
|
||||
/**
|
||||
* A class that represents an inline array.
|
||||
* Arguments:
|
||||
* T: the array element type;
|
||||
* ElementCount: the fixed size of the array;
|
||||
*/
|
||||
template <typename T, int ElementCount>
|
||||
class Array {
|
||||
public:
|
||||
static constexpr int kElementCount = ElementCount;
|
||||
Array() {
|
||||
for (int i = 0; i < ElementCount; ++i) {
|
||||
data_[i] = T(0);
|
||||
}
|
||||
}
|
||||
|
||||
const T &operator[](int index) const { return data_[index]; }
|
||||
|
||||
T &operator[](int index) { return data_[index]; }
|
||||
|
||||
size_t size() const { return ElementCount; }
|
||||
|
||||
private:
|
||||
T data_[ElementCount];
|
||||
};
|
||||
|
||||
class PhiloxRandom {
|
||||
public:
|
||||
using ResultType = Array<uint32_t, 4>;
|
||||
using ResultElementType = uint32_t;
|
||||
// The number of elements that will be returned.
|
||||
static constexpr int kResultElementCount = 4;
|
||||
// Cost of generation of a single element (in cycles).
|
||||
static constexpr int kElementCost = 10;
|
||||
/*
|
||||
* The type for the 64-bit key stored in the form of two 32-bit uint
|
||||
* that are used in the diffusion process.
|
||||
*/
|
||||
using Key = Array<uint32_t, 2>;
|
||||
|
||||
PhiloxRandom() {}
|
||||
|
||||
PhiloxRandom(int64_t seed, uint64_t offset) {
|
||||
const uint32_t seed_low_index = 0;
|
||||
const uint32_t seed_high_index = 1;
|
||||
const uint32_t offset_low_index = 2;
|
||||
const uint32_t offset_high_index = 3;
|
||||
key_[seed_low_index] = static_cast<uint32_t>(seed);
|
||||
key_[seed_high_index] = static_cast<uint32_t>(seed >> 32);
|
||||
counter_[offset_low_index] = static_cast<uint32_t>(offset);
|
||||
counter_[offset_high_index] = static_cast<uint32_t>(offset >> 32);
|
||||
}
|
||||
|
||||
ResultType const &counter() const { return counter_; }
|
||||
|
||||
Key const &key() const { return key_; }
|
||||
|
||||
// Skip the specified number of samples of 128-bits in the current stream.
|
||||
void Skip(uint64_t count) {
|
||||
const uint32_t count_lo = static_cast<uint32_t>(count);
|
||||
uint32_t count_hi = static_cast<uint32_t>(count >> 32);
|
||||
|
||||
counter_[0] += count_lo;
|
||||
if (counter_[0] < count_lo) {
|
||||
++count_hi;
|
||||
}
|
||||
|
||||
counter_[1] += count_hi;
|
||||
if (counter_[1] < count_hi) {
|
||||
if (++counter_[2] == 0) {
|
||||
++counter_[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Returns a group of four random numbers using the underlying Philox
|
||||
* algorithm.
|
||||
*/
|
||||
ResultType operator()() {
|
||||
ResultType counter = counter_;
|
||||
Key key = key_;
|
||||
/*
|
||||
* Run the single rounds for ten times. Manually unrolling the loop
|
||||
* for better performance.
|
||||
*/
|
||||
counter = ComputeSingleRound(counter, key);
|
||||
RaiseKey(&key);
|
||||
counter = ComputeSingleRound(counter, key);
|
||||
RaiseKey(&key);
|
||||
counter = ComputeSingleRound(counter, key);
|
||||
RaiseKey(&key);
|
||||
counter = ComputeSingleRound(counter, key);
|
||||
RaiseKey(&key);
|
||||
counter = ComputeSingleRound(counter, key);
|
||||
RaiseKey(&key);
|
||||
counter = ComputeSingleRound(counter, key);
|
||||
RaiseKey(&key);
|
||||
counter = ComputeSingleRound(counter, key);
|
||||
RaiseKey(&key);
|
||||
counter = ComputeSingleRound(counter, key);
|
||||
RaiseKey(&key);
|
||||
counter = ComputeSingleRound(counter, key);
|
||||
RaiseKey(&key);
|
||||
counter = ComputeSingleRound(counter, key);
|
||||
SkipOne();
|
||||
return counter;
|
||||
}
|
||||
|
||||
private:
|
||||
// We use the same constants as recommended by the original paper.
|
||||
static constexpr uint32_t kPhiloxW32A = 0x9E3779B9;
|
||||
static constexpr uint32_t kPhiloxW32B = 0xBB67AE85;
|
||||
static constexpr uint32_t kPhiloxM4x32A = 0xD2511F53;
|
||||
static constexpr uint32_t kPhiloxM4x32B = 0xCD9E8D57;
|
||||
|
||||
// Helper function to skip the next sample of 128-bits in the current stream.
|
||||
void SkipOne() {
|
||||
if (++counter_[0] == 0) {
|
||||
if (++counter_[1] == 0) {
|
||||
if (++counter_[2] == 0) {
|
||||
++counter_[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Helper function to return the lower and higher 32-bits from two 32-bit
|
||||
* integer multiplications.
|
||||
*/
|
||||
static void MultiplyHighLow(uint32_t a, uint32_t b, uint32_t *result_low, uint32_t *result_high) {
|
||||
const uint64_t product = static_cast<uint64_t>(a) * b;
|
||||
*result_low = static_cast<uint32_t>(product);
|
||||
*result_high = static_cast<uint32_t>(product >> 32);
|
||||
}
|
||||
|
||||
// Helper function for a single round of the underlying Philox algorithm.
|
||||
static ResultType ComputeSingleRound(const ResultType &counter, const Key &key) {
|
||||
uint32_t lo0;
|
||||
uint32_t hi0;
|
||||
MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
|
||||
|
||||
uint32_t lo1;
|
||||
uint32_t hi1;
|
||||
MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
|
||||
|
||||
ResultType result;
|
||||
result[0] = hi1 ^ counter[1] ^ key[0];
|
||||
result[1] = lo1;
|
||||
result[2] = hi0 ^ counter[3] ^ key[1];
|
||||
result[3] = lo0;
|
||||
return result;
|
||||
}
|
||||
|
||||
void RaiseKey(Key *key) {
|
||||
(*key)[0] += kPhiloxW32A;
|
||||
(*key)[1] += kPhiloxW32B;
|
||||
}
|
||||
|
||||
private:
|
||||
ResultType counter_;
|
||||
Key key_;
|
||||
};
|
||||
#endif // _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_PHILOX_RANDOM_H_
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "sampling_kernels.h"
|
||||
#include <algorithm>
|
||||
#include "kernel_log.h"
|
||||
#include "status.h"
|
||||
using namespace std;
|
||||
|
||||
namespace aicpu {
|
||||
SamplingKernelType SamplingKernelTypeFromString(std::string str) {
|
||||
if (str == "lanczos1") return Lanczos1Kernel;
|
||||
if (str == "lanczos3") return Lanczos3Kernel;
|
||||
if (str == "lanczos5") return Lanczos5Kernel;
|
||||
if (str == "gaussian") return GaussianKernel;
|
||||
if (str == "box") return BoxKernel;
|
||||
if (str == "triangle") return TriangleKernel;
|
||||
if (str == "keyscubic") return KeysCubicKernel;
|
||||
if (str == "mitchellcubic") return MitchellCubicKernel;
|
||||
return SamplingKernelTypeEnd;
|
||||
}
|
||||
} // namespace aicpu
|
|
@ -0,0 +1,199 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef AICPU_UTILS_SAMPLING_KERNELS_H_
|
||||
#define AICPU_UTILS_SAMPLING_KERNELS_H_
|
||||
|
||||
#include <cmath>
|
||||
#include <stdio.h>
|
||||
#include "cpu_context.h"
|
||||
|
||||
namespace aicpu {
|
||||
// Defines functions for different types of sampling kernels.
|
||||
enum SamplingKernelType {
|
||||
// Lanczos kernel with radius 1. Aliases but does not ring.
|
||||
Lanczos1Kernel,
|
||||
|
||||
/**
|
||||
* Lanczos kernel with radius 3. High-quality practical filter but may have
|
||||
* some ringing especially on synthetic images.
|
||||
*/
|
||||
Lanczos3Kernel,
|
||||
|
||||
/**
|
||||
* Lanczos kernel with radius 5. Very-high-quality filter but may have
|
||||
* stronger ringing.
|
||||
*/
|
||||
Lanczos5Kernel,
|
||||
|
||||
// Gaussian kernel with radius 3, sigma = 1.5 / 3. Less commonly used.
|
||||
GaussianKernel,
|
||||
|
||||
/**
|
||||
* Rectangle function. Equivalent to "nearest" sampling when upscaling.
|
||||
* Has value 1 in interval (-0.5, 0.5), value 0.5 on edge, and 0 elsewhere.
|
||||
*/
|
||||
BoxKernel,
|
||||
|
||||
/**
|
||||
* Hat/tent function with radius 1. Equivalent to "bilinear" reconstruction
|
||||
* when upsampling.
|
||||
* Has value zero at -1.0 and 1.0.
|
||||
*/
|
||||
TriangleKernel,
|
||||
|
||||
/**
|
||||
* Cubic interpolant of Keys. Equivalent to Catmull-Rom kernel. Reasonably
|
||||
* good quality and faster than Lanczos3Kernel.
|
||||
*/
|
||||
KeysCubicKernel,
|
||||
|
||||
/**
|
||||
* Cubic non-interpolating scheme. For synthetic images (especially those
|
||||
* lacking proper prefiltering), less ringing than Keys cubic kernel but less
|
||||
* sharp.
|
||||
*/
|
||||
MitchellCubicKernel,
|
||||
|
||||
// Always insert new kernel types before this.
|
||||
SamplingKernelTypeEnd
|
||||
};
|
||||
|
||||
/**
|
||||
* // Converts a string into the corresponding kernel type.
|
||||
* Returns SamplingKernelTypeEnd if the string couldn't be converted.
|
||||
*/
|
||||
SamplingKernelType SamplingKernelTypeFromString(std::string str);
|
||||
|
||||
// A function object for a Lanczos kernel.
|
||||
struct LanczosKernelFunc {
|
||||
// Pass 1 for Lanczos1 kernel, 3 for Lanczos3 etc.
|
||||
explicit LanczosKernelFunc(float _radius) : radius(_radius) {}
|
||||
float operator()(float x) const {
|
||||
constexpr float kPI = 3.14159265359;
|
||||
x = std::abs(x);
|
||||
if (x > radius) {
|
||||
return 0.0;
|
||||
}
|
||||
// Need to special case the limit case of sin(x) / x when x is zero.
|
||||
if (x <= 1e-3) {
|
||||
return 1.0;
|
||||
}
|
||||
return radius * std::sin(kPI * x) * std::sin(kPI * x / radius) / (kPI * kPI * x * x);
|
||||
}
|
||||
float Radius() const { return radius; }
|
||||
const float radius;
|
||||
};
|
||||
|
||||
struct GaussianKernelFunc {
|
||||
static constexpr float kRadiusMultiplier = 3.0f;
|
||||
/**
|
||||
* https://en.wikipedia.org/wiki/Gaussian_function
|
||||
* We use sigma = 0.5, as suggested on p. 4 of Ken Turkowski's "Filters
|
||||
* for Common Resampling Tasks" for kernels with a support of 3 pixels:
|
||||
* www.realitypixels.com/turk/computergraphics/ResamplingFilters.pdf
|
||||
* This implies a radius of 1.5,
|
||||
*/
|
||||
explicit GaussianKernelFunc(float _radius = 1.5f) : radius(_radius), sigma(_radius / kRadiusMultiplier) {}
|
||||
float operator()(float x) const {
|
||||
x = std::abs(x);
|
||||
if (x >= radius) {
|
||||
return 0.0;
|
||||
}
|
||||
return std::exp(-x * x / (2.0 * sigma * sigma));
|
||||
}
|
||||
float Radius() const { return radius; }
|
||||
const float radius;
|
||||
// Gaussian standard deviation
|
||||
const float sigma;
|
||||
};
|
||||
|
||||
struct BoxKernelFunc {
|
||||
float operator()(float x) const {
|
||||
x = std::abs(x);
|
||||
return x < 0.5f ? 1. : x == 0.5f ? 0.5f : 0.0f;
|
||||
}
|
||||
float Radius() const { return 1.f; }
|
||||
};
|
||||
|
||||
struct TriangleKernelFunc {
|
||||
// https://en.wikipedia.org/wiki/Triangle_function
|
||||
float operator()(float x) const {
|
||||
x = std::abs(x);
|
||||
return x < 1.0f ? 1.0f - x : 0.0f;
|
||||
}
|
||||
float Radius() const { return 1.f; }
|
||||
};
|
||||
|
||||
struct KeysCubicKernelFunc {
|
||||
/**
|
||||
* http://ieeexplore.ieee.org/document/1163711/
|
||||
* R. G. Keys. Cubic convolution interpolation for digital image
|
||||
* processing. IEEE Transactions on Acoustics, Speech, and Signal
|
||||
* Processing, 29(6):1153–1160, 1981.
|
||||
*/
|
||||
float operator()(float x) const {
|
||||
x = std::abs(x);
|
||||
if (x >= 2.0f) {
|
||||
return 0.0f;
|
||||
} else if (x >= 1.0f) {
|
||||
return ((-0.5f * x + 2.5f) * x - 4.0f) * x + 2.0f;
|
||||
} else {
|
||||
return ((1.5f * x - 2.5f) * x) * x + 1.0f;
|
||||
}
|
||||
}
|
||||
float Radius() const { return 2.f; }
|
||||
};
|
||||
|
||||
struct MitchellCubicKernelFunc {
|
||||
/**
|
||||
* https://doi.org/10.1145/378456.378514
|
||||
* D. P. Mitchell and A. N. Netravali. Reconstruction filters in computer
|
||||
* graphics. Computer Graphics (Proceedings of ACM SIGGRAPH 1988),
|
||||
* 22(4):221–228, 1988.
|
||||
*/
|
||||
float operator()(float x) const {
|
||||
x = std::abs(x);
|
||||
if (x >= 2.0f) {
|
||||
return 0.0f;
|
||||
} else if (x >= 1.0f) {
|
||||
return (((-7.0f / 18.0f) * x + 2.0f) * x - 10.0f / 3.0f) * x + 16.0f / 9.0f;
|
||||
} else {
|
||||
return (((7.0f / 6.0f) * x - 2.0f) * x) * x + 8.0f / 9.0f;
|
||||
}
|
||||
}
|
||||
float Radius() const { return 2.f; }
|
||||
};
|
||||
|
||||
inline LanczosKernelFunc CreateLanczos1Kernel() { return LanczosKernelFunc(1.0); }
|
||||
|
||||
inline LanczosKernelFunc CreateLanczos3Kernel() { return LanczosKernelFunc(3.0); }
|
||||
|
||||
inline LanczosKernelFunc CreateLanczos5Kernel() { return LanczosKernelFunc(5.0); }
|
||||
|
||||
inline GaussianKernelFunc CreateGaussianKernel() { return GaussianKernelFunc(1.5); }
|
||||
|
||||
inline BoxKernelFunc CreateBoxKernel() { return BoxKernelFunc(); }
|
||||
|
||||
inline TriangleKernelFunc CreateTriangleKernel() { return TriangleKernelFunc(); }
|
||||
|
||||
inline KeysCubicKernelFunc CreateKeysCubicKernel() { return KeysCubicKernelFunc(); }
|
||||
|
||||
inline MitchellCubicKernelFunc CreateMitchellCubicKernel() { return MitchellCubicKernelFunc(); }
|
||||
|
||||
} // namespace aicpu
|
||||
|
||||
#endif // _AICPU_AICPU_DEVICE_CPU_KERNELS_UTILS_SAMPLING_KERNELS_H_
|
|
@ -81,8 +81,49 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
|
|||
mindspore::kQuantileOpName,
|
||||
mindspore::kSparseSegmentSqrtNOpName,
|
||||
mindspore::kUnsortedSegmentProdOpName,
|
||||
mindspore::kMulOpName,
|
||||
mindspore::kExpOpName};
|
||||
mindspore::kExpOpName,
|
||||
mindspore::kMatrixTriangularSolveOpName,
|
||||
mindspore::kMaximumGradGradOpName,
|
||||
mindspore::kMaxPoolOpName,
|
||||
mindspore::kMinimumGradGradOpName,
|
||||
mindspore::kMulNoNanOpName,
|
||||
mindspore::kMultilabelMarginLossGradOpName,
|
||||
mindspore::kNthElementOpName,
|
||||
mindspore::kNonMaxSuppressionWithOverlapsOpName,
|
||||
mindspore::kOneHotOpName,
|
||||
mindspore::kOrgqrOpName,
|
||||
mindspore::kPackOpName,
|
||||
mindspore::kParameterizedTruncatedNormalOpName,
|
||||
mindspore::kPolarOpName,
|
||||
mindspore::kPdistGradOpName,
|
||||
mindspore::kRaggedRangeOpName,
|
||||
mindspore::kRaggedTensorToSparseOpName,
|
||||
mindspore::kRaggedTensorToTensorOpName,
|
||||
mindspore::kReciprocalOpName,
|
||||
mindspore::kReciprocalGradOpName,
|
||||
mindspore::kReduceMeanOpName,
|
||||
mindspore::kReduceProdOpName,
|
||||
mindspore::kReluOpName,
|
||||
mindspore::kReverseV2OpName,
|
||||
mindspore::kRGBToHSVOpName,
|
||||
mindspore::kRsqrtGradOpName,
|
||||
mindspore::kSampleDistortedBoundingBoxExt2OpName,
|
||||
mindspore::kScaleAndTranslateGradOpName,
|
||||
mindspore::kScatterNdOpName,
|
||||
mindspore::kScatterNdUpdateOpName,
|
||||
mindspore::kSelectOpName,
|
||||
mindspore::kSelfAdjointEigOpName,
|
||||
mindspore::kSinOpName,
|
||||
mindspore::kSincOpName,
|
||||
mindspore::kSinhOpName,
|
||||
mindspore::kSmoothL1LossGradV2OpName,
|
||||
mindspore::kSmoothL1LossV2OpName,
|
||||
mindspore::kSignOpName,
|
||||
mindspore::kCheckNumericsOpName,
|
||||
mindspore::kFloorDivOpName,
|
||||
mindspore::kLog1pOpName,
|
||||
mindspore::kMulOpName};
|
||||
|
||||
static const std::string kEnvOpSoNames = "mindspore_aicpu_kernels";
|
||||
static const std::string kCpuKernelSoName = "mindspore_cpu_kernels";
|
||||
|
||||
|
|
|
@ -185,3 +185,38 @@ from .qr import _qr_aicpu
|
|||
from .col2im import _col2im_aicpu
|
||||
from .matrix_solve_ls import _matrix_solve_ls_aicpu
|
||||
from .exp import _exp_aicpu
|
||||
from .matrix_triangular_solve import _matrix_triangular_solve_aicpu
|
||||
from .maximum_grad_grad import _maximum_grad_grad_aicpu
|
||||
from .maxpool_v1 import _maxpool_v1_aicpu
|
||||
from .minimum_grad_grad import _minimum_grad_grad_aicpu
|
||||
from .mul_no_nan import _mul_no_nan_aicpu
|
||||
from .multilabel_margin_loss_grad import _multilabel_margin_loss_grad_aicpu
|
||||
from .nth_element import _nth_element_aicpu
|
||||
from .non_max_suppression_with_overlaps import _non_max_suppression_with_overlaps_aicpu
|
||||
from .one_hot import _one_hot_aicpu
|
||||
from .orgqr import _orgqr_aicpu
|
||||
from .parameterized_truncated_normal import _parameterized_truncated_normal_aicpu
|
||||
from .polar import _polar_aicpu
|
||||
from .pdist_grad import _pdist_grad_aicpu
|
||||
from .ragged_range import _raggedrange_aicpu
|
||||
from .ragged_tensor_to_sparse import _ragged_tensor_to_sparse_aicpu
|
||||
from .ragged_tensor_to_tensor import _ragged_tensor_to_tensor_aicpu
|
||||
from .reciprocal import _reciprocal_aicpu
|
||||
from .reciprocal_grad import _reciprocal_grad_aicpu
|
||||
from .reduce_mean import _reduce_mean_aicpu
|
||||
from .reduce_prod import _reduce_prod_aicpu
|
||||
from .relu_v3 import _relu_v3_aicpu
|
||||
from .reversev2 import _reversev2_aicpu
|
||||
from .rgb_to_hsv import _rgb_to_hsv_aicpu
|
||||
from .rsqrt_grad import _rsqrt_grad_aicpu
|
||||
from .sample_distorted_bounding_box_v2 import _sample_distorted_bounding_box_v2_aicpu
|
||||
from .scale_and_translate_grad import _scale_and_translate_grad_aicpu
|
||||
from .scatter_nd import _scatter_nd_aicpu
|
||||
from .scatter_nd_update import _scatter_nd_update_aicpu
|
||||
from .select import _select_aicpu
|
||||
from .self_adjoint_eig import _self_adjoint_eig_aicpu
|
||||
from .sin import _sin_aicpu
|
||||
from .sinc import _sinc_aicpu
|
||||
from .sinh import _sinh_aicpu
|
||||
from .smooth_l1_loss_grad import _smooth_l1_loss_grad_aicpu
|
||||
from .smooth_l1_loss import _smooth_l1_loss_aicpu
|
||||
|
|
Loading…
Reference in New Issue