aicpu migration from sjx del first 9 ops

This commit is contained in:
lilinjie 2023-01-18 19:07:06 +08:00
parent 08aa1515d3
commit c137e34989
65 changed files with 7276 additions and 1 deletions

View File

@ -96,6 +96,8 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "truncLongCastAssignment" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "truncLongCastAssignment"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "knownConditionTrueFalse" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "knownConditionTrueFalse"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "passedByValue" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "passedByValue"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitMemberVar"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitvar"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "shadowVariable"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "unsignedPositive"

View File

@ -129,6 +129,7 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "legal/copyright" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "legal/copyright"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "readability/inheritance" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "readability/inheritance"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "runtime/int" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "runtime/int"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/empty_if_body"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/newline" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/newline"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/operators" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/operators"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/comma" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "whitespace/comma"

View File

@ -282,6 +282,14 @@ mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multinomial.cc:aicpu::Generate mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/multinomial.cc:aicpu::Generate
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_set_diag_v3.cc:aicpu::MatrixSetDiagV3CpuKernel::DoCompute mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_set_diag_v3.cc:aicpu::MatrixSetDiagV3CpuKernel::DoCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/max_unpool_2d.cc:aicpu::MaxUnpool2DCpuKernel::MaxUnpool2DCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparseaddmm.cc:aicpu::SparseAddmmCpuKernel::SparseAddmmCompute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::TripletMarginLossComputeRealType
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::TripletMarginLossComputeComplexType
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/triplet_margin_loss.cc:aicpu::TripletMarginLossCpuKernel::TripletMarginLossComputeRealTypeFloat16
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_matrix_transpose.cc:aicpu::SparseMatrixTransposeCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/sparse_reshape.cc:aicpu::SparseReshapeCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/optimizer/mindir/aicpu_lib_select.cc:mindspore::opt::AICpuLibSelectPass::Process
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/matrix_solve_ls.cc:aicpu::MatrixSolveLsCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tensor_scatter_update.cc:aicpu::TensorScatterUpdateCpuKernel::Compute mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/tensor_scatter_update.cc:aicpu::TensorScatterUpdateCpuKernel::Compute
mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc:aicpu::ScatterNdCpuKernel::Compute mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/scatter_nd.cc:aicpu::ScatterNdCpuKernel::Compute

View File

@ -35,6 +35,7 @@
namespace mindspore { namespace mindspore {
// op name. Op which not exists in operator/ops.h, so define it's name here // op name. Op which not exists in operator/ops.h, so define it's name here
constexpr auto kSparseApplyCenteredRMSPropOpName = "SparseApplyCenteredRMSProp"; constexpr auto kSparseApplyCenteredRMSPropOpName = "SparseApplyCenteredRMSProp";
constexpr auto kSparseApplyMomentumOpName = "SparseApplyMomentum";
constexpr auto kAbsOpName = "Abs"; constexpr auto kAbsOpName = "Abs";
constexpr auto kAccumulateNV2OpName = "AccumulateNV2"; constexpr auto kAccumulateNV2OpName = "AccumulateNV2";
constexpr auto kAdamApplyOneAssignOpName = "AdamApplyOneAssign"; constexpr auto kAdamApplyOneAssignOpName = "AdamApplyOneAssign";
@ -49,7 +50,17 @@ constexpr auto kAdaptiveAvgPool2dOpName = "AdaptiveAvgPool2d";
constexpr auto kAdaptiveAvgPool2dGradOpName = "AdaptiveAvgPool2dGrad"; constexpr auto kAdaptiveAvgPool2dGradOpName = "AdaptiveAvgPool2dGrad";
constexpr auto kAdaptiveMaxPool3DGradOpName = "AdaptiveMaxPool3DGrad"; constexpr auto kAdaptiveMaxPool3DGradOpName = "AdaptiveMaxPool3DGrad";
constexpr auto kAddNOpName = "AddN"; constexpr auto kAddNOpName = "AddN";
constexpr auto kAddV2OpName = "AddV2";
constexpr auto kAddOpName = "Add"; constexpr auto kAddOpName = "Add";
constexpr auto kAdaptiveAvgPool3DOpName = "AdaptiveAvgPool3D";
constexpr auto kAdaptiveMaxPool3DOpName = "AdaptiveMaxPool3D";
constexpr auto kAdaptiveAvgPool3DGradOpName = "AdaptiveAvgPool3DGrad";
constexpr auto kAdaptiveMaxPool2DGradOpName = "AdaptiveMaxPool2DGrad";
constexpr auto kAdjustContrastv2OpName = "AdjustContrastv2";
constexpr auto kAdjustHueOpName = "AdjustHue";
constexpr auto kAdjustSaturationOpName = "AdjustSaturation";
constexpr auto kAngleOpName = "Angle";
constexpr auto kAffineGridGradOpName = "AffineGridGrad";
constexpr auto kApplyAdadeltaDOpName = "ApplyAdadeltaD"; constexpr auto kApplyAdadeltaDOpName = "ApplyAdadeltaD";
constexpr auto kApplyAdadeltaOpName = "ApplyAdadelta"; constexpr auto kApplyAdadeltaOpName = "ApplyAdadelta";
constexpr auto kApplyAdagradDADOpName = "ApplyAdagradDAD"; constexpr auto kApplyAdagradDADOpName = "ApplyAdagradDAD";
@ -92,7 +103,10 @@ constexpr auto kArgMinDOpName = "ArgMinD";
constexpr auto kArgminOpName = "Argmin"; constexpr auto kArgminOpName = "Argmin";
constexpr auto kArgMinOpName = "ArgMin"; constexpr auto kArgMinOpName = "ArgMin";
constexpr auto kArgminV2OpName = "ArgminV2"; constexpr auto kArgminV2OpName = "ArgminV2";
constexpr auto kArgMinWithValueOpName = "ArgMinWithValue";
constexpr auto kArgMaxWithValueOpName = "ArgMaxWithValue"; constexpr auto kArgMaxWithValueOpName = "ArgMaxWithValue";
constexpr auto KAsinGradOpName = "AsinGrad";
constexpr auto KAsinhGradOpName = "AsinhGrad";
constexpr auto kAssignAddOpName = "AssignAdd"; constexpr auto kAssignAddOpName = "AssignAdd";
constexpr auto kAssignOpName = "Assign"; constexpr auto kAssignOpName = "Assign";
constexpr auto kAssignSubOpName = "AssignSub"; constexpr auto kAssignSubOpName = "AssignSub";
@ -103,6 +117,8 @@ constexpr auto kAvgPool3DOpName = "AvgPool3D";
constexpr auto kACosOpName = "ACos"; constexpr auto kACosOpName = "ACos";
constexpr auto kACosGradOpName = "ACosGrad"; constexpr auto kACosGradOpName = "ACosGrad";
constexpr auto kAcosGradOpName = "AcosGrad"; constexpr auto kAcosGradOpName = "AcosGrad";
constexpr auto kACoshOpName = "ACosh";
constexpr auto kAcoshGradOpName = "ACoshGrad";
constexpr auto kAvgPool3DDOpName = "AvgPool3DD"; constexpr auto kAvgPool3DDOpName = "AvgPool3DD";
constexpr auto kAvgPoolGradOpName = "AvgPoolGrad"; constexpr auto kAvgPoolGradOpName = "AvgPoolGrad";
constexpr auto kAvgPoolGradDOpName = "AvgPoolGradD"; constexpr auto kAvgPoolGradDOpName = "AvgPoolGradD";
@ -113,10 +129,12 @@ constexpr auto kBasicLSTMCellCStateGradV2OpName = "BasicLSTMCellCStateGradV2";
constexpr auto kBasicLSTMCellInputGradOpName = "BasicLSTMCellInputGrad"; constexpr auto kBasicLSTMCellInputGradOpName = "BasicLSTMCellInputGrad";
constexpr auto kBasicLSTMCellOpName = "BasicLSTMCell"; constexpr auto kBasicLSTMCellOpName = "BasicLSTMCell";
constexpr auto kBasicLSTMCellWeightGradOpName = "BasicLSTMCellWeightGrad"; constexpr auto kBasicLSTMCellWeightGradOpName = "BasicLSTMCellWeightGrad";
constexpr auto kBartlettWindowOpName = "BartlettWindow";
constexpr auto kBatchMatMulOpName = "BatchMatMul"; constexpr auto kBatchMatMulOpName = "BatchMatMul";
constexpr auto kBatchMatMulV2OpName = "BatchMatMulV2"; constexpr auto kBatchMatMulV2OpName = "BatchMatMulV2";
constexpr auto kBatchNormOpName = "BatchNorm"; constexpr auto kBatchNormOpName = "BatchNorm";
constexpr auto kBatchNormGradOpName = "BatchNormGrad"; constexpr auto kBatchNormGradOpName = "BatchNormGrad";
constexpr auto kBatchNormGradGradOpName = "BatchNormGradGrad";
constexpr auto kBatchNormGradWithActivation = "BatchNormGradWithActivation"; constexpr auto kBatchNormGradWithActivation = "BatchNormGradWithActivation";
constexpr auto kBatchNormGradWithAddAndActivation = "BatchNormGradWithAddAndActivation"; constexpr auto kBatchNormGradWithAddAndActivation = "BatchNormGradWithAddAndActivation";
constexpr auto kBatchNormWithActivation = "BatchNormWithActivation"; constexpr auto kBatchNormWithActivation = "BatchNormWithActivation";
@ -130,7 +148,9 @@ constexpr auto kBiasAddOpName = "BiasAdd";
constexpr auto kBiasAddGradOpName = "BiasAddGrad"; constexpr auto kBiasAddGradOpName = "BiasAddGrad";
constexpr auto kIndexAddOpName = "IndexAdd"; constexpr auto kIndexAddOpName = "IndexAdd";
constexpr auto kBitwiseOrOpName = "BitwiseOr"; constexpr auto kBitwiseOrOpName = "BitwiseOr";
constexpr auto kBincountOpName = "Bincount";
constexpr auto kBCEWithLogitsLossOpName = "BCEWithLogitsLoss"; constexpr auto kBCEWithLogitsLossOpName = "BCEWithLogitsLoss";
constexpr auto kBlackmanWindowOpName = "BlackmanWindow";
constexpr auto kBN2AddReluOpName = "BN2AddRelu"; constexpr auto kBN2AddReluOpName = "BN2AddRelu";
constexpr auto kBN2OpName = "BN2"; constexpr auto kBN2OpName = "BN2";
constexpr auto kBN2ReLUOpName = "BN2Relu"; constexpr auto kBN2ReLUOpName = "BN2Relu";
@ -214,6 +234,13 @@ constexpr auto kCSRMVOpName = "CSRMV";
constexpr auto kCSRReduceSumOpName = "CSRReduceSum"; constexpr auto kCSRReduceSumOpName = "CSRReduceSum";
constexpr auto kCSRSparseMatrixToDenseOpName = "CSRSparseMatrixToDense"; constexpr auto kCSRSparseMatrixToDenseOpName = "CSRSparseMatrixToDense";
constexpr auto kCSRSparseMatrixToSparseTensorOpName = "CSRSparseMatrixToSparseTensor"; constexpr auto kCSRSparseMatrixToSparseTensorOpName = "CSRSparseMatrixToSparseTensor";
constexpr auto kSparseMatrixMatMulOpName = "SparseMatrixMatMul";
constexpr auto kSparseMatrixNNZOpName = "SparseMatrixNNZ";
constexpr auto kSparseMatrixTransposeOpName = "SparseMatrixTranspose";
constexpr auto kSparseReshapeOpName = "SparseReshape";
constexpr auto kSparseSegmentSqrtNGradOpName = "SparseSegmentSqrtNGrad";
constexpr auto kSparseSegmentSumWithNumSegmentsOpName = "SparseSegmentSumWithNumSegments";
constexpr auto kSparseSegmentSqrtNWithNumSegmentsOpName = "SparseSegmentSqrtNWithNumSegments";
constexpr auto kCTCGreedyDecoderOpName = "CTCGreedyDecoder"; constexpr auto kCTCGreedyDecoderOpName = "CTCGreedyDecoder";
constexpr auto kCumprodOpName = "Cumprod"; constexpr auto kCumprodOpName = "Cumprod";
constexpr auto kCumprodDOpName = "CumprodD"; constexpr auto kCumprodDOpName = "CumprodD";
@ -610,6 +637,7 @@ constexpr auto kRpcSendOpName = "RpcSend";
constexpr auto kRpnProposalsOpName = "RpnProposals"; constexpr auto kRpnProposalsOpName = "RpnProposals";
constexpr auto kRpnProposalsDOpName = "RpnProposalsD"; constexpr auto kRpnProposalsDOpName = "RpnProposalsD";
constexpr auto kRsqrtGradOpName = "RsqrtGrad"; constexpr auto kRsqrtGradOpName = "RsqrtGrad";
constexpr auto kSqrtGradOpName = "SqrtGrad";
constexpr auto kRsqrtOpName = "Rsqrt"; constexpr auto kRsqrtOpName = "Rsqrt";
constexpr auto kSampleDistortedBoundingBoxExt2OpName = "SampleDistortedBoundingBoxExt2"; constexpr auto kSampleDistortedBoundingBoxExt2OpName = "SampleDistortedBoundingBoxExt2";
constexpr auto kScaleAndTranslateOpName = "ScaleAndTranslate"; constexpr auto kScaleAndTranslateOpName = "ScaleAndTranslate";
@ -659,9 +687,11 @@ constexpr auto kSpaceToBatchNDDOpName = "SpaceToBatchNDD";
constexpr auto kSpaceToDepthOpName = "SpaceToDepth"; constexpr auto kSpaceToDepthOpName = "SpaceToDepth";
constexpr auto kSparseApplyAdadeltaOpName = "SparseApplyAdadelta"; constexpr auto kSparseApplyAdadeltaOpName = "SparseApplyAdadelta";
constexpr auto kSparseFillEmptyRows = "SparseFillEmptyRows"; constexpr auto kSparseFillEmptyRows = "SparseFillEmptyRows";
constexpr auto kSparseFillEmptyRowsGradOpName = "SparseFillEmptyRowsGrad";
constexpr auto kSparseApplyAdadeltaDOpName = "SparseApplyAdadeltaD"; constexpr auto kSparseApplyAdadeltaDOpName = "SparseApplyAdadeltaD";
constexpr auto kSparseApplyAdagradOpName = "SparseApplyAdagrad"; constexpr auto kSparseApplyAdagradOpName = "SparseApplyAdagrad";
constexpr auto kSparseApplyAdagradDOpName = "SparseApplyAdagradD"; constexpr auto kSparseApplyAdagradDOpName = "SparseApplyAdagradD";
constexpr auto kSparseApplyAdagradDAOpName = "SparseApplyAdagradDA";
constexpr auto kSparseApplyAdagradV2OpName = "SparseApplyAdagradV2"; constexpr auto kSparseApplyAdagradV2OpName = "SparseApplyAdagradV2";
constexpr auto kSparseApplyAdagradV2DOpName = "SparseApplyAdagradV2D"; constexpr auto kSparseApplyAdagradV2DOpName = "SparseApplyAdagradV2D";
constexpr auto kSparseApplyFtrlOpName = "SparseApplyFtrl"; constexpr auto kSparseApplyFtrlOpName = "SparseApplyFtrl";
@ -670,9 +700,15 @@ constexpr auto kSparseApplyFtrlV2OpName = "SparseApplyFtrlV2";
constexpr auto kSparseApplyFtrlV2DOpName = "SparseApplyFtrlV2D"; constexpr auto kSparseApplyFtrlV2DOpName = "SparseApplyFtrlV2D";
constexpr auto kSparseApplyProximalAdagradDOpName = "SparseApplyProximalAdagradD"; constexpr auto kSparseApplyProximalAdagradDOpName = "SparseApplyProximalAdagradD";
constexpr auto kSparseApplyProximalAdagradOpName = "SparseApplyProximalAdagrad"; constexpr auto kSparseApplyProximalAdagradOpName = "SparseApplyProximalAdagrad";
constexpr auto kSparseApplyProximalGradientDescentOpName = "SparseApplyProximalGradientDescent";
constexpr auto kSparseApplyRMSPropOpName = "SparseApplyRMSProp"; constexpr auto kSparseApplyRMSPropOpName = "SparseApplyRMSProp";
constexpr auto kSparseApplyRMSPropDOpName = "SparseApplyRMSPropD"; constexpr auto kSparseApplyRMSPropDOpName = "SparseApplyRMSPropD";
constexpr auto kSparseAddmmOpName = "SparseAddmm";
constexpr auto kSparseCrossOpName = "SparseCross"; constexpr auto kSparseCrossOpName = "SparseCross";
constexpr auto kSparseDenseCwiseMulOpName = "SparseDenseCwiseMul";
constexpr auto kSparseDenseCwiseDivOpName = "SparseDenseCwiseDiv";
constexpr auto kSparseDenseCwiseAddOpName = "SparseDenseCwiseAdd";
constexpr auto kSparseConcatOpName = "SparseConcat";
constexpr auto kSparseGatherV2OpName = "SparseGatherV2"; constexpr auto kSparseGatherV2OpName = "SparseGatherV2";
constexpr auto kSparseSliceOpName = "SparseSlice"; constexpr auto kSparseSliceOpName = "SparseSlice";
constexpr auto kSparseSoftmaxCrossEntropyWithLogitsOpName = "SparseSoftmaxCrossEntropyWithLogits"; constexpr auto kSparseSoftmaxCrossEntropyWithLogitsOpName = "SparseSoftmaxCrossEntropyWithLogits";
@ -711,6 +747,7 @@ constexpr auto kSubAndFilterOpName = "SubAndFilter";
constexpr auto kSubOpName = "Sub"; constexpr auto kSubOpName = "Sub";
constexpr auto kSubscalarOpName = "Subscalar"; constexpr auto kSubscalarOpName = "Subscalar";
constexpr auto kSwitchOpName = "Switch"; constexpr auto kSwitchOpName = "Switch";
constexpr auto kTanhOpName = "Tanh";
constexpr auto kTensorAddOpName = "Add"; constexpr auto kTensorAddOpName = "Add";
constexpr auto kTensorCopySlicesOpName = "TensorCopySlices"; constexpr auto kTensorCopySlicesOpName = "TensorCopySlices";
constexpr auto kTensorMoveOpName = "TensorMove"; constexpr auto kTensorMoveOpName = "TensorMove";
@ -725,6 +762,10 @@ constexpr auto kTransposeDOpName = "TransposeD";
constexpr auto kTruncatedNormal = "TruncatedNormal"; constexpr auto kTruncatedNormal = "TruncatedNormal";
constexpr auto kTruncateDivOpName = "TruncateDiv"; constexpr auto kTruncateDivOpName = "TruncateDiv";
constexpr auto kTruncOpName = "Trunc"; constexpr auto kTruncOpName = "Trunc";
constexpr auto kTridiagonalMatMulOpName = "TridiagonalMatMul";
constexpr auto kTrilIndicesOpName = "TrilIndices";
constexpr auto kTriuIndicesOpName = "TriuIndices";
constexpr auto kTripletMarginLossOpName = "TripletMarginLoss";
constexpr auto kUniformCandidateSamplerOpName = "UniformCandidateSampler"; constexpr auto kUniformCandidateSamplerOpName = "UniformCandidateSampler";
constexpr auto kLogUniformCandidateSamplerOpName = "LogUniformCandidateSampler"; constexpr auto kLogUniformCandidateSamplerOpName = "LogUniformCandidateSampler";
constexpr auto kUniformIntOpName = "UniformInt"; constexpr auto kUniformIntOpName = "UniformInt";
@ -743,8 +784,12 @@ constexpr auto kUnsortedSegmentProdOpName = "UnsortedSegmentProd";
constexpr auto kUnsortedSegmentProdDOpName = "UnsortedSegmentProdD"; constexpr auto kUnsortedSegmentProdDOpName = "UnsortedSegmentProdD";
constexpr auto kUnsortedSegmentSumOpName = "UnsortedSegmentSum"; constexpr auto kUnsortedSegmentSumOpName = "UnsortedSegmentSum";
constexpr auto kUnsortedSegmentSumDOpName = "UnsortedSegmentSumD"; constexpr auto kUnsortedSegmentSumDOpName = "UnsortedSegmentSumD";
constexpr auto kUnravelIndexOpName = "UnravelIndex";
constexpr auto kUpdateCacheOpName = "UpdateCache"; constexpr auto kUpdateCacheOpName = "UpdateCache";
constexpr auto kUpdateStateOpName = "UpdateState"; constexpr auto kUpdateStateOpName = "UpdateState";
constexpr auto kUpperBoundOpName = "UpperBound";
constexpr auto kXlogyOpName = "Xlogy";
constexpr auto kXdivyOpName = "Xdivy";
constexpr auto kDynamicBroadcastToOpName = "DynamicBroadcastTo"; constexpr auto kDynamicBroadcastToOpName = "DynamicBroadcastTo";
constexpr auto kCheckValidOpName = "CheckValid"; constexpr auto kCheckValidOpName = "CheckValid";
constexpr auto kSoftmaxGradFusionOpName = "SoftmaxGradFusion"; constexpr auto kSoftmaxGradFusionOpName = "SoftmaxGradFusion";

View File

@ -0,0 +1,283 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_matrix_mat_mul.h"
#include <securec.h>
#include <complex>
#include <numeric>
#include <string>
#include <vector>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/allocator_utils.h"
#include "utils/kernel_util.h"
using namespace std;
namespace aicpu {
const char *SparseMatrixMatMul = "SparseMatrixMatMul";
const int INPUT_PARAMS_NUM = 6;
const int OUTPUT_PARAMS_NUM = 1;
} // namespace aicpu
namespace aicpu {
uint32_t SparseMatrixMatMulCpuKernel::Compute(CpuKernelContext &ctx) {
if (ValidParam(ctx) != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("valid sparse matrix mat mul param error.");
return KERNEL_STATUS_PARAM_INVALID;
}
DataType indice_type = ctx.Input(0)->GetDataType();
DataType value_type = ctx.Input(4)->GetDataType();
uint32_t status;
switch (indice_type) {
case DT_INT32:
switch (value_type) {
case DT_FLOAT:
status = DoCompute<int32_t, float_t>(ctx);
break;
case DT_DOUBLE:
status = DoCompute<int32_t, double_t>(ctx);
break;
case DT_COMPLEX64:
status = DoCompute<int32_t, complex<float_t> >(ctx);
break;
case DT_COMPLEX128:
status = DoCompute<int32_t, complex<double_t> >(ctx);
break;
default:
KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
return KERNEL_STATUS_PARAM_INVALID;
}
break;
case DT_INT64:
switch (value_type) {
case DT_FLOAT:
status = DoCompute<int64_t, float_t>(ctx);
break;
case DT_DOUBLE:
status = DoCompute<int64_t, double_t>(ctx);
break;
case DT_COMPLEX64:
status = DoCompute<int64_t, complex<float_t> >(ctx);
break;
case DT_COMPLEX128:
status = DoCompute<int64_t, complex<double_t> >(ctx);
break;
default:
KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
return KERNEL_STATUS_PARAM_INVALID;
}
break;
default:
KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
return KERNEL_STATUS_PARAM_INVALID;
}
if (status != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("error in do the actual compute!");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename indiceT, typename valueT>
Eigen::Ref<const Eigen::SparseMatrix<valueT, Eigen::RowMajor, indiceT> >
SparseMatrixMatMulCpuKernel::CreateEigenSparseMatrix(indiceT rows, indiceT cols, int64_t nnz, indiceT *row_pointers,
indiceT *col_indices, valueT *values, bool transpose,
bool adjoint) {
Eigen::Map<const Eigen::SparseMatrix<valueT, Eigen::RowMajor, indiceT> > sparse_matrix(rows, cols, nnz, row_pointers,
col_indices, values);
// The transpose/adjoint expressions are not actually evaluated until
// necessary. Hence we don't create copies or modify the input matrix
// inplace.
if (transpose) {
return sparse_matrix.transpose();
}
if (adjoint) {
return sparse_matrix.adjoint();
}
return sparse_matrix;
}
uint32_t SparseMatrixMatMulCpuKernel::ValidParam(CpuKernelContext &ctx) {
KERNEL_LOG_DEBUG("Start to execute ValidParam.");
// valid input and output nullptr
if (NormalCheck(ctx, INPUT_PARAMS_NUM, OUTPUT_PARAMS_NUM) != KERNEL_STATUS_OK) {
return KERNEL_STATUS_PARAM_INVALID;
}
// check if the matrix can mul
DataType dt = ctx.Input(0)->GetDataType(); // dense shape x1
uint32_t checkStatus;
switch (dt) {
case DT_INT32:
checkStatus = CheckMatMul<int32_t>(ctx);
break;
case DT_INT64:
checkStatus = CheckMatMul<int64_t>(ctx);
break;
default:
// KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
return KERNEL_STATUS_PARAM_INVALID;
}
if (checkStatus != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("the two input matrixs cannot mul cause their dim!");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SparseMatrixMatMulCpuKernel::CheckMatMul(CpuKernelContext &ctx) {
KERNEL_LOG_DEBUG("check if the matrix can mul");
const int rank = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
const int row_dim = (rank == 2) ? 0 : 1;
Tensor *dense_shape_x1 = ctx.Input(0);
T *shape_x1 = static_cast<T *>(dense_shape_x1->GetData());
std::vector<int64_t> shape_x2 = ctx.Input(5)->GetTensorShape()->GetDimSizes();
bool transpose_a = false;
bool transpose_b = false;
bool adjoint_a = false;
bool adjoint_b = false;
if (ctx.GetAttr("transpose_x1") != nullptr) {
transpose_a = ctx.GetAttr("transpose_x1")->GetBool();
}
if (ctx.GetAttr("transpose_x2") != nullptr) {
transpose_b = ctx.GetAttr("transpose_x2")->GetBool();
}
if (ctx.GetAttr("adjoint_x1") != nullptr) {
adjoint_a = ctx.GetAttr("adjoint_x1")->GetBool();
}
if (ctx.GetAttr("adjoint_x2") != nullptr) {
adjoint_b = ctx.GetAttr("adjoint_x2")->GetBool();
}
T x1_col = (transpose_a || adjoint_a) ? shape_x1[row_dim] : shape_x1[row_dim + 1];
T x2_row = (transpose_b || adjoint_b) ? shape_x2[row_dim + 1] : shape_x2[row_dim];
if (x1_col != x2_row) {
KERNEL_LOG_ERROR("x1's col is no equal x2's row, cannot do mat mul!");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename indiceT, typename valueT>
uint32_t SparseMatrixMatMulCpuKernel::DoCompute(CpuKernelContext &ctx) {
using Matrix = Eigen::Matrix<valueT, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
indiceT batch_size = ctx.Input(1)->NumElements() - 1;
std::vector<Matrix> results(batch_size);
int shift = (ctx.Input(0)->NumElements() == 2) ? 0 : 1;
indiceT row_x1 = *(static_cast<indiceT *>(ctx.Input(0)->GetData()) + shift);
indiceT col_x1 = *(static_cast<indiceT *>(ctx.Input(0)->GetData()) + shift + 1);
indiceT *batch_pointers_x1 = static_cast<indiceT *>(ctx.Input(1)->GetData());
indiceT *row_pointers_x1 = static_cast<indiceT *>(ctx.Input(2)->GetData());
indiceT *col_indices_x1 = static_cast<indiceT *>(ctx.Input(3)->GetData());
valueT *value_x1 = static_cast<valueT *>(ctx.Input(4)->GetData());
std::vector<int64_t> shape_x2 = ctx.Input(5)->GetTensorShape()->GetDimSizes();
const int rank = ctx.Input(0)->GetTensorShape()->GetDimSize(0);
const int row_dim = (rank == 2) ? 0 : 1;
indiceT row_x2 = shape_x2[row_dim];
indiceT col_x2 = shape_x2[row_dim + 1];
valueT *value_x2 = static_cast<valueT *>(ctx.Input(5)->GetData());
bool transpose_a = false;
bool transpose_b = false;
bool adjoint_a = false;
bool adjoint_b = false;
bool transpose_output = false;
bool conjugate_output = false;
if (ctx.GetAttr("transpose_x1") != nullptr) {
transpose_a = ctx.GetAttr("transpose_x1")->GetBool();
}
if (ctx.GetAttr("transpose_x2") != nullptr) {
transpose_b = ctx.GetAttr("transpose_x2")->GetBool();
}
if (ctx.GetAttr("adjoint_x1") != nullptr) {
adjoint_a = ctx.GetAttr("adjoint_x1")->GetBool();
}
if (ctx.GetAttr("adjoint_x2") != nullptr) {
adjoint_b = ctx.GetAttr("adjoint_x2")->GetBool();
}
if (ctx.GetAttr("transpose_output") != nullptr) {
transpose_output = ctx.GetAttr("transpose_output")->GetBool();
}
if (ctx.GetAttr("conjugate_output") != nullptr) {
conjugate_output = ctx.GetAttr("conjugate_output")->GetBool();
}
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
max_core_num = std::min(max_core_num, (uint32_t)batch_size);
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max core num cannot be zero");
return KERNEL_STATUS_PARAM_INVALID;
}
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, batch_size, batch_size / max_core_num,
[&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++) {
int64_t nnz_x1 = batch_pointers_x1[i + 1] - batch_pointers_x1[i];
indiceT *row_pointers_x1_batch_i = row_pointers_x1 + (row_x1 + 1) * i;
indiceT *col_indices_x1_batch_i = col_indices_x1 + batch_pointers_x1[i];
valueT *value_x1_batch_i = value_x1 + batch_pointers_x1[i];
auto x1_sparse_matrix = CreateEigenSparseMatrix<indiceT, valueT>(
row_x1, col_x1, nnz_x1, row_pointers_x1_batch_i, col_indices_x1_batch_i,
value_x1_batch_i, transpose_a, adjoint_a);
Eigen::Map<Matrix> x2_dense_matrix(value_x2 + col_x2 * row_x2 * i, row_x2, col_x2);
Matrix temp;
if (transpose_b) {
temp = x1_sparse_matrix * x2_dense_matrix.transpose();
} else if (adjoint_b) {
temp = x1_sparse_matrix * x2_dense_matrix.adjoint();
} else {
temp = x1_sparse_matrix * x2_dense_matrix;
}
if (transpose_output) {
results[i] = temp.transpose();
} else if (conjugate_output) {
results[i] = temp.conjugate();
} else
results[i] = temp;
}
}),
"SparseMatrixMatMul Compute failed.");
// computer result_row_pointers|result_col_indices|result_values data
indiceT row_output, col_output;
row_output = results[0].rows();
col_output = results[0].cols();
for (int i = 0; i < batch_size; i++) {
valueT *output_values_data = static_cast<valueT *>(ctx.Output(0)->GetData());
std::copy(results[i].data(), results[i].data() + row_output * col_output,
output_values_data + i * row_output * col_output);
}
KERNEL_LOG_DEBUG("DoCompute end!!");
return KERNEL_STATUS_OK;
}
// register the opetaor
REGISTER_CPU_KERNEL(SparseMatrixMatMul, SparseMatrixMatMulCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,46 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSEMATRIXSPARSEMATMUL_H_
#define AICPU_KERNELS_NORMALIZED_SPARSEMATRIXSPARSEMATMUL_H_
#include "Eigen/Core"
#include "Eigen/SparseCore"
#include "cpu_ops_kernel.h"
namespace aicpu {
class SparseMatrixMatMulCpuKernel : public CpuKernel {
public:
~SparseMatrixMatMulCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t ValidParam(CpuKernelContext &ctx);
// check if the matrix can mul
template <typename T>
uint32_t CheckMatMul(CpuKernelContext &ctx);
// create eigen sparsematrix with eigen::map
template <typename indiceT, typename valueT>
Eigen::Ref<const Eigen::SparseMatrix<valueT, Eigen::RowMajor, indiceT> > CreateEigenSparseMatrix(
indiceT rows, indiceT cols, int64_t nnz, indiceT *row_pointers, indiceT *col_indices, valueT *values,
bool transpose, bool adjoint);
// do the actual complute
template <typename indiceT, typename valueT>
uint32_t DoCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,86 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_matrix_nnz.h"
#include <securec.h>
#include <complex>
#include <numeric>
#include <string>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/allocator_utils.h"
#include "utils/kernel_util.h"
using namespace std;
namespace aicpu {
const char *SparseMatrixNNZ = "SparseMatrixNNZ";
const int INPUT_PARAMS_NUM = 5;
const int OUTPUT_PARAMS_NUM = 1;
} // namespace aicpu
namespace aicpu {
uint32_t SparseMatrixNNZCpuKernel::Compute(CpuKernelContext &ctx) {
if (NormalCheck(ctx, INPUT_PARAMS_NUM, OUTPUT_PARAMS_NUM) != KERNEL_STATUS_OK) {
return KERNEL_STATUS_PARAM_INVALID;
}
DataType indice_type = ctx.Input(1)->GetDataType();
uint32_t status;
switch (indice_type) {
case DT_INT32:
status = DoCompute<int32_t>(ctx);
break;
case DT_INT64:
status = DoCompute<int64_t>(ctx);
break;
default:
KERNEL_LOG_ERROR("data type of batch pointers is not int32 or int64");
status = KERNEL_STATUS_PARAM_INVALID;
}
if (status != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("error in do the actual compute!");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename indiceT>
uint32_t SparseMatrixNNZCpuKernel::DoCompute(CpuKernelContext &ctx) {
const indiceT batch_size = ctx.Input(1)->NumElements() - 1;
// define some temp arrays to store the output tensor data
int32_t result_nnz[batch_size];
// do computer
indiceT *batch_pointers_x = static_cast<indiceT *>(ctx.Input(1)->GetData());
indiceT curr = 0;
for (int i = 1; i < batch_size + 1; i++) {
result_nnz[i - 1] = batch_pointers_x[i] - curr;
// update curr
curr = batch_pointers_x[i];
}
// write result
int32_t *output_y = static_cast<int32_t *>(ctx.Output(0)->GetData());
std::copy(result_nnz, result_nnz + (int32_t)batch_size, output_y);
KERNEL_LOG_DEBUG("DoCompute end!!");
return KERNEL_STATUS_OK;
}
// register the opetaor
REGISTER_CPU_KERNEL(SparseMatrixNNZ, SparseMatrixNNZCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSEMATRIXNNZ_H_
#define AICPU_KERNELS_NORMALIZED_SPARSEMATRIXNNZ_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SparseMatrixNNZCpuKernel : public CpuKernel {
public:
~SparseMatrixNNZCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
// do the actual complute
template <typename indiceT>
uint32_t DoCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,337 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* UnSparseMatrixTranspose required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_matrix_transpose.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include <numeric>
#include <iostream>
using namespace std;
namespace aicpu {
const uint32_t kInputNum = 5;
const uint32_t kOutputNum = 5;
const uint32_t kzero = 0;
const uint32_t kone = 1;
const uint32_t ktwo = 2;
const uint32_t kthree = 3;
const uint32_t kfour = 4;
const uint32_t krankwithbatch = 3;
const char *SPARSEMATRIXTRANSPOSE = "SparseMatrixTranspose";
} // namespace aicpu
namespace aicpu {
uint32_t SparseMatrixTransposeCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseMatrixTranspose normal check failed.");
DataType indice_type = ctx.Input(0)->GetDataType();
DataType value_type = ctx.Input(4)->GetDataType();
uint32_t status;
switch (indice_type) {
case DT_INT32:
switch (value_type) {
case DT_UINT8:
status = SparseMatrixTransposeCompute<int32_t, uint8_t>(ctx);
break;
case DT_UINT16:
status = SparseMatrixTransposeCompute<int32_t, uint16_t>(ctx);
break;
case DT_UINT32:
status = SparseMatrixTransposeCompute<int32_t, uint32_t>(ctx);
break;
case DT_UINT64:
status = SparseMatrixTransposeCompute<int32_t, uint64_t>(ctx);
break;
case DT_INT8:
status = SparseMatrixTransposeCompute<int32_t, int8_t>(ctx);
break;
case DT_INT16:
status = SparseMatrixTransposeCompute<int32_t, int16_t>(ctx);
break;
case DT_INT32:
status = SparseMatrixTransposeCompute<int32_t, int32_t>(ctx);
break;
case DT_INT64:
status = SparseMatrixTransposeCompute<int32_t, int64_t>(ctx);
break;
case DT_FLOAT16:
status = SparseMatrixTransposeCompute<int32_t, Eigen::half>(ctx);
break;
case DT_FLOAT:
status = SparseMatrixTransposeCompute<int32_t, float_t>(ctx);
break;
case DT_DOUBLE:
status = SparseMatrixTransposeCompute<int32_t, double_t>(ctx);
break;
case DT_COMPLEX64:
status = SparseMatrixTransposeComputecomplex<int32_t, complex<float_t>>(ctx);
break;
case DT_COMPLEX128:
status = SparseMatrixTransposeComputecomplex<int32_t, complex<double_t>>(ctx);
break;
default:
KERNEL_LOG_ERROR("data type of x_value is not required");
return KERNEL_STATUS_PARAM_INVALID;
}
break;
case DT_INT64:
switch (value_type) {
case DT_UINT8:
status = SparseMatrixTransposeCompute<int64_t, uint8_t>(ctx);
break;
case DT_UINT16:
status = SparseMatrixTransposeCompute<int64_t, uint16_t>(ctx);
break;
case DT_UINT32:
status = SparseMatrixTransposeCompute<int64_t, uint32_t>(ctx);
break;
case DT_UINT64:
status = SparseMatrixTransposeCompute<int64_t, uint64_t>(ctx);
break;
case DT_INT8:
status = SparseMatrixTransposeCompute<int64_t, int8_t>(ctx);
break;
case DT_INT16:
status = SparseMatrixTransposeCompute<int64_t, int16_t>(ctx);
break;
case DT_INT32:
status = SparseMatrixTransposeCompute<int64_t, int32_t>(ctx);
break;
case DT_INT64:
status = SparseMatrixTransposeCompute<int64_t, int64_t>(ctx);
break;
case DT_FLOAT16:
status = SparseMatrixTransposeCompute<int64_t, Eigen::half>(ctx);
break;
case DT_FLOAT:
status = SparseMatrixTransposeCompute<int64_t, float_t>(ctx);
break;
case DT_DOUBLE:
status = SparseMatrixTransposeCompute<int64_t, double_t>(ctx);
break;
case DT_COMPLEX64:
status = SparseMatrixTransposeComputecomplex<int64_t, complex<float_t>>(ctx);
break;
case DT_COMPLEX128:
status = SparseMatrixTransposeComputecomplex<int64_t, complex<double_t>>(ctx);
break;
default:
KERNEL_LOG_ERROR("data type of x_value is not required");
return KERNEL_STATUS_PARAM_INVALID;
}
break;
default:
KERNEL_LOG_ERROR("data type of dense shape is not int32 or int64");
return KERNEL_STATUS_PARAM_INVALID;
}
if (status != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("error in do the actual compute!");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename indiceT, typename valueT>
uint32_t SparseMatrixTransposeCpuKernel::SparseMatrixTransposeCompute(CpuKernelContext &ctx) {
indiceT *x_dense_shape = static_cast<indiceT *>(ctx.Input(0)->GetData());
indiceT *x_batch_pointers = static_cast<indiceT *>(ctx.Input(1)->GetData());
indiceT *x_row_pointers = static_cast<indiceT *>(ctx.Input(2)->GetData());
indiceT *x_col_indices = static_cast<indiceT *>(ctx.Input(3)->GetData());
valueT *x_values = static_cast<valueT *>(ctx.Input(4)->GetData());
bool conjugate = (ctx.GetAttr("conjugate") == nullptr) ? false : ctx.GetAttr("conjugate")->GetBool();
indiceT *y_dense_shape = static_cast<indiceT *>(ctx.Output(0)->GetData());
indiceT *y_batch_pointers = static_cast<indiceT *>(ctx.Output(1)->GetData());
indiceT *y_row_pointers = static_cast<indiceT *>(ctx.Output(2)->GetData());
indiceT *y_col_indices = static_cast<indiceT *>(ctx.Output(3)->GetData());
valueT *y_values = static_cast<valueT *>(ctx.Output(4)->GetData());
auto rank = ctx.Input(0)->NumElements();
if (rank == krankwithbatch) {
y_dense_shape[0] = x_dense_shape[0];
y_dense_shape[1] = x_dense_shape[ktwo];
y_dense_shape[ktwo] = x_dense_shape[1];
} else {
y_dense_shape[0] = x_dense_shape[1];
y_dense_shape[1] = x_dense_shape[0];
}
auto batch_pointers = ctx.Input(1)->NumElements();
for (int i = 0; i < batch_pointers; ++i) {
y_batch_pointers[i] = x_batch_pointers[i];
}
auto num_rows = x_dense_shape[rank - 2];
auto num_cols = x_dense_shape[rank - 1];
auto num_batch = ctx.Input(1)->NumElements() - 1;
int y_part_row_pointers[num_cols + 1];
int part_row_pointers[num_rows + 1];
for (int j = 0; j < num_batch; ++j) {
int n = x_batch_pointers[j + 1] - x_batch_pointers[j];
valueT part_values[n];
indiceT part_col_indices[n];
indiceT y_part_col_indices[n];
valueT y_part_values[n];
for (int i = 0; i < num_cols + 1; ++i) {
y_part_row_pointers[i] = 0;
}
for (int k = 0; k < num_rows + 1; ++k) {
part_row_pointers[k] = x_row_pointers[(num_rows + 1) * j + k];
}
for (int k = 0; k < n; ++k) {
part_values[k] = x_values[x_batch_pointers[j] + k];
part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
}
for (int64_t i = 0; i < n; ++i) {
y_part_row_pointers[part_col_indices[i] + 1] += 1;
}
std::partial_sum(y_part_row_pointers, y_part_row_pointers + num_cols + 1, y_part_row_pointers);
for (int k = 0; k < num_cols + 1; ++k) {
y_row_pointers[(num_cols + 1) * j + k] = y_part_row_pointers[k];
}
for (int k = 0; k < n; ++k) {
part_values[k] = x_values[x_batch_pointers[j] + k];
part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
}
std::vector<int> current_col_count(num_cols);
for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
const int64_t row_begin = part_row_pointers[row_idx];
const int64_t row_end = part_row_pointers[row_idx + 1];
for (int64_t i = row_begin; i < row_end; ++i) {
const int col_idx = part_col_indices[i];
const int64_t offset = y_part_row_pointers[col_idx] + current_col_count[col_idx];
y_part_col_indices[offset] = row_idx;
y_part_values[offset] = part_values[i];
current_col_count[col_idx] += 1;
}
}
for (int k = 0; k < n; ++k) {
y_values[x_batch_pointers[j] + k] = y_part_values[k];
y_col_indices[x_batch_pointers[j] + k] = y_part_col_indices[k];
}
}
if (conjugate == false) {
}
auto output = ctx.Output(2);
auto output_shape = output->GetTensorShape();
if (rank == ktwo) {
output_shape->SetDimSizes({num_cols + 1});
} else {
output_shape->SetDimSizes({x_dense_shape[0] * (num_cols + 1)});
}
output->SetTensorShape(output_shape.get());
return KERNEL_STATUS_OK;
}
template <typename indiceT, typename valueT>
uint32_t SparseMatrixTransposeCpuKernel::SparseMatrixTransposeComputecomplex(CpuKernelContext &ctx) {
indiceT *x_dense_shape = static_cast<indiceT *>(ctx.Input(0)->GetData());
indiceT *x_batch_pointers = static_cast<indiceT *>(ctx.Input(1)->GetData());
indiceT *x_row_pointers = static_cast<indiceT *>(ctx.Input(2)->GetData());
indiceT *x_col_indices = static_cast<indiceT *>(ctx.Input(3)->GetData());
valueT *x_values = static_cast<valueT *>(ctx.Input(4)->GetData());
bool conjugate = (ctx.GetAttr("conjugate") == nullptr) ? false : ctx.GetAttr("conjugate")->GetBool();
indiceT *y_dense_shape = static_cast<indiceT *>(ctx.Output(0)->GetData());
indiceT *y_batch_pointers = static_cast<indiceT *>(ctx.Output(1)->GetData());
indiceT *y_row_pointers = static_cast<indiceT *>(ctx.Output(2)->GetData());
indiceT *y_col_indices = static_cast<indiceT *>(ctx.Output(3)->GetData());
valueT *y_values = static_cast<valueT *>(ctx.Output(4)->GetData());
auto rank = ctx.Input(0)->NumElements();
if (rank == krankwithbatch) {
y_dense_shape[0] = x_dense_shape[0];
y_dense_shape[1] = x_dense_shape[ktwo];
y_dense_shape[ktwo] = x_dense_shape[1];
} else {
y_dense_shape[0] = x_dense_shape[1];
y_dense_shape[1] = x_dense_shape[0];
}
auto batch_pointers = ctx.Input(1)->NumElements();
for (int i = 0; i < batch_pointers; ++i) {
y_batch_pointers[i] = x_batch_pointers[i];
}
auto num_rows = x_dense_shape[rank - 2];
auto num_cols = x_dense_shape[rank - 1];
auto num_batch = ctx.Input(1)->NumElements() - 1;
int y_part_row_pointers[num_cols + 1];
int part_row_pointers[num_rows + 1];
for (int j = 0; j < num_batch; ++j) {
int n = x_batch_pointers[j + 1] - x_batch_pointers[j];
valueT part_values[n];
indiceT part_col_indices[n];
indiceT y_part_col_indices[n];
valueT y_part_values[n];
for (int i = 0; i < num_cols + 1; ++i) {
y_part_row_pointers[i] = 0;
}
for (int k = 0; k < num_rows + 1; ++k) {
part_row_pointers[k] = x_row_pointers[(num_rows + 1) * j + k];
}
for (int k = 0; k < n; ++k) {
part_values[k] = x_values[x_batch_pointers[j] + k];
part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
}
for (int64_t i = 0; i < n; ++i) {
y_part_row_pointers[part_col_indices[i] + 1] += 1;
}
std::partial_sum(y_part_row_pointers, y_part_row_pointers + num_cols + 1, y_part_row_pointers);
for (int k = 0; k < num_cols + 1; ++k) {
y_row_pointers[(num_cols + 1) * j + k] = y_part_row_pointers[k];
}
for (int k = 0; k < n; ++k) {
part_values[k] = x_values[x_batch_pointers[j] + k];
part_col_indices[k] = x_col_indices[x_batch_pointers[j] + k];
}
std::vector<int> current_col_count(num_cols);
for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
const int64_t row_begin = part_row_pointers[row_idx];
const int64_t row_end = part_row_pointers[row_idx + 1];
for (int64_t i = row_begin; i < row_end; ++i) {
const int col_idx = part_col_indices[i];
const int64_t offset = y_part_row_pointers[col_idx] + current_col_count[col_idx];
y_part_col_indices[offset] = row_idx;
y_part_values[offset] = part_values[i];
current_col_count[col_idx] += 1;
}
}
for (int k = 0; k < n; ++k) {
y_values[x_batch_pointers[j] + k] = y_part_values[k];
y_col_indices[x_batch_pointers[j] + k] = y_part_col_indices[k];
}
}
if (conjugate == true) {
for (int i = 0; i < ctx.Input(kfour)->GetTensorShape()->NumElements(); ++i) {
y_values[i] = std::conj(y_values[i]);
}
}
auto output = ctx.Output(2);
auto output_shape = output->GetTensorShape();
if (rank == ktwo) {
output_shape->SetDimSizes({num_cols + 1});
} else {
output_shape->SetDimSizes({x_dense_shape[0] * (num_cols + 1)});
}
output->SetTensorShape(output_shape.get());
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(SPARSEMATRIXTRANSPOSE, SparseMatrixTransposeCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSEMATRIXTRANSPOSE_H_
#define AICPU_KERNELS_NORMALIZED_SPARSEMATRIXTRANSPOSE_H_
#include "cpu_ops_kernel.h"
#include "utils/sparse_tensor.h"
#include "Eigen/SparseCore"
namespace aicpu {
class SparseMatrixTransposeCpuKernel : public CpuKernel {
public:
~SparseMatrixTransposeCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t SparseMatrixTransposeParamCheck(CpuKernelContext &ctx);
template <typename indiceT, typename valueT>
uint32_t SparseMatrixTransposeCompute(CpuKernelContext &ctx);
template <typename indiceT, typename valueT>
uint32_t SparseMatrixTransposeComputecomplex(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,180 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_reshape.h"
#include <vector>
#include "cpu_kernel_utils.h"
#include "securec.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
constexpr uint32_t kSparseReshapeInputNum = 3;
constexpr uint32_t kSparseReshapeOutputNum = 2;
const char *kSparseReshape = "SparseReshape";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNumSameShape = 24 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
} // namespace
namespace aicpu {
void SparseReshapeCpuKernel::SpecialCompute(int64_t start, int64_t end, const int64_t *in0, int64_t *out0,
const int64_t *input_strides, const int64_t *output_strides,
const int64_t input_rank, const int64_t output_rank) {
for (int i = start; i < end; i++) {
int64_t id = 0;
for (int j = 0; j < input_rank; j++) {
id += *(in0 + i * input_rank + j) * input_strides[j];
}
for (int j = 0; j < output_rank; j++) {
*(out0 + i * output_rank + j) = id / output_strides[j];
id %= output_strides[j];
}
}
}
uint32_t SparseReshapeCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSparseReshapeInputNum, kSparseReshapeOutputNum), "[%s] check params failed.",
kSparseReshape);
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *input_2 = ctx.Input(2);
Tensor *output_0 = ctx.Output(0);
Tensor *output_1 = ctx.Output(1);
KERNEL_CHECK_FALSE(
(input_0->GetDataType() == DT_INT64 && input_1->GetDataType() == DT_INT64 && input_2->GetDataType() == DT_INT64 &&
output_0->GetDataType() == DT_INT64 && output_1->GetDataType() == DT_INT64),
KERNEL_STATUS_INNER_ERROR, "the data of SparseReshape kernel must be DT_INT64.");
KERNEL_CHECK_FALSE((input_0->GetTensorShape()->GetDimSize(1) == input_1->GetTensorShape()->GetDimSize(0)),
KERNEL_STATUS_INNER_ERROR, "Input tensor rank must match input shape length.");
int64_t *in0 = reinterpret_cast<int64_t *>(input_0->GetData());
int64_t *in1 = reinterpret_cast<int64_t *>(input_1->GetData());
int64_t *in2 = reinterpret_cast<int64_t *>(input_2->GetData());
int64_t *out0 = reinterpret_cast<int64_t *>(output_0->GetData());
int64_t *out1 = reinterpret_cast<int64_t *>(output_1->GetData());
const int64_t input_rank = input_1->NumElements();
const int64_t output_rank = input_2->NumElements();
const int64_t nnz = input_0->GetTensorShape()->GetDimSize(0);
int64_t dense_size = 1;
int64_t product = 1;
int64_t out_num = 1;
int unknown_index = -1;
for (int i = 0; i < input_rank; i++) {
dense_size *= *(in1 + i);
}
for (int d = 0; d < output_rank; d++) {
const int64_t size = *(in2 + d);
if (size == -1) {
KERNEL_CHECK_FALSE((unknown_index == -1), KERNEL_STATUS_INNER_ERROR,
"only one output dimension may be -1, "
"not both [%d] and [%d]",
unknown_index, d);
unknown_index = d;
} else {
KERNEL_CHECK_FALSE((size >= 0), KERNEL_STATUS_INNER_ERROR, "size [%d] must be non-negative, not [%ld]", d, size);
product *= size;
*(out1 + d) = size;
out_num *= size;
}
}
if (unknown_index != -1) {
KERNEL_CHECK_FALSE((product >= 0), KERNEL_STATUS_INNER_ERROR,
"reshape cannot infer the missing "
"input size for an empty tensor unless all "
"specified input sizes are non-zero");
const int64_t missing = dense_size / product;
KERNEL_CHECK_FALSE((product * missing == dense_size), KERNEL_STATUS_INNER_ERROR,
"Input to reshape is a SparseTensor with [%ld]"
" dense values, but the requested shape requires"
" a multiple of [%ld].",
dense_size, product);
out_num *= missing;
*(out1 + unknown_index) = missing;
}
KERNEL_CHECK_FALSE((out_num == dense_size), KERNEL_STATUS_INNER_ERROR,
"Input to reshape is a tensor with [%ld]"
" dense values, but the requested shape has [%ld].",
dense_size, out_num);
int64_t input_size = input_0->GetDataSize();
int64_t output_size = output_0->GetDataSize();
if (input_size == output_size && input_rank == output_rank) {
bool flag = true;
for (int64_t i = 0; i < input_rank; ++i) {
if (*(in1 + i) != *(out1 + i)) {
flag = false;
break;
}
}
if (flag) {
auto mem_ret = memcpy_s(out0, output_size, in0, input_size);
KERNEL_CHECK_FALSE(mem_ret == EOK, KERNEL_STATUS_INNER_ERROR,
"[%s] memcpy_s to output failed, destMax [%ld], count [%ld].", kSparseReshape, output_size,
input_size);
return KERNEL_STATUS_OK;
}
}
if (nnz <= 0) return KERNEL_STATUS_OK;
int64_t *input_strides = new int64_t[input_rank];
int64_t *output_strides = new int64_t[output_rank];
if (input_rank > 0) {
input_strides[input_rank - 1] = 1;
for (int d = input_rank - 2; d >= 0; d--) {
input_strides[d] = input_strides[d + 1] * *(in1 + d + 1);
}
}
if (output_rank > 0) {
output_strides[output_rank - 1] = 1;
for (int d = output_rank - 2; d >= 0; d--) {
output_strides[d] = output_strides[d + 1] * *(out1 + d + 1);
}
}
if (nnz * input_rank >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
KERNEL_CHECK_FALSE(max_core_num != 0, KERNEL_STATUS_INNER_ERROR, "core num should not be 0.");
if (nnz * input_rank <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > nnz) {
max_core_num = nnz;
}
auto sharder_sparse_reshape = [&](int64_t start, int64_t end) {
SpecialCompute(start, end, in0, out0, input_strides, output_strides, input_rank, output_rank);
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, nnz, nnz / max_core_num, sharder_sparse_reshape),
"SparseReshape Compute failed.");
} else {
SpecialCompute(0, nnz, in0, out0, input_strides, output_strides, input_rank, output_rank);
}
delete[] input_strides;
delete[] output_strides;
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSparseReshape, SparseReshapeCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,33 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_RESHAPE_H_
#define AICPU_KERNELS_NORMALIZED_SPARSE_RESHAPE_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class SparseReshapeCpuKernel : public CpuKernel {
public:
~SparseReshapeCpuKernel() override = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
void SpecialCompute(int64_t start, int64_t end, const int64_t *in0, int64_t *out0, const int64_t *input_strides,
const int64_t *output_strides, const int64_t input_rank, const int64_t output_rank);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,136 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_segment_sqrt_n_grad.h"
#include "Eigen/Core"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 4;
const uint32_t kOutputNum = 1;
const char *SparseSegmentSqrtNGrad = "SparseSegmentSqrtNGrad";
} // namespace
namespace aicpu {
uint32_t SparseSegmentSqrtNGradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"SparseSegmentSqrtNGrad check input and output number failed.");
Tensor *inputx = ctx.Input(0);
Tensor *input_indices = ctx.Input(1);
Tensor *input_segment_ids = ctx.Input(2);
Tensor *input_output_dim = ctx.Input(3);
auto data_type0 = inputx->GetDataType();
auto data_type1 = input_indices->GetDataType();
auto data_type2 = input_segment_ids->GetDataType();
auto data_type3 = input_output_dim->GetDataType();
if (data_type0 != DT_FLOAT && data_type0 != DT_DOUBLE && data_type0 != DT_FLOAT16) {
KERNEL_LOG_ERROR("SparseSegmentSqrtNGrad kernel data type [%u] not support.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (data_type1 != data_type2 || data_type1 != data_type3 || data_type1 != DT_INT32) {
KERNEL_LOG_ERROR("SparseSegmentSqrtNGrad kernel data type [%u] not support.", data_type1);
return KERNEL_STATUS_PARAM_INVALID;
}
auto shape0 = inputx->GetTensorShape();
auto shape1 = input_indices->GetTensorShape();
auto shape2 = input_segment_ids->GetTensorShape();
auto scalarshape = input_output_dim->GetTensorShape();
if (shape0->GetDims() < 1) {
KERNEL_LOG_ERROR("[%s] Tensor input0's rank less than 1.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (shape1->NumElements() != shape2->NumElements()) {
KERNEL_LOG_ERROR("[%s] Tensor input1&input2's ranks mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (data_type0 == DT_FLOAT) {
return ComputeKernal<float>(ctx);
} else if (data_type0 == DT_DOUBLE) {
return ComputeKernal<double>(ctx);
} else {
return ComputeKernal<Eigen::half>(ctx);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SparseSegmentSqrtNGradCpuKernel::ComputeKernal(CpuKernelContext &ctx) {
size_t n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
size_t m = ctx.Input(2)->GetTensorShape()->NumElements();
int l = ctx.Output(0)->GetTensorShape()->GetDimSize(0);
auto x_addr = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto indices_addr = reinterpret_cast<int32_t *>(ctx.Input(1)->GetData());
auto segment_ids_addr = reinterpret_cast<int32_t *>(ctx.Input(2)->GetData());
int k = *reinterpret_cast<int32_t *>(ctx.Input(3)->GetData());
auto y_addr = reinterpret_cast<T *>(ctx.Output(0)->GetData());
std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
y_shape_values[0] = k;
ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
const size_t tensor_dim = 2;
Eigen::TensorMap<Eigen::Tensor<T, tensor_dim>, Eigen::Aligned> res_map(y_addr, l, n);
res_map.setZero();
for (size_t i = 1; i < m; i++) {
if (segment_ids_addr[i] < segment_ids_addr[i - 1]) {
KERNEL_LOG_ERROR("Segment_ids should be sorted.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (size_t i = 0; i < m; i++) {
if (indices_addr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
KERNEL_LOG_ERROR("Indices out of range.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_addr[i] >= k) {
KERNEL_LOG_ERROR("Segment_ids out of range.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
int beginindex = segment_ids_addr[0];
size_t countnum = 1;
for (size_t i = 1; i < m; i++) {
if (segment_ids_addr[i] == beginindex) {
countnum++;
continue;
}
for (size_t j = 1; j <= countnum; j++) {
for (size_t l = 0; l < n; l++) {
y_addr[indices_addr[i - j] * n + l] += x_addr[beginindex * n + l] / (T)(sqrt(countnum));
}
beginindex = segment_ids_addr[i];
countnum = 1;
}
}
int i = m;
for (size_t j = 1; j <= countnum; j++) {
for (size_t l = 0; l < n; l++) {
y_addr[indices_addr[i - j] * n + l] += x_addr[beginindex * n + l] / (T)(sqrt(countnum));
}
}
return KERNEL_STATUS_OK;
};
REGISTER_CPU_KERNEL(SparseSegmentSqrtNGrad, SparseSegmentSqrtNGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_GRAD_H_
#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_GRAD_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
namespace aicpu {
class SparseSegmentSqrtNGradCpuKernel : public CpuKernel {
public:
SparseSegmentSqrtNGradCpuKernel() = default;
~SparseSegmentSqrtNGradCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
static uint32_t ComputeKernal(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,186 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_segment_sqrt_n_with_num_segments.h"
#include <math.h>
#include "Eigen/Core"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 4;
const uint32_t kOutputNum = 1;
const char *SparseSegmentSqrtNWithNumSegments = "SparseSegmentSqrtNWithNumSegments";
#define COMPUTE_CASE(DTYPE, TYPE, DTYPE_1, DTYPE_2, DTYPE_3, CTX) \
case (DTYPE): \
if ((DTYPE_1) == DT_INT32) { \
if ((DTYPE_2) == DT_INT32) { \
if ((DTYPE_3) == DT_INT32) { \
return Computekernel<TYPE, int32_t, int32_t, int32_t>(CTX); \
} else { \
return Computekernel<TYPE, int32_t, int32_t, int64_t>(CTX); \
} \
} else { \
if ((DTYPE_3) == DT_INT32) { \
return Computekernel<TYPE, int32_t, int64_t, int32_t>(CTX); \
} else { \
return Computekernel<TYPE, int32_t, int64_t, int64_t>(CTX); \
} \
} \
} else { \
if ((DTYPE_2) == DT_INT32) { \
if ((DTYPE_3) == DT_INT32) { \
return Computekernel<TYPE, int64_t, int32_t, int32_t>(CTX); \
} else { \
return Computekernel<TYPE, int64_t, int32_t, int64_t>(CTX); \
} \
} else { \
if ((DTYPE_3) == DT_INT32) { \
return Computekernel<TYPE, int64_t, int64_t, int32_t>(CTX); \
} else { \
return Computekernel<TYPE, int64_t, int64_t, int64_t>(CTX); \
} \
} \
} \
break;
} // namespace
namespace aicpu {
uint32_t SparseSegmentSqrtNWithNumSegmentsCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentSqrtNWithNumSegments normalcheck failed.");
Tensor *x = ctx.Input(0);
Tensor *indices = ctx.Input(1);
Tensor *segment_ids = ctx.Input(2);
Tensor *num_segments = ctx.Input(3);
auto x_shape = x->GetTensorShape();
auto indices_shape = indices->GetTensorShape();
auto segment_ids_shape = segment_ids->GetTensorShape();
auto num_segments_shape = num_segments->GetTensorShape();
if (x_shape->GetDims() < 1) {
KERNEL_LOG_ERROR("[%s] Tensor x's rank less than 1.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (indices->NumElements() != segment_ids->NumElements()) {
KERNEL_LOG_ERROR("[%s] Tensor indices&segment_ids's ranks mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto x_data_type = x->GetDataType();
auto indices_data_type = indices->GetDataType();
auto segment_ids_data_type = segment_ids->GetDataType();
auto num_segments_data_type = num_segments->GetDataType();
if (indices_data_type != DT_INT32 && indices_data_type != DT_INT64) {
KERNEL_LOG_ERROR("SparseSegmentSqrtNWithNumSegments kernel data type [%s] not support.",
DTypeStr(indices_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_type != DT_INT32 && segment_ids_data_type != DT_INT64) {
KERNEL_LOG_ERROR("SparseSegmentSqrtNWithNumSegments kernel data type [%s] not support.",
DTypeStr(segment_ids_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (num_segments_data_type != DT_INT32 && num_segments_data_type != DT_INT64) {
KERNEL_LOG_ERROR("SparseSegmentSqrtNWithNumSegments kernel data type [%s] not support.",
DTypeStr(num_segments_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
switch (x_data_type) {
COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
COMPUTE_CASE(DT_FLOAT, float, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, segment_ids_data_type, num_segments_data_type, ctx)
default:
KERNEL_LOG_ERROR(
"SparseSegmentSqrtNWithNumSegments kernel data type [%s] not "
"support.",
DTypeStr(x_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
REGISTER_CPU_KERNEL(SparseSegmentSqrtNWithNumSegments, SparseSegmentSqrtNWithNumSegmentsCpuKernel);
template <typename T1, typename T2, typename T3, typename T4>
uint32_t SparseSegmentSqrtNWithNumSegmentsCpuKernel::Computekernel(CpuKernelContext &ctx) {
int n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
int m = ctx.Input(2)->GetTensorShape()->NumElements();
auto x_ptr = reinterpret_cast<T1 *>(ctx.Input(0)->GetData());
auto indices_ptr = reinterpret_cast<T2 *>(ctx.Input(1)->GetData());
auto segment_ids_ptr = reinterpret_cast<T3 *>(ctx.Input(2)->GetData());
auto num_segments_ptr = reinterpret_cast<T4 *>(ctx.Input(3)->GetData());
auto y_ptr = reinterpret_cast<T1 *>(ctx.Output(0)->GetData());
std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
y_shape_values[0] = num_segments_ptr[0];
ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
for (int64_t i = 1; i < m; i++) {
if (segment_ids_ptr[i] < segment_ids_ptr[i - 1]) {
KERNEL_LOG_ERROR("segment_ids should be sorted.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (int64_t i = 0; i < m; i++) {
if (indices_ptr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
KERNEL_LOG_ERROR("indices out of range.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_ptr[i] >= num_segments_ptr[0]) {
KERNEL_LOG_ERROR("segment_ids out of range.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (int64_t i = 0; i < ctx.Output(0)->GetTensorShape()->NumElements(); i++) {
y_ptr[i] = (T1)0;
}
int oldindex = -1;
int countnum = 0;
for (int64_t i = 0; i < m; i++) {
if (oldindex == segment_ids_ptr[i]) {
countnum++;
} else if (countnum != 0) {
for (int64_t j = 0; j < n; j++) {
y_ptr[j + oldindex * n] /= (static_cast<T1>(sqrt(countnum)));
}
countnum = 1;
oldindex = segment_ids_ptr[i];
} else {
countnum = 1;
oldindex = segment_ids_ptr[i];
}
for (int64_t j = 0; j < n; j++) {
y_ptr[j + oldindex * n] += x_ptr[j + indices_ptr[i] * n];
}
}
if (countnum != 0) {
for (int64_t j = 0; j < n; j++) {
y_ptr[j + oldindex * n] /= (static_cast<T1>(sqrt(countnum)));
}
}
return KERNEL_STATUS_OK;
}
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_WITH_NUM_SEGMENTS_H_
#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SQRT_N_WITH_NUM_SEGMENTS_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
#include "utils/sparse_tensor.h"
namespace aicpu {
class SparseSegmentSqrtNWithNumSegmentsCpuKernel : public CpuKernel {
public:
SparseSegmentSqrtNWithNumSegmentsCpuKernel() = default;
~SparseSegmentSqrtNWithNumSegmentsCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2, typename T3, typename T4>
uint32_t Computekernel(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,152 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_segment_sum_with_num_segments.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 4;
const uint32_t kOutputNum = 1;
const char *SparseSegmentSumWithNumSegments = "SparseSegmentSumWithNumSegments";
#define COMPUTE_CASE(DTYPE, TYPE, ITYPE, CTX) \
case (DTYPE): \
if ((ITYPE) == DT_INT32) { \
return ComputeKernel<TYPE, int32_t>(CTX); \
} else { \
return ComputeKernel<TYPE, int64_t>(CTX); \
} \
break;
} // namespace
namespace aicpu {
uint32_t SparseSegmentSumWithNumSegmentsCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSegmentSumWithNumSegments normalcheck failed.");
Tensor *x = ctx.Input(0);
Tensor *indices = ctx.Input(1);
Tensor *segment_ids = ctx.Input(2);
Tensor *num_segments = ctx.Input(3);
if (x->GetDataSize() == 0 || indices->GetDataSize() == 0 || segment_ids->GetDataSize() == 0 ||
num_segments->GetDataSize() == 0) {
KERNEL_LOG_ERROR("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto x_shape = x->GetTensorShape();
auto indices_shape = indices->GetTensorShape();
auto segment_ids_shape = segment_ids->GetTensorShape();
auto num_segments_shape = num_segments->GetTensorShape();
if (x_shape->GetDims() < 1) {
KERNEL_LOG_ERROR("[%s] Tensor x's rank less than 1.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (indices_shape->NumElements() != segment_ids_shape->NumElements()) {
KERNEL_LOG_ERROR("[%s] Tensor indices&segment_ids's ranks mismatch.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
auto x_data_type = x->GetDataType();
auto indices_data_type = indices->GetDataType();
auto segment_ids_data_type = segment_ids->GetDataType();
auto num_segments_data_type = num_segments->GetDataType();
if (indices_data_type != DT_INT32 && indices_data_type != DT_INT64) {
KERNEL_LOG_ERROR("SparseSegmentSumWithNumSegments kernel data type [%s] not support.",
DTypeStr(indices_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_data_type != indices_data_type || num_segments_data_type != indices_data_type) {
KERNEL_LOG_ERROR("SparseSegmentSumWithNumSegments kernel data type mismatch.");
return KERNEL_STATUS_PARAM_INVALID;
}
switch (x_data_type) {
COMPUTE_CASE(DT_INT8, int8_t, indices_data_type, ctx)
COMPUTE_CASE(DT_INT16, int16_t, indices_data_type, ctx)
COMPUTE_CASE(DT_INT32, int32_t, indices_data_type, ctx)
COMPUTE_CASE(DT_INT64, int64_t, indices_data_type, ctx)
COMPUTE_CASE(DT_UINT8, uint8_t, indices_data_type, ctx)
COMPUTE_CASE(DT_UINT16, uint16_t, indices_data_type, ctx)
COMPUTE_CASE(DT_FLOAT16, Eigen::half, indices_data_type, ctx)
COMPUTE_CASE(DT_FLOAT, float, indices_data_type, ctx)
COMPUTE_CASE(DT_DOUBLE, double, indices_data_type, ctx)
default:
KERNEL_LOG_ERROR("SparseSegmentSumWithNumSegments kernel data type [%s] not support.",
DTypeStr(x_data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(SparseSegmentSumWithNumSegments, SparseSegmentSumWithNumSegmentsCpuKernel);
template <typename dataT, typename indicesT>
uint32_t SparseSegmentSumWithNumSegmentsCpuKernel::ComputeKernel(CpuKernelContext &ctx) {
size_t n = ctx.Input(0)->GetTensorShape()->NumElements() / ctx.Input(0)->GetTensorShape()->GetDimSize(0);
size_t m = ctx.Input(2)->GetTensorShape()->NumElements();
size_t num_elements = ctx.Output(0)->GetTensorShape()->NumElements();
auto x_ptr = reinterpret_cast<dataT *>(ctx.Input(0)->GetData());
auto indices_ptr = reinterpret_cast<indicesT *>(ctx.Input(1)->GetData());
auto segment_ids_ptr = reinterpret_cast<indicesT *>(ctx.Input(2)->GetData());
auto num_segments_ptr = reinterpret_cast<indicesT *>(ctx.Input(3)->GetData());
auto y_ptr = reinterpret_cast<dataT *>(ctx.Output(0)->GetData());
std::vector<int64_t> y_shape_values = ctx.Input(0)->GetTensorShape()->GetDimSizes();
y_shape_values[0] = num_segments_ptr[0];
ctx.Output(0)->GetTensorShape()->SetDimSizes(y_shape_values);
for (size_t i = 1; i < m; i++) {
if (segment_ids_ptr[i] < segment_ids_ptr[i - 1]) {
KERNEL_LOG_ERROR("segment_ids should be sorted.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (size_t i = 0; i < m; i++) {
if (indices_ptr[i] >= ctx.Input(0)->GetTensorShape()->GetDimSize(0)) {
KERNEL_LOG_ERROR("indices out of range.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (segment_ids_ptr[i] >= num_segments_ptr[0]) {
KERNEL_LOG_ERROR("segment_ids out of range.");
return KERNEL_STATUS_PARAM_INVALID;
}
}
for (size_t i = 0; i < num_elements; i++) {
y_ptr[i] = (dataT)0;
}
int oldindex = -1;
for (size_t i = 0; i < m; i++) {
if (oldindex != segment_ids_ptr[i]) {
oldindex = segment_ids_ptr[i];
for (size_t j = 0; j < n; j++) {
y_ptr[j + oldindex * n] = (dataT)0;
}
}
for (size_t j = 0; j < n; j++) {
y_ptr[j + oldindex * n] += x_ptr[j + indices_ptr[i] * n];
}
}
return KERNEL_STATUS_OK;
};
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SUM_H_
#define AICPU_KERNELS_NORMALIZED_SPARSE_SEGMENT_SUM_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
namespace aicpu {
class SparseSegmentSumWithNumSegmentsCpuKernel : public CpuKernel {
public:
SparseSegmentSumWithNumSegmentsCpuKernel() = default;
~SparseSegmentSumWithNumSegmentsCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename dataT, typename indicesT>
static uint32_t ComputeKernel(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,253 @@
/**
Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_softmax_cross_entropy_with_logits.h"
#include <iostream>
#include <unsupported/Eigen/CXX11/Tensor>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kSparseSoftmaxCrossEntropyWithLogits = "SparseSoftmaxCrossEntropyWithLogits";
const uint32_t kOutputNum{2};
const uint32_t kInputNum{2};
const uint32_t kDimSizeTwo{2};
const uint32_t kDimSizeOne{1};
const uint32_t paralledDataNum{2048};
} // namespace
namespace aicpu {
template <typename data_type, typename label_type>
void SparseSoftmaxCrossEntropyWithLogitsSingleOp(data_type *input_features, label_type *input_labels,
data_type *output_loss, data_type *output_backprop, int64_t batch_size,
int64_t classes_num, size_t features_total) {
double_t *dims_exp_sum = static_cast<double_t *>(malloc(batch_size * sizeof(double_t)));
data_type *dims_maximum = static_cast<data_type *>(malloc(batch_size * sizeof(data_type)));
memset(dims_exp_sum, 0, batch_size * sizeof(double_t));
Eigen::TensorMap<Eigen::Tensor<data_type, kDimSizeTwo>, Eigen::Aligned> logits(input_features, batch_size,
classes_num);
Eigen::TensorMap<Eigen::Tensor<double_t, 1>, Eigen::Aligned> dims_sum(dims_exp_sum, batch_size);
Eigen::TensorMap<Eigen::Tensor<data_type, 1>, Eigen::Aligned> dims_max(dims_maximum, batch_size);
Eigen::array<int, 1> axes{{1}};
// compute softmax
dims_max = logits.maximum(axes);
const data_type constant_one(1.0);
for (size_t index = 0, batch_idx = 0; index < features_total; index++) {
output_backprop[index] = Eigen::numext::exp(input_features[index] - dims_maximum[batch_idx]);
dims_exp_sum[batch_idx] += static_cast<double_t>(output_backprop[index]);
if ((index + 1) % classes_num == 0) {
batch_idx++;
}
}
dims_sum = dims_sum.inverse();
for (size_t index = 0, batch_idx = 0; index < features_total; index++) {
*(output_backprop + index) =
static_cast<data_type>(static_cast<double_t>(*(output_backprop + index)) * dims_exp_sum[batch_idx]);
if ((index + 1) % classes_num == 0) {
batch_idx++;
}
}
label_type offset = 0;
for (int64_t index = 0, batch_base = 0; index < batch_size; ++index, batch_base += classes_num) {
offset = input_labels[index];
*(output_loss + index) = -Eigen::numext::log(*(output_backprop + batch_base + offset));
*(output_backprop + batch_base + offset) = *(output_backprop + batch_base + offset) - constant_one;
}
free(dims_exp_sum);
free(dims_maximum);
}
template <typename data_type, typename label_type>
void SparseSoftmaxCrossEntropyWithLogitsMultiOp(data_type *input_features, label_type *input_labels,
data_type *output_loss, data_type *output_backprop, size_t begin,
size_t end, int64_t classes_num, size_t features_total) {
for (size_t index = begin; index < end; index++) {
size_t batch_begin = index * classes_num;
size_t batch_end = batch_begin + classes_num;
data_type max_value = input_features[batch_begin];
double_t sum_value{0};
data_type constant_one{1};
for (size_t idx = batch_begin; idx < batch_end; idx++) {
if (max_value < input_features[idx]) {
max_value = input_features[idx];
}
}
for (size_t idx = batch_begin; idx < batch_end; idx++) {
output_backprop[idx] = Eigen::numext::exp(input_features[idx] - max_value);
sum_value += static_cast<double_t>(output_backprop[idx]);
}
sum_value = double_t(1.0) / sum_value;
for (size_t idx = batch_begin; idx < batch_end; idx++) {
output_backprop[idx] = static_cast<data_type>(static_cast<double_t>(output_backprop[idx]) * sum_value);
if (idx % classes_num == static_cast<size_t>(input_labels[index])) {
output_loss[index] = -Eigen::numext::log(output_backprop[idx]);
output_backprop[idx] = output_backprop[idx] - constant_one;
}
}
}
}
std::uint32_t SparseSoftmaxCrossEntropyWithLogitsExtraCheck(CpuKernelContext &ctx) {
Tensor *input_features = ctx.Input(0);
Tensor *input_labels = ctx.Input(1);
Tensor *output_loss = ctx.Output(0);
Tensor *output_backprop = ctx.Output(1);
std::vector<int64_t> features_dims = input_features->GetTensorShape()->GetDimSizes();
std::vector<int64_t> labels_dims = input_labels->GetTensorShape()->GetDimSizes();
std::vector<int64_t> loss_dims = output_loss->GetTensorShape()->GetDimSizes();
std::vector<int64_t> backprop_dims = output_backprop->GetTensorShape()->GetDimSizes();
if ((input_features->GetDataSize() == 0) || (input_labels->GetDataSize() == 0)) {
KERNEL_LOG_INFO("[%s] Input is empty tensor.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (input_features->GetDataType() != output_loss->GetDataType() ||
input_features->GetDataType() != output_backprop->GetDataType()) {
KERNEL_LOG_ERROR(
"The data type of the input features [%s], output loss [%s], output "
"backprop [%s] must be the same type.",
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str(),
DTypeStr(ctx.Output(1)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (input_labels->GetDataType() != DT_INT32 && input_labels->GetDataType() != DT_INT64) {
KERNEL_LOG_ERROR(
"The data type of the input labels [%s], must be the int32 or int64 "
"type.",
DTypeStr(ctx.Input(1)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (features_dims.size() != kDimSizeTwo || labels_dims.size() != kDimSizeOne || loss_dims.size() != kDimSizeOne ||
backprop_dims.size() != kDimSizeTwo) {
KERNEL_LOG_ERROR(
"The dims of the input features [%d], output backprop [%d] must be "
"[batch_size x num_classes]. the dims of input labels [%d], output "
"loss [%d] must be [batch_size].",
features_dims.size(), backprop_dims.size(), labels_dims.size(), loss_dims.size());
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t batch_size = features_dims[0];
int64_t num_classes = features_dims[1];
if (labels_dims[0] != batch_size) {
KERNEL_LOG_ERROR("the size of label must be equal with batch_size[%d]", batch_size);
return KERNEL_STATUS_PARAM_INVALID;
}
if (loss_dims[0] != batch_size) {
KERNEL_LOG_ERROR("the size of loss must be equal with batch_size[%d]", batch_size);
return KERNEL_STATUS_PARAM_INVALID;
}
if (backprop_dims[0] != batch_size || backprop_dims[1] != num_classes) {
KERNEL_LOG_ERROR("the size of label must be equal with [%d x %d], but get [%d x %d]", batch_size, num_classes,
backprop_dims[0], backprop_dims[1]);
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename data_type, typename label_type>
inline uint32_t SparseSoftmaxCrossEntropyWithLogitsCompute(const CpuKernelContext &ctx) {
size_t features_total = static_cast<size_t>(ctx.Input(0)->NumElements());
uint64_t total_size = ctx.Input(0)->GetDataSize();
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
auto *input_features = static_cast<data_type *>(ctx.Input(0)->GetData());
auto *input_labels = static_cast<label_type *>(ctx.Input(1)->GetData());
auto *output_loss = static_cast<data_type *>(ctx.Output(0)->GetData());
auto *output_backprop = static_cast<data_type *>(ctx.Output(1)->GetData());
bool muilt_core_flag = false;
if (total_size > paralledDataNum * sizeof(data_type)) {
muilt_core_flag = true;
}
std::vector<std::int64_t> dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<std::int64_t> labels_dims = ctx.Input(1)->GetTensorShape()->GetDimSizes();
for (int64_t idx = 0; idx < labels_dims[0]; idx++) {
if (input_labels[idx] >= dims[1]) {
KERNEL_LOG_ERROR(
"Received a label value of [%d] which is outside the valid range of "
"[0, %d).",
input_labels[idx], dims[1]);
return KERNEL_STATUS_PARAM_INVALID;
}
}
// Determine whether to enable multi-core parallel computing
size_t pivot, classes_num;
int64_t batch_size{1};
pivot = dims.size() - 1;
classes_num = dims[pivot];
for (size_t index = 0; index < dims.size(); index++) {
if (index < pivot) {
batch_size *= dims[index];
}
}
// Eigen::Array
if (muilt_core_flag) {
std::int64_t per_unit_size{batch_size / std::min(std::max(1L, cores - 2L), batch_size)};
auto shard = [&](size_t begin, size_t end) {
SparseSoftmaxCrossEntropyWithLogitsMultiOp(input_features, input_labels, output_loss, output_backprop, begin, end,
classes_num, features_total);
};
CpuKernelUtils::ParallelFor(ctx, batch_size, per_unit_size, shard);
} else if (cores != 0) {
SparseSoftmaxCrossEntropyWithLogitsSingleOp<data_type, label_type>(
input_features, input_labels, output_loss, output_backprop, batch_size, classes_num, features_total);
} else {
KERNEL_LOG_ERROR("SparseSoftmaxCrossEntropyWithLogits compute failed.");
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
uint32_t SparseSoftmaxCrossEntropyWithLogitsCpuKernel::Compute(CpuKernelContext &ctx) {
if (NormalCheck(ctx, kInputNum, kOutputNum) == KERNEL_STATUS_PARAM_INVALID) {
return KERNEL_STATUS_PARAM_INVALID;
}
if (SparseSoftmaxCrossEntropyWithLogitsExtraCheck(ctx) == KERNEL_STATUS_PARAM_INVALID) {
return KERNEL_STATUS_PARAM_INVALID;
}
// choose compute function depend on dataType
auto data_type = static_cast<DataType>(ctx.Input(0)->GetDataType());
auto labels_type = static_cast<DataType>(ctx.Input(1)->GetDataType());
switch (data_type) {
case DT_FLOAT16: {
if (labels_type == DT_INT32) {
return SparseSoftmaxCrossEntropyWithLogitsCompute<Eigen::half, std::int32_t>(ctx);
} else if (labels_type == DT_INT64) {
return SparseSoftmaxCrossEntropyWithLogitsCompute<Eigen::half, std::int64_t>(ctx);
}
}
case DT_FLOAT: {
if (labels_type == DT_INT32) {
return SparseSoftmaxCrossEntropyWithLogitsCompute<std::float_t, std::int32_t>(ctx);
} else if (labels_type == DT_INT64) {
return SparseSoftmaxCrossEntropyWithLogitsCompute<std::float_t, std::int64_t>(ctx);
}
}
case DT_DOUBLE: {
if (labels_type == DT_INT32) {
return SparseSoftmaxCrossEntropyWithLogitsCompute<std::double_t, std::int32_t>(ctx);
} else if (labels_type == DT_INT64) {
return SparseSoftmaxCrossEntropyWithLogitsCompute<std::double_t, std::int64_t>(ctx);
}
}
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not support, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
REGISTER_CPU_KERNEL(kSparseSoftmaxCrossEntropyWithLogits, SparseSoftmaxCrossEntropyWithLogitsCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,27 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSESOFTMAXENTROPYWITHLOGITS_H_
#define AICPU_KERNELS_NORMALIZED_SPARSESOFTMAXENTROPYWITHLOGITS_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SparseSoftmaxCrossEntropyWithLogitsCpuKernel final : public CpuKernel {
virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,241 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_sparse_maximum.h"
#include <unsupported/Eigen/CXX11/Tensor>
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kSparseSparseMaximum = "SparseSparseMaximum";
const uint32_t kOutputNum = 2;
const uint32_t kInputNum = 6;
constexpr int64_t kIndex0 = 0;
constexpr int64_t kIndex1 = 1;
constexpr int64_t kIndex2 = 2;
constexpr int64_t kIndex3 = 3;
constexpr int64_t kIndex4 = 4;
constexpr int64_t kIndex5 = 5;
bool isMatrix(const std::shared_ptr<aicpu::TensorShape> shape) { return shape->GetDims() == 2; }
bool isVector(const std::shared_ptr<aicpu::TensorShape> shape) { return shape->GetDims() == 1; }
} // namespace
// 定义命名空间aicpu
namespace aicpu {
uint32_t SparseMaximumCpuKernel::NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &databank) {
databank.a_indices_t = ctx.Input(kIndex0);
databank.a_values_t = ctx.Input(kIndex1);
databank.a_shape_t = ctx.Input(kIndex2);
databank.b_indices_t = ctx.Input(kIndex3);
databank.b_values_t = ctx.Input(kIndex4);
databank.b_shape_t = ctx.Input(kIndex5);
databank.output_indices_t = ctx.Output(kIndex0);
databank.output_values_t = ctx.Output(kIndex1);
KERNEL_CHECK_FALSE(
isMatrix(databank.a_indices_t->GetTensorShape()) && isMatrix(databank.b_indices_t->GetTensorShape()),
KERNEL_STATUS_PARAM_INVALID,
"Inputs a_indices and b_indices should be "
"matrices but received shapes: [%d], [%d]",
databank.a_indices_t->GetTensorShape()->GetDims(), databank.b_indices_t->GetTensorShape()->GetDims());
KERNEL_CHECK_FALSE(isVector(databank.a_values_t->GetTensorShape()) && isVector(databank.b_values_t->GetTensorShape()),
KERNEL_STATUS_PARAM_INVALID,
"Inputs a_values and b_values should be vectors "
"but received shapes: [%d] and [%d]",
databank.a_values_t->GetTensorShape()->GetDims(),
databank.b_values_t->GetTensorShape()->GetDims());
KERNEL_CHECK_FALSE(isVector(databank.a_shape_t->GetTensorShape()) && isVector(databank.b_shape_t->GetTensorShape()),
KERNEL_STATUS_PARAM_INVALID, "Input shapes should be a vector but received shapes [%d] and [%d]",
databank.a_shape_t->GetTensorShape()->GetDims(), databank.b_shape_t->GetTensorShape()->GetDims());
return KERNEL_STATUS_OK;
}
inline static int64_t cmp(const TTypes<int64_t>::Matrix &a_idx, const TTypes<int64_t>::Matrix &b_idx,
const int64_t a_row, const int64_t b_row, const int64_t dims) {
for (int d = 0; d < dims; ++d) {
const int64_t a = a_idx(a_row, d);
const int64_t b = b_idx(b_row, d);
if (a < b) {
return -1;
} else if (a > b) {
return 1;
}
}
return 0;
}
template <typename T>
void SparseMaximumCpuKernel::UnionSparseIndicesAndValues(typename TTypes<int64_t>::Matrix a_indices_mat,
typename TTypes<T>::Flat a_values, int64_t a_nnz,
typename TTypes<int64_t>::Matrix b_indices_mat,
typename TTypes<T>::Flat b_values, int64_t b_nnz,
int64_t num_dims, std::vector<T> *a_augmented_values,
std::vector<T> *b_augmented_values,
std::vector<std::pair<bool, int64_t>> *entries_to_copy) {
entries_to_copy->reserve(a_nnz + b_nnz);
a_augmented_values->reserve(a_nnz);
b_augmented_values->reserve(b_nnz);
int64_t i = 0, j = 0;
const T kZero = T(0);
while (i < a_nnz && j < b_nnz) {
switch (cmp(a_indices_mat, b_indices_mat, i, j, num_dims)) {
case -1:
entries_to_copy->emplace_back(true, i);
a_augmented_values->push_back(a_values(i));
b_augmented_values->push_back(kZero);
++i;
break;
case 0:
entries_to_copy->emplace_back(true, i);
a_augmented_values->push_back(a_values(i));
b_augmented_values->push_back(b_values(j));
++i;
++j;
break;
case 1:
entries_to_copy->emplace_back(false, j);
a_augmented_values->push_back(kZero);
b_augmented_values->push_back(b_values(j));
++j;
break;
}
}
// Handles leftovers; at most one loop runs.
while (i < a_nnz) {
entries_to_copy->emplace_back(true, i);
a_augmented_values->push_back(a_values(i++));
b_augmented_values->push_back(kZero);
}
while (j < b_nnz) {
entries_to_copy->emplace_back(false, j);
a_augmented_values->push_back(kZero);
b_augmented_values->push_back(b_values(j++));
}
}
template <typename T>
uint32_t SparseMaximumCpuKernel::EigenedSparseMax(DataBank &databank) {
const int64_t a_nnz = databank.a_indices_t->GetTensorShape()->GetDimSize(0);
const int64_t b_nnz = databank.b_indices_t->GetTensorShape()->GetDimSize(0);
EigenTensor a_values_t(databank.a_values_t, databank.a_values_t->GetData());
const auto a_values = a_values_t.vec<T>();
EigenTensor b_values_t(databank.b_values_t, databank.b_values_t->GetData());
const auto b_values = b_values_t.vec<T>();
EigenTensor a_indices_t(databank.a_indices_t, databank.a_indices_t->GetData());
const auto a_indices_mat = a_indices_t.matrix<int64_t>();
EigenTensor b_indices_t(databank.b_indices_t, databank.b_indices_t->GetData());
const auto b_indices_mat = b_indices_t.matrix<int64_t>();
const int64_t num_dims = databank.a_indices_t->GetTensorShape()->GetDimSize(1);
EigenTensor a_shape_t(databank.a_shape_t, databank.a_shape_t->GetData());
const auto a_shape = a_shape_t.flat<int64_t>();
EigenTensor b_shape_t(databank.b_shape_t, databank.b_shape_t->GetData());
const auto b_shape = b_shape_t.flat<int64_t>();
KERNEL_CHECK_FALSE(a_values.size() == a_nnz && b_values.size() == b_nnz, KERNEL_STATUS_PARAM_INVALID,
"Expected [%d] and [%d] non-empty input values, got [%d] and [%d]", a_nnz, b_nnz, a_values.size(),
b_values.size());
KERNEL_CHECK_FALSE(databank.a_shape_t->GetTensorShape()->NumElements() == num_dims, KERNEL_STATUS_PARAM_INVALID,
"Second dimension of a_indices and length of "
"a_shape must match, got [%d] and [%d]",
databank.a_shape_t->GetTensorShape()->NumElements(), num_dims);
KERNEL_CHECK_FALSE(num_dims > 0, KERNEL_STATUS_PARAM_INVALID, "Tensors must not be empty");
KERNEL_CHECK_FALSE(
databank.a_shape_t->GetTensorShape()->NumElements() == databank.b_shape_t->GetTensorShape()->NumElements(),
KERNEL_STATUS_PARAM_INVALID, "Operands do not have the same ranks; got shapes: [%d] and [%d]",
databank.a_shape_t->GetTensorShape()->NumElements(), databank.b_shape_t->GetTensorShape()->NumElements());
for (int i = 0; i < num_dims; ++i) {
KERNEL_CHECK_FALSE(a_shape(i) == b_shape(i), KERNEL_STATUS_PARAM_INVALID,
"Operands' shapes do not match: got [%d] and [%d] for dimension [%d]", a_shape(i), b_shape(i), i)
}
std::vector<T> a_augmented_values, b_augmented_values;
std::vector<std::pair<bool, int64_t>> entries_to_copy; // from_a?, idx
UnionSparseIndicesAndValues(a_indices_mat, a_values, a_nnz, b_indices_mat, b_values, b_nnz, num_dims,
&a_augmented_values, &b_augmented_values, &entries_to_copy);
const int64_t sum_nnz = a_augmented_values.size();
EigenTensor output_values_t(databank.output_values_t, databank.output_values_t->GetData());
EigenTensor output_indices_t(databank.output_indices_t, databank.output_indices_t->GetData());
auto output_indices_mat = output_indices_t.matrix<int64_t>();
for (int64_t i = 0; i < sum_nnz; ++i) {
const bool from_a = entries_to_copy[i].first;
const int64_t idx = entries_to_copy[i].second;
output_indices_mat.chip<0>(i) = from_a ? a_indices_mat.chip<0>(idx) : b_indices_mat.chip<0>(idx);
}
using UnalignedTensorMap = Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>, Eigen::Unaligned>;
auto a_augmented_values_t = UnalignedTensorMap(a_augmented_values.data(), sum_nnz);
auto b_augmented_values_t = UnalignedTensorMap(b_augmented_values.data(), sum_nnz);
output_values_t.flat<T>() =
a_augmented_values_t.binaryExpr(b_augmented_values_t, Eigen::internal::scalar_max_op<T, T>());
databank.output_indices_t->GetTensorShape()->SetDimSizes({sum_nnz, num_dims});
databank.output_values_t->GetTensorShape()->SetDimSizes({sum_nnz});
return KERNEL_STATUS_OK;
}
uint32_t SparseMaximumCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"SparseSparseMaximum check input and output number failed.");
DataBank databank;
KERNEL_HANDLE_ERROR(NullptrAndMatVecCheck(ctx, databank), "SparseSparseMaximum check params failed.");
DataType dt = static_cast<DataType>(databank.output_values_t->GetDataType());
uint32_t KERNEL_STATUS;
switch (dt) {
case DT_INT8:
KERNEL_STATUS = EigenedSparseMax<int8_t>(databank);
break;
case DT_UINT8:
KERNEL_STATUS = EigenedSparseMax<uint8_t>(databank);
break;
case DT_INT16:
KERNEL_STATUS = EigenedSparseMax<int16_t>(databank);
break;
case DT_UINT16:
KERNEL_STATUS = EigenedSparseMax<uint16_t>(databank);
break;
case DT_INT32:
KERNEL_STATUS = EigenedSparseMax<int32_t>(databank);
break;
case DT_INT64:
KERNEL_STATUS = EigenedSparseMax<int64_t>(databank);
break;
case DT_FLOAT16:
KERNEL_STATUS = EigenedSparseMax<Eigen::half>(databank);
break;
case DT_FLOAT:
KERNEL_STATUS = EigenedSparseMax<float>(databank);
break;
case DT_DOUBLE:
KERNEL_STATUS = EigenedSparseMax<double>(databank);
break;
default:
KERNEL_LOG_ERROR("SparseSparseMaximum can't support this data type [%d].", dt);
return KERNEL_STATUS_PARAM_INVALID;
}
if (KERNEL_STATUS != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("SparseSparseMaximum failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
// 注册该算子实现
REGISTER_CPU_KERNEL(kSparseSparseMaximum, SparseMaximumCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,59 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cpu_ops_kernel.h"
#include "utils/eigen_tensor.h"
namespace aicpu {
struct DataBank {
DataBank()
: a_indices_t(nullptr),
a_values_t(nullptr),
a_shape_t(nullptr),
b_indices_t(nullptr),
b_values_t(nullptr),
b_shape_t(nullptr) {}
Tensor *a_indices_t;
Tensor *a_values_t;
Tensor *a_shape_t;
Tensor *b_indices_t;
Tensor *b_values_t;
Tensor *b_shape_t;
Tensor *output_indices_t;
Tensor *output_values_t;
};
class SparseMaximumCpuKernel : public CpuKernel {
public:
~SparseMaximumCpuKernel() = default;
SparseMaximumCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
static void UnionSparseIndicesAndValues(typename TTypes<int64_t>::Matrix a_indices_mat,
typename TTypes<T>::Flat a_values, int64_t a_nnz,
typename TTypes<int64_t>::Matrix b_indices_mat,
typename TTypes<T>::Flat b_values, int64_t b_nnz, int64_t num_dims,
std::vector<T> *a_augmented_values, std::vector<T> *b_augmented_values,
std::vector<std::pair<bool, int64_t>> *entries_to_copy);
template <typename T>
uint32_t EigenedSparseMax(DataBank &databank);
static uint32_t NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &calc_info);
};
} // namespace aicpu

View File

@ -0,0 +1,207 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparse_sparse_minimum.h"
#include <algorithm>
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 2;
const uint32_t kInputNum = 6;
const char *kSparseSparseMinimum = "SparseSparseMinimum";
#define SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = SparseSparseMinimumCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SparseSparseMinimum kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t SparseSparseMinimumCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "SparseSparseMinimum normal check failed.");
const Tensor *x1_indices = ctx.Input(0);
const Tensor *x1_values_t = ctx.Input(1);
const Tensor *x1_shape = ctx.Input(2);
const Tensor *x2_indices = ctx.Input(3);
const Tensor *x2_values_t = ctx.Input(4);
const Tensor *x2_shape = ctx.Input(5);
auto x1_indices_shape = x1_indices->GetTensorShape();
auto x2_indices_shape = x2_indices->GetTensorShape();
KERNEL_CHECK_FALSE(((x1_indices_shape->GetDims() == 2) && (x2_indices_shape->GetDims() == 2)),
KERNEL_STATUS_PARAM_INVALID, "Input indices should be matrices but received dims: %d and %d.",
x1_indices_shape->GetDims(), x2_indices_shape->GetDims())
const int64_t x1_nnz = x1_indices_shape->GetDimSize(0);
const int64_t x2_nnz = x2_indices_shape->GetDimSize(0);
auto x1_values_shape = x1_values_t->GetTensorShape();
auto x2_values_shape = x2_values_t->GetTensorShape();
KERNEL_CHECK_FALSE(((x1_values_shape->GetDims() == 1) && (x2_values_shape->GetDims() == 1)),
KERNEL_STATUS_PARAM_INVALID, "Input values should be vectors but received dims: %d and %d.",
x1_values_shape->GetDims(), x2_values_shape->GetDims())
KERNEL_CHECK_FALSE(((x1_values_t->NumElements() == x1_nnz) && (x2_values_t->NumElements() == x2_nnz)),
KERNEL_STATUS_PARAM_INVALID,
"Expected %d and %d non-empty input values, but received : %d and %d.", x1_nnz, x2_nnz,
x1_values_t->NumElements(), x2_values_t->NumElements())
KERNEL_CHECK_FALSE((x1_values_t->GetDataType() == x2_values_t->GetDataType()), KERNEL_STATUS_PARAM_INVALID,
"Data types of the input values should be the same, but "
"received %d-th and %d-th data type in the DataType enum.",
x1_values_t->GetDataType(), x2_values_t->GetDataType())
auto x1_shape_shape = x1_shape->GetTensorShape();
auto x2_shape_shape = x2_shape->GetTensorShape();
KERNEL_CHECK_FALSE(((x1_shape_shape->GetDims() == 1) && (x2_shape_shape->GetDims() == 1)),
KERNEL_STATUS_PARAM_INVALID, "Input shapes should be vectors but received dims: %d and %d.",
x1_shape_shape->GetDims(), x2_shape_shape->GetDims())
KERNEL_CHECK_FALSE((x1_shape_shape->GetDimSize(0) == x2_shape_shape->GetDimSize(0)), KERNEL_STATUS_PARAM_INVALID,
"Operands' should have the same ranks but received: %d and %d.", x1_shape_shape->GetDimSize(0),
x2_shape_shape->GetDimSize(0))
auto shape_x1 = reinterpret_cast<int64_t *>(x1_shape->GetData());
auto shape_x2 = reinterpret_cast<int64_t *>(x2_shape->GetData());
for (int i = 0; i < x1_shape->NumElements(); ++i) {
KERNEL_CHECK_FALSE(shape_x1[i] == shape_x2[i], KERNEL_STATUS_PARAM_INVALID,
"Operands' shapes do not match: got %d and %d for dimension %d", shape_x1[i], shape_x2[i], i)
}
auto data_type = ctx.Input(1)->GetDataType();
switch (data_type) {
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT8, int8_t, ctx)
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT16, int16_t, ctx)
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_FLOAT, float, ctx)
SPARSE_SPARSE_MINIMUM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("SparseSparseMinimum kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
int SparseSparseMinimumCpuKernel::cmp(const TTypes<int64_t>::ConstMatrix &x_idx, const int64_t x_row, const int dims,
const TTypes<int64_t>::ConstMatrix &y_idx, const int64_t y_row) {
for (int d = 0; d < dims; ++d) {
const int64_t x = x_idx(x_row, d);
const int64_t y = y_idx(y_row, d);
if (x < y) {
return -1;
} else if (x > y) {
return 1;
}
}
return 0;
}
template <typename T>
uint32_t SparseSparseMinimumCpuKernel::SparseSparseMinimumCompute(CpuKernelContext &ctx) {
const EigenTensor x1_indices_ET(ctx.Input(0), ctx.Input(0)->GetData());
const EigenTensor x2_indices_ET(ctx.Input(3), ctx.Input(3)->GetData());
auto x1_indices_mat = x1_indices_ET.matrix<int64_t>();
auto x2_indices_mat = x2_indices_ET.matrix<int64_t>();
const int64_t x1_nnz = x1_indices_mat.dimension(0);
const int64_t x2_nnz = x2_indices_mat.dimension(0);
std::vector<std::pair<bool, int64_t>> entries_to_copy;
entries_to_copy.reserve(x1_nnz + x2_nnz);
std::vector<T> out_values;
const int num_dims = ctx.Input(2)->GetTensorShape()->GetDimSize(0);
EigenTensor x1_values_ET(ctx.Input(1), ctx.Input(1)->GetData());
EigenTensor x2_values_ET(ctx.Input(4), ctx.Input(4)->GetData());
auto x1_values = x1_values_ET.vec<T>();
auto x2_values = x2_values_ET.vec<T>();
int64_t i = 0, j = 0;
T s;
while (i < x1_nnz && j < x2_nnz) {
switch (cmp(x1_indices_mat, i, num_dims, x2_indices_mat, j)) {
case -1:
s = std::min(x1_values(i), T(0));
entries_to_copy.emplace_back(true, i);
out_values.push_back(s);
++i;
break;
case 0:
s = std::min(x1_values(i), x2_values(j));
entries_to_copy.emplace_back(true, i);
out_values.push_back(s);
++i;
++j;
break;
case 1:
s = std::min(T(0), x2_values(j));
entries_to_copy.emplace_back(false, j);
out_values.push_back(s);
++j;
break;
default:
KERNEL_LOG_ERROR("Some inner error happens in the SparseSparseMinimum computation.");
return KERNEL_STATUS_INNER_ERROR;
}
}
#define HANDLE_LEFTOVERS(X1_OR_X2, IDX, IS_A) \
while ((IDX) < X1_OR_X2##_nnz) { \
entries_to_copy.emplace_back(IS_A, IDX); \
s = std::min((X1_OR_X2##_values)((IDX)), T(0)); \
out_values.push_back(s); \
++(IDX); \
}
HANDLE_LEFTOVERS(x1, i, true);
HANDLE_LEFTOVERS(x2, j, false);
#undef HANDLE_LEFTOVERS
const int64_t y_nnz = out_values.size();
Tensor *out_indices_t = ctx.Output(0);
EigenTensor out_indices_ET(out_indices_t, out_indices_t->GetData());
auto out_indices_mat = out_indices_ET.matrix<int64_t>();
for (int64_t i = 0; i < y_nnz; ++i) {
const bool from_x1 = entries_to_copy[i].first;
const int64_t idx = entries_to_copy[i].second;
out_indices_mat.chip<0>(i) = from_x1 ? x1_indices_mat.chip<0>(idx) : x2_indices_mat.chip<0>(idx);
}
std::vector<int64_t> indices_dims = {y_nnz, num_dims};
auto out_indices_shape = out_indices_t->GetTensorShape();
out_indices_shape->SetDimSizes(indices_dims);
out_indices_t->SetTensorShape(out_indices_shape.get());
Tensor *out_values_t = ctx.Output(1);
EigenTensor out_values_ET(out_values_t, out_values_t->GetData());
auto out_values_flat = out_values_ET.vec<T>();
if (y_nnz > 0) {
std::copy_n(out_values.begin(), y_nnz, &out_values_flat(0));
}
std::vector<int64_t> values_dims = {y_nnz};
auto out_values_shape = out_values_t->GetTensorShape();
out_values_shape->SetDimSizes(values_dims);
out_values_t->SetTensorShape(out_values_shape.get());
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSparseSparseMinimum, SparseSparseMinimumCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,41 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSE_SPARSE_MINIMUM_H_
#define AICPU_KERNELS_NORMALIZED_SPARSE_SPARSE_MINIMUM_H_
#include "cpu_ops_kernel.h"
#include "eigen_tensor.h"
namespace aicpu {
class SparseSparseMinimumCpuKernel : public CpuKernel {
public:
SparseSparseMinimumCpuKernel() = default;
~SparseSparseMinimumCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
static uint32_t SparseSparseMinimumCompute(CpuKernelContext &ctx);
static int cmp(const TTypes<int64_t>::ConstMatrix &x_idx, const int64_t x_row, const int dims,
const TTypes<int64_t>::ConstMatrix &y_idx, const int64_t y_row);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,301 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparseaddmm.h"
#include <securec.h>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 7;
const char *kSparseAddmm = "SparseAddmm";
constexpr int64_t kParallelDataNums = 16;
#define SPARSEADDMM_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
if (indices_type == DT_INT64) { \
uint32_t result = SparseAddmmCompute<TYPE, int64_t>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SparseAddmm kernel compute failed."); \
return result; \
} \
break; \
} else { \
uint32_t result = SparseAddmmCompute<TYPE, int32_t>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SparseAddmm kernel compute failed."); \
return result; \
} \
break; \
} \
}
} // namespace
namespace aicpu {
uint32_t SparseAddmmCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSparseAddmm);
KERNEL_HANDLE_ERROR(SparseAddmmCheck(ctx), "[%s] check params failed.", kSparseAddmm);
DataType data_type = ctx.Input(1)->GetDataType();
DataType data_type1 = ctx.Input(3)->GetDataType();
DataType indices_type = ctx.Input(0)->GetDataType();
if (data_type != data_type1) {
KERNEL_LOG_ERROR(
"sparse data type is no equal dense data type, sparsetype [%d], "
"densetype [%d].",
data_type, data_type1);
return KERNEL_STATUS_PARAM_INVALID;
}
switch (data_type) {
SPARSEADDMM_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_FLOAT, float, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_DOUBLE, double, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_INT8, int8_t, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_INT16, int16_t, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_INT32, int32_t, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_INT64, int64_t, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
SPARSEADDMM_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("SparseAddmm kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t SparseAddmmCpuKernel::SparseAddmmCheck(CpuKernelContext &ctx) {
Tensor *indices_tensor = ctx.Input(0);
Tensor *values_tensor = ctx.Input(1);
Tensor *shape_tensor = ctx.Input(2);
Tensor *dense_tensor = ctx.Input(3);
Tensor *alpha_tensor = ctx.Input(5);
Tensor *beta_tensor = ctx.Input(6);
if (alpha_tensor->GetTensorShape()->NumElements() != 1) {
KERNEL_LOG_ERROR(
"alpha_tensor should be a number,but got NumElements "
"[%d].",
alpha_tensor->GetTensorShape()->NumElements());
return KERNEL_STATUS_PARAM_INVALID;
}
if (beta_tensor->GetTensorShape()->NumElements() != 1) {
KERNEL_LOG_ERROR(
"beta_tensor should be a number,but got NumElements "
"[%d].",
beta_tensor->GetTensorShape()->NumElements());
return KERNEL_STATUS_PARAM_INVALID;
}
// valid shape nullptr
auto sparse_shape = shape_tensor->GetTensorShape();
auto values_shape = values_tensor->GetTensorShape();
auto dense_tensor_shape = dense_tensor->GetTensorShape();
auto indices_shape = indices_tensor->GetTensorShape();
// sparse_indices
if (indices_shape->GetDims() > 2) {
KERNEL_LOG_ERROR(
"Sparse_indices should be a scalar, vector, or matrix, got dim "
"size [%d].",
indices_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
const int64_t elems_num = indices_shape->GetDims() > 0 ? indices_shape->GetDimSize(0) : 1;
const int64_t dims_num = indices_shape->GetDims() > 1 ? indices_shape->GetDimSize(1) : 1;
// output_shape
if (sparse_shape->GetDims() != 1) {
KERNEL_LOG_ERROR("Sparse_shape should be a vector, got dim size [%d].", sparse_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
if (shape_tensor->NumElements() != dims_num) {
KERNEL_LOG_ERROR("Sparse_shape has incorrect number of elements [%lld], should be [%lld]",
shape_tensor->NumElements(), dims_num);
return KERNEL_STATUS_PARAM_INVALID;
}
// valid data type
int32_t IndiceType = indices_tensor->GetDataType();
int32_t ShapeType = shape_tensor->GetDataType();
bool validIndiceType = ((IndiceType != DT_INT32) && (IndiceType != DT_INT64));
bool validShapeType = ((ShapeType != DT_INT32) && (ShapeType != DT_INT64));
if (validShapeType || validIndiceType) {
KERNEL_LOG_ERROR(
"Valid indice or Sparse shape data type failed, indiceType [%d], "
"shapeType [%d].",
IndiceType, ShapeType);
return KERNEL_STATUS_PARAM_INVALID;
}
// sparse_values
int32_t values_dims_size = values_shape->GetDims();
if ((values_dims_size != 0) && (values_dims_size != 1)) {
KERNEL_LOG_ERROR("Values_shape should be a scalar or a vector, got dim size [%d].", values_shape->GetDims());
return KERNEL_STATUS_PARAM_INVALID;
}
if ((values_dims_size == 1) && (values_tensor->NumElements() != elems_num)) {
KERNEL_LOG_ERROR("Values_shape has incorrect number of elements [%lld], should be [%lld]",
values_tensor->NumElements(), elems_num);
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T, typename T1>
uint32_t SparseAddmmCpuKernel::SparseAddmmCompute(CpuKernelContext &ctx) {
auto *indices_tensor = ctx.Input(0);
auto *values_tensor = ctx.Input(1);
auto *shape_tensor = ctx.Input(2);
auto *dense_tensor = ctx.Input(3);
auto *x3_dense_tensor = ctx.Input(4);
auto *alpha_tensor = ctx.Input(5);
auto *beta_tensor = ctx.Input(6);
auto *output_tensor = ctx.Output(0);
// auto indices = reinterpret_cast<int64_t *>(indices_tensor->GetData());
auto values = reinterpret_cast<T *>(values_tensor->GetData());
auto dense_data = reinterpret_cast<T *>(dense_tensor->GetData());
auto x3_dense_data = reinterpret_cast<T *>(x3_dense_tensor->GetData());
auto alpha = reinterpret_cast<T *>(alpha_tensor->GetData());
auto beta = reinterpret_cast<T *>(beta_tensor->GetData());
auto y = reinterpret_cast<T *>(output_tensor->GetData());
std::vector<int64_t> temp_shape;
for (int32_t index = 0; index < shape_tensor->GetTensorShape()->GetDimSize(0); ++index) {
if (shape_tensor->GetDataType() == DT_INT32) {
int32_t *temp_dim = reinterpret_cast<int32_t *>(shape_tensor->GetData());
temp_shape.emplace_back(static_cast<int64_t>(temp_dim[index]));
} else {
int64_t *temp_dim = reinterpret_cast<int64_t *>(shape_tensor->GetData());
temp_shape.emplace_back(temp_dim[index]);
}
}
const int64_t row_x1 = temp_shape[0];
const int64_t col_x1 = temp_shape[1];
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> sparse(row_x1, col_x1);
sparse.setZero(row_x1, col_x1);
std::vector<int64_t> temp_indices;
auto indices_one = indices_tensor->GetTensorShape()->GetDimSize(0);
auto indices_two = indices_tensor->GetTensorShape()->GetDimSize(1);
for (int32_t index = 0; index < indices_one; ++index) {
if (indices_tensor->GetDataType() == DT_INT32) {
int32_t *temp_dim = reinterpret_cast<int32_t *>(indices_tensor->GetData());
temp_indices.emplace_back(static_cast<int64_t>(temp_dim[index * indices_two + 0]));
temp_indices.emplace_back(static_cast<int64_t>(temp_dim[index * indices_two + 1]));
} else {
int64_t *temp_dim = reinterpret_cast<int64_t *>(indices_tensor->GetData());
temp_indices.emplace_back(temp_dim[index * indices_two + 0]);
temp_indices.emplace_back(temp_dim[index * indices_two + 1]);
}
}
if (indices_one <= kParallelDataNums) {
for (int64_t i = 0; i < indices_one; i++) {
int64_t row = temp_indices[i * indices_two + 0];
int64_t col = temp_indices[i * indices_two + 1];
sparse(row, col) = *(values + i);
}
} else {
uint32_t minCoreNum = 1;
int64_t maxCoreNum = std::max(minCoreNum, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
auto shardSparse = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
int64_t row = temp_indices[i * indices_two + 0];
int64_t col = temp_indices[i * indices_two + 1];
sparse(row, col) = *(values + i);
}
};
CpuKernelUtils::ParallelFor(ctx, indices_one, indices_one / maxCoreNum, shardSparse);
}
std::vector<int64_t> shape_x2 = dense_tensor->GetTensorShape()->GetDimSizes();
const int64_t row_x2 = shape_x2[0];
const int64_t col_x2 = shape_x2[1];
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> dense(row_x2, col_x2);
std::vector<int64_t> shape_x3 = x3_dense_tensor->GetTensorShape()->GetDimSizes();
const int64_t row_x3 = shape_x3[0];
const int64_t col_x3 = shape_x3[1];
if (row_x3 != row_x1) {
KERNEL_LOG_ERROR("x1's row is no equal x3's row, cannot do add!");
return KERNEL_STATUS_PARAM_INVALID;
}
if (col_x3 != col_x2) {
KERNEL_LOG_ERROR("x2's col is no equal x3's col, cannot do add!");
return KERNEL_STATUS_PARAM_INVALID;
}
if (row_x2 <= kParallelDataNums) {
for (int64_t i = 0; i < row_x2; i++) {
for (int64_t j = 0; j < col_x2; j++) {
dense(i, j) = *(dense_data + i * col_x2 + j);
}
}
} else {
uint32_t minCoreNum = 1;
int64_t maxCoreNum = std::max(minCoreNum, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
auto shardDense = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
for (int64_t j = 0; j < col_x2; j++) {
dense(i, j) = *(dense_data + i * col_x2 + j);
}
}
};
CpuKernelUtils::ParallelFor(ctx, row_x2, row_x2 / maxCoreNum, shardDense);
}
if (col_x1 != row_x2) {
KERNEL_LOG_ERROR("x1's col is no equal x2's row, cannot do mat mul!");
return KERNEL_STATUS_PARAM_INVALID;
}
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> temp;
temp = sparse * dense;
if (row_x1 <= kParallelDataNums) {
for (int64_t i = 0; i < row_x1; i++) {
for (int64_t j = 0; j < col_x2; j++) {
*(y + i * col_x2 + j) = *(alpha + 0) * temp(i, j);
*(y + i * col_x2 + j) += *(beta + 0) * (*(x3_dense_data + i * col_x2 + j));
}
}
} else {
uint32_t minCoreNum = 1;
int64_t maxCoreNum = std::max(minCoreNum, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
auto shardMatMul = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
for (int64_t j = 0; j < col_x2; j++) {
*(y + i * col_x2 + j) = *(alpha + 0) * temp(i, j);
*(y + i * col_x2 + j) += *(beta + 0) * (*(x3_dense_data + i * col_x2 + j));
}
}
};
CpuKernelUtils::ParallelFor(ctx, row_x1, row_x1 / maxCoreNum, shardMatMul);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSparseAddmm, SparseAddmmCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPARSEADDMM_H_
#define AICPU_KERNELS_NORMALIZED_SPARSEADDMM_H_
#include "cpu_ops_kernel.h"
#include "utils/sparse_tensor.h"
namespace aicpu {
class SparseAddmmCpuKernel : public CpuKernel {
public:
~SparseAddmmCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx);
uint32_t SparseAddmmCheck(CpuKernelContext &ctx);
template <typename T, typename T1>
uint32_t SparseAddmmCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,169 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sparsefillemptyrowsgrad.h"
#include <algorithm>
#include <atomic>
#include <mutex>
#include <numeric>
#include <set>
#include <string>
#include <vector>
#include "cpu_kernel_utils.h"
#include "utils/allocator_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "kernel_log.h"
#include "status.h"
namespace {
const char *kSparseFillEmptyRowsGrad = "SparseFillEmptyRowsGrad";
const uint32_t kOutputNum = 2;
const uint32_t kInputNum = 2;
const int64_t kParallelNum{16384};
bool isVector(const std::shared_ptr<aicpu::TensorShape> shape) { return shape->GetDims() == 1; }
} // namespace
namespace aicpu {
template <typename T>
uint32_t SparseFillEmptyRowsGradCpuKernel::ComputeSparseFillEmptyRowsGrad(CpuKernelContext &ctx, DataBank &databank) {
EigenTensor reverse_index_map_e(databank.reverse_index_map, databank.reverse_index_map->GetData());
EigenTensor grad_values_e(databank.grad_values, databank.grad_values->GetData());
EigenTensor y_value_e(databank.y_value, databank.y_value->GetData());
auto reverse_index_map = reverse_index_map_e.vec<int64_t>();
auto grad_values = grad_values_e.vec<T>();
auto y_value = y_value_e.vec<T>();
const int64_t N = databank.reverse_index_map->GetTensorShape()->GetDimSize(0);
const int64_t N_full = databank.grad_values->GetTensorShape()->GetDimSize(0);
std::vector<bool> visited(N_full, false);
T *y_default_value = reinterpret_cast<T *>(databank.y_default_value->GetData());
*y_default_value = static_cast<T>(0);
if (N <= kParallelNum) {
for (int64_t i = 0; i < N; ++i) {
int64_t reverse_index = reverse_index_map(i);
KERNEL_CHECK_FALSE(0 <= reverse_index && reverse_index < N_full, KERNEL_STATUS_PARAM_INVALID,
"Elements in reverse index must be in [0, [%d]) but got [%d]", N_full, reverse_index)
y_value(i) = grad_values(reverse_index);
visited[reverse_index] = true;
}
} else {
int64_t total = N;
uint32_t cores = CpuKernelUtils::GetCPUNum(ctx);
int64_t per_unit_size = (total / std::min(std::max(1L, cores - 2L), total));
uint32_t ret = CpuKernelUtils::ParallelFor(ctx, total, per_unit_size, [&](int64_t begin, int64_t end) {
for (int64_t i = begin; i < end; ++i) {
int64_t reverse_index = reverse_index_map(i);
KERNEL_CHECK_FALSE_VOID(0 <= reverse_index && reverse_index < N_full,
"Elements in reverse index must be in [0, [%d]) but got [%d]", N_full, reverse_index);
y_value(i) = grad_values(reverse_index);
visited[reverse_index] = true;
}
});
KERNEL_CHECK_FALSE((ret == KERNEL_STATUS_OK), KERNEL_STATUS_INNER_ERROR, "SparseFillEmptyRowsGrad compute failed.");
}
for (int64_t j = 0; j < N_full; ++j) {
if (!visited[j]) {
(*y_default_value) += grad_values(j);
}
}
databank.y_default_value->GetTensorShape()->SetDimSizes({});
databank.y_value->GetTensorShape()->SetDimSizes({N});
return KERNEL_STATUS_OK;
}
uint32_t SparseFillEmptyRowsGradCpuKernel::NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &databank) {
databank.reverse_index_map = ctx.Input(0);
databank.grad_values = ctx.Input(1);
databank.y_value = ctx.Output(0);
databank.y_default_value = ctx.Output(1);
KERNEL_CHECK_FALSE(isVector(databank.reverse_index_map->GetTensorShape()), KERNEL_STATUS_PARAM_INVALID,
"Inputs reverse_index_map should be vectors")
KERNEL_CHECK_FALSE(isVector(databank.grad_values->GetTensorShape()), KERNEL_STATUS_PARAM_INVALID,
"Inputs grad_values should be vectors")
return KERNEL_STATUS_OK;
}
uint32_t SparseFillEmptyRowsGradCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
"SparseFillEmptyRowsGrad check input and output number failed.");
DataBank databank;
KERNEL_HANDLE_ERROR(NullptrAndMatVecCheck(ctx, databank), "SparseFillEmptyRowsGrad check params failed.");
DataType dt = static_cast<DataType>(databank.y_value->GetDataType());
uint32_t KERNEL_STATUS;
switch (dt) {
case DT_INT8:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int8_t>(ctx, databank);
break;
case DT_UINT8:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint8_t>(ctx, databank);
break;
case DT_INT16:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int16_t>(ctx, databank);
break;
case DT_UINT16:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint16_t>(ctx, databank);
break;
case DT_INT32:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int32_t>(ctx, databank);
break;
case DT_UINT32:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint32_t>(ctx, databank);
break;
case DT_INT64:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<int64_t>(ctx, databank);
break;
case DT_UINT64:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<uint64_t>(ctx, databank);
break;
case DT_BOOL:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<bool>(ctx, databank);
break;
case DT_STRING:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<std::string>(ctx, databank);
break;
case DT_FLOAT16:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<Eigen::half>(ctx, databank);
break;
case DT_FLOAT:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<float>(ctx, databank);
break;
case DT_DOUBLE:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<double>(ctx, databank);
break;
case DT_COMPLEX64:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<std::complex<float>>(ctx, databank);
break;
case DT_COMPLEX128:
KERNEL_STATUS = ComputeSparseFillEmptyRowsGrad<std::complex<double>>(ctx, databank);
break;
default:
KERNEL_LOG_ERROR("SparseFillEmptyRowsGrad can't support this data type [%s].", DTypeStr(dt).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (KERNEL_STATUS != KERNEL_STATUS_OK) {
KERNEL_LOG_ERROR("SparseFillEmptyRowsGrad failed.");
return KERNEL_STATUS;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSparseFillEmptyRowsGrad, SparseFillEmptyRowsGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,45 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <set>
#include "cpu_ops_kernel.h"
#include "utils/sparse_group.h"
#include "utils/sparse_tensor.h"
// 定义命名空间aicpu
namespace aicpu {
struct DataBank {
DataBank() : reverse_index_map(nullptr), grad_values(nullptr), y_value(nullptr), y_default_value(nullptr) {}
Tensor *reverse_index_map;
Tensor *grad_values;
Tensor *y_value;
Tensor *y_default_value;
};
// 算子类继承CpuKernel基类
class SparseFillEmptyRowsGradCpuKernel : public CpuKernel {
public:
~SparseFillEmptyRowsGradCpuKernel() = default;
SparseFillEmptyRowsGradCpuKernel() = default;
// 声明函数Compute且Compute函数需要重写
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t NullptrAndMatVecCheck(CpuKernelContext &ctx, DataBank &calc_info);
template <typename T>
uint32_t ComputeSparseFillEmptyRowsGrad(CpuKernelContext &ctx, DataBank &databank);
};
} // namespace aicpu

View File

@ -0,0 +1,190 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "split.h"
#include "utils/kernel_util.h"
namespace {
const char *kSplit = "Split";
constexpr uint32_t kSplitInputNum = 2;
std::vector<std::string> attr_names;
} // namespace
namespace aicpu {
uint32_t SplitCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
// check params
AttrValue *num_split_ptr = ctx.GetAttr("num_split");
num_split_ = num_split_ptr->GetInt();
uint32_t kSplitOutputNum = num_split_ptr->GetInt();
attr_names.emplace_back("num_split");
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kSplitInputNum, kSplitOutputNum, attr_names), "[%s] check params failed.",
kSplit);
KERNEL_CHECK_FALSE((num_split_ >= 1), KERNEL_STATUS_PARAM_INVALID,
"Attr num_split must >= 1, but got attr num_split[%lld]", num_split_);
Tensor *split_dim_ptr = ctx.Input(0);
auto split_dim_shape_ptr = split_dim_ptr->GetTensorShape();
KERNEL_CHECK_FALSE((split_dim_shape_ptr->GetDims() == 0), KERNEL_STATUS_PARAM_INVALID,
"Input split_dim should be a scalar integer, but got rank[%lld]", split_dim_shape_ptr->GetDims());
KERNEL_CHECK_FALSE((split_dim_ptr->GetDataType() == DT_INT32), KERNEL_STATUS_PARAM_INVALID,
"Input split_dim data type must be DT_INT32, but got data type[%s]",
DTypeStr(split_dim_ptr->GetDataType()).c_str());
auto split_dim_data_ptr = split_dim_ptr->GetData();
KERNEL_CHECK_NULLPTR(split_dim_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input split_dim data failed.");
split_dim_ = *(reinterpret_cast<int32_t *>(split_dim_data_ptr));
Tensor *value_ptr = ctx.Input(1);
value_data_ptr_ = value_ptr->GetData();
auto value_shape_ptr = value_ptr->GetTensorShape();
int64_t value_dim = value_shape_ptr->GetDims();
if (split_dim_ < 0) {
split_dim_ += value_dim;
}
KERNEL_CHECK_FALSE(value_dim > split_dim_, KERNEL_STATUS_PARAM_INVALID,
"Dim of Input value must greater than split_dim, value dim is [%d], split_dim is [%d].", value_dim,
num_split_);
value_shape_vec_ = value_shape_ptr->GetDimSizes();
data_type_ = value_ptr->GetDataType();
value_num_ = value_ptr->NumElements();
KERNEL_CHECK_FALSE((value_shape_ptr->GetDimSize(split_dim_) % num_split_ == 0), KERNEL_STATUS_PARAM_INVALID,
"Number of ways to split should evenly divide the split "
"dimension, but got split_dim [%d] (size = [%lld]) and num_split is [%lld]",
split_dim_, value_shape_ptr->GetDimSize(split_dim_), num_split_);
output_ptr_vec_.resize(num_split_);
for (int64_t i = 0; i < num_split_; i++) {
Tensor *output_ptr = ctx.Output(i);
auto output_data_ptr = output_ptr->GetData();
output_ptr_vec_[i] = output_data_ptr;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SplitCpuKernel::DoCompute(CpuKernelContext &ctx) {
T *input_data_ptr = static_cast<T *>(value_data_ptr_);
std::vector<T *> output_data_vec;
output_data_vec.resize(num_split_);
for (int64_t i = 0; i < num_split_; i++) {
output_data_vec[i] = reinterpret_cast<T *>(output_ptr_vec_[i]);
}
if (num_split_ == 1) {
KERNEL_CHECK_FALSE((SplitWithOneOutput<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
KERNEL_STATUS_PARAM_INVALID, "SplitWithOneOutput failed.");
return KERNEL_STATUS_OK;
}
if (split_dim_ == 0) {
KERNEL_CHECK_FALSE((SplitWithDimZero<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
KERNEL_STATUS_PARAM_INVALID, "SplitWithDimZero failed.");
return KERNEL_STATUS_OK;
}
KERNEL_CHECK_FALSE((SplitCompute<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
KERNEL_STATUS_PARAM_INVALID, "Split Compute failed.");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SplitCpuKernel::SplitWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec) {
int64_t copy_size = value_num_ * sizeof(T);
auto mem_ret = memcpy_s(output_data_vec[0], copy_size, input_data_ptr, copy_size);
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
"Memcpy size[%zu] from input value to output[0] failed.", copy_size);
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SplitCpuKernel::SplitWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec) {
int64_t copy_num = value_num_ / value_shape_vec_[0];
T *input_copy_ptr = input_data_ptr;
const int64_t split_dim_output_size = value_shape_vec_[0] / num_split_;
for (int32_t i = 0; i < num_split_; i++) {
int64_t copy_size_per = copy_num * split_dim_output_size;
int64_t copy_size = copy_size_per * sizeof(T);
auto mem_ret = memcpy_s(output_data_vec[i], copy_size, input_copy_ptr, copy_size);
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
"Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
input_copy_ptr += copy_size_per;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SplitCpuKernel::SplitCompute(T *input_data_ptr, std::vector<T *> output_data_vec) {
int64_t prefix = 1;
for (int32_t i = 0; i < split_dim_; ++i) {
prefix *= value_shape_vec_[i];
}
int64_t midfix = value_shape_vec_[split_dim_];
int64_t subfix = 1;
for (size_t i = split_dim_ + 1; i < value_shape_vec_.size(); i++) {
subfix *= value_shape_vec_[i];
}
const int64_t split_dim_output_size = midfix / num_split_;
int64_t offset = 0;
for (int64_t i = 0; i < num_split_; ++i) {
T *output_data_ptr = output_data_vec[i];
T *input_copy_ptr = input_data_ptr + offset;
int64_t copy_num = subfix * split_dim_output_size;
int64_t copy_size = copy_num * sizeof(T);
for (int64_t j = 0; j < prefix; j++) {
auto mem_ret = memcpy_s(output_data_ptr, copy_size, input_copy_ptr, copy_size);
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
"Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
input_copy_ptr += (subfix * midfix);
output_data_ptr += copy_num;
}
offset += copy_num;
}
return KERNEL_STATUS_OK;
}
uint32_t SplitCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_CHECK_FALSE((CheckAndInitParams(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"CheckAndInitParams failed.");
switch (data_type_) {
case DT_FLOAT16:
return DoCompute<Eigen::half>(ctx);
case DT_FLOAT:
return DoCompute<float>(ctx);
case DT_DOUBLE:
return DoCompute<double>(ctx);
case DT_BOOL:
return DoCompute<bool>(ctx);
case DT_INT8:
return DoCompute<int8_t>(ctx);
case DT_INT16:
return DoCompute<int16_t>(ctx);
case DT_INT32:
return DoCompute<int32_t>(ctx);
case DT_INT64:
return DoCompute<int64_t>(ctx);
case DT_UINT8:
return DoCompute<uint8_t>(ctx);
case DT_UINT16:
return DoCompute<uint16_t>(ctx);
case DT_UINT32:
return DoCompute<uint32_t>(ctx);
case DT_UINT64:
return DoCompute<uint64_t>(ctx);
case DT_COMPLEX64:
return DoCompute<std::complex<float>>(ctx);
case DT_COMPLEX128:
return DoCompute<std::complex<double>>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported datatype[%s]", DTypeStr(data_type_).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
REGISTER_CPU_KERNEL(kSplit, SplitCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,84 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SPLIT_H_
#define AICPU_KERNELS_NORMALIZED_SPLIT_H_
#include <memory>
#include <vector>
#include "unsupported/Eigen/CXX11/Tensor"
#include "securec.h"
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "status.h"
namespace aicpu {
class SplitCpuKernel : public CpuKernel {
public:
SplitCpuKernel() = default;
~SplitCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
/**
* @brief Init params
* @param ctx cpu kernel context
* @return status if success
*/
uint32_t CheckAndInitParams(CpuKernelContext &ctx);
/**
* @brief split data when split num is 1
* @param input_data_ptr ptr which store input data
* @param output_data_vec vector which store all output data ptr
* @return status if success
*/
template <typename T>
uint32_t SplitWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec);
/**
* @brief split data when split dim is 0
* @param input_data_ptr ptr which store input data
* @param output_data_vec vector which store all output data ptr
* @return status if success
*/
template <typename T>
uint32_t SplitWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec);
/**
* @brief split data
* @param input_data_ptr ptr which store input data
* @param output_data_vec vector which store all output data ptr
* @return status if success
*/
template <typename T>
uint32_t SplitCompute(T *input_data_ptr, std::vector<T *> output_data_vec);
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
private:
DataType data_type_;
int32_t split_dim_;
int64_t num_split_;
int64_t value_num_;
void *value_data_ptr_;
std::vector<void *> output_ptr_vec_;
std::vector<int64_t> value_shape_vec_;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,142 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sqrt.h"
#include <complex>
#include <unsupported/Eigen/CXX11/Tensor>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/kernel_util.h"
namespace {
const std::uint32_t kSqrtInputNum{1};
const std::uint32_t kSqrtOutputNum{1};
const std::uint32_t Parallel4ThreadNum{4096};
const std::uint32_t Parallel6ThreadNum{8192};
const std::uint32_t ParallelNum{16384};
const char *kSqrt{"Sqrt"};
} // namespace
namespace aicpu {
namespace detail {
template <typename T>
inline std::uint32_t ComputeSqrtKernel(const CpuKernelContext &ctx) {
const auto ParallelFor = aicpu::CpuKernelUtils::ParallelFor;
auto input = static_cast<T *>(ctx.Input(0)->GetData());
auto output = static_cast<T *>(ctx.Output(0)->GetData());
std::int64_t total = ctx.Input(0)->NumElements();
std::uint64_t total_size = ctx.Input(0)->GetDataSize();
uint32_t cores = aicpu::CpuKernelUtils::GetCPUNum(ctx);
bool parallel_flag = false;
if (total_size > ParallelNum * sizeof(T)) {
parallel_flag = true;
} else if (total_size > Parallel6ThreadNum * sizeof(T)) {
parallel_flag = true;
cores = 8;
} else if (total_size > Parallel4ThreadNum * sizeof(T)) {
parallel_flag = true;
cores = 6;
}
if (parallel_flag) {
std::int64_t per_unit_size{total / std::min(std::max(1L, cores - 2L), total)};
return ParallelFor(ctx, total, per_unit_size, [&](std::int64_t begin, std::int64_t end) {
std::transform(input + begin, input + end, output + begin, Eigen::numext::sqrt<T>);
});
} else if (cores != 0) {
std::transform(input, input + total, output, Eigen::numext::sqrt<T>);
} else {
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
template <typename T>
inline std::uint32_t ComputeSqrt(const CpuKernelContext &ctx) {
uint32_t result = ComputeSqrtKernel<T>(ctx);
if (result != 0) {
KERNEL_LOG_ERROR("Sqrt compute failed.");
}
return result;
}
inline std::uint32_t SqrtExtraCheck(const CpuKernelContext &ctx) {
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s].",
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get input data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Output(0)->GetData() == nullptr) {
KERNEL_LOG_ERROR("Get output data failed.");
return KERNEL_STATUS_PARAM_INVALID;
}
std::vector<int64_t> input_dims = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> output_dims = ctx.Output(0)->GetTensorShape()->GetDimSizes();
if (input_dims.size() != output_dims.size()) {
KERNEL_LOG_ERROR(
"The data dim of the input size [%llu] need be the same as the output "
"size [%llu].",
input_dims.size(), output_dims.size());
return KERNEL_STATUS_PARAM_INVALID;
}
for (size_t index = 0; index < input_dims.size(); index++) {
if (input_dims[index] != output_dims[index]) {
KERNEL_LOG_ERROR(
"The data dim[%llu]=%lld of the input need be the same as the output "
"dim[%llu]=%lld.",
index, input_dims[index], index, output_dims[index]);
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
std::uint32_t SqrtCheck(CpuKernelContext &ctx, uint32_t inputs_num, uint32_t outputs_num) {
return NormalCheck(ctx, kSqrtInputNum, kSqrtOutputNum) ? KERNEL_STATUS_PARAM_INVALID : SqrtExtraCheck(ctx);
}
std::uint32_t SqrtCompute(const CpuKernelContext &ctx) {
DataType input_type{ctx.Input(0)->GetDataType()};
switch (input_type) {
case DT_FLOAT16:
return ComputeSqrt<Eigen::half>(ctx);
case DT_FLOAT:
return ComputeSqrt<std::float_t>(ctx);
case DT_DOUBLE:
return ComputeSqrt<std::double_t>(ctx);
case DT_COMPLEX64:
return ComputeSqrt<std::complex<std::float_t> >(ctx);
case DT_COMPLEX128:
return ComputeSqrt<std::complex<std::double_t> >(ctx);
default:
KERNEL_LOG_ERROR("Unsupported input data type [%s].", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
} // namespace detail
std::uint32_t SqrtCpuKernel::Compute(CpuKernelContext &ctx) {
return detail::SqrtCheck(ctx, kSqrtInputNum, kSqrtOutputNum) ? KERNEL_STATUS_PARAM_INVALID : detail::SqrtCompute(ctx);
}
REGISTER_CPU_KERNEL(kSqrt, SqrtCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,25 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SQRT_H_
#define AICPU_KERNELS_NORMALIZED_SQRT_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class SqrtCpuKernel final : public CpuKernel {
virtual std::uint32_t Compute(CpuKernelContext &ctx) override final;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,248 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sqrtgrad.h"
#include <complex>
#include <cstdint>
#include <typeinfo>
#include "Eigen/Dense"
#include <iostream>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "utils/kernel_util.h"
#include "kernel_log.h"
#include "securec.h"
#include "status.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kSqrtGrad = "SqrtGrad";
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
#define SQRTGRAD_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = SqrtGradCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SqrtGrad kernel compute failed."); \
return result; \
} \
break; \
}
#define SQRTGRAD_COMPUTE_COMPLEX_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = SqrtGradComputeComplex<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("SqrtGrad kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t SqrtGradCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kSqrtGrad);
KERNEL_HANDLE_ERROR(SqrtGradParamCheck(ctx), "[%s] check params failed.", kSqrtGrad);
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
SQRTGRAD_COMPUTE_COMPLEX_CASE(DT_COMPLEX64, std::complex<float>, ctx)
SQRTGRAD_COMPUTE_COMPLEX_CASE(DT_COMPLEX128, std::complex<double>, ctx)
SQRTGRAD_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
SQRTGRAD_COMPUTE_CASE(DT_FLOAT, float, ctx)
SQRTGRAD_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("SqrtGrad kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t SqrtGradCpuKernel::SqrtGradParamCheck(CpuKernelContext &ctx) {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"SqrtGradCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
/**
special compute is used in the following situations.
1. the shapes of input1 and input2 are the same
2. input1 is a 1D tensor with only one element or input1 is scalar
3. input2 is a 1D tensor with only one element or input2 is scalar
4. the shapes of input1 and input2 are different
*/
template <typename T>
void SqrtGradCpuKernel::SpecialCompute(int64_t start, int64_t end, T *input1, T *input2, T *output) {
int flag = 0;
for (int64_t i = start; i < end; ++i) {
if (*(input2 + i) == static_cast<T>(0)) {
flag = 1;
break;
}
}
for (int64_t i = start; i < end; ++i) {
*(output + i) = *(input2 + i) * static_cast<T>(0.5) / *(input1 + i);
}
if (flag == 1) KERNEL_LOG_WARN("divide by zero encountered");
}
template <typename T>
void SqrtGradCpuKernel::SpecialComputeComplex(int64_t start, int64_t end, T *input1, T *input2, T *output) {
int flag = 0;
for (int64_t i = start; i < end; ++i) {
if (*(input2 + i) == static_cast<T>(0)) {
flag = 1;
break;
}
}
for (int64_t i = start; i < end; ++i) {
T in1 = *(input1 + i);
T in1_conj = std::conj(in1);
if (in1_conj == static_cast<T>(0)) {
*(output + i) = INFINITY;
} else {
*(output + i) = *(input2 + i) * static_cast<T>(0.5) / in1_conj;
}
}
if (flag == 1) KERNEL_LOG_WARN("divide by zero encountered");
}
template <typename T>
uint32_t SqrtGradCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t data_num = in0_elements_nums;
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_sqrtgrad = [&](size_t start, size_t end) { SpecialCompute<T>(0, data_num, in0, in1, out); };
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_sqrtgrad),
"SqrtGrad Compute failed.");
} else {
SpecialCompute<T>(0, data_num, in0, in1, out);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SqrtGradCpuKernel::NoBcastComputeComplex(CpuKernelContext &ctx) {
auto in0 = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto in1 = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto out = reinterpret_cast<T *>(ctx.Output(0)->GetData());
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t data_num = in0_elements_nums;
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_sqrtgrad = [&](size_t start, size_t end) { SpecialComputeComplex<T>(0, data_num, in0, in1, out); };
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_sqrtgrad),
"SqrtGrad Compute failed.");
} else {
SpecialComputeComplex<T>(0, data_num, in0, in1, out);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SqrtGradCpuKernel::SqrtGradCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
if (input0_elements_nums != input1_elements_nums) {
KERNEL_LOG_WARN("Invalid element numbers, got[%d] and [%d]", static_cast<int32_t>(input0_elements_nums),
static_cast<int32_t>(input1_elements_nums));
return KERNEL_STATUS_PARAM_INVALID;
} else {
return NoBcastCompute<T>(ctx);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t SqrtGradCpuKernel::SqrtGradComputeComplex(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
if (input0_elements_nums != input1_elements_nums) {
KERNEL_LOG_WARN("Invalid element numbers, got[%d] and [%d]", static_cast<int32_t>(input0_elements_nums),
static_cast<int32_t>(input1_elements_nums));
return KERNEL_STATUS_PARAM_INVALID;
} else {
return NoBcastComputeComplex<T>(ctx);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kSqrtGrad, SqrtGradCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,50 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_SQRTGRAD_H_
#define AICPU_KERNELS_NORMALIZED_SQRTGRAD_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
namespace aicpu {
class SqrtGradCpuKernel : public CpuKernel {
public:
SqrtGradCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t SqrtGradParamCheck(CpuKernelContext &ctx);
template <typename T>
void SpecialCompute(int64_t start, int64_t end, T *input1, T *input2, T *output);
template <typename T>
void SpecialComputeComplex(int64_t start, int64_t end, T *input1, T *input2, T *output);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t NoBcastComputeComplex(CpuKernelContext &ctx);
template <typename T>
uint32_t SqrtGradCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t SqrtGradComputeComplex(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,85 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tanh.h"
#include "Eigen/Dense"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "cmath"
#include <complex>
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 1;
const char *kTanh = "Tanh";
constexpr int64_t kParallelDataNums = 128 * 1024;
#define Tanh_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = TanhCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Tanh kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t TanhCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kTanh);
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
Tanh_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
Tanh_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx) Tanh_COMPUTE_CASE(DT_FLOAT, float, ctx)
Tanh_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx) Tanh_COMPUTE_CASE(DT_DOUBLE, double, ctx) default
: KERNEL_LOG_ERROR("Tanh kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t TanhCpuKernel::TanhCompute(CpuKernelContext &ctx) {
Eigen::internal::scalar_tanh_op<T> tanh_op;
auto input_x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto output_y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
size_t data_num = ctx.Input(0)->NumElements();
int64_t data_size = data_num * sizeof(T);
if (data_size <= kParallelDataNums) {
for (size_t i = 0; i < data_num; i++) {
auto x_idx = input_x + i; // i-th value of input0
*(output_y + i) = tanh_op((*x_idx));
}
} else {
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
auto shard_Tanh = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto x_idx = input_x + i; // i-th value of input0
*(output_y + i) = tanh_op((*x_idx));
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, shard_Tanh),
"Tanh Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kTanh, TanhCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,34 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_TANH_H_
#define AICPU_KERNELS_NORMALIZED_TANH_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class TanhCpuKernel : public CpuKernel {
public:
TanhCpuKernel() = default;
~TanhCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t TanhCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,156 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdint.h>
#include <algorithm>
#include <tuple>
#include <utility>
#include "tile.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
#include "Eigen/Core"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kTile = "Tile";
#define TILE_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = TileCompute<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Tile kernel compute failed."); \
return result; \
} \
break; \
}
#define TILE_COMPUTE_CASE_ALL(TYPE, CTX) \
TILE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_UINT32, uint32_t, TYPE, CTX) \
TILE_COMPUTE_CASE(DT_UINT64, uint64_t, TYPE, CTX)
} // namespace
namespace aicpu {
uint32_t TileCpuKernel::Compute(CpuKernelContext &ctx) {
// check params
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Tile check input and output number failed.");
Tensor *input_x0 = ctx.Input(0);
Tensor *input_x1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
auto size_0 = ctx.Input(0)->GetTensorShape()->GetDims();
auto size_1 = ctx.Input(1)->GetTensorShape()->GetDims();
KERNEL_CHECK_FALSE((size_0 >= 1), KERNEL_STATUS_PARAM_INVALID, "Dimension of x must be 1 or higher, but got[%zu].",
size_0);
KERNEL_CHECK_FALSE((size_1 == 1), KERNEL_STATUS_PARAM_INVALID, "Dimension of multiples must be 1, but got[%zu].",
size_1);
KERNEL_CHECK_FALSE((size_0 == input_x1->NumElements()), KERNEL_STATUS_PARAM_INVALID,
"Multiples length must be the same as the number of dimensions in x.");
KERNEL_LOG_DEBUG(
"TileCpuKernel[%s], inputx0: size[%llu];"
"inputx1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_x0->GetDataSize(), input_x1->GetDataSize(), output->GetDataSize());
DataType data_type = ctx.Input(0)->GetDataType();
DataType multiples_type = ctx.Input(1)->GetDataType();
switch (multiples_type) {
case DT_INT32:
switch (data_type) {
TILE_COMPUTE_CASE_ALL(int32_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
case DT_INT64:
switch (data_type) {
TILE_COMPUTE_CASE_ALL(int64_t, ctx)
default:
KERNEL_LOG_ERROR("Input[0] data type[%s] not supported.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
default:
KERNEL_LOG_ERROR("Input[1] data type[%s] not supported.", DTypeStr(multiples_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T, typename M>
void TileCpuKernel::CopyMultipleTimes(const T *in_data, int64_t in_size, M multiplier, T *out_data) {
for (M i = 0; i < multiplier; ++i) {
const T *in_end = in_data + in_size;
T *new_out_data = std::copy(in_data, in_end, out_data);
in_data = out_data;
out_data = new_out_data;
}
}
template <typename T, typename M>
std::pair<int64_t, int64_t> TileCpuKernel::TileOneDimension(const std::vector<int64_t> &in_dimensions, const T *in_data,
const M *multipliers, T *out_data, int64_t dimension) {
if (in_dimensions.size() == 0) {
// If input tensor is a scalar, then just copy it to output (no need to
// multiply).
*out_data = *in_data;
return std::make_pair(0, 0);
}
const int64_t dimension_size = in_dimensions[dimension];
if (dimension == static_cast<int64_t>(in_dimensions.size() - 1)) {
CopyMultipleTimes(in_data, dimension_size, multipliers[dimension], out_data);
return std::make_pair(dimension_size, dimension_size * static_cast<int64_t>(multipliers[dimension]));
}
int64_t total_stride_size = 0, total_tiled_stride_size = 0;
const T *copy_from_data = in_data;
T *copy_to_data = out_data;
for (int64_t i = 0; i < dimension_size; ++i) {
int64_t stride_size = 0, tiled_stride_size = 0;
std::tie(stride_size, tiled_stride_size) =
TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
copy_from_data += stride_size;
copy_to_data += tiled_stride_size;
total_stride_size += stride_size;
total_tiled_stride_size += tiled_stride_size;
}
CopyMultipleTimes(out_data, total_tiled_stride_size, multipliers[dimension] - 1, out_data + total_tiled_stride_size);
return std::make_pair(total_stride_size, static_cast<int64_t>(total_tiled_stride_size * multipliers[dimension]));
}
template <typename T, typename M>
uint32_t TileCpuKernel::TileCompute(CpuKernelContext &ctx) {
auto x = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto multiples = reinterpret_cast<M *>(ctx.Input(1)->GetData());
auto y = reinterpret_cast<T *>(ctx.Output(0)->GetData());
std::vector<int64_t> in_dimensions = ctx.Input(0)->GetTensorShape()->GetDimSizes();
TileOneDimension(in_dimensions, x, multiples, y, 0);
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kTile, TileCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,43 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_TILE_H_
#define AICPU_KERNELS_NORMALIZED_TILE_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class TileCpuKernel : public CpuKernel {
public:
TileCpuKernel() = default;
~TileCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T, typename M>
void CopyMultipleTimes(const T *in_data, int64_t in_size, M multiplier, T *out_data);
template <typename T, typename M>
std::pair<int64_t, int64_t> TileOneDimension(const std::vector<int64_t> &in_dimensions, const T *in_data,
const M *multipliers, T *out_data, int64_t dimension);
template <typename T, typename M>
uint32_t TileCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,220 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "transpose.h"
#include "cpu_kernel_utils.h"
#include "securec.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kTranspose = "Transpose";
#define TRANSPOSE_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = TransposeCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Transpose kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t TransposeCpuKernel::GetTransposeValue(Tensor *tensor, std::vector<int64_t> &value) {
auto type = tensor->GetDataType();
if (type == DT_INT32) {
auto data = reinterpret_cast<int32_t *>(tensor->GetData());
for (unsigned int i = 0; i < tensor->NumElements(); i++) {
value.push_back(static_cast<int64_t>(*(data + i)));
}
} else if (type == DT_INT64) {
auto data = reinterpret_cast<int64_t *>(tensor->GetData());
for (unsigned int i = 0; i < tensor->NumElements(); i++) {
value.push_back(*(data + i));
}
} else {
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t TransposeCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kTranspose);
KERNEL_HANDLE_ERROR(TransposeParamCheck(ctx), "[%s] check params failed.", kTranspose);
auto x_type = ctx.Input(0)->GetDataType();
switch (x_type) {
TRANSPOSE_COMPUTE_CASE(DT_BOOL, bool, ctx)
TRANSPOSE_COMPUTE_CASE(DT_DOUBLE, double, ctx)
TRANSPOSE_COMPUTE_CASE(DT_UINT8, uint8_t, ctx)
TRANSPOSE_COMPUTE_CASE(DT_UINT16, uint16_t, ctx)
TRANSPOSE_COMPUTE_CASE(DT_UINT32, uint32_t, ctx)
TRANSPOSE_COMPUTE_CASE(DT_UINT64, uint64_t, ctx)
TRANSPOSE_COMPUTE_CASE(DT_INT8, int8_t, ctx)
TRANSPOSE_COMPUTE_CASE(DT_INT16, int16_t, ctx)
TRANSPOSE_COMPUTE_CASE(DT_INT32, int32_t, ctx)
TRANSPOSE_COMPUTE_CASE(DT_INT64, int64_t, ctx)
TRANSPOSE_COMPUTE_CASE(DT_FLOAT, float, ctx)
TRANSPOSE_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
TRANSPOSE_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
TRANSPOSE_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("Transpose kernel data type [%s] not support.", DTypeStr(x_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t TransposeCpuKernel::TransposeParamCheck(CpuKernelContext &ctx) {
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_perm = ctx.Input(1)->GetTensorShape()->GetDimSizes();
auto perm_tensor = ctx.Input(1);
auto y_tensor = ctx.Output(0);
KERNEL_CHECK_FALSE((shape_perm.size() == 1), KERNEL_STATUS_PARAM_INVALID,
"Expected perm to be 1-D tensors , but got [%zu]-D tensors.", shape_x.size())
KERNEL_CHECK_FALSE((perm_tensor->NumElements() == (unsigned int)shape_x.size()), KERNEL_STATUS_PARAM_INVALID,
"Expected the size of perm to be [%zu], but got [%zu].", shape_x.size(),
perm_tensor->NumElements())
KERNEL_CHECK_FALSE((GetTransposeValue(perm_tensor, perm) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"perm must be either int32 or int64, but got [%s].", DTypeStr(perm_tensor->GetDataType()).c_str())
KERNEL_CHECK_FALSE((shape_x.size() > 1), KERNEL_STATUS_PARAM_INVALID,
"Expected the dimension of x to be greater than 1-D, but got [%zu].", shape_x.size())
std::vector<int64_t> shape_y;
for (size_t i = 0; i < shape_x.size(); ++i) {
int64_t perm_value = perm.at(i);
if (shape_x.at(i) == 0) {
KERNEL_CHECK_FALSE((perm_value == 0), KERNEL_STATUS_PARAM_INVALID,
"Expected perm[%zu] == 0 (got %zu), when x shape[%zu] == 0.", i, perm_value, i)
} else {
KERNEL_CHECK_FALSE((0 <= perm_value && perm_value <= (unsigned int)shape_x.size() - 1),
KERNEL_STATUS_PARAM_INVALID, "Expected perm[%zu] in [0, %zu], but got %zu.", i, shape_x.size(),
perm_value)
}
int64_t temp_value = 0;
for (size_t j = 0; j < shape_x.size(); ++j) {
if ((unsigned int)perm.at(j) == i) {
break;
} else {
temp_value = j + 1;
KERNEL_CHECK_FALSE((temp_value < (unsigned int)shape_x.size()), KERNEL_STATUS_PARAM_INVALID,
"Expected perm value is unique.")
}
}
shape_y.push_back(shape_x.at(perm_value));
}
y_tensor->GetTensorShape()->SetDimSizes(shape_y);
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t TransposeCpuKernel::TransposeCompute(CpuKernelContext &ctx) {
auto x_data = ctx.Input(0)->GetData();
auto y_data = ctx.Output(0)->GetData();
std::vector<int64_t> shape_x = ctx.Input(0)->GetTensorShape()->GetDimSizes();
std::vector<int64_t> shape_y = ctx.Output(0)->GetTensorShape()->GetDimSizes();
auto input_data = reinterpret_cast<T *>(x_data);
auto output_data = reinterpret_cast<T *>(y_data);
int64_t input_dims = shape_x.size();
switch (input_dims) {
case 2: {
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_2D;
Eigen_Tensor_2D input_2D(input_data, shape_x.at(0), shape_x.at(1));
Eigen_Tensor_2D output_2D(output_data, shape_y.at(0), shape_y.at(1));
Eigen::array<Eigen::DenseIndex, 2> perm_2D;
for (size_t i = 0; i < 2; ++i) {
perm_2D[i] = perm.at(i);
}
output_2D = input_2D.shuffle(perm_2D);
break;
}
case 3: {
typedef Eigen::TensorMap<Eigen::Tensor<T, 3, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_3D;
Eigen_Tensor_3D input_3D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2));
Eigen_Tensor_3D output_3D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2));
Eigen::array<Eigen::DenseIndex, 3> perm_3D;
for (size_t i = 0; i < 3; ++i) {
perm_3D[i] = perm.at(i);
}
output_3D = input_3D.shuffle(perm_3D);
break;
}
case 4: {
typedef Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_4D;
Eigen_Tensor_4D input_4D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3));
Eigen_Tensor_4D output_4D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3));
Eigen::array<Eigen::DenseIndex, 4> perm_4D;
for (size_t i = 0; i < 4; ++i) {
perm_4D[i] = perm.at(i);
}
output_4D = input_4D.shuffle(perm_4D);
break;
}
case 5: {
typedef Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_5D;
Eigen_Tensor_5D input_5D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3), shape_x.at(4));
Eigen_Tensor_5D output_5D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3), shape_y.at(4));
Eigen::array<Eigen::DenseIndex, 5> perm_5D;
for (size_t i = 0; i < 5; ++i) {
perm_5D[i] = perm.at(i);
}
output_5D = input_5D.shuffle(perm_5D);
break;
}
case 6: {
typedef Eigen::TensorMap<Eigen::Tensor<T, 6, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_6D;
Eigen_Tensor_6D input_6D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3), shape_x.at(4),
shape_x.at(5));
Eigen_Tensor_6D output_6D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3), shape_y.at(4),
shape_y.at(5));
Eigen::array<Eigen::DenseIndex, 6> perm_6D;
for (size_t i = 0; i < 6; ++i) {
perm_6D[i] = perm.at(i);
}
output_6D = input_6D.shuffle(perm_6D);
break;
}
case 7: {
typedef Eigen::TensorMap<Eigen::Tensor<T, 7, Eigen::RowMajor>, Eigen::Aligned> Eigen_Tensor_7D;
Eigen_Tensor_7D input_7D(input_data, shape_x.at(0), shape_x.at(1), shape_x.at(2), shape_x.at(3), shape_x.at(4),
shape_x.at(5), shape_x.at(6));
Eigen_Tensor_7D output_7D(output_data, shape_y.at(0), shape_y.at(1), shape_y.at(2), shape_y.at(3), shape_y.at(4),
shape_y.at(5), shape_y.at(6));
Eigen::array<Eigen::DenseIndex, 7> perm_7D;
for (size_t i = 0; i < 7; ++i) {
perm_7D[i] = perm.at(i);
}
output_7D = input_7D.shuffle(perm_7D);
break;
}
default:
KERNEL_LOG_ERROR("[%s] : Unhandled input dimensions [%zu].", kTranspose, input_dims);
return KERNEL_STATUS_INNER_ERROR;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kTranspose, TransposeCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,39 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_TRANSPOSE_H_
#define AICPU_KERNELS_NORMALIZED_TRANSPOSE_H_
#include <vector>
#include "cpu_ops_kernel.h"
namespace aicpu {
class TransposeCpuKernel : public CpuKernel {
public:
~TransposeCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
std::vector<int64_t> perm;
uint32_t TransposeParamCheck(CpuKernelContext &ctx);
uint32_t GetTransposeValue(Tensor *tensor, std::vector<int64_t> &value);
template <typename T>
uint32_t TransposeCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_TRANSPOSE_H

View File

@ -0,0 +1,127 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tridiagonal_matmul.h"
#include <complex>
#include "Eigen/Core"
#include "Eigen/Dense"
#include "Eigen/LU"
#include "unsupported/Eigen/CXX11/Tensor"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "status.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
constexpr uint32_t kInputNum = 4;
constexpr uint32_t kOutputNum = 1;
const char *kTridiagonalMatMul = "TridiagonalMatMul";
} // namespace
namespace aicpu {
uint32_t TridiagonalMatMulCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "TridiagonalMatMul check input and output num failed.");
KERNEL_HANDLE_ERROR(TridiagonalMatMulDataAndTypeCheck(ctx),
"TridiagonalMatMul check input and output params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
case DT_FLOAT16:
return TridiagonalMatMulCompute<Eigen::half>(ctx);
case DT_FLOAT:
return TridiagonalMatMulCompute<float>(ctx);
case DT_DOUBLE:
return TridiagonalMatMulCompute<double>(ctx);
case DT_COMPLEX64:
return TridiagonalMatMulCompute<std::complex<float>>(ctx);
case DT_COMPLEX128:
return TridiagonalMatMulCompute<std::complex<double>>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported input data type[%s]", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t TridiagonalMatMulCpuKernel::TridiagonalMatMulDataAndTypeCheck(CpuKernelContext &ctx) {
DataType superdiag_type = ctx.Input(0)->GetDataType();
DataType maindiag_type = ctx.Input(1)->GetDataType();
DataType subdiag_type = ctx.Input(2)->GetDataType();
DataType rhs_type = ctx.Input(3)->GetDataType();
KERNEL_CHECK_FALSE((superdiag_type == maindiag_type && maindiag_type == subdiag_type && subdiag_type == rhs_type),
KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s], input1 [%s],input2 [%s] and input3 [%s] "
"need be same.",
DTypeStr(superdiag_type).c_str(), DTypeStr(maindiag_type).c_str(), DTypeStr(subdiag_type).c_str(),
DTypeStr(rhs_type).c_str())
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t TridiagonalMatMulCpuKernel::TridiagonalMatMulCompute(CpuKernelContext &ctx) {
auto superdiag_tensor = ctx.Input(0);
auto superdiag_tensor_shape = superdiag_tensor->GetTensorShape();
KERNEL_CHECK_FALSE((IsVector(superdiag_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
"invalid Input[superdiag]")
auto maindiag_tensor = ctx.Input(1);
auto maindiag_tensor_shape = maindiag_tensor->GetTensorShape();
KERNEL_CHECK_FALSE((IsVector(maindiag_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
"invalid Input[maindiag]")
auto subdiag_tensor = ctx.Input(2);
auto subdiag_tensor_shape = subdiag_tensor->GetTensorShape();
KERNEL_CHECK_FALSE((IsVector(subdiag_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID,
"invalid Input[subdiag]")
auto rhs_tensor = ctx.Input(3);
auto rhs_tensor_shape = rhs_tensor->GetTensorShape();
KERNEL_CHECK_FALSE((IsMatrix(rhs_tensor_shape->GetDimSizes())), KERNEL_STATUS_PARAM_INVALID, "invalid Input[rhs]")
auto superdiag_shape = superdiag_tensor_shape->GetDimSizes();
auto maindiag_shape = maindiag_tensor_shape->GetDimSizes();
auto subdiag_shape = subdiag_tensor_shape->GetDimSizes();
auto rhs_shape = rhs_tensor_shape->GetDimSizes();
int32_t superdiag_dims = superdiag_tensor_shape->GetDims();
int32_t maindiag_dims = maindiag_tensor_shape->GetDims();
int32_t subdiag_dims = subdiag_tensor_shape->GetDims();
int32_t rhs_dims = rhs_tensor_shape->GetDims();
int64_t length = rhs_shape[rhs_dims - 2];
KERNEL_CHECK_FALSE((superdiag_shape[superdiag_dims - 1] == length), KERNEL_STATUS_PARAM_INVALID,
"invalid Input superdiag length")
KERNEL_CHECK_FALSE((maindiag_shape[maindiag_dims - 1] == length), KERNEL_STATUS_PARAM_INVALID,
"invalid Input maindiag length")
KERNEL_CHECK_FALSE((subdiag_shape[subdiag_dims - 1] == length), KERNEL_STATUS_PARAM_INVALID,
"invalid Input subdiag length")
using VectorMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
using MatrixMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
VectorMap superdiag(reinterpret_cast<T *>(superdiag_tensor->GetData()), superdiag_shape[superdiag_dims - 1], 1);
VectorMap maindiag(reinterpret_cast<T *>(maindiag_tensor->GetData()), maindiag_shape[maindiag_dims - 1], 1);
VectorMap subdiag(reinterpret_cast<T *>(subdiag_tensor->GetData()), subdiag_shape[subdiag_dims - 1], 1);
MatrixMap rhs(reinterpret_cast<T *>(rhs_tensor->GetData()), rhs_shape[rhs_dims - 2], rhs_shape[rhs_dims - 1]);
auto y_tensor = ctx.Output(0);
auto y_shape = y_tensor->GetTensorShape()->GetDimSizes();
int32_t y_dims = y_tensor->GetTensorShape()->GetDims();
MatrixMap y(reinterpret_cast<T *>(y_tensor->GetData()), y_shape[y_dims - 2], y_shape[y_dims - 1]);
y.array() = rhs.array().colwise() * maindiag.array();
for (int64_t i = 0; i < length - 1; i++) {
y.array().row(i) += rhs.array().row(i + 1) * superdiag(i);
y.array().row(i + 1) += rhs.array().row(i) * subdiag(i + 1);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kTridiagonalMatMul, TridiagonalMatMulCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_TRIDIAGONALMATMUL_H_
#define AICPU_KERNELS_NORMALIZED_TRIDIAGONALMATMUL_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class TridiagonalMatMulCpuKernel : public CpuKernel {
public:
TridiagonalMatMulCpuKernel() = default;
~TridiagonalMatMulCpuKernel() = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t TridiagonalMatMulCompute(CpuKernelContext &ctx);
uint32_t TridiagonalMatMulDataAndTypeCheck(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif // AICPU_KERNELS_NORMALIZED_TRIDIAGONALMATMUL_H_

View File

@ -0,0 +1,93 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tril_indices.h"
#include <Eigen/Dense>
#include <algorithm>
#include <iostream>
#include <map>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kTrilIndices = "TrilIndices";
#define TRIL_INDICES_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = DoCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("TrilIndices kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t TrilIndicesCpuKernel::Compute(CpuKernelContext &ctx) {
Tensor *output = ctx.Output(0);
KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed.")
auto data_type = ctx.Output(0)->GetDataType();
switch (data_type) {
TRIL_INDICES_COMPUTE_CASE(DT_INT32, int32_t, ctx)
TRIL_INDICES_COMPUTE_CASE(DT_INT64, int64_t, ctx)
default:
KERNEL_LOG_ERROR("TrilIndices kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t TrilIndicesCpuKernel::DoCompute(CpuKernelContext &ctx) {
AttrValue *row_ptr = ctx.GetAttr("row");
AttrValue *col_ptr = ctx.GetAttr("col");
AttrValue *offset_ptr = ctx.GetAttr("offset");
int64_t row = row_ptr->GetInt();
int64_t col = col_ptr->GetInt();
int64_t offset = (offset_ptr == nullptr) ? 0 : (offset_ptr->GetInt());
auto m_first_row = offset > 0 ? std::min<int64_t>(col, 1 + offset) : row + offset > 0;
auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset));
auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset));
auto n_row_trapezoid = (m_last_row - m_first_row + 1);
auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
auto diff_row = n_row_all - n_row_trapezoid;
if (diff_row > 0) {
tril_size += diff_row * col;
}
T *output{static_cast<T *>(ctx.Output(0)->GetData())};
int64_t i = 0;
int64_t r = std::max<int64_t>(0, -offset), c = 0;
while (i < tril_size) {
output[i] = r;
output[tril_size + i++] = c;
c += 1;
if (c > r + offset || c >= col) {
r += 1;
c = 0;
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kTrilIndices, TrilIndicesCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,40 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_TRIL_INDICES_H_
#define AICPU_KERNELS_NORMALIZED_TRIL_INDICES_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
#include "utils/sparse_tensor.h"
namespace aicpu {
class TrilIndicesCpuKernel : public CpuKernel {
public:
TrilIndicesCpuKernel() = default;
~TrilIndicesCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
int32_t offset = 0;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,874 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "triplet_margin_loss.h"
#include <Eigen/Dense>
#include <algorithm>
#include <cmath>
#include <iostream>
#include <map>
#include "cpu_kernel_utils.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "utils/broadcast_iterator.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 4;
const uint32_t kOutputNum = 1;
const int64_t kNoBroadcastValue = 1;
const char *kTripletMarginLoss = "TripletMarginLoss";
// when input data size is more than kParallelDataNum, use Parallel func
const int64_t kParallelDataNum = 28 * 1024;
const int64_t kParallelDataNumMid = 56 * 1024;
} // namespace
namespace aicpu {
uint32_t TripletMarginLossCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum),
" TripletMarginLoss check input and output number failed.");
auto data_type_x = static_cast<DataType>(ctx.Input(0)->GetDataType());
auto data_type_positive = static_cast<DataType>(ctx.Input(1)->GetDataType());
auto data_type_negative = static_cast<DataType>(ctx.Input(2)->GetDataType());
if (data_type_x != data_type_negative || data_type_positive != data_type_negative ||
data_type_x != data_type_positive) {
KERNEL_LOG_ERROR(
"[%s] Data type of inputs requires to be the same, but got data type "
"[%s] and "
"[%s], type[%s].",
ctx.GetOpType().c_str(), DTypeStr(data_type_x).c_str(), DTypeStr(data_type_positive).c_str(),
DTypeStr(data_type_negative).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
AttrValue *Attr_p = ctx.GetAttr("p");
int p_value = (Attr_p == nullptr) ? 2 : Attr_p->GetInt();
float margin_value = *(reinterpret_cast<float *>(ctx.Input(3)->GetData()));
AttrValue *Attr_eps = ctx.GetAttr("eps");
float eps_value = (Attr_eps == nullptr) ? 1e-6 : Attr_eps->GetFloat();
AttrValue *Attr_swap = ctx.GetAttr("swap");
bool swap_value = (Attr_swap == nullptr) ? false : Attr_swap->GetBool();
AttrValue *Attr_red = ctx.GetAttr("reduction");
std::string reduction_value = (Attr_red == nullptr) ? "mean" : Attr_red->GetString();
Tensor *input_x = (ctx.Input(0));
Tensor *input_positive = (ctx.Input(1));
Tensor *input_negative = (ctx.Input(2));
const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
std::vector<int64_t> broadcast_shape;
std::vector<int64_t> broadcast_shape_x_and_positive;
(void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
(void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
int64_t num_elements = 1;
for (size_t i = 0; i < broadcast_shape.size(); i++) {
num_elements *= broadcast_shape[i];
}
int64_t data_num_output_reduction_none = (num_elements) / (broadcast_shape[1]);
int64_t data_num_each_batch_input = (num_elements) / (broadcast_shape[0]);
int64_t data_num_each_batch_output_reduction_none = data_num_output_reduction_none / (broadcast_shape[0]);
int64_t batch_size = broadcast_shape[0];
int64_t once_compute_size = broadcast_shape[1];
bool broadcast = false;
std::vector<int64_t> x_reshape_vector = shape_x;
std::vector<int64_t> positive_reshape_vector = shape_positive;
std::vector<int64_t> negative_reshape_vector = shape_negative;
if (shape_x != shape_positive || shape_x != shape_negative || shape_positive != shape_negative) {
broadcast = true;
std::reverse(x_reshape_vector.begin(), x_reshape_vector.end());
std::reverse(positive_reshape_vector.begin(), positive_reshape_vector.end());
std::reverse(negative_reshape_vector.begin(), negative_reshape_vector.end());
int64_t dim_num_x = input_x->GetTensorShape()->GetDims();
int64_t dim_num_positive = input_positive->GetTensorShape()->GetDims();
int64_t dim_num_negative = input_negative->GetTensorShape()->GetDims();
auto dims = std::max(dim_num_x, std::max(dim_num_positive, dim_num_negative));
if (dim_num_x < dims) x_reshape_vector.resize(dims, kNoBroadcastValue);
if (dim_num_positive < dims) positive_reshape_vector.resize(dims, kNoBroadcastValue);
if (dim_num_negative < dims) negative_reshape_vector.resize(dims, kNoBroadcastValue);
std::reverse(x_reshape_vector.begin(), x_reshape_vector.end());
std::reverse(positive_reshape_vector.begin(), positive_reshape_vector.end());
std::reverse(negative_reshape_vector.begin(), negative_reshape_vector.end());
}
switch (data_type_x) {
case DT_FLOAT16:
return TripletMarginLossComputeRealTypeFloat16<Eigen::half>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_FLOAT:
return TripletMarginLossComputeRealType<float>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_DOUBLE:
return TripletMarginLossComputeRealType<double>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_INT8:
return TripletMarginLossComputeRealType<int8_t>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_INT16:
return TripletMarginLossComputeRealType<int16_t>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_INT32:
return TripletMarginLossComputeRealType<int32_t>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_INT64:
return TripletMarginLossComputeRealType<int64_t>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_UINT8:
return TripletMarginLossComputeRealType<uint8_t>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_UINT16:
return TripletMarginLossComputeRealType<uint16_t>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_UINT32:
return TripletMarginLossComputeRealType<uint32_t>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_UINT64:
return TripletMarginLossComputeRealType<uint64_t>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_COMPLEX128:
return TripletMarginLossComputeComplexType<std::complex<double>>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
case DT_COMPLEX64:
return TripletMarginLossComputeComplexType<std::complex<float>>(
ctx, p_value, margin_value, eps_value, swap_value, reduction_value, num_elements,
data_num_output_reduction_none, data_num_each_batch_input, data_num_each_batch_output_reduction_none,
batch_size, once_compute_size, broadcast, x_reshape_vector, positive_reshape_vector, negative_reshape_vector);
default:
KERNEL_LOG_ERROR("[%s] Data type of input is not supported, input data type is [%s].", ctx.GetOpType().c_str(),
DTypeStr(data_type_x).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t TripletMarginLossCpuKernel::TripletMarginLossComputeRealType(
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value, std::string reduction_value,
int64_t num_elements, int64_t data_num_output_reduction_none, int64_t data_num_each_batch_input,
int64_t data_num_each_batch_output_reduction_none, int64_t batch_size, int64_t once_compute_size, bool broadcast,
std::vector<int64_t> x_reshape_vector, std::vector<int64_t> positive_reshape_vector,
std::vector<int64_t> negative_reshape_vector) {
constexpr int ADULT_AGE = 4;
Tensor *input_x = (ctx.Input(0));
Tensor *input_positive = (ctx.Input(1));
Tensor *input_negative = (ctx.Input(2));
Tensor *output = (ctx.Output(0));
const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
T *x_data = reinterpret_cast<T *>(input_x->GetData());
T *positive_data = reinterpret_cast<T *>(input_positive->GetData());
T *negative_data = reinterpret_cast<T *>(input_negative->GetData());
std::vector<int64_t> broadcast_shape;
std::vector<int64_t> broadcast_shape_x_and_positive;
(void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
(void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
std::vector<T> x_broadcast_tensor;
std::vector<T> positive_broadcast_tensor;
std::vector<T> negative_broadcast_tensor;
if (broadcast == true) {
auto shape_x1 = shape_x;
auto shape_x2 = shape_x;
auto shape_positive1 = shape_positive;
auto shape_negative1 = shape_negative;
auto broadcast_shape1 = broadcast_shape;
auto broadcast_shape2 = broadcast_shape;
BroadcastIterator iter1(shape_x1, shape_positive1, broadcast_shape1);
BroadcastIterator iter2(shape_x2, shape_negative1, broadcast_shape2);
iter1.SetPos(0);
iter2.SetPos(0);
for (int64_t i = 0; i < num_elements; i++) {
x_broadcast_tensor.push_back(x_data[iter1.GetInputPosA()]);
positive_broadcast_tensor.push_back(positive_data[iter1.GetInputPosB()]);
negative_broadcast_tensor.push_back(negative_data[iter2.GetInputPosB()]);
iter1.GenNextPos();
iter2.GenNextPos();
}
x_data = x_broadcast_tensor.data();
positive_data = positive_broadcast_tensor.data();
negative_data = negative_broadcast_tensor.data();
}
auto output_data = reinterpret_cast<float *>(output->GetData());
Eigen::Array<float, Eigen::Dynamic, 1> output_reduction_none(data_num_output_reduction_none, 1);
float *output_reduction_none_data = reinterpret_cast<float *>(output_reduction_none.data());
auto shard_triplet_margin_loss = [&](int64_t start, int64_t end) {
Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
int64_t once_compute_thread_size = (end - start);
float positive_distance;
float negative_distance;
float swap_distance;
float temp1;
float temp2;
float temp3;
if (data_num_each_batch_input == 0) {
KERNEL_LOG_ERROR("data_num_each_batch_input could not be 0.");
}
for (int64_t n = 0; n < once_compute_thread_size / data_num_each_batch_input; n++) {
int64_t i = start / data_num_each_batch_input;
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
for (int64_t k = 0; k < once_compute_size; k++) {
*(calculate_positive_distance_data + k) =
eps_value +
static_cast<float>(
*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(
*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
*(calculate_negative_distance_data + k) =
eps_value +
static_cast<float>(
*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(
*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
if (swap_value == true) {
*(calculate_swap_distance_data + k) =
eps_value +
static_cast<float>(
*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(
*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
}
}
calculate_positive_distance = (calculate_positive_distance).abs();
calculate_negative_distance = (calculate_negative_distance).abs();
for (int64_t n = 0; n < once_compute_size; n++) {
temp1 = *(calculate_positive_distance_data + n);
temp2 = *(calculate_negative_distance_data + n);
for (int64_t l = 1; l < p_value; l++) {
*(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
*(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
}
}
positive_distance =
std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value)));
negative_distance =
std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value)));
if (broadcast == true) {
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
}
if (swap_value == true) {
calculate_swap_distance = ((calculate_swap_distance)).abs();
for (int64_t n = 0; n < once_compute_size; n++) {
temp3 = *(calculate_swap_distance_data + n);
for (int64_t l = 1; l < p_value; l++) {
*(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
}
}
swap_distance =
std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value)));
if (broadcast == true) {
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
}
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
}
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
(positive_distance + margin_value - negative_distance > 0)
? (positive_distance + margin_value - negative_distance)
: 0;
}
start += data_num_each_batch_input;
}
};
if (num_elements * sizeof(T) > kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (num_elements * sizeof(T) <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
CpuKernelUtils::ParallelFor(ctx, num_elements,
data_num_each_batch_input * ADULT_AGE * (batch_size / max_core_num + 1),
shard_triplet_margin_loss);
} else {
Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
float positive_distance;
float negative_distance;
float swap_distance;
float temp1;
float temp2;
float temp3;
for (int64_t i = 0; i < batch_size; i++) {
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
for (int64_t k = 0; k < once_compute_size; k++) {
*(calculate_positive_distance_data + k) =
eps_value +
static_cast<float>(
*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(
*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
*(calculate_negative_distance_data + k) =
eps_value +
static_cast<float>(
*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(
*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
if (swap_value == true) {
*(calculate_swap_distance_data + k) =
eps_value +
static_cast<float>(
*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(
*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
}
}
calculate_positive_distance = (calculate_positive_distance).abs();
calculate_negative_distance = (calculate_negative_distance).abs();
for (int64_t n = 0; n < once_compute_size; n++) {
temp1 = *(calculate_positive_distance_data + n);
temp2 = *(calculate_negative_distance_data + n);
for (int64_t l = 1; l < p_value; l++) {
*(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
*(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
}
}
positive_distance =
std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value)));
negative_distance =
std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value)));
if (broadcast == true) {
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
}
if (swap_value == true) {
calculate_swap_distance = ((calculate_swap_distance)).abs();
for (int64_t n = 0; n < once_compute_size; n++) {
temp3 = *(calculate_swap_distance_data + n);
for (int64_t l = 1; l < p_value; l++) {
*(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
}
}
swap_distance =
std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value)));
if (broadcast == true) {
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
}
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
}
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
(positive_distance + margin_value - negative_distance > 0)
? (positive_distance + margin_value - negative_distance)
: 0;
}
}
}
if (reduction_value == "none") {
for (int64_t i = 0; i < data_num_output_reduction_none; i++) {
*(output_data + i) = *(output_reduction_none_data + i);
}
}
if (reduction_value == "mean") {
*(output_data) = (output_reduction_none.mean());
}
if (reduction_value == "sum") {
*(output_data) = (output_reduction_none.sum());
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t TripletMarginLossCpuKernel::TripletMarginLossComputeComplexType(
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value, std::string reduction_value,
int64_t num_elements, int64_t data_num_output_reduction_none, int64_t data_num_each_batch_input,
int64_t data_num_each_batch_output_reduction_none, int64_t batch_size, int64_t once_compute_size, bool broadcast,
std::vector<int64_t> x_reshape_vector, std::vector<int64_t> positive_reshape_vector,
std::vector<int64_t> negative_reshape_vector) {
constexpr int ADULT_AGE = 4;
Tensor *input_x = (ctx.Input(0));
Tensor *input_positive = (ctx.Input(1));
Tensor *input_negative = (ctx.Input(2));
Tensor *output = (ctx.Output(0));
const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
T *x_data = reinterpret_cast<T *>(input_x->GetData());
T *positive_data = reinterpret_cast<T *>(input_positive->GetData());
T *negative_data = reinterpret_cast<T *>(input_negative->GetData());
std::vector<int64_t> broadcast_shape;
std::vector<int64_t> broadcast_shape_x_and_positive;
(void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
(void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
std::vector<T> x_broadcast_tensor;
std::vector<T> positive_broadcast_tensor;
std::vector<T> negative_broadcast_tensor;
if (broadcast == true) {
auto shape_x1 = shape_x;
auto shape_x2 = shape_x;
auto shape_positive1 = shape_positive;
auto shape_negative1 = shape_negative;
auto broadcast_shape1 = broadcast_shape;
auto broadcast_shape2 = broadcast_shape;
BroadcastIterator iter1(shape_x1, shape_positive1, broadcast_shape1);
BroadcastIterator iter2(shape_x2, shape_negative1, broadcast_shape2);
iter1.SetPos(0);
iter2.SetPos(0);
for (int64_t i = 0; i < num_elements; i++) {
x_broadcast_tensor.push_back(x_data[iter1.GetInputPosA()]);
positive_broadcast_tensor.push_back(positive_data[iter1.GetInputPosB()]);
negative_broadcast_tensor.push_back(negative_data[iter2.GetInputPosB()]);
iter1.GenNextPos();
iter2.GenNextPos();
}
x_data = x_broadcast_tensor.data();
positive_data = positive_broadcast_tensor.data();
negative_data = negative_broadcast_tensor.data();
}
auto output_data = reinterpret_cast<float *>(output->GetData());
Eigen::Array<float, Eigen::Dynamic, 1> output_reduction_none(data_num_output_reduction_none, 1);
float *output_reduction_none_data = reinterpret_cast<float *>(output_reduction_none.data());
auto shard_triplet_margin_loss = [&](int64_t start, int64_t end) {
Eigen::Array<T, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
Eigen::Array<T, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
Eigen::Array<T, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
T *calculate_positive_distance_data = reinterpret_cast<T *>(calculate_positive_distance.data());
T *calculate_negative_distance_data = reinterpret_cast<T *>(calculate_negative_distance.data());
T *calculate_swap_distance_data = reinterpret_cast<T *>(calculate_swap_distance.data());
int64_t once_compute_thread_size = end - start;
float positive_distance;
float negative_distance;
float swap_distance;
if (data_num_each_batch_input == 0) {
KERNEL_LOG_ERROR("data_num_each_batch_input could not be 0.");
}
for (int64_t n = 0; n < (once_compute_thread_size) / data_num_each_batch_input; n++) {
int64_t i = start / data_num_each_batch_input;
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
for (int64_t k = 0; k < once_compute_size; k++) {
*(calculate_positive_distance_data + k) =
static_cast<T>(eps_value) +
(*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
(*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
*(calculate_negative_distance_data + k) =
static_cast<T>(eps_value) +
(*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
(*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
if (swap_value == true) {
*(calculate_swap_distance_data + k) =
static_cast<T>(eps_value) +
(*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
(*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
}
}
auto calculate_positive_distance_float =
(calculate_positive_distance * (calculate_positive_distance.matrix().conjugate().array())).real().sqrt();
auto calculate_negative_distance_float =
(calculate_negative_distance * (calculate_negative_distance.matrix().conjugate().array())).real().sqrt();
positive_distance =
std::pow(calculate_positive_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
negative_distance =
std::pow(calculate_negative_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
if (broadcast == true) {
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
}
if (swap_value == true) {
auto calculate_swap_distance_float =
(calculate_swap_distance * (calculate_swap_distance.matrix().conjugate().array())).real().sqrt();
swap_distance = std::pow(calculate_swap_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
if (broadcast == true) {
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
}
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
}
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
(positive_distance + margin_value - negative_distance > 0)
? (positive_distance + margin_value - negative_distance)
: 0;
}
start += data_num_each_batch_input;
}
};
if (num_elements * sizeof(T) > kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (num_elements * sizeof(T) <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
CpuKernelUtils::ParallelFor(ctx, num_elements,
data_num_each_batch_input * ADULT_AGE * (batch_size / max_core_num + 1),
shard_triplet_margin_loss);
} else {
Eigen::Array<T, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
Eigen::Array<T, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
Eigen::Array<T, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
T *calculate_positive_distance_data = reinterpret_cast<T *>(calculate_positive_distance.data());
T *calculate_negative_distance_data = reinterpret_cast<T *>(calculate_negative_distance.data());
T *calculate_swap_distance_data = reinterpret_cast<T *>(calculate_swap_distance.data());
for (int64_t i = 0; i < batch_size; i++) {
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
for (int64_t k = 0; k < once_compute_size; k++) {
*(calculate_positive_distance_data + k) =
static_cast<T>(eps_value) +
(*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
(*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
*(calculate_negative_distance_data + k) =
static_cast<T>(eps_value) +
(*(x_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
(*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
if (swap_value == true) {
*(calculate_swap_distance_data + k) =
static_cast<T>(eps_value) +
(*(positive_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none)) -
(*(negative_data + i * data_num_each_batch_input + j + k * data_num_each_batch_output_reduction_none));
}
}
float positive_distance;
float negative_distance;
float swap_distance;
auto calculate_positive_distance_float =
(calculate_positive_distance * (calculate_positive_distance.matrix().conjugate().array())).real().sqrt();
auto calculate_negative_distance_float =
(calculate_negative_distance * (calculate_negative_distance.matrix().conjugate().array())).real().sqrt();
positive_distance =
std::pow(calculate_positive_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
negative_distance =
std::pow(calculate_negative_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
if (broadcast == true) {
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
}
if (swap_value == true) {
auto calculate_swap_distance_float =
(calculate_swap_distance * (calculate_swap_distance.matrix().conjugate().array())).real().sqrt();
swap_distance = std::pow(calculate_swap_distance_float.pow(p_value).sum(), 1 / static_cast<float>(p_value));
if (broadcast == true) {
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
}
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
}
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
(positive_distance + margin_value - negative_distance > 0)
? positive_distance + margin_value - negative_distance
: 0;
}
}
}
if (reduction_value == "none") {
for (int64_t i = 0; i < data_num_output_reduction_none; i++) {
*(output_data + i) = *(output_reduction_none_data + i);
}
}
if (reduction_value == "mean") {
*(output_data) = (output_reduction_none.mean());
}
if (reduction_value == "sum") {
*(output_data) = (output_reduction_none.sum());
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t TripletMarginLossCpuKernel::TripletMarginLossComputeRealTypeFloat16(
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value, std::string reduction_value,
int64_t num_elements, int64_t data_num_output_reduction_none, int64_t data_num_each_batch_input,
int64_t data_num_each_batch_output_reduction_none, int64_t batch_size, int64_t once_compute_size, bool broadcast,
std::vector<int64_t> x_reshape_vector, std::vector<int64_t> positive_reshape_vector,
std::vector<int64_t> negative_reshape_vector) {
constexpr int ADULT_AGE = 4;
Tensor *input_x = (ctx.Input(0));
Tensor *input_positive = (ctx.Input(1));
Tensor *input_negative = (ctx.Input(2));
Tensor *output = (ctx.Output(0));
const std::vector<int64_t> &shape_x = input_x->GetTensorShape()->GetDimSizes();
const std::vector<int64_t> &shape_positive = input_positive->GetTensorShape()->GetDimSizes();
const std::vector<int64_t> &shape_negative = input_negative->GetTensorShape()->GetDimSizes();
T *x_data = reinterpret_cast<T *>(input_x->GetData());
T *positive_data = reinterpret_cast<T *>(input_positive->GetData());
T *negative_data = reinterpret_cast<T *>(input_negative->GetData());
std::vector<int64_t> broadcast_shape;
std::vector<int64_t> broadcast_shape_x_and_positive;
(void)GetBroadcastShape(shape_x, shape_positive, broadcast_shape_x_and_positive);
(void)GetBroadcastShape(broadcast_shape_x_and_positive, shape_negative, broadcast_shape);
std::vector<T> x_broadcast_tensor;
std::vector<T> positive_broadcast_tensor;
std::vector<T> negative_broadcast_tensor;
if (broadcast == true) {
auto shape_x1 = shape_x;
auto shape_x2 = shape_x;
auto shape_positive1 = shape_positive;
auto shape_negative1 = shape_negative;
auto broadcast_shape1 = broadcast_shape;
auto broadcast_shape2 = broadcast_shape;
BroadcastIterator iter1(shape_x1, shape_positive1, broadcast_shape1);
BroadcastIterator iter2(shape_x2, shape_negative1, broadcast_shape2);
iter1.SetPos(0);
iter2.SetPos(0);
for (int64_t i = 0; i < num_elements; i++) {
x_broadcast_tensor.push_back(x_data[iter1.GetInputPosA()]);
positive_broadcast_tensor.push_back(positive_data[iter1.GetInputPosB()]);
negative_broadcast_tensor.push_back(negative_data[iter2.GetInputPosB()]);
iter1.GenNextPos();
iter2.GenNextPos();
}
x_data = x_broadcast_tensor.data();
positive_data = positive_broadcast_tensor.data();
negative_data = negative_broadcast_tensor.data();
}
auto output_data = reinterpret_cast<T *>(output->GetData());
Eigen::Array<float, Eigen::Dynamic, 1> output_reduction_none(data_num_output_reduction_none, 1);
float *output_reduction_none_data = reinterpret_cast<float *>(output_reduction_none.data());
auto shard_triplet_margin_loss = [&](int64_t start, int64_t end) {
Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
int64_t once_compute_thread_size = end - start;
float positive_distance;
float negative_distance;
float swap_distance;
float temp1;
float temp2;
float temp3;
if (data_num_each_batch_input == 0) {
KERNEL_LOG_ERROR("data_num_each_batch_input could not be 0.");
}
for (int64_t n = 0; n < (once_compute_thread_size) / data_num_each_batch_input; n++) {
int64_t i = start / data_num_each_batch_input;
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
for (int64_t k = 0; k < once_compute_size; k++) {
*(calculate_positive_distance_data + k) =
eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)));
*(calculate_negative_distance_data + k) =
eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)));
if (swap_value == true) {
*(calculate_swap_distance_data + k) =
eps_value + (static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)));
}
}
calculate_positive_distance = (calculate_positive_distance).abs();
calculate_negative_distance = (calculate_negative_distance).abs();
for (int64_t n = 0; n < once_compute_size; n++) {
temp1 = *(calculate_positive_distance_data + n);
temp2 = *(calculate_negative_distance_data + n);
for (int64_t l = 1; l < p_value; l++) {
*(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
*(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
}
}
positive_distance = static_cast<float>(
std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value))));
negative_distance = static_cast<float>(
std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value))));
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
if (swap_value == true) {
calculate_swap_distance = ((calculate_swap_distance)).abs();
for (int64_t n = 0; n < once_compute_size; n++) {
temp3 = *(calculate_swap_distance_data + n);
for (int64_t l = 1; l < p_value; l++) {
*(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
}
}
swap_distance = static_cast<float>(
std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value))));
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
}
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
(positive_distance + margin_value - negative_distance > static_cast<float>(0))
? ((positive_distance + margin_value - negative_distance))
: static_cast<float>(0);
}
start += data_num_each_batch_input;
}
};
if (num_elements * sizeof(T) > kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (num_elements * sizeof(T) <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
CpuKernelUtils::ParallelFor(ctx, num_elements,
data_num_each_batch_input * ADULT_AGE * (batch_size / max_core_num + 1),
shard_triplet_margin_loss);
} else {
Eigen::Array<float, Eigen::Dynamic, 1> calculate_positive_distance(once_compute_size, 1);
Eigen::Array<float, Eigen::Dynamic, 1> calculate_negative_distance(once_compute_size, 1);
Eigen::Array<float, Eigen::Dynamic, 1> calculate_swap_distance(once_compute_size, 1);
float *calculate_positive_distance_data = reinterpret_cast<float *>(calculate_positive_distance.data());
float *calculate_negative_distance_data = reinterpret_cast<float *>(calculate_negative_distance.data());
float *calculate_swap_distance_data = reinterpret_cast<float *>(calculate_swap_distance.data());
for (int64_t i = 0; i < batch_size; i++) {
for (int64_t j = 0; j < data_num_each_batch_output_reduction_none; j++) {
float positive_distance;
float negative_distance;
float swap_distance;
for (int64_t k = 0; k < once_compute_size; k++) {
*(calculate_positive_distance_data + k) =
eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)));
*(calculate_negative_distance_data + k) =
eps_value + (static_cast<float>(*(x_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)));
if (swap_value == true) {
*(calculate_swap_distance_data + k) =
eps_value + (static_cast<float>(*(positive_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)) -
static_cast<float>(*(negative_data + i * data_num_each_batch_input + j +
k * data_num_each_batch_output_reduction_none)));
}
}
calculate_positive_distance = (calculate_positive_distance).abs();
calculate_negative_distance = (calculate_negative_distance).abs();
float temp1;
float temp2;
float temp3;
for (int64_t n = 0; n < once_compute_size; n++) {
temp1 = *(calculate_positive_distance_data + n);
temp2 = *(calculate_negative_distance_data + n);
for (int64_t l = 1; l < p_value; l++) {
*(calculate_positive_distance_data + n) = *(calculate_positive_distance_data + n) * temp1;
*(calculate_negative_distance_data + n) = *(calculate_negative_distance_data + n) * temp2;
}
}
positive_distance = static_cast<float>(
std::pow(static_cast<double>(calculate_positive_distance.sum()), (1 / static_cast<float>(p_value))));
negative_distance = static_cast<float>(
std::pow(static_cast<double>(calculate_negative_distance.sum()), (1 / static_cast<float>(p_value))));
if (broadcast == true) {
if (x_reshape_vector[1] == 1 && positive_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
positive_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
if (x_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
negative_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
}
if (swap_value == true) {
calculate_swap_distance = ((calculate_swap_distance)).abs();
for (int64_t n = 0; n < once_compute_size; n++) {
temp3 = *(calculate_swap_distance_data + n);
for (int64_t l = 1; l < p_value; l++) {
*(calculate_swap_distance_data + n) = *(calculate_swap_distance_data + n) * temp3;
}
}
swap_distance = static_cast<float>(
std::pow(static_cast<double>(calculate_swap_distance.sum()), (1 / static_cast<float>(p_value))));
if (broadcast == true) {
if (positive_reshape_vector[1] == 1 && negative_reshape_vector[1] == 1 && broadcast_shape[1] != 1) {
swap_distance /= std::pow(broadcast_shape[1], (1 / static_cast<float>(p_value)));
}
}
negative_distance = (negative_distance < swap_distance) ? negative_distance : swap_distance;
}
*(output_reduction_none_data + data_num_each_batch_output_reduction_none * i + j) =
(positive_distance + margin_value - negative_distance > static_cast<float>(0))
? ((positive_distance + margin_value - negative_distance))
: static_cast<float>(0);
}
}
}
if (reduction_value == "none") {
for (int64_t i = 0; i < data_num_output_reduction_none; i++) {
*(output_data + i) = static_cast<T>(*(output_reduction_none_data + i));
}
}
if (reduction_value == "mean") {
*(output_data) = static_cast<T>(output_reduction_none.mean());
}
if (reduction_value == "sum") {
*(output_data) = static_cast<T>(output_reduction_none.sum());
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kTripletMarginLoss, TripletMarginLossCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,57 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_TRIPLET_MARGIN_LOSS_H_
#define AICPU_KERNELS_NORMALIZED_TRIPLET_MARGIN_LOSS_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class TripletMarginLossCpuKernel : public CpuKernel {
public:
TripletMarginLossCpuKernel() = default;
~TripletMarginLossCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
static uint32_t TripletMarginLossComputeRealType(
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value,
std::string reduction_value, int64_t num_elements, int64_t data_num_output_reduction_none,
int64_t data_num_each_batch_input, int64_t data_num_each_batch_output_reduction_none, int64_t batch_size,
int64_t once_compute_size, bool broadcast, std::vector<int64_t> x_reshape_vector,
std::vector<int64_t> positive_reshape_vector, std::vector<int64_t> negative_reshape_vector);
template <typename T>
static uint32_t TripletMarginLossComputeRealTypeFloat16(
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value,
std::string reduction_value, int64_t num_elements, int64_t data_num_output_reduction_none,
int64_t data_num_each_batch_input, int64_t data_num_each_batch_output_reduction_none, int64_t batch_size,
int64_t once_compute_size, bool broadcast, std::vector<int64_t> x_reshape_vector,
std::vector<int64_t> positive_reshape_vector, std::vector<int64_t> negative_reshape_vector);
template <typename T>
static uint32_t TripletMarginLossComputeComplexType(
CpuKernelContext &ctx, int p_value, float margin_value, float eps_value, bool swap_value,
std::string reduction_value, int64_t num_elements, int64_t data_num_output_reduction_none,
int64_t data_num_each_batch_input, int64_t data_num_each_batch_output_reduction_none, int64_t batch_size,
int64_t once_compute_size, bool broadcast, std::vector<int64_t> x_reshape_vector,
std::vector<int64_t> positive_reshape_vector, std::vector<int64_t> negative_reshape_vector);
};
} // namespace aicpu
#endif // namespace aicpu

View File

@ -0,0 +1,95 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "triu_indices.h"
#include <Eigen/Dense>
#include <algorithm>
#include <iostream>
#include <map>
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kTriuIndices = "TriuIndices";
#define TRIU_INDICES_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = DoCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("TriuIndices kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t TriuIndicesCpuKernel::Compute(CpuKernelContext &ctx) {
Tensor *output = ctx.Output(0);
KERNEL_CHECK_NULLPTR(output, KERNEL_STATUS_PARAM_INVALID, "Get output failed.")
auto data_type = ctx.Output(0)->GetDataType();
switch (data_type) {
TRIU_INDICES_COMPUTE_CASE(DT_INT32, int32_t, ctx)
TRIU_INDICES_COMPUTE_CASE(DT_INT64, int64_t, ctx)
default:
KERNEL_LOG_ERROR("TriuIndices kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t TriuIndicesCpuKernel::DoCompute(CpuKernelContext &ctx) {
AttrValue *row_ptr = ctx.GetAttr("row");
AttrValue *col_ptr = ctx.GetAttr("col");
AttrValue *offset_ptr = ctx.GetAttr("offset");
int64_t row = row_ptr->GetInt();
int64_t col = col_ptr->GetInt();
int64_t offset = (offset_ptr == nullptr) ? 0 : (offset_ptr->GetInt());
auto offset1 = offset - 1;
auto m_first_row = offset1 > 0 ? std::min<int64_t>(col, 1 + offset1) : row + offset1 > 0;
auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset1));
auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset1));
auto n_row_trapezoid = (m_last_row - m_first_row + 1);
auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
auto diff_row = n_row_all - n_row_trapezoid;
if (diff_row > 0) {
tril_size += diff_row * col;
}
auto triu_size = row * col - tril_size;
T *output{static_cast<T *>(ctx.Output(0)->GetData())};
int64_t i = 0;
int64_t c = std::max<int64_t>(0, offset), r = 0;
while (i < triu_size) {
output[i] = r;
output[triu_size + i++] = c;
c += 1;
if (c >= col) {
r += 1;
c = std::max<int64_t>(0, r + offset);
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kTriuIndices, TriuIndicesCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,41 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All right reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_TRIU_INDICES_H_
#define AICPU_KERNELS_NORMALIZED_TRIU_INDICES_H_
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
#include "utils/sparse_tensor.h"
namespace aicpu {
class TriuIndicesCpuKernel : public CpuKernel {
public:
TriuIndicesCpuKernel() = default;
~TriuIndicesCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
int32_t offset = 0;
int32_t offset1 = 0;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,209 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "unpack.h"
#include "utils/kernel_util.h"
namespace {
const char *kUnpack = "Unpack";
}
namespace aicpu {
uint32_t UnpackCpuKernel::CheckAndInitParams(CpuKernelContext &ctx) {
Tensor *value_ptr = ctx.Input(0);
KERNEL_CHECK_NULLPTR(value_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input value failed.");
value_data_ptr = value_ptr->GetData();
KERNEL_CHECK_NULLPTR(value_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input value data failed.");
auto value_shape_ptr = value_ptr->GetTensorShape();
KERNEL_CHECK_NULLPTR(value_shape_ptr, KERNEL_STATUS_PARAM_INVALID, "Get input value shape failed.");
int64_t value_dim = value_shape_ptr->GetDims();
AttrValue *unpack_axis_ptr = ctx.GetAttr("axis");
int64_t real_unpack_axis = 0;
KERNEL_CHECK_FALSE(unpack_axis_ptr, KERNEL_STATUS_PARAM_INVALID, "get axis failed!");
unpack_axis = unpack_axis_ptr->GetInt();
real_unpack_axis = unpack_axis >= 0 ? unpack_axis : unpack_axis + value_dim;
KERNEL_CHECK_FALSE(value_dim > real_unpack_axis, KERNEL_STATUS_PARAM_INVALID,
"The axis value range should be [-value_dim, value_dim), "
"value dim is [%d], axis is [%d].",
value_dim, unpack_axis);
unpack_axis = real_unpack_axis;
AttrValue *unpack_num_ptr = ctx.GetAttr("num");
KERNEL_CHECK_FALSE(unpack_num_ptr, KERNEL_STATUS_PARAM_INVALID, "get num failed!");
int64_t axis_size = value_shape_ptr->GetDimSize(unpack_axis);
unpack_num = unpack_num_ptr->GetInt();
KERNEL_CHECK_FALSE(unpack_num == axis_size, KERNEL_STATUS_PARAM_INVALID,
"The num you want to unpack to should be equal to the "
"size of the specified dimension. "
"The num you want to unpack to is [%d], while the [%d] "
"dim's size is [%d].",
unpack_num, unpack_axis, axis_size);
value_shape_vec = value_shape_ptr->GetDimSizes();
data_type = value_ptr->GetDataType();
value_num = value_ptr->NumElements();
output_ptr_vec.resize(unpack_num);
for (int64_t i = 0; i < unpack_num; i++) {
Tensor *output_ptr = ctx.Output(i);
KERNEL_CHECK_NULLPTR(output_ptr, KERNEL_STATUS_PARAM_INVALID, "Get output [%d] failed.", i);
auto output_data_ptr = output_ptr->GetData();
KERNEL_CHECK_NULLPTR(output_data_ptr, KERNEL_STATUS_PARAM_INVALID, "Get output data [%d] failed.", i);
output_ptr_vec[i] = output_data_ptr;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t UnpackCpuKernel::UnpackWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec) {
int64_t copy_size = value_num * sizeof(T);
auto mem_ret = memcpy_s(output_data_vec[0], copy_size, input_data_ptr, copy_size);
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
"Memcpy size[%zu] from input value to output[0] failed.", copy_size);
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t UnpackCpuKernel::UnpackWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec) {
if (value_shape_vec[0] == 0) {
KERNEL_CHECK_FALSE(value_shape_vec[0] > 0, KERNEL_STATUS_PARAM_INVALID, "The shape of input tensor is invalid.");
}
int64_t copy_num = value_num / value_shape_vec[0];
T *input_copy_ptr = input_data_ptr;
for (int64_t i = 0; i < unpack_num; i++) {
int64_t copy_size_per = copy_num;
int64_t copy_size = copy_size_per * sizeof(T);
auto mem_ret = memcpy_s(output_data_vec[i], copy_size, input_copy_ptr, copy_size);
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
"Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
input_copy_ptr += copy_size_per;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t UnpackCpuKernel::UnpackCompute(T *input_data_ptr, std::vector<T *> output_data_vec, CpuKernelContext &ctx) {
int64_t prefix = 1;
for (uint64_t i = 0; i < unpack_axis; i++) {
if (value_shape_vec[i] == 0) {
KERNEL_CHECK_FALSE(value_shape_vec[i] > 0, KERNEL_STATUS_PARAM_INVALID, "The shape of input tensor is invalid.");
}
prefix *= value_shape_vec[i];
}
if (unpack_axis >= value_shape_vec.size()) {
KERNEL_CHECK_FALSE(unpack_axis < value_shape_vec.size(), KERNEL_STATUS_PARAM_INVALID,
"input attr axis is invalid.");
}
int64_t midfix = value_shape_vec[unpack_axis];
int64_t subfix = 1;
for (size_t i = unpack_axis + 1; i < value_shape_vec.size(); i++) {
if (value_shape_vec[i] == 0) {
KERNEL_CHECK_FALSE(value_shape_vec[i] > 0, KERNEL_STATUS_PARAM_INVALID, "The shape of input tensor is invalid.");
}
subfix *= value_shape_vec[i];
}
uint32_t min_core_num = 1;
int64_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > unpack_num) {
max_core_num = unpack_num;
}
auto shard_unpack = [&](size_t start, size_t end) {
int64_t offset = 0;
for (uint64_t i = start; i < end; i++) {
offset = i * subfix;
T *output_data_ptr = output_data_vec[i];
T *input_copy_ptr = input_data_ptr + offset;
int64_t copy_size = subfix * sizeof(T);
for (int64_t j = 0; j < prefix; j++) {
auto mem_ret = memcpy_s(output_data_ptr, copy_size, input_copy_ptr, copy_size);
KERNEL_CHECK_FALSE((mem_ret == EOK), KERNEL_STATUS_PARAM_INVALID,
"Memcpy size[%zu] from input value to output[%d] failed.", copy_size, i);
input_copy_ptr += (subfix * midfix);
output_data_ptr += subfix;
}
}
return KERNEL_STATUS_OK;
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, unpack_num, unpack_num / max_core_num, shard_unpack),
"Unpack Compute failed.");
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t UnpackCpuKernel::DoCompute(CpuKernelContext &ctx) {
T *input_data_ptr = reinterpret_cast<T *>(value_data_ptr);
std::vector<T *> output_data_vec;
output_data_vec.resize(unpack_num);
for (int64_t i = 0; i < unpack_num; i++) {
output_data_vec[i] = reinterpret_cast<T *>(output_ptr_vec[i]);
}
if (unpack_num == 1) {
KERNEL_CHECK_FALSE((UnpackWithOneOutput<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
KERNEL_STATUS_PARAM_INVALID, "UnpackWithOneOutput failed.");
return KERNEL_STATUS_OK;
}
if (unpack_axis == 0) {
KERNEL_CHECK_FALSE((UnpackWithDimZero<T>(input_data_ptr, output_data_vec) == KERNEL_STATUS_OK),
KERNEL_STATUS_PARAM_INVALID, "UnpackWithDimZero failed.");
return KERNEL_STATUS_OK;
}
KERNEL_CHECK_FALSE((UnpackCompute<T>(input_data_ptr, output_data_vec, ctx) == KERNEL_STATUS_OK),
KERNEL_STATUS_PARAM_INVALID, "Unpack Compute failed.");
return KERNEL_STATUS_OK;
}
uint32_t UnpackCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_CHECK_FALSE((CheckAndInitParams(ctx) == KERNEL_STATUS_OK), KERNEL_STATUS_PARAM_INVALID,
"CheckAndInitParams failed.");
switch (data_type) {
case DT_FLOAT16:
return DoCompute<Eigen::half>(ctx);
case DT_FLOAT:
return DoCompute<float>(ctx);
case DT_DOUBLE:
return DoCompute<double>(ctx);
case DT_BOOL:
return DoCompute<bool>(ctx);
case DT_INT8:
return DoCompute<int8_t>(ctx);
case DT_INT16:
return DoCompute<int16_t>(ctx);
case DT_INT32:
return DoCompute<int32_t>(ctx);
case DT_INT64:
return DoCompute<int64_t>(ctx);
case DT_UINT8:
return DoCompute<uint8_t>(ctx);
case DT_UINT16:
return DoCompute<uint16_t>(ctx);
case DT_UINT32:
return DoCompute<uint32_t>(ctx);
case DT_UINT64:
return DoCompute<uint64_t>(ctx);
case DT_COMPLEX64:
return DoCompute<std::complex<float>>(ctx);
case DT_COMPLEX128:
return DoCompute<std::complex<double>>(ctx);
default:
KERNEL_LOG_ERROR("Unsupported data type [%s]", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
REGISTER_CPU_KERNEL(kUnpack, UnpackCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,65 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_UNPACK_H_
#define AICPU_KERNELS_NORMALIZED_UNPACK_H_
#include <memory>
#include <vector>
#include "cpu_types.h"
#include "utils/bcast.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "securec.h"
#include "cpu_ops_kernel.h"
#include "cpu_kernel_utils.h"
#include "kernel_log.h"
#include "status.h"
namespace aicpu {
class UnpackCpuKernel : public CpuKernel {
public:
UnpackCpuKernel() : data_type(DT_DOUBLE), unpack_axis(0), unpack_num(0), value_num(0) {
output_ptr_vec.clear();
value_shape_vec.clear();
}
~UnpackCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t CheckAndInitParams(CpuKernelContext &ctx);
template <typename T>
uint32_t UnpackWithOneOutput(T *input_data_ptr, std::vector<T *> output_data_vec);
template <typename T>
uint32_t UnpackWithDimZero(T *input_data_ptr, std::vector<T *> output_data_vec);
template <typename T>
uint32_t UnpackCompute(T *input_data_ptr, std::vector<T *> output_data_vec, CpuKernelContext &ctx);
template <typename T>
uint32_t DoCompute(CpuKernelContext &ctx);
private:
DataType data_type;
uint64_t unpack_axis;
int64_t unpack_num;
int64_t value_num;
void *value_data_ptr;
std::vector<void *> output_ptr_vec;
std::vector<int64_t> value_shape_vec;
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,120 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "unravel_index.h"
#include "cpu_kernel_utils.h"
#include "utils/kernel_util.h"
namespace {
const char *KUnravelIndex = "UnravelIndex";
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const int64_t kParallelDataNumSameShape = 1000;
} // namespace
namespace aicpu {
uint32_t UnravelIndexCpuKernel::Compute(CpuKernelContext &ctx) {
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
case DT_INT32: {
KERNEL_HANDLE_ERROR(DataAndTypeCheck<int32_t>(ctx), " data or type check failed.");
UnravelCompute<int32_t>(ctx);
break;
}
case DT_INT64: {
KERNEL_HANDLE_ERROR(DataAndTypeCheck<int64_t>(ctx), " data or type check failed.");
UnravelCompute<int64_t>(ctx);
break;
}
default: {
KERNEL_LOG_ERROR("UnravelIndex kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t UnravelIndexCpuKernel::DataAndTypeCheck(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "Unravel_Index check input and output number failed.");
Tensor *indices = ctx.Input(0);
Tensor *dims = ctx.Input(1);
auto dims_number = ctx.Input(1)->NumElements();
auto indices_number = ctx.Input(0)->NumElements();
auto dims_data = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto indices_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto indices_type = indices->GetDataType();
auto dims_type = dims->GetDataType();
T dims_multi = 1;
KERNEL_CHECK_FALSE((indices_type == dims_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(indices_type).c_str(), DTypeStr(dims_type).c_str())
for (auto i = 0; i < dims_number; i++) {
KERNEL_CHECK_FALSE((*(dims_data + i) > 0), KERNEL_STATUS_PARAM_INVALID, "Dimension number must be more than 0.")
dims_multi = dims_multi * (*(dims_data + i));
}
for (auto i = 0; i < indices_number; i++) {
KERNEL_CHECK_FALSE((*(indices_data + i) >= 0), KERNEL_STATUS_PARAM_INVALID, "Indice number must be more than 0.")
KERNEL_CHECK_FALSE((*(indices_data + i) <= dims_multi), KERNEL_STATUS_PARAM_INVALID,
"Index is out of bound as with dims");
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t UnravelIndexCpuKernel ::UnravelCompute(CpuKernelContext &ctx) {
auto indices_data = reinterpret_cast<T *>(ctx.Input(0)->GetData());
auto dims_data = reinterpret_cast<T *>(ctx.Input(1)->GetData());
auto output_data = reinterpret_cast<T *>(ctx.Output(0)->GetData());
auto dims_number = ctx.Input(1)->NumElements();
auto indices_number = ctx.Input(0)->NumElements();
auto data_num = indices_number;
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx));
if (max_core_num > data_num) {
max_core_num = data_num;
}
auto sharder_unravel_index = [&](size_t start, size_t end) {
for (auto j = start; j < end; j++) {
T Quotient = *(indices_data + j);
for (auto i = (dims_number - 1); i >= 0; i--) {
*(output_data + i + j * dims_number) = Quotient % *(dims_data + i);
Quotient = Quotient / *(dims_data + i);
}
}
};
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_unravel_index),
"Unravel Index Compute failed.");
} else {
for (auto j = 0; j < indices_number; j++) {
T Quotient = *(indices_data + j);
for (auto i = (dims_number - 1); i >= 0; i--) {
*(output_data + i + j * dims_number) = Quotient % *(dims_data + i);
Quotient = Quotient / *(dims_data + i);
}
}
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(KUnravelIndex, UnravelIndexCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,37 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_UNRAVEL_INDEX_
#define AICPU_KERNELS_NORMALIZED_UNRAVEL_INDEX_
#include "cpu_ops_kernel.h"
namespace aicpu {
class UnravelIndexCpuKernel : public CpuKernel {
public:
~UnravelIndexCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T>
uint32_t DataAndTypeCheck(CpuKernelContext &ctx);
template <typename T>
uint32_t UnravelCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,167 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "unsorted_segment_sum.h"
#include <string>
#include "cpu_kernel_utils.h"
#include "cpu_types.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const char *kUnsortedSegmentSum = "UnsortedSegmentSum";
const uint32_t input_num = 3;
const uint32_t output_num = 1;
constexpr int64_t kParallelDataNums = 64 * 1024;
} // namespace
namespace aicpu {
template <typename input_t, typename segment_ids_t, typename num_segments_t>
uint32_t UnsortedSegmentSumCpuKernel::UnsortedSegmentSumComputeTemplate(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, input_num, output_num), " node input size should be [%llu], get [%llu]",
input_num, ctx.GetInputsSize(), " node output size should be [%llu], get [%llu]", output_num,
ctx.GetOutputsSize());
if (ctx.Input(0)->GetDataType() != ctx.Output(0)->GetDataType()) {
KERNEL_LOG_ERROR("The data type of the input [%s] need be the same as the output [%s]",
DTypeStr(ctx.Input(0)->GetDataType()).c_str(), DTypeStr(ctx.Output(0)->GetDataType()).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
if (ctx.Input(0)->GetDataSize() != ctx.Output(0)->GetDataSize()) {
KERNEL_LOG_ERROR(
"The data size of the input [%llu] need be the same as the output "
"[%llu]",
ctx.Input(0)->GetDataSize(), ctx.Output(0)->GetDataSize());
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t data_size = ctx.Input(0)->NumElements();
int64_t id_size = ctx.Input(1)->NumElements();
auto input_x = reinterpret_cast<input_t *>(ctx.Input(0)->GetData());
KERNEL_CHECK_NULLPTR(input_x, KERNEL_STATUS_PARAM_INVALID, "Get input data failed")
auto output_y = reinterpret_cast<input_t *>(ctx.Output(0)->GetData());
KERNEL_CHECK_NULLPTR(output_y, KERNEL_STATUS_PARAM_INVALID, "Get output data failed")
auto segmentids = reinterpret_cast<segment_ids_t *>(ctx.Input(1)->GetData());
KERNEL_CHECK_NULLPTR(segmentids, KERNEL_STATUS_PARAM_INVALID, "Get segment_ids failed")
auto numsegments = reinterpret_cast<num_segments_t *>(ctx.Input(2)->GetData());
KERNEL_CHECK_NULLPTR(numsegments, KERNEL_STATUS_PARAM_INVALID, "Get num_segments failed")
if (id_size <= 0) {
KERNEL_LOG_ERROR("segment_ids num elements should great than 0");
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t reshapesize = data_size / id_size;
// Initialized to 0
memset(output_y, 0, ctx.Output(0)->GetDataSize());
if (data_size <= kParallelDataNums) {
// calculation process
for (int64_t i = 0; i < id_size; i++) {
if (*(segmentids + i) < *numsegments) {
for (int64_t j = 0; j < reshapesize; j++) {
*(output_y + *(segmentids + i) * reshapesize + j) += *(input_x + i * reshapesize + j);
}
}
}
} else {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (max_core_num > reshapesize) {
max_core_num = reshapesize;
}
// calculation process
auto shard_unsorted_segment_sum = [&](int64_t start, int64_t end) {
for (int64_t i = 0; i < id_size; i++) {
if (*(segmentids + i) < *numsegments) {
for (int64_t j = start; j < end; j++) {
*(output_y + *(segmentids + i) * reshapesize + j) += *(input_x + i * reshapesize + j);
}
}
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, reshapesize, reshapesize / max_core_num, shard_unsorted_segment_sum),
"CpuKernelUtils::ParallelFor failed.");
}
return KERNEL_STATUS_OK;
}
template <typename input_t, typename segment_ids_t>
uint32_t UnsortedSegmentSumCpuKernel::DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type) {
switch (num_segments_type) {
case DT_INT32:
return UnsortedSegmentSumComputeTemplate<input_t, segment_ids_t, int32_t>(ctx);
case DT_INT64:
return UnsortedSegmentSumComputeTemplate<input_t, segment_ids_t, int64_t>(ctx);
default:
KERNEL_LOG_ERROR("UnsortedSegmentSum invalid num_segments_type type [%s]", DTypeStr(num_segments_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
template <typename input_t>
uint32_t UnsortedSegmentSumCpuKernel::DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type) {
auto num_segments_type = ctx.Input(2)->GetDataType();
switch (segment_ids_type) {
case DT_INT32:
return DoComputeWithNumSegmentsType<input_t, int32_t>(ctx, num_segments_type);
case DT_INT64:
return DoComputeWithNumSegmentsType<input_t, int64_t>(ctx, num_segments_type);
default:
KERNEL_LOG_ERROR("UnsortedSegmentSum invalid segment_ids_type type [%s]", DTypeStr(segment_ids_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
}
uint32_t UnsortedSegmentSumCpuKernel::Compute(CpuKernelContext &ctx) {
auto input_type = ctx.Input(0)->GetDataType();
auto segment_ids_type = ctx.Input(1)->GetDataType();
switch (input_type) {
case DT_INT32:
return DoComputeWithSegmentIdsType<int32_t>(ctx, segment_ids_type);
case DT_INT16:
return DoComputeWithSegmentIdsType<int16_t>(ctx, segment_ids_type);
case DT_FLOAT:
return DoComputeWithSegmentIdsType<float>(ctx, segment_ids_type);
case DT_DOUBLE:
return DoComputeWithSegmentIdsType<double>(ctx, segment_ids_type);
case DT_FLOAT16:
return DoComputeWithSegmentIdsType<Eigen::half>(ctx, segment_ids_type);
case DT_INT8:
return DoComputeWithSegmentIdsType<int8_t>(ctx, segment_ids_type);
case DT_INT64:
return DoComputeWithSegmentIdsType<int64_t>(ctx, segment_ids_type);
case DT_UINT8:
return DoComputeWithSegmentIdsType<uint8_t>(ctx, segment_ids_type);
case DT_UINT16:
return DoComputeWithSegmentIdsType<uint16_t>(ctx, segment_ids_type);
case DT_UINT32:
return DoComputeWithSegmentIdsType<uint32_t>(ctx, segment_ids_type);
case DT_UINT64:
return DoComputeWithSegmentIdsType<uint64_t>(ctx, segment_ids_type);
case DT_COMPLEX64:
return DoComputeWithSegmentIdsType<std::complex<float>>(ctx, segment_ids_type);
case DT_COMPLEX128:
return DoComputeWithSegmentIdsType<std::complex<double>>(ctx, segment_ids_type);
default:
KERNEL_LOG_ERROR("UnsortedSegmentSum invalid input type [%s]", DTypeStr(input_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kUnsortedSegmentSum, UnsortedSegmentSumCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,38 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_SUM_H
#define AICPU_KERNELS_NORMALIZED_UNSORTED_SEGMENT_SUM_H
#include "cpu_ops_kernel.h"
namespace aicpu {
class UnsortedSegmentSumCpuKernel : public CpuKernel {
public:
~UnsortedSegmentSumCpuKernel() = default;
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename input_t, typename segment_ids_t, typename num_segments_t>
uint32_t UnsortedSegmentSumComputeTemplate(CpuKernelContext &ctx);
template <typename input_t, typename segment_ids_t>
uint32_t DoComputeWithNumSegmentsType(CpuKernelContext &ctx, DataType num_segments_type);
template <typename input_t>
uint32_t DoComputeWithSegmentIdsType(CpuKernelContext &ctx, DataType segment_ids_type);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,153 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "upper_bound.h"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kInputNum = 2;
const uint32_t kOutputNum = 1;
const char *kUpperBound = "UpperBound";
#define UPPERBOUND_COMPUTE_CASE(DTYPE, TYPE1, TYPE2, CTX) \
case (DTYPE): { \
uint32_t result = UpperBoundCompute<TYPE1, TYPE2>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("UpperBound kernel compute failed."); \
return result; \
} \
break; \
}
#define UPPERBOUND_COMPUTE_CASE_ALL(TYPE, CTX) \
UPPERBOUND_COMPUTE_CASE(DT_INT8, int8_t, TYPE, CTX) \
UPPERBOUND_COMPUTE_CASE(DT_INT16, int16_t, TYPE, CTX) \
UPPERBOUND_COMPUTE_CASE(DT_INT32, int32_t, TYPE, CTX) \
UPPERBOUND_COMPUTE_CASE(DT_INT64, int64_t, TYPE, CTX) \
UPPERBOUND_COMPUTE_CASE(DT_UINT8, uint8_t, TYPE, CTX) \
UPPERBOUND_COMPUTE_CASE(DT_UINT16, uint16_t, TYPE, CTX) \
UPPERBOUND_COMPUTE_CASE(DT_FLOAT16, Eigen::half, TYPE, CTX) \
UPPERBOUND_COMPUTE_CASE(DT_FLOAT, float, TYPE, CTX) \
UPPERBOUND_COMPUTE_CASE(DT_DOUBLE, double, TYPE, CTX)
} // namespace
namespace aicpu {
uint32_t UpperBoundCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "UpperBound check input and output number failed.");
Tensor *sorted_x_data = ctx.Input(0);
Tensor *values_data = ctx.Input(1);
Tensor *output_data = ctx.Output(0);
auto output_type = output_data->GetDataType();
auto sorted_x_type = sorted_x_data->GetDataType();
auto values_type = values_data->GetDataType();
if (sorted_x_type != values_type) {
KERNEL_LOG_ERROR("Input[0] data type[%s] must be same with Input[1] data type[%s]", DTypeStr(sorted_x_type).c_str(),
DTypeStr(values_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
switch (output_type) {
case DT_INT32:
switch (sorted_x_type) {
UPPERBOUND_COMPUTE_CASE_ALL(int32_t, ctx)
default:
KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
case DT_INT64:
switch (sorted_x_type) {
UPPERBOUND_COMPUTE_CASE_ALL(int64_t, ctx)
default:
KERNEL_LOG_ERROR("Input data type[%s] not supported.", DTypeStr(sorted_x_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
break;
default:
KERNEL_LOG_ERROR("Output data type[%s] not supported.", DTypeStr(output_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
template <typename T1, typename T2>
uint32_t UpperBoundCpuKernel::UpperBoundCompute(CpuKernelContext &ctx) {
Tensor *sorted_x_data = ctx.Input(0);
auto sorted_x_data_addr = reinterpret_cast<T1 *>(sorted_x_data->GetData());
auto sorted_x_data_shape = sorted_x_data->GetTensorShape();
std::vector<int64_t> sorted_x_data_shape_dims = sorted_x_data_shape->GetDimSizes();
Tensor *values_data = ctx.Input(1);
auto values_data_addr = reinterpret_cast<T1 *>(values_data->GetData());
auto values_data_shape = values_data->GetTensorShape();
int64_t values_data_num = values_data_shape->NumElements();
std::vector<int64_t> values_data_shape_dims = values_data_shape->GetDimSizes();
Tensor *output_data = ctx.Output(0);
auto output_data_addr = reinterpret_cast<T2 *>(output_data->GetData());
if (sorted_x_data_shape_dims[0] != values_data_shape_dims[0]) {
KERNEL_LOG_ERROR("The number of rows of Input[0]:([%d]) should be consistent with that of Input[1]:([%d]).",
sorted_x_data_shape_dims[0], values_data_shape_dims[0]);
return KERNEL_STATUS_PARAM_INVALID;
}
int64_t sorted_x_data_column = sorted_x_data_shape_dims[1];
int64_t values_data_column = values_data_shape_dims[1];
if (values_data_num < 1024) {
for (int64_t i = 0; i < values_data_num; i++) {
int64_t seq_row = i / values_data_column;
int64_t low = seq_row * sorted_x_data_column;
int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
int64_t mid;
while (low <= up) {
mid = (low + up) / 2;
if (values_data_addr[i] < sorted_x_data_addr[mid]) {
up = mid - 1;
} else {
low = mid + 1;
}
}
output_data_addr[i] = low - seq_row * sorted_x_data_column;
}
} else {
uint32_t min_core_num = 1;
int64_t sum_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (sum_core_num > values_data_num) {
sum_core_num = values_data_num;
}
auto shard_compute = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
int64_t seq_row = i / values_data_column;
int64_t low = seq_row * sorted_x_data_column;
int64_t up = (seq_row + 1) * sorted_x_data_column - 1;
int64_t mid;
while (low <= up) {
mid = (low + up) / 2;
if (values_data_addr[i] < sorted_x_data_addr[mid]) {
up = mid - 1;
} else {
low = mid + 1;
}
}
output_data_addr[i] = low - seq_row * sorted_x_data_column;
}
};
KERNEL_HANDLE_ERROR(
CpuKernelUtils::ParallelFor(ctx, values_data_num, values_data_num / sum_core_num, shard_compute),
"UpperBound Compute failed.");
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kUpperBound, UpperBoundCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,35 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_UPPERBOUND_H_
#define AICPU_KERNELS_NORMALIZED_UPPERBOUND_H_
#include "cpu_ops_kernel.h"
namespace aicpu {
class UpperBoundCpuKernel : public CpuKernel {
public:
UpperBoundCpuKernel() = default;
~UpperBoundCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
template <typename T1, typename T2>
static uint32_t UpperBoundCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,192 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "xdivy.h"
#include <complex>
#include "cmath"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kXdivy = "Xdivy";
const int64_t kParallelDataNum = 2 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
constexpr double EPSLON = 1e-15;
#define XDIVY_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = XdivyCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Xdivy kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t XdivyCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kXdivy);
BCalcInfo calc_info;
KERNEL_HANDLE_ERROR(XdivyParamCheck(ctx), "Xdivy check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
XDIVY_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
XDIVY_COMPUTE_CASE(DT_FLOAT, float, ctx)
XDIVY_COMPUTE_CASE(DT_DOUBLE, double, ctx)
XDIVY_COMPUTE_CASE(DT_COMPLEX64, std::complex<float>, ctx)
XDIVY_COMPUTE_CASE(DT_COMPLEX128, std::complex<double>, ctx)
default:
KERNEL_LOG_ERROR("Xdivy kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t XdivyCpuKernel::XdivyParamCheck(CpuKernelContext &ctx) {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"XdivyCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t XdivyCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx) {
auto input1 = static_cast<T *>(ctx.Input(0)->GetData());
auto input2 = static_cast<T *>(ctx.Input(1)->GetData());
auto output = static_cast<T *>(ctx.Output(0)->GetData());
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = start; i < end; ++i) {
*(output + i) = *(input1 + i) / *(input2 + i) + static_cast<T>(EPSLON);
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(output + i) = (*input1) / *(input2 + i) + static_cast<T>(EPSLON);
}
break;
case BcastShapeType::Y_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
*(output + i) = *(input1 + i) / (*input2) + static_cast<T>(EPSLON);
}
break;
default:
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t XdivyCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type = in0_elements_nums == in1_elements_nums
? BcastShapeType::SAME_SHAPE
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - kResvCpuNum);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
auto sharder_div = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, ctx); };
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
"Xdivy Compute failed.");
} else {
SpecialCompute<T>(type, 0, data_num, ctx);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t XdivyCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
auto in0 = static_cast<T *>(ctx.Input(0)->GetData());
auto in1 = static_cast<T *>(ctx.Input(1)->GetData());
auto out = static_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
auto sharder_div = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
*(out + i) =
*(in0 + bcast.GetBroadcastXIndex(i)) / *(in1 + bcast.GetBroadcastYIndex(i)) + static_cast<T>(EPSLON);
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
"Xdivy Compute failed.");
} else {
for (int64_t i = 0; i < data_num; ++i) {
*(out + i) = *(in0 + bcast.GetBroadcastXIndex(i)) / *(in1 + bcast.GetBroadcastYIndex(i)) + static_cast<T>(EPSLON);
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t XdivyCpuKernel::XdivyCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool noNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (noNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kXdivy, XdivyCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,52 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_XDIVY_H_
#define AICPU_KERNELS_NORMALIZED_XDIVY_H_
#define EIGEN_USE_THREADS
#define EIGEN_USE_SIMPLE_THREAD_POOL
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
namespace aicpu {
class XdivyCpuKernel : public CpuKernel {
public:
XdivyCpuKernel() = default;
~XdivyCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t XdivyParamCheck(CpuKernelContext &ctx);
private:
template <typename T>
uint32_t SpecialCompute(BcastShapeType type, int64_t start, int64_t end, CpuKernelContext &ctx);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
uint32_t XdivyCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,216 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "xlogy.h"
#include "cmath"
#include "cpu_kernel_utils.h"
#include "utils/eigen_tensor.h"
#include "utils/kernel_util.h"
namespace {
const uint32_t kOutputNum = 1;
const uint32_t kInputNum = 2;
const char *kXlogy = "Xlogy";
const int64_t kParallelDataNum = 8 * 1024;
const int64_t kParallelDataNumMid = 16 * 1024;
const int64_t kParallelDataNumSameShape = 7 * 1024;
const int64_t kParallelDataNumSameShapeMid = 35 * 1024;
#define XLOGY_COMPUTE_CASE(DTYPE, TYPE, CTX) \
case (DTYPE): { \
uint32_t result = XlogyCompute<TYPE>(CTX); \
if (result != KERNEL_STATUS_OK) { \
KERNEL_LOG_ERROR("Xlogy kernel compute failed."); \
return result; \
} \
break; \
}
} // namespace
namespace aicpu {
uint32_t XlogyCpuKernel::Compute(CpuKernelContext &ctx) {
KERNEL_HANDLE_ERROR(NormalCheck(ctx, kInputNum, kOutputNum), "[%s] check input and output failed.", kXlogy);
BCalcInfo calc_info;
KERNEL_HANDLE_ERROR(XlogyParamCheck(ctx), "Xlogy check params failed.");
auto data_type = ctx.Input(0)->GetDataType();
switch (data_type) {
XLOGY_COMPUTE_CASE(DT_FLOAT16, Eigen::half, ctx)
XLOGY_COMPUTE_CASE(DT_FLOAT, float, ctx)
XLOGY_COMPUTE_CASE(DT_DOUBLE, double, ctx)
default:
KERNEL_LOG_ERROR("Xlogy kernel data type [%s] not support.", DTypeStr(data_type).c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return KERNEL_STATUS_OK;
}
uint32_t XlogyCpuKernel::XlogyParamCheck(CpuKernelContext &ctx) {
// the non null of input_0, input_1, output has been verified in NormalCheck
Tensor *input_0 = ctx.Input(0);
Tensor *input_1 = ctx.Input(1);
Tensor *output = ctx.Output(0);
DataType input0_type = input_0->GetDataType();
DataType input1_type = input_1->GetDataType();
KERNEL_CHECK_FALSE((input0_type == input1_type), KERNEL_STATUS_PARAM_INVALID,
"The data type of input0 [%s] need be same with "
"input1 [%s].",
DTypeStr(input0_type).c_str(), DTypeStr(input1_type).c_str())
KERNEL_LOG_DEBUG(
"XlogyCpuKernel[%s], input0: size[%llu];"
"input1: size[%llu], output: size[%llu].",
ctx.GetOpType().c_str(), input_0->GetDataSize(), input_1->GetDataSize(), output->GetDataSize());
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t XlogyCpuKernel::SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1,
const T *input2, T *output) {
auto zero = T(0);
switch (type) {
case BcastShapeType::SAME_SHAPE:
for (int64_t i = start; i < end; ++i) {
if (*(input1 + i) == zero) {
*(output + i) = zero;
continue;
}
if (*(input2 + i) < zero) {
*(output + i) = std::numeric_limits<T>::quiet_NaN();
continue;
}
*(output + i) = *(input1 + i) * log(*(input2 + i));
}
break;
case BcastShapeType::X_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
if (*(input1) == zero) {
*(output + i) = zero;
continue;
}
if (*(input2 + i) < zero) {
*(output + i) = std::numeric_limits<T>::quiet_NaN();
continue;
}
*(output + i) = (*input1) * log(*(input2 + i));
}
break;
case BcastShapeType::Y_ONE_ELEMENT:
for (int64_t i = start; i < end; ++i) {
if (*(input1 + i) == zero) {
*(output + i) = zero;
continue;
}
if (*(input2) < zero) {
*(output + i) = std::numeric_limits<T>::quiet_NaN();
continue;
}
*(output + i) = *(input1 + i) * log(*(input2));
}
break;
default:
KERNEL_LOG_WARN("Invalid type [%d]", static_cast<int32_t>(type));
break;
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t XlogyCpuKernel::NoBcastCompute(CpuKernelContext &ctx) {
auto in0 = static_cast<T *>(ctx.Input(0)->GetData());
auto in1 = static_cast<T *>(ctx.Input(1)->GetData());
auto out = static_cast<T *>(ctx.Output(0)->GetData());
int64_t in0_elements_nums = ctx.Input(0)->NumElements();
int64_t in1_elements_nums = ctx.Input(1)->NumElements();
int64_t data_num = ctx.Output(0)->NumElements();
BcastShapeType type = in0_elements_nums == in1_elements_nums
? BcastShapeType::SAME_SHAPE
: (in0_elements_nums == 1 ? BcastShapeType::X_ONE_ELEMENT : BcastShapeType::Y_ONE_ELEMENT);
if (data_num >= kParallelDataNumSameShape) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_num <= kParallelDataNumSameShapeMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
auto sharder_div = [&](int64_t start, int64_t end) { SpecialCompute<T>(type, start, end, in0, in1, out); };
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
"Xlogy Compute failed.");
} else {
SpecialCompute<T>(type, 0, data_num, in0, in1, out);
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t XlogyCpuKernel::BcastCompute(CpuKernelContext &ctx, Bcast &bcast) {
auto in0 = static_cast<T *>(ctx.Input(0)->GetData());
auto in1 = static_cast<T *>(ctx.Input(1)->GetData());
auto out = static_cast<T *>(ctx.Output(0)->GetData());
int64_t data_num = ctx.Output(0)->NumElements();
auto zero = T(0);
if (data_num >= kParallelDataNum) {
uint32_t min_core_num = 1;
uint32_t max_core_num = std::max(min_core_num, aicpu::CpuKernelUtils::GetCPUNum(ctx) - 2);
if (data_num <= kParallelDataNumMid) {
max_core_num = std::min(max_core_num, 4U); // up to 4 cpu cores
}
auto sharder_div = [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
*(out + i) = *(in1 + i) >= zero
? *(in0 + bcast.GetBroadcastXIndex(i)) * log(*(in1 + bcast.GetBroadcastYIndex(i)))
: std::numeric_limits<T>::quiet_NaN();
}
};
if (max_core_num == 0) {
KERNEL_LOG_ERROR("max_core_num could not be 0.");
}
KERNEL_HANDLE_ERROR(CpuKernelUtils::ParallelFor(ctx, data_num, data_num / max_core_num, sharder_div),
"Xlogy Compute failed.");
} else {
for (int64_t i = 0; i < data_num; ++i) {
*(out + i) = *(in1 + i) >= zero ? *(in0 + bcast.GetBroadcastXIndex(i)) * log(*(in1 + bcast.GetBroadcastYIndex(i)))
: std::numeric_limits<T>::quiet_NaN();
}
}
return KERNEL_STATUS_OK;
}
template <typename T>
uint32_t XlogyCpuKernel::XlogyCompute(CpuKernelContext &ctx) {
Tensor *input0_tensor = ctx.Input(0);
auto input0_shape = input0_tensor->GetTensorShape()->GetDimSizes();
int64_t input0_elements_nums = input0_tensor->NumElements();
Tensor *input1_tensor = ctx.Input(1);
auto input1_shape = input1_tensor->GetTensorShape()->GetDimSizes();
int64_t input1_elements_nums = input1_tensor->NumElements();
bool noNeedBcast = (input0_shape == input1_shape) || (input0_elements_nums == 1) || (input1_elements_nums == 1);
if (noNeedBcast) {
return NoBcastCompute<T>(ctx);
} else {
Bcast bcast(input0_shape, input1_shape);
if (!bcast.IsValid()) {
KERNEL_LOG_ERROR("[%s] broadcast failed.", ctx.GetOpType().c_str());
return KERNEL_STATUS_PARAM_INVALID;
}
return BcastCompute<T>(ctx, bcast);
}
return KERNEL_STATUS_OK;
}
REGISTER_CPU_KERNEL(kXlogy, XlogyCpuKernel);
} // namespace aicpu

View File

@ -0,0 +1,52 @@
/**
* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef AICPU_KERNELS_NORMALIZED_XLOGY_H_
#define AICPU_KERNELS_NORMALIZED_XLOGY_H_
#define EIGEN_USE_THREADS
#define EIGEN_USE_SIMPLE_THREAD_POOL
#include "cpu_ops_kernel.h"
#include "cpu_types.h"
#include "utils/bcast.h"
namespace aicpu {
class XlogyCpuKernel : public CpuKernel {
public:
XlogyCpuKernel() = default;
~XlogyCpuKernel() override = default;
protected:
uint32_t Compute(CpuKernelContext &ctx) override;
private:
uint32_t XlogyParamCheck(CpuKernelContext &ctx);
private:
template <typename T>
uint32_t SpecialCompute(BcastShapeType type, int64_t start, int64_t end, const T *input1, const T *input2, T *output);
template <typename T>
uint32_t NoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t BcastCompute(CpuKernelContext &ctx, Bcast &bcast);
template <typename T>
uint32_t XlogyCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -0,0 +1,61 @@
#ifndef AICPU_UtILS_SPARSE_DENSE_CWISE_UTILS_H_
#define AICPU_UtILS_SPARSE_DENSE_CWISE_UTILS_H_
#include "cpu_ops_kernel.h"
#include "utils/bcast.h"
#include "utils/eigen_tensor.h"
namespace aicpu {
struct AddOp {
static std::string Name() { return "Add"; }
};
struct DivOp {
static std::string Name() { return "Div"; }
};
struct MulOp {
static std::string Name() { return "Mul"; }
};
template <typename Op>
class SparseDenseCwiseOpKernel : public CpuKernel {
public:
SparseDenseCwiseOpKernel() = default;
~SparseDenseCwiseOpKernel() override = default;
protected:
virtual uint32_t Compute(CpuKernelContext &ctx) override = 0;
static uint32_t CheckParams(CpuKernelContext &ctx);
template <typename T>
uint32_t SparseDenseCwiseOpSpecialCompute(BcastShapeType type, CpuKernelContext &ctx);
template <typename T>
uint32_t SparseDenseCwiseOpSpecialComputeComplex(BcastShapeType type, CpuKernelContext &ctx);
template <typename T>
uint32_t ComputeOp(CpuKernelContext &ctx);
template <typename T>
uint32_t ComputeOpComplex(CpuKernelContext &ctx);
template <typename T>
uint32_t SparseDenseCwiseOpNoBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t SparseDenseCwiseOpNoBcastComputeComplex(CpuKernelContext &ctx);
template <typename T>
uint32_t SparseDenseCwiseOpBcastCompute(CpuKernelContext &ctx);
template <typename T>
uint32_t SparseDenseCwiseOpBcastComputeComplex(CpuKernelContext &ctx);
template <typename T>
uint32_t SparseDenseCwiseOpCompute(CpuKernelContext &ctx);
};
} // namespace aicpu
#endif

View File

@ -67,6 +67,14 @@ class SparseTensor {
* sparse eigen tensor indices valid * sparse eigen tensor indices valid
* @return uint32_t: 0->success other->failed * @return uint32_t: 0->success other->failed
*/ */
int dims() const { return dims_; }
std::shared_ptr<EigenTensor> indices() const { return ix_; }
std::shared_ptr<EigenTensor> values() const { return vals_; }
std::vector<int64_t> shape() const { return shape_; }
template <typename T> template <typename T>
uint32_t EigenTensorIndicesValidCheck(int64_t dims_size) const { uint32_t EigenTensorIndicesValidCheck(int64_t dims_size) const {
const auto ix_t = ix_->matrix<T>(); const auto ix_t = ix_->matrix<T>();

View File

@ -78,9 +78,75 @@ const AnfNodePtr AICpuLibSelectPass::Process(const FuncGraphPtr &graph, const An
mindspore::kMaskedSelectOpName, mindspore::kMaskedSelectOpName,
mindspore::kMaskedSelectGradOpName, mindspore::kMaskedSelectGradOpName,
mindspore::kMedianOpName, mindspore::kMedianOpName,
mindspore::kACosGradOpName,
mindspore::kAcoshGradOpName,
mindspore::kAdaptiveAvgPool3DOpName,
mindspore::kAdaptiveAvgPool3DGradOpName,
mindspore::kAdaptiveMaxPool2DGradOpName,
mindspore::kAdaptiveMaxPool3DOpName,
mindspore::kAdaptiveMaxPool3DGradOpName,
mindspore::kAddNOpName,
mindspore::kAddV2OpName,
mindspore::kAdjustContrastv2OpName,
mindspore::kAdjustHueOpName,
mindspore::kAdjustSaturationOpName,
mindspore::kAffineGridGradOpName,
mindspore::kAngleOpName,
mindspore::kArgmaxOpName,
mindspore::kArgMaxWithValueOpName,
mindspore::kArgMinOpName,
mindspore::kArgMinWithValueOpName,
mindspore::KAsinGradOpName,
mindspore::KAsinhGradOpName,
mindspore::kAvgPoolOpName,
mindspore::kAvgPoolGradOpName,
mindspore::kBartlettWindowOpName,
mindspore::kBatchNormGradGradOpName,
mindspore::kBiasAddOpName,
mindspore::kBiasAddGradOpName,
mindspore::kBincountOpName,
mindspore::kBlackmanWindowOpName,
mindspore::kBroadcastOpName,
mindspore::kMedianGradOpName, mindspore::kMedianGradOpName,
mindspore::kNMSWithMaskOpName, mindspore::kNMSWithMaskOpName,
mindspore::kReduceSumOpName, mindspore::kReduceSumOpName,
mindspore::kSpaceToDepthOpName,
mindspore::kSparseAddmmOpName,
mindspore::kSparseApplyAdagradDAOpName,
mindspore::kSparseApplyCenteredRMSPropOpName,
mindspore::kSparseApplyMomentumOpName,
mindspore::kSparseApplyProximalGradientDescentOpName,
mindspore::kSparseConcatOpName,
mindspore::kSparseDenseCwiseAddOpName,
mindspore::kSparseDenseCwiseDivOpName,
mindspore::kSparseDenseCwiseMulOpName,
mindspore::kSparseMatrixMatMulOpName,
mindspore::kSparseMatrixNNZOpName,
mindspore::kSparseMatrixTransposeOpName,
mindspore::kSparseFillEmptyRowsGradOpName,
mindspore::kSparseReshapeOpName,
mindspore::kSparseSegmentSqrtNGradOpName,
mindspore::kSparseSegmentSqrtNWithNumSegmentsOpName,
mindspore::kSparseSoftmaxCrossEntropyWithLogitsOpName,
mindspore::kSparseSparseMaximumOpName,
mindspore::kSparseSparseMinimumOpName,
mindspore::kSparseSegmentSumWithNumSegmentsOpName,
mindspore::kSplitOpName,
mindspore::kSqrtOpName,
mindspore::kSqrtGradOpName,
mindspore::kTanhOpName,
mindspore::kTileOpName,
mindspore::kTridiagonalMatMulOpName,
mindspore::kTripletMarginLossOpName,
mindspore::kTransposeOpName,
mindspore::kTriuIndicesOpName,
mindspore::kTrilIndicesOpName,
mindspore::kUnpackOpName,
mindspore::kUnravelIndexOpName,
mindspore::kUnsortedSegmentSumOpName,
mindspore::kUpperBoundOpName,
mindspore::kXlogyOpName,
mindspore::kXdivyOpName,
mindspore::kFFTWithSizeOpName, mindspore::kFFTWithSizeOpName,
mindspore::kHistogramDOpName, mindspore::kHistogramDOpName,
mindspore::kIm2colOpName, mindspore::kIm2colOpName,

View File

@ -410,7 +410,7 @@ class _MindsporeFunctionExecutor:
# Case: If the shape of input args is dynamic, get dynamic shape tensor from context and use it to compile. # Case: If the shape of input args is dynamic, get dynamic shape tensor from context and use it to compile.
compile_args = args_list compile_args = args_list
# Case: The `set_inputs()` of Cell object has been set, using these dynamic shape args as compile args. # Case: The `set_inputs()` of Cell object has been set, using these dynamic shape args as compile args.
if isinstance(self.obj, ms.nn.Cell) and self.obj.get_inputs(): if self.fn.__name__ == 'construct' and isinstance(self.obj, ms.nn.Cell) and self.obj.get_inputs():
compile_args = self.obj.get_inputs() compile_args = self.obj.get_inputs()
for args in compile_args: for args in compile_args:
Validator.check_isinstance("args set in `set_inputs()` of Cell", args, PythonTensor) Validator.check_isinstance("args set in `set_inputs()` of Cell", args, PythonTensor)

View File

@ -13,9 +13,48 @@
# limitations under the License. # limitations under the License.
"""aicpu ops""" """aicpu ops"""
from .adaptive_max_pool_3d_grad import _adaptive_max_pool_3d_grad_aicpu
from .adaptive_max_pool_3d import _adaptive_max_pool_3d_aicpu
from .adaptive_max_pool_2d_grad import _adaptive_max_pool_2d_grad_aicpu
from .adaptive_avg_pool_3d_grad import _adaptiveavgpool3d_grad_aicpu
from .adaptive_avg_pool_3d import _adaptiveavgpool3d_aicpu
from .tile import _tile_aicpu
from .tanh import _tanh_aicpu
from .space_to_depth import _space_to_depth_aicpu
from .sparse_matrix_transpose import _sparse_matrix_transpose_aicpu
from .sparse_matrix_nnz import _sparse_matrix_nnz_aicpu
from .sparse_matrix_mat_mul import _sparse_matrix_mat_mul_aicpu
from .sparse_dense_cwise_mul import _sparse_dense_cwise_mul_aicpu
from .sparse_dense_cwise_div import _sparse_dense_cwise_div_aicpu
from .sparse_dense_cwise_add import _sparse_dense_cwise_add_aicpu
from .sparse_concat import _sparse_concat_aicpu
from .sparse_apply_proximal_gradient_descent import _sparse_apply_proximal_gradient_descent_aicpu
from .sparse_apply_momentum import _sparse_apply_momentum_aicpu
from .sparse_apply_centered_rms_prop import _sparse_apply_centered_rms_prop_aicpu
from .sparse_apply_adagrad_da import _sparse_apply_adagrad_da_aicpu
from .sparseaddmm import _sparse_addmm_aicpu
from .broadcast_to import _broadcast_to_aicpu
from .blackman_window import _blackman_window_aicpu
from .bincount import _bincount_aicpu
from .asinh_grad import _asinh_grad_aicpu
from .unique import _unique_aicpu from .unique import _unique_aicpu
from .add_n import _add_n_aicpu
from .add_v2 import _add_v2_aicpu
from .adjust_contrastv2 import _adjust_contrastv2_aicpu
from .adjust_hue import _adjust_hue_aicpu
from .adjust_saturation import _adjust_saturation_aicpu
from .affine_grid_grad import _affine_grid_grad_aicpu
from .angle import _angle_aicpu
from .arg_max import _arg_max_aicpu
from .argmax_with_value import _argmax_with_value_aicpu
from .arg_min import _arg_min_aicpu
from .argmin_with_value import _argmin_with_value_aicpu
from .avgpool_v1 import _avgpool_v1_aicpu
from .avgpool_grad_v1 import _avgpool_grad_v1_aicpu
from .matrix_solve import _matrix_solve_aicpu from .matrix_solve import _matrix_solve_aicpu
from .betainc import _betainc_aicpu from .betainc import _betainc_aicpu
from .bartlett_window import _bartlett_window_aicpu
from .batch_norm_grad_grad import _batch_norm_grad_grad_aicpu
from .no_repeat_ngram import _no_repeat_ngram_aicpu from .no_repeat_ngram import _no_repeat_ngram_aicpu
from .init_data_set_queue import _init_data_set_queue_aicpu from .init_data_set_queue import _init_data_set_queue_aicpu
from .embedding_lookup import _embedding_lookup_aicpu from .embedding_lookup import _embedding_lookup_aicpu
@ -43,6 +82,7 @@ from .topk import _top_k_aicpu
from .tensor_scatter_update import _tensor_scatter_update_aicpu from .tensor_scatter_update import _tensor_scatter_update_aicpu
from .log1p import _log1p_aicpu from .log1p import _log1p_aicpu
from .asin import _asin_aicpu from .asin import _asin_aicpu
from .asin_grad import _asin_grad_aicpu
from .is_finite import _is_finite_aicpu from .is_finite import _is_finite_aicpu
from .is_inf import _is_inf_aicpu from .is_inf import _is_inf_aicpu
from .is_nan import _is_nan_aicpu from .is_nan import _is_nan_aicpu
@ -52,14 +92,18 @@ from .cosh import _cosh_aicpu
from .sign import _sign_aicpu from .sign import _sign_aicpu
from .squeeze import _squeeze_aicpu from .squeeze import _squeeze_aicpu
from .acos import _acos_aicpu from .acos import _acos_aicpu
from .acos_grad import _acos_grad_aicpu
from .expand import _expand_aicpu from .expand import _expand_aicpu
from .expand_dims import _expand_dims_aicpu from .expand_dims import _expand_dims_aicpu
from .randperm import _randperm_aicpu from .randperm import _randperm_aicpu
from .random_choice_with_mask import _random_choice_with_mask_aicpu from .random_choice_with_mask import _random_choice_with_mask_aicpu
from .rsqrt import _rsqrt_aicpu from .rsqrt import _rsqrt_aicpu
from .sqrt import _sqrt_aicpu
from .sqrt_grad import _sqrt_grad_aicpu
from .search_sorted import _search_sorted_aicpu from .search_sorted import _search_sorted_aicpu
from .stack import _stack_aicpu from .stack import _stack_aicpu
from .unstack import _unstack_aicpu from .unstack import _unstack_aicpu
from .unsorted_segment_sum import _unsorted_segment_sum_aicpu
from .addcmul import _addcmul_aicpu from .addcmul import _addcmul_aicpu
from .uniform_candidate_sampler import _uniform_candidate_sampler_aicpu from .uniform_candidate_sampler import _uniform_candidate_sampler_aicpu
from .log_uniform_candidate_sampler import _log_uniform_candidate_sampler_aicpu from .log_uniform_candidate_sampler import _log_uniform_candidate_sampler_aicpu
@ -69,6 +113,7 @@ from .reverse_sequence import _reverse_sequence_aicpu
from .log_matrix_determinant import _log_matrix_determinant_aicpu from .log_matrix_determinant import _log_matrix_determinant_aicpu
from .crop_and_resize import _crop_and_resize_aicpu from .crop_and_resize import _crop_and_resize_aicpu
from .acosh import _acosh_aicpu from .acosh import _acosh_aicpu
from .acosh_grad import _acosh_grad_aicpu
from .rnnt_loss import _rnnt_loss_aicpu from .rnnt_loss import _rnnt_loss_aicpu
from .random_categorical import _random_categorical_aicpu from .random_categorical import _random_categorical_aicpu
from .tanh_grad import _tanh_grad_aicpu from .tanh_grad import _tanh_grad_aicpu
@ -86,6 +131,7 @@ from .sub import _sub_aicpu
from .not_equal import _not_equal_aicpu from .not_equal import _not_equal_aicpu
from .poisson import _poisson_aicpu from .poisson import _poisson_aicpu
from .update_cache import _update_cache_aicpu from .update_cache import _update_cache_aicpu
from .upper_bound import _upper_bound_aicpu
from .cache_swap_table import _cache_swap_table_aicpu from .cache_swap_table import _cache_swap_table_aicpu
from .uniform_int import _uniform_int_aicpu from .uniform_int import _uniform_int_aicpu
from .uniform_real import _uniform_real_aicpu from .uniform_real import _uniform_real_aicpu
@ -97,6 +143,23 @@ from .end_of_sequence import _end_of_sequence_aicpu
from .fused_sparse_adam import _fused_sparse_adam_aicpu from .fused_sparse_adam import _fused_sparse_adam_aicpu
from .fused_sparse_lazy_adam import _fused_sparse_lazy_adam_aicpu from .fused_sparse_lazy_adam import _fused_sparse_lazy_adam_aicpu
from .fused_sparse_ftrl import _fused_sparse_ftrl_aicpu from .fused_sparse_ftrl import _fused_sparse_ftrl_aicpu
from .sparse_fill_empty_rows_grad import _sparse_fill_empty_rows_grad_aicpu
from .sparse_reshape import _sparse_reshape_aicpu
from .sparse_segment_sqrt_n_grad import _sparse_segment_sqrt_n_grad_aicpu
from .sparse_segment_mean_with_num_segments import _sparse_segment_mean_with_num_segments_aicpu
from .sparse_segment_sum_with_num_segments import _sparse_segment_sum_with_num_segments_aicpu
from .sparse_softmax_cross_entropy_with_logits_v2 import _sparse_softmax_cross_entropy_with_logits_v2_aicpu
from .sparsesparsemaximum import _sparsesparsemaximum_aicpu
from .sparse_sparse_minimum import _sparse_sparse_minimum_aicpu
from .split import _split_aicpu
from .transpose import _transpose_aicpu
from .tridiagonal_matmul import _tridiagonal_matmul_aicpu
from .tril_indices import _tril_indices_aicpu
from .triu_indices import _triu_indices_aicpu
from .triplet_margin_loss import _triplet_margin_loss_aicpu
from .unravel_index import _unravel_index_aicpu
from .xlogy import _xlogy_aicpu
from .xdivy import _xdivy_aicpu
from .fused_sparse_proximal_adagrad import _fused_sparse_proximal_adagrad_aicpu from .fused_sparse_proximal_adagrad import _fused_sparse_proximal_adagrad_aicpu
from .meshgrid import _meshgrid_aicpu from .meshgrid import _meshgrid_aicpu
from .div import _div_aicpu from .div import _div_aicpu